Repository: confluentinc/ducktape
Branch: master
Commit: 7b1e2132b337
Files: 193
Total size: 650.7 KB

Directory structure:
gitextract_muj9xuou/

├── .coveragerc
├── .dockerignore
├── .github/
│   └── ISSUE_TEMPLATE/
│       └── bug_report.md
├── .gitignore
├── .readthedocs.yaml
├── .semaphore/
│   └── semaphore.yml
├── CODEOWNERS
├── Dockerfile
├── Jenkinsfile.disabled
├── README.md
├── Vagrantfile
├── docs/
│   ├── Makefile
│   ├── README.md
│   ├── _static/
│   │   └── theme_overrides.css
│   ├── api/
│   │   ├── clusters.rst
│   │   ├── remoteaccount.rst
│   │   ├── services.rst
│   │   ├── templates.rst
│   │   └── test.rst
│   ├── api.rst
│   ├── changelog.rst
│   ├── conf.py
│   ├── debug_tests.rst
│   ├── index.rst
│   ├── install.rst
│   ├── make.bat
│   ├── misc.rst
│   ├── new_services.rst
│   ├── new_tests.rst
│   ├── requirements.txt
│   ├── run_tests.rst
│   └── test_clusters.rst
├── ducktape/
│   ├── __init__.py
│   ├── __main__.py
│   ├── cluster/
│   │   ├── __init__.py
│   │   ├── cluster.py
│   │   ├── cluster_node.py
│   │   ├── cluster_spec.py
│   │   ├── consts.py
│   │   ├── finite_subcluster.py
│   │   ├── json.py
│   │   ├── linux_remoteaccount.py
│   │   ├── localhost.py
│   │   ├── node_container.py
│   │   ├── node_spec.py
│   │   ├── remoteaccount.py
│   │   ├── vagrant.py
│   │   └── windows_remoteaccount.py
│   ├── command_line/
│   │   ├── __init__.py
│   │   ├── defaults.py
│   │   ├── main.py
│   │   └── parse_args.py
│   ├── errors.py
│   ├── json_serializable.py
│   ├── jvm_logging.py
│   ├── mark/
│   │   ├── __init__.py
│   │   ├── _mark.py
│   │   ├── consts.py
│   │   ├── mark_expander.py
│   │   └── resource.py
│   ├── services/
│   │   ├── __init__.py
│   │   ├── background_thread.py
│   │   ├── service.py
│   │   └── service_registry.py
│   ├── template.py
│   ├── templates/
│   │   └── report/
│   │       ├── report.css
│   │       └── report.html
│   ├── tests/
│   │   ├── __init__.py
│   │   ├── event.py
│   │   ├── loader.py
│   │   ├── loggermaker.py
│   │   ├── reporter.py
│   │   ├── result.py
│   │   ├── runner.py
│   │   ├── runner_client.py
│   │   ├── scheduler.py
│   │   ├── serde.py
│   │   ├── session.py
│   │   ├── status.py
│   │   ├── test.py
│   │   └── test_context.py
│   └── utils/
│       ├── __init__.py
│       ├── http_utils.py
│       ├── local_filesystem_utils.py
│       ├── persistence.py
│       ├── terminal_size.py
│       └── util.py
├── requirements-test.txt
├── requirements.txt
├── ruff.toml
├── service.yml
├── setup.cfg
├── setup.py
├── systests/
│   ├── __init__.py
│   └── cluster/
│       ├── __init__.py
│       ├── test_debug.py
│       ├── test_no_cluster.py
│       ├── test_remote_account.py
│       └── test_runner_operations.py
├── tests/
│   ├── __init__.py
│   ├── cluster/
│   │   ├── __init__.py
│   │   ├── check_cluster.py
│   │   ├── check_cluster_spec.py
│   │   ├── check_finite_subcluster.py
│   │   ├── check_json.py
│   │   ├── check_localhost.py
│   │   ├── check_node_container.py
│   │   ├── check_remoteaccount.py
│   │   └── check_vagrant.py
│   ├── command_line/
│   │   ├── __init__.py
│   │   ├── check_main.py
│   │   └── check_parse_args.py
│   ├── ducktape_mock.py
│   ├── loader/
│   │   ├── __init__.py
│   │   ├── check_loader.py
│   │   └── resources/
│   │       ├── __init__.py
│   │       ├── loader_test_directory/
│   │       │   ├── README
│   │       │   ├── __init__.py
│   │       │   ├── invalid_test_suites/
│   │       │   │   ├── empty_file.yml
│   │       │   │   ├── malformed_test_suite.yml
│   │       │   │   ├── not_yaml.yml
│   │       │   │   ├── test_suite_refers_to_non_existent_file.yml
│   │       │   │   ├── test_suite_with_malformed_params.yml
│   │       │   │   └── test_suites_with_no_tests.yml
│   │       │   ├── name_does_not_match_pattern.py
│   │       │   ├── sub_dir_a/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── test_c.py
│   │       │   │   └── test_d.py
│   │       │   ├── sub_dir_no_tests/
│   │       │   │   ├── __init__.py
│   │       │   │   └── just_some_file.py
│   │       │   ├── test_a.py
│   │       │   ├── test_b.py
│   │       │   ├── test_decorated.py
│   │       │   ├── test_suite_cyclic_a.yml
│   │       │   ├── test_suite_cyclic_b.yml
│   │       │   ├── test_suite_decorated.yml
│   │       │   ├── test_suite_import_malformed.yml
│   │       │   ├── test_suite_import_py.yml
│   │       │   ├── test_suite_malformed.yml
│   │       │   ├── test_suite_multiple.yml
│   │       │   ├── test_suite_single.yml
│   │       │   ├── test_suite_with_self_import.yml
│   │       │   ├── test_suite_with_single_import.yml
│   │       │   └── test_suites/
│   │       │       ├── refers_to_parent_dir.yml
│   │       │       ├── sub_dir_a_test_c.yml
│   │       │       ├── sub_dir_a_test_c_via_class.yml
│   │       │       ├── sub_dir_a_with_exclude.yml
│   │       │       ├── sub_dir_test_import.yml
│   │       │       └── test_suite_glob.yml
│   │       └── report.json
│   ├── logger/
│   │   ├── __init__.py
│   │   └── check_logger.py
│   ├── mark/
│   │   ├── __init__.py
│   │   ├── check_cluster_use_metadata.py
│   │   ├── check_env.py
│   │   ├── check_ignore.py
│   │   ├── check_parametrize.py
│   │   └── resources/
│   │       └── __init__.py
│   ├── reporter/
│   │   └── check_symbol_reporter.py
│   ├── runner/
│   │   ├── __init__.py
│   │   ├── check_runner.py
│   │   ├── check_runner_memory.py
│   │   ├── check_sender_receiver.py
│   │   ├── fake_remote_account.py
│   │   └── resources/
│   │       ├── __init__.py
│   │       ├── test_bad_actor.py
│   │       ├── test_failing_tests.py
│   │       ├── test_fails_to_init.py
│   │       ├── test_fails_to_init_in_setup.py
│   │       ├── test_memory_leak.py
│   │       ├── test_thingy.py
│   │       └── test_various_num_nodes.py
│   ├── scheduler/
│   │   ├── __init__.py
│   │   └── check_scheduler.py
│   ├── services/
│   │   ├── __init__.py
│   │   ├── check_background_thread_service.py
│   │   ├── check_jvm_logging.py
│   │   └── check_service.py
│   ├── templates/
│   │   ├── __init__.py
│   │   ├── service/
│   │   │   ├── __init__.py
│   │   │   ├── check_render.py
│   │   │   └── templates/
│   │   │       └── sample
│   │   └── test/
│   │       ├── __init__.py
│   │       ├── check_render.py
│   │       └── templates/
│   │           └── sample
│   ├── test_utils.py
│   ├── tests/
│   │   ├── __init__.py
│   │   ├── check_session.py
│   │   ├── check_test.py
│   │   └── check_test_context.py
│   └── utils/
│       ├── __init__.py
│       └── check_util.py
└── tox.ini

================================================
FILE CONTENTS
================================================

================================================
FILE: .coveragerc
================================================
[run]
omit =
    .git/*
    .tox/*
    docs/*
    setup.py
    test/*
    tests/*


================================================
FILE: .dockerignore
================================================
*


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''

---
@confluentinc/devprod-apac-n-frameworks-eng is tagged for visibility

**Describe the bug**
A clear and concise description of what the bug is.


**To Reproduce**
Steps to reproduce the behavior

**Expected behavior**
A clear and concise description of what you expected to happen.

**Additional context**
Add any other context about the problem here.


================================================
FILE: .gitignore
================================================
# Duckape
systests/.ducktape/
systests/results/
results
.ducktape
.vagrant

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
textcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
.virtualenvs/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

.idea
/.vagrant/
.vscode


================================================
FILE: .readthedocs.yaml
================================================
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

build:
  os: ubuntu-20.04
  tools:
    python: "3.13"

sphinx:
  configuration: docs/conf.py

python:
  install:
    - requirements: docs/requirements.txt
    - method: setuptools
      path: .


================================================
FILE: .semaphore/semaphore.yml
================================================
version: v1.0
name: pr-test-job
agent:
  machine:
    type: s1-prod-ubuntu24-04-amd64-1

execution_time_limit:
  hours: 1

global_job_config:
  prologue:
    commands:
      - checkout


blocks:
  - name: Test
    dependencies: []
    task:
      jobs:
        - name: Test Python 3.8
          commands:
            - sem-version python 3.8
            - pip install tox
            - export PYTESTARGS='--junitxml=test/results-py38.xml'
            - tox -e py38
        - name: Test Python 3.9
          commands:
            - sem-version python 3.9
            - pip install tox
            - export PYTESTARGS='--junitxml=test/results-py39.xml'
            - tox -e py39
        - name: Test Python 3.10
          commands:
            - sem-version python 3.10
            - pip install tox
            - export PYTESTARGS='--junitxml=test/results-py310.xml'
            - tox -e py310
        - name: Test Python 3.11
          commands:
            - sem-version python 3.11
            - pip install tox
            - export PYTESTARGS='--junitxml=test/results-py311.xml'
            - tox -e py311
        - name: Test Python 3.12
          commands:
            - sem-version python 3.12
            - pip install tox
            - export PYTESTARGS='--junitxml=test/results-py312.xml'
            - tox -e py312
        - name: Test Python 3.13
          commands:
            - sem-version python 3.13
            - pip install tox
            - export PYTESTARGS='--junitxml=test/results-py313.xml'
            - tox


================================================
FILE: CODEOWNERS
================================================
* @confluentinc/cp-test-frameworks-and-readiness


================================================
FILE: Dockerfile
================================================
# An image of ducktape that can be used to setup a Docker cluster where ducktape is run inside the container.

FROM ubuntu:14.04

RUN apt-get update && \
    apt-get install -y libffi-dev libssl-dev openssh-server python-dev python-pip python-virtualenv && \
    virtualenv /opt/ducktape && \
    . /opt/ducktape/bin/activate && \
    pip install -U pip setuptools wheel && \
    pip install bcrypt cryptography==2.2.2 pynacl && \
    mkdir /var/run/sshd && \
    mkdir /root/.ssh && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

ARG DUCKTAPE_VERSION=0.7.3

RUN . /opt/ducktape/bin/activate && \
    pip install ducktape==$DUCKTAPE_VERSION && \
    ln -s /opt/ducktape/bin/ducktape /usr/local/bin/ducktape && \
    deactivate && \
    /usr/local/bin/ducktape --version

EXPOSE 22

CMD    ["/usr/sbin/sshd", "-D"]


================================================
FILE: Jenkinsfile.disabled
================================================
python {
    publish = false  // Release is done manually to PyPI and not supported yet.
}


================================================
FILE: README.md
================================================
[![Documentation Status](https://readthedocs.org/projects/ducktape/badge/?version=latest)](https://ducktape.readthedocs.io/en/latest/?badge=latest)


Distributed System Integration & Performance Testing Library
============================================================

Overview
--------

Ducktape contains tools for running system integration and performance tests. It provides the following features:

* Isolation by default so system tests are as reliable as possible.
* Utilities for pulling up and tearing down services easily in clusters in different environments
  (e.g. local, custom cluster, Vagrant, K8s, Mesos, Docker, cloud providers, etc.)
* Easy to write unit tests for distributed systems
* Trigger special events (e.g. bouncing a service)
* Collect results (e.g. logs, console output)
* Report results (e.g. expected conditions met, performance results, etc.)

Documentation
-------------

For detailed documentation on how to install, run, create new tests please refer to: http://ducktape.readthedocs.io/

Contribute
----------

- Source Code: https://github.com/confluentinc/ducktape
- Issue Tracker: https://github.com/confluentinc/ducktape/issues

License
-------
The project is licensed under the Apache 2 license.


================================================
FILE: Vagrantfile
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- mode: ruby -*-
# vi: set ft=ruby :

require 'socket'

# Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
VAGRANTFILE_API_VERSION = "2"

# General config
enable_dns = false
num_workers = 3
ram_megabytes = 300
base_box = "ubuntu/focal64"

local_config_file = File.join(File.dirname(__FILE__), "Vagrantfile.local")
if File.exists?(local_config_file) then
  eval(File.read(local_config_file), binding, "Vagrantfile.local")
end

Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
  config.hostmanager.enabled = true
  config.hostmanager.manage_host = enable_dns
  config.hostmanager.include_offline = false

  ## Provider-specific global configs
  config.vm.provider :virtualbox do |vb,override|
    override.vm.box = base_box

    override.hostmanager.ignore_private_ip = false

    # Brokers started with the standard script currently set Xms and Xmx to 1G,
    # plus we need some extra head room.
    vb.customize ["modifyvm", :id, "--memory", ram_megabytes.to_s]
  end

  ## Cluster definition
  (1..num_workers).each { |i|
    name = "ducktape" + i.to_s
    config.vm.define name do |worker|
      worker.vm.hostname = name
      worker.vm.network :private_network, ip: "192.168.56." + (150 + i).to_s
    end
  }

end


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SPHINXPROJ    = ducktape
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

================================================
FILE: docs/README.md
================================================
Ducktape documentation quick start guide
========================================


Build the documentation
-----------------------

To render the pages run::
```shell
tox -e docs
```
    
The rendered pages will be in ``docs/_build/html``


Specify documentation format
----------------------------

Documentation is built using [sphinx-build](https://www.sphinx-doc.org/en/master/man/sphinx-build.html) command.
You can select which builder to use using SPHINX_BUILDER command:
```shell
SPHINX_BUILDER=man tox -e docs
```
All available values: https://www.sphinx-doc.org/en/master/man/sphinx-build.html#cmdoption-sphinx-build-M


Pass options to sphinx-build
----------------------------
Any argument after `--` will be passed to the 
[sphinx-build](https://www.sphinx-doc.org/en/master/man/sphinx-build.html) command directly:
```shell
tox -e docs -- -E
```


================================================
FILE: docs/_static/theme_overrides.css
================================================
/* override table width restrictions */
@media screen and (min-width: 767px) {

   .wy-table-responsive table td {
      /* !important prevents the common CSS stylesheets from overriding
         this as on RTD they are loaded after this stylesheet */
      white-space: normal !important;
   }

   .wy-table-responsive {
      overflow: visible !important;
   }
}

================================================
FILE: docs/api/clusters.rst
================================================
Clusters
========

.. autoclass:: ducktape.cluster.cluster.Cluster
    :members:

.. autoclass:: ducktape.cluster.vagrant.VagrantCluster
    :members:

.. autoclass:: ducktape.cluster.localhost.LocalhostCluster
    :members:

.. autoclass:: ducktape.cluster.json.JsonCluster
    :members:


================================================
FILE: docs/api/remoteaccount.rst
================================================
Remote Account
==============

.. autoclass:: ducktape.cluster.remoteaccount.RemoteAccount
    :members:

.. autoclass:: ducktape.cluster.remoteaccount.LogMonitor
    :members:

.. autoclass:: ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount
    :members:

.. autoclass:: ducktape.cluster.windows_remoteaccount.WindowsRemoteAccount
    :members:


================================================
FILE: docs/api/services.rst
================================================
Services
========

.. autoclass:: ducktape.services.service.Service
    :members:

.. autoclass:: ducktape.services.background_thread.BackgroundThreadService
    :members:


================================================
FILE: docs/api/templates.rst
================================================
Template
========

.. autoclass:: ducktape.template.TemplateRenderer
    :members:

================================================
FILE: docs/api/test.rst
================================================
Test
====

.. autoclass:: ducktape.tests.test.Test
    :members:

.. autoclass:: ducktape.tests.test.TestContext
    :members:


================================================
FILE: docs/api.rst
================================================
.. _topics-api:

=======
API Doc
=======

.. toctree::
    api/test
    api/services
    api/remoteaccount
    api/clusters
    api/templates


================================================
FILE: docs/changelog.rst
================================================
.. _topics-changelog:

====
Changelog
====

0.14.0
======
Tuesday, March 10th, 2026
-------------------------
- Ensure log collection in case of runner client unresponsive issue
- Enable jvm logging for java based services
- Graceful shutdown and reporting fix for runner client unresponsive issue
- Add heterogeneous cluster support
- Add types to ducktape
- Call JUnitReporter after each test completes
- Update dependency requests to v2.32.4 [security]


0.13.0
======
Monday, June 09th, 2025
-----------------------
- Report expected test count in the summary
- Add python 3.13 support and run pr test job with all supported python versions
- Upgrade PyYAML and fix style error
- Add support for historical report in loader
- Update dependency requests to v2.32.2
- Update dependency pycryptodome to v3.19.1
- Removing generated internal project.yml, public project.yml


0.12.0
======
Friday, October 04th, 2024
--------------------------
- Store summary of previous runs when deflaking
- Runner Client Minor Refactor and Test
- Adding nodes used in test summary for HTML report
- Parse args from config files independently
- add support to python 3.10, 3.11 and 3.12


0.11.4
======
Friday, August 18th, 2023
-------------------------
- Updated `requests` version to 2.31.0

0.11.3
======
Wednesday, November 30th, 2022
------------------------------
- Bugfix: fixed an edge case when BackgroundThread wait() method errors out if start() method has never been called.

0.11.2
======
Wednesday, November 30th, 2022
------------------------------
- Bugfix: fixed an edge case when BackgroundThread wait() method errors out if start() method has never been called.

0.11.1
======
- Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`.
- Bumped `jinja2` to `3.0.x`

0.11.0
======
- Option to fail tests without `@cluster` annotation. Deprecate ``min_cluster_spec()`` method in the ``Test`` class - `#336 <https://github.com/confluentinc/ducktape/pull/336>`_

0.10.3
======
Friday, August 18th, 2023
-------------------------
- Updated `requests` version to 2.31.0

0.10.2
======
- Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`.

0.10.1
======
- Disable health checks for nodes, effectively disabling `#325 <https://github.com/confluentinc/ducktape/pull/325>`_. See github issue for details - `#339 <https://github.com/confluentinc/ducktape/issues/339>`_

0.10.0
======
- **DO NOT USE**, this release has a nasty bug - `#339 <https://github.com/confluentinc/ducktape/issues/339>`_
- Do not schedule tests on unresponsive nodes - `#325 <https://github.com/confluentinc/ducktape/pull/325>`_

0.9.4
=====
Friday, August 18th, 2023
-------------------------
- Updated `requests` version to 2.31.0

0.9.3
=====
- Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`.

0.9.2
=====
- Service release, no ducktape changes, simply fixed readthedocs configs.

0.9.1
=====
- use a generic network device based on the devices found on the remote machine rather than a hardcoded one - `#314 <https://github.com/confluentinc/ducktape/pull/314>`_ and `#328 <https://github.com/confluentinc/ducktape/pull/328>`_
- clean up process properly after an exception during test runner execution - `#323 <https://github.com/confluentinc/ducktape/pull/323>`_
- log ssh errors - `#319 <https://github.com/confluentinc/ducktape/pull/319>`_
- update vagrant tests to use ubuntu20 - `#328 <https://github.com/confluentinc/ducktape/pull/328>`_
- added command to print the total number of nodes the tests run will require - `#320 <https://github.com/confluentinc/ducktape/pull/320>`_
- drop support for python 3.6 and add support for python 3.9 - `#317 <https://github.com/confluentinc/ducktape/pull/317>`_

0.9.0
=====
- Upgrade paramiko version to 2.10.0 - `#312 <https://github.com/confluentinc/ducktape/pull/312>`_
- Support SSH timeout - `#311 <https://github.com/confluentinc/ducktape/pull/311>`_

0.8.18
======
- Updated `requests` version to `2.31.0`

0.8.17
======
- Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`.

0.8.x
=====
- Support test suites
- Easier way to rerun failed tests - generate test suite with all the failed tests and also print them in the log so that user can copy them and paste as ducktape command line arguments
- Python 2 is no longer supported, minimum supported version is 3.6
- Added `--deflake N` flag - if provided, it will attempt to rerun each failed test  up to N times, and if it eventually passes, it will be marked as Flaky - `#299 <https://github.com/confluentinc/ducktape/pull/299>`_
- [backport, also in 0.9.1] - use a generic network device based on the devices found on the remote machine rather than a hardcoded one - `#314 <https://github.com/confluentinc/ducktape/pull/314>`_ and `#328 <https://github.com/confluentinc/ducktape/pull/328>`_
- [backport, also in 0.9.1] - clean up process properly after an exception during test runner execution - `#323 <https://github.com/confluentinc/ducktape/pull/323>`_
- [backport, also in 0.9.1] - log ssh errors - `#319 <https://github.com/confluentinc/ducktape/pull/319>`_
- [backport, also in 0.9.1] - update vagrant tests to use ubuntu20 - `#328 <https://github.com/confluentinc/ducktape/pull/328>`_
- [backport, also in 0.9.1] - added command to print the total number of nodes the tests run will require - `#320 <https://github.com/confluentinc/ducktape/pull/320>`_


================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# ducktape documentation build configuration file, created by
# sphinx-quickstart on Mon Mar 13 14:06:07 2017.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
import sphinx_rtd_theme
from ducktape import __version__

sys.path.insert(0, os.path.abspath(".."))


# -- General configuration ------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ["sphinx.ext.viewcode", "sphinx.ext.autodoc", "sphinxarg.ext"]

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = ".rst"

# The master toctree document.
master_doc = "index"

# General information about the project.
project = "Ducktape"
copyright = "2017, Confluent Inc."
author = "Confluent Inc."

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#

# The short X.Y version.
version = __version__
# The full version, including alpha/beta/rc tags.
release = version

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False


# -- Options for HTML output ----------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}

# Add any paths that contain custom themes here, relative to this directory.
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']


# -- Options for HTMLHelp output ------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = "ducktapedoc"


# -- Options for LaTeX output ---------------------------------------------

latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, "ducktape.tex", "Ducktape Documentation", "Confluent Inc.", "manual"),
]


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "ducktape", "Ducktape Documentation", [author], 1)]


# -- Options for Texinfo output -------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (
        master_doc,
        "ducktape",
        "Ducktape Documentation",
        author,
        "ducktape",
        "One line description of project.",
        "Miscellaneous",
    ),
]


# ---- Options for Autodoc ------------------------------------------------

autodoc_default_flags = ["show-inheritance"]


def skip(app, what, name, obj, skip, options):
    if name == "__init__":
        return False
    return skip


def setup(app):
    app.connect("autodoc-skip-member", skip)


================================================
FILE: docs/debug_tests.rst
================================================
.. _topics-debug_tests:

===========
Debug Tests
===========

The test results go in ``results/<date>—<test_number>``. For results from a particular test, look for ``results/<date>—<test_number>/test_class_name/<test_method_name>/`` directory. The ``test_log.debug`` file will contain the log output from the python driver, and logs of services used in the test will be in ``service_name/node_name`` sub-directory.

If there is not enough information in the logs, you can re-run the test with ``--no-teardown`` argument.

.. code-block:: bash

    ducktape dir/tests/my_test.py::TestA.test_a --no-teardown


This will run the test but will not kill any running processes or remove log files when the test finishes running. Then, you can examine the state of a running service or the machine when the service process is running by logging into that machine. Suppose you suspect a particular service being the cause of the test failure. You can find out which machine was allocated to that service by either looking at ``test_log.debug`` or at directory names under ``results/<date>—<test_number>/test_class_name/<test_method_name>/service_name/``. It could be useful to add an explicit debug log to ``start_node`` method with a node ID and node’s hostname information for easy debugging:

.. code-block:: python

    def start_node(self, node):
        idx = self.idx(node)
        self.logger.info("Starting ZK node %d on %s", idx, node.account.hostname)

The log statement will look something like this::

    [INFO  - 2017-03-28 22:07:25,222 - zookeeper - start_node - lineno:50]: Starting ZK node 1 on worker1

If you are using Vagrant for example, you can then log into that node via:

.. code-block:: bash

    vagrant ssh worker1


Use Logging
===========

Distributed system tests can be difficult to debug. You want to add a lot of logging for debugging and tracking progress of the test. A good approach would be to log an intention of an operation with some useful information before any operation that can fail. It could be a good idea to use a higher logging level than you would in production so more info is available. For example, make your log levels default to DEBUG instead of INFO. Also, put enough information to a message of ``assert`` to help figure out what went wrong as well as log messages. Consider an example of testing ElasticSearch service:

.. code-block:: python

        res = es.search(index="test-index", body={"query": {"match_all": {}}})
        self.logger.debug("result: %s" % res['hits'])
        assert res['hits']['total'] == 1, "Expected total 1 hit, but got %d" % res['hits']['total']
        for hit in res['hits']['hits']:
            assert 'kimchy’ == hit['_source']['author’], "Expected author kimchy but got %s" % hit['_source']['author']
            assert 'Elasticsearch: cool.' == hit['_source']['text’], "Expected text Elasticsearch: cool. but got %s" % hit['_source']['text’]

First, the tests outputs the result of a search, so that if any of the following assertions fail, we can see the whole result in ``test_log.debug``. Assertion messages help to quickly see the difference in expected and retrieved results. 


Fail early
==========

Try to avoid a situation where a test fails because of an uncaught failure earlier in the test. Suppose we write a ``start_node`` method that does not check if the service starts successfully. The service fails to start, but we get a test failure indication that there was a problem querying the service. It would be much faster to debug the issue if the test failure pointed to the issue with starting the service. So make sure to add checks for operations that may fail, and fail the test earlier than later.


Flaky tests
============

Flaky tests are hard to debug due to their non-determinism, they waste time, and sometimes hide real bugs: developers tend to ignore those failures, and thus could miss real bugs. Flakiness can come from the test itself, the system it is testing, or the environmental issues.

Waiting on Conditions
^^^^^^^^^^^^^^^^^^^^^

A common cause of a flaky test is asynchronous wait on conditions. A test makes an asynchronous call and does not properly wait for the result of the call to become available before using it::

	node.account.kill_process("zookeeper", allow_fail=False)
	time.sleep(2)
	assert not self.alive(node), “Expected Zookeeper service to stop” 

In this example, the test terminates a zookeeper service via ``kill_process`` and then uses ``time.sleep`` to wait for it to stop. If terminating the process takes longer, the test will fail. The test may intermittently fail based on how fast a process terminates. Of course, there should be a timeout for termination to ensure that test does not run indefinitely. You could increase sleep time, but that also increases the test run length. A more explicit way to express this condition is to use :meth:`~ ducktape.utils.util.wait_until` with a timeout::

	node.account.kill_process("zookeeper", allow_fail=False)
	wait_until(lambda: not self.alive(node),
                   timeout_sec=5,
                   err_msg="Timed out waiting for zookeeper to stop.")

The test will progress as soon as condition is met, and timeout ensures that the test does not run indefinitely if termination never ends.

Think carefully about the condition to check. A common source of issues is incorrect choice of condition of successful service start in ``start_node`` implementation. One way to check that a service starts successfully is to wait for some specific log output. However, make sure that this specific log message is always printed after the things run successfully. If there is still a chance that service may fail to start after the log is printed, this may cause race conditions and flaky tests. Sometimes it could be better to check if the service runs successfully by querying a service or checking some metrics if they are available.


Test Order Dependency
^^^^^^^^^^^^^^^^^^^^^

Make sure that your services properly cleanup the state in ``clean_node`` implementation. Failure to properly clean up the state can cause the next run of the test to fail or fail intermittently if other tests happen to clean same directories for example. One of the benefits of isolation that ducktape assumes is that you can assume you have complete control of the machine. It is ok to delete the entire working space. It is also safe to kill all java processes you can find rather than being more targeted. So, clean up aggressively.

Incorrect Assumptions
^^^^^^^^^^^^^^^^^^^^^

It is possible that assumptions about how the system works that we are testing are incorrect. One way to help debug this is to use more detailed comments why certain checks are made.


Tools for Managing Logs
=======================

Analyzing and matching up logs from a distributed service could be time consuming. There are many good tools for working with logs. Examples include http://lnav.org/, http://list.xmodulo.com/multitail.html, and http://glogg.bonnefon.org/.

Validating Ssh Issues
=======================

Ducktape supports running custom validators when an ssh error occurs, allowing you to run your own validation against a host.
this is done simply by running ducktape with the `--ssh-checker-function`, followed by the module path to your function, so for instance::
    
    ducktape my-test.py --ssh-checker-function my.module.validator.validate_ssh

this function will take in the ssh error raised as its first argument, and the remote account object as its second.


================================================
FILE: docs/index.rst
================================================
.. _topics-index:

============================================================
Distributed System Integration & Performance Testing Library
============================================================
Ducktape contains tools for running system integration and performance tests. It provides the following features:

   * Write tests for distributed systems in a simple unit test-like style
   * Isolation by default so system tests are as reliable as possible.
   * Utilities for pulling up and tearing down services easily in clusters in different environments (e.g. local, custom cluster, Vagrant, K8s, Mesos, Docker, cloud providers, etc.)
   * Trigger special events (e.g. bouncing a service)
   * Collect results (e.g. logs, console output)
   * Report results (e.g. expected conditions met, performance results, etc.)

.. toctree::
   install
   test_clusters
   run_tests
   new_tests
   new_services
   debug_tests
   api
   misc
   changelog

Contribute
==========

- Source Code: https://github.com/confluentinc/ducktape
- Issue Tracker: https://github.com/confluentinc/ducktape/issues

License
=======

The project is licensed under the Apache 2 license.


================================================
FILE: docs/install.rst
================================================
.. _topics-install:

=======
Install
=======

1. Ducktape requires python 3.7 or later.

2. Install `cryptography`_ (used by `paramiko` which Ducktape depends on), this may have non-python external requirements

.. _cryptography: https://cryptography.io/en/latest/installation

    * OSX (if needed)::

        brew install openssl

    * Ubuntu::

        sudo apt-get install build-essential libssl-dev libffi-dev python-dev

    * Fedora and RHEL-derivatives::

        sudo yum install gcc libffi-devel python-devel openssl-devel


3. As a general rule, it's recommended to use an isolation tool such as ``virtualenv``

4. Install Ducktape::

    pip install ducktape

.. note::

    On OSX you may need to::

        C_INCLUDE_PATH=/usr/local/opt/openssl/include LIBRARY_PATH=/usr/local/opt/openssl/lib pip install ducktape

    If you are not using a virtualenv and get the error message `failed with error code 1`, you may need to install ducktape to your user directory instead with ::

        pip install --user ducktape


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
set SPHINXPROJ=ducktape

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

:end
popd


================================================
FILE: docs/misc.rst
================================================
.. _topics-misc:

====
Misc
====

Developer Install
=================

If you are are a ducktape developer, consider using the develop command instead of install. This allows you to make code changes without constantly reinstalling ducktape (see http://stackoverflow.com/questions/19048732/python-setup-py-develop-vs-install for more information)::

    cd ducktape
    python setup.py develop

To uninstall::

    cd ducktape
    python setup.py develop --uninstall


Unit Tests
==========

You can run the tests with code coverage and style check using `tox <https://tox.readthedocs.io/en/latest/>`_::

    tox

Alternatively, you can activate the virtualenv and run pytest and ruff directly::

    source ~/.virtualenvs/ducktape/bin/activate
    pytest tests
    ruff check
    ruff format --check


System Tests
============

System tests are included under the `systests/` directory. These tests are end to end tests that run across multiple VMs, testing ducktape in an environment similar to how it would be used in practice to test other projects.

The system tests run against virtual machines managed by `Vagrant <https://www.vagrantup.com/>`_. With Vagrant installed, start the VMs (3 by default)::

  vagrant up

From a developer install, running the system tests now looks the same as using ducktape on your own project::

  ducktape systests/

You should see the tests running, and then results and logs will be in the default directory, `results/`. By using a developer install, you can make modifications to the ducktape code and iterate on system tests without having to re-install after each modification.

When you're done running tests, you can destroy the VMs::

  vagrant destroy


Windows
=======

Ducktape support Services that run on Windows, but only in EC2.

When a ``Service`` requires a Windows machine, AWS credentials must be configured on the machine running ducktape.

Ducktape uses the `boto3`_ Python module to connect to AWS. And ``boto3`` support many different `configuration options`_

.. _boto3: https://aws.amazon.com/sdk-for-python/
.. _configuration options: https://boto3.readthedocs.io/en/latest/guide/configuration.html#guide-configuration

Here's an example bare minimum configuration using environment variables::

    export AWS_ACCESS_KEY_ID="ABC123"
    export AWS_SECRET_ACCESS_KEY="secret"
    export AWS_DEFAULT_REGION="us-east-1"

The region can be any AWS region, not just ``us-east-1``.


================================================
FILE: docs/new_services.rst
================================================
.. _topics-new_services:

===================
Create New Services
===================

Writing ducktape services
=============================

``Service`` refers generally to multiple processes, possibly long-running, which you
want to run on the test cluster.

These can be services you would actually deploy (e.g., Kafka brokers, ZK servers, REST proxy) or processes used during testing (e.g. producer/consumer performance processes). Services that are distributed systems can support a variable number of nodes which allow them to handle a variety of tests.

Each service is implemented as a class and should at least implement the following:

    * :meth:`~ducktape.services.service.Service.start_node` - start the service (possibly waiting to ensure it started successfully)

    * :meth:`~ducktape.services.service.Service.stop_node` - kill processes on the given node

    * :meth:`~ducktape.services.service.Service.clean_node` - remove persistent state leftover from testing, e.g. log files

These may block to ensure services start or stop properly, but must *not* block for the full lifetime of the service. If you need to run a blocking process (e.g. run a process via SSH and iterate over its output), this should be done in a background thread. For services that exit after completing a fixed operation (e.g. produce N messages to topic foo), you should also implement ``wait``, which will usually just wait for background worker threads to exit. The ``Service`` base class provides a helper method ``run`` which wraps ``start``, ``wait``, and ``stop`` for tests that need to start a service and wait for it to finish. You can also provide additional helper methods for common test functionality. Normal services might provide a ``bounce`` method.

Most of the code you'll write for a service will just be series of SSH commands and tests of output. You should request the number of nodes you'll need using the ``num_nodes`` or ``cluster_spec`` parameter to the Service base class's constructor. Then, in your Service's methods you'll have access to ``self.nodes`` to access the nodes allocated to your service. Each node has an associated :class:`~ducktape.cluster.remoteaccount.RemoteAccount` instance which lets you easily perform remote operations such as running commands via SSH or creating files. By default, these operations try to hide output (but provide it to you if you need to extract some subset of it) and *checks status codes for errors* so any operations that fail cause an obvious failure of the entire test.

.. _service-example-ref:

New Service Example
===================

Let’s walk through an example of writing a simple Zookeeper service.

.. code-block:: python

    class ZookeeperService(Service):
        PERSISTENT_ROOT = "/mnt"
        LOG_FILE = os.path.join(PERSISTENT_ROOT, "zk.log")
        DATA_DIR = os.path.join(PERSISTENT_ROOT, "zookeeper")
        CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "zookeeper.properties")

        logs = {
            "zk_log": {
                "path": LOG_FILE,
                "collect_default": True},
            "zk_data": {
                "path": DATA_DIR,
                "collect_default": False}
        }

        def __init__(self, context, num_nodes):
            super(ZookeeperService, self).__init__(context, num_nodes)


``logs`` is a member of ``Service`` that provides a mechanism for locating and collecting log files produced by the service on its nodes. ``logs`` is a dict with entries that look like ``log_name: {"path": log_path, "collect_default": boolean}``. In our example, log files will be collected on both successful and failed test runs, while files from the data directory will be collected only on failed test runs. Zookeeper service requests the number of nodes passed to its constructor by passing ``num_nodes`` parameters to the Service base class’s constructor.

.. code-block:: python

        def start_node(self, node):
            idx = self.idx(node)
            self.logger.info("Starting ZK node %d on %s", idx, node.account.hostname)

            node.account.ssh("mkdir -p %s" % self.DATA_DIR)
            node.account.ssh("echo %d > %s/myid" % (idx, self.DATA_DIR))

            prop_file = """\n dataDir=%s\n clientPort=2181""" % self.DATA_DIR
            for idx, node in enumerate(self.nodes):
                prop_file += "\n server.%d=%s:2888:3888" % (idx, node.account.hostname)
            self.logger.info("zookeeper.properties: %s" % prop_file)
            node.account.create_file(self.CONFIG_FILE, prop_file)

            start_cmd = "/opt/kafka/bin/zookeeper-server-start.sh %s 1>> %s 2>> %s &" % \
                    (self.CONFIG_FILE, self.LOG_FILE, self.LOG_FILE)

            with node.account.monitor_log(self.LOG_FILE) as monitor:
                node.account.ssh(start_cmd)
                monitor.wait_until(
                    "binding to port",
                    timeout_sec=100,
                    backoff_sec=7,
                    err_msg="Zookeeper service didn't finish startup"
                )
            self.logger.debug("Zookeeper service is successfully started.")


The ``start_node`` method first creates directories and the config file on the given node, and then invokes the start script to start a Zookeeper service. In this simple example, the config file is created from manually constructed ``prop_file`` string, because it has only a couple of easy to construct lines. More complex config files can be created with templates, as described in :ref:`using-templates-ref`.

A service may take time to start and get to a usable state. Using sleeps to wait for a service to start often leads to a flaky test. The sleep time may be too short, or the service may fail to start altogether. It is useful to verify that the service starts properly before returning from the ``start_node``, and fail the test if the service fails to start. Otherwise, the test will likely fail later, and it would be harder to find the root cause of the failure. One way to check that the service starts successfully is to check whether a service’s process is alive and one additional check that the service is usable such as querying the service or checking some metrics if they are available. Our example checks whether a Zookeeper service is started successfully by searching for a particular output in a log file.

The :class:`~ducktape.cluster.remoteaccount.RemoteAccount` instance associated with each node provides you with :class:`~ducktape.cluster.remoteaccount.LogMonitor` that let you check or wait for a pattern to appear in the log. Our example waits for 100 seconds for “binding to port” string to appear in the ``self.LOG_FILE`` log file, and raises an exception if it does not.

.. code-block:: python

    def pids(self, node):
        try:
            cmd = "ps ax | grep -i zookeeper | grep java | grep -v grep | awk '{print $1}'"
            pid_arr = [pid for pid in node.account.ssh_capture(cmd, allow_fail=True, callback=int)]
            return pid_arr
        except (RemoteCommandError, ValueError) as e:
            return []

    def alive(self, node):
        return len(self.pids(node)) > 0

    def stop_node(self, node):
        idx = self.idx(node)
        self.logger.info("Stopping %s node %d on %s" % (type(self).__name__, idx, node.account.hostname))
        node.account.kill_process("zookeeper", allow_fail=False)

    def clean_node(self, node):
        self.logger.info("Cleaning Zookeeper node %d on %s", self.idx(node), node.account.hostname)
        if self.alive(node):
            self.logger.warn("%s %s was still alive at cleanup time. Killing forcefully..." %
                             (self.__class__.__name__, node.account))
        node.account.kill_process("zookeeper", clean_shutdown=False, allow_fail=True)
        node.account.ssh("rm -rf /mnt/zookeeper /mnt/zookeeper.properties /mnt/zk.log",
                         allow_fail=False)


The ``stop_node`` method uses :meth:`~ducktape.cluster.remoteaccount.RemoteAccount.kill_process` to terminate the service process on the given node. If the remote command to terminate the process fails, :meth:`~ducktape.cluster.remoteaccount.RemoteAccount.kill_process` will raise an ``RemoteCommandError`` exception.

The ``clean_node`` method forcefully kills the process if it is still alive, and then removes persistent state leftover from testing. Make sure to properly cleanup the state to avoid test order dependency and flaky tests. You can assume complete control of the machine, so it is safe to delete an entire temporary working space and kill all java processes, etc.

.. _using-templates-ref:


Using Templates
===============

Both ``Service`` and ``Test`` subclass :class:`~ducktape.template.TemplateRenderer` that lets you render templates directly from strings or from files loaded from *templates/* directory relative to the class. A template contains variables and/or expressions, which are replaced with values when a template is rendered. :class:`~ducktape.template.TemplateRenderer` renders templates using `Jinja2 <http://jinja.pocoo.org/docs/2.9/>`_ template engine. A good use-case for templates is a properties file that needs to be passed to a service process. In :ref:`service-example-ref`, the properties file is created by building a string and using it as contents as follows::

        prop_file = """\n dataDir=%s\n clientPort=2181""" % self.DATA_DIR
        for idx, node in enumerate(self.nodes):
            prop_file += "\n server.%d=%s:2888:3888" % (idx, node.account.hostname)
        node.account.create_file(self.CONFIG_FILE, prop_file)

A template approach is to add a properties file in *templates/* directory relative to the ZookeeperService class:

.. code-block:: rst

    dataDir={{ DATA_DIR }}
    clientPort=2181
    {% for node in nodes %}
    server.{{ loop.index }}={{ node.account.hostname }}:2888:3888
    {% endfor %}


Suppose we named the file zookeeper.properties. The creation of the config file will look like this:

.. code-block:: python

        prop_file = self.render('zookeeper.properties')
        node.account.create_file(self.CONFIG_FILE, prop_file)


================================================
FILE: docs/new_tests.rst
================================================
.. _topics-new_tests:

================
Create New Tests
================

Writing ducktape Tests
======================

Subclass :class:`~ducktape.tests.test.Test` and implement as many ``test`` methods as you
want. The name of each test method must start or end with ``test``,
e.g. ``test_functionality`` or ``example_test``. Typically, a test will
start a few services, collect and/or validate some data, and then finish.

If the test method finishes with no exceptions, the test is recorded as successful, otherwise it is recorded as a failure.


Here is an example of a test that just starts a Zookeeper cluster with 2 nodes, and a
Kafka cluster with 3 nodes::

    class StartServicesTest(Test):
        """Make sure we can start Kafka and Zookeeper services."""
        def __init__(self, test_context):
            super(StartServicesTest, self).__init__(test_context=test_context)
            self.zk = ZookeeperService(test_context, num_nodes=2)
            self.kafka = KafkaService(test_context, num_nodes=3, self.zk)

        def test_services_start(self):
            self.zk.start()
            self.kafka.start()

Test Parameters
===============

Use test decorators to parametrize tests, examples are provided below

.. autofunction:: ducktape.mark.parametrize
.. autofunction:: ducktape.mark.matrix
.. autofunction:: ducktape.mark.resource.cluster
.. autofunction:: ducktape.mark.ignore

Logging
=======

The :class:`~ducktape.tests.test.Test` base class sets up logger you can use which is tagged by class name,
so adding some logging for debugging or to track the progress of tests is easy::

    self.logger.debug("End-to-end latency %d: %s", idx, line.strip())

These types of tests can be difficult to debug, so err toward more rather than
less logging.

.. note:: Logs are collected a multiple log levels, and only higher log levels are displayed to the console while the test runs. Make sure you log at the appropriate level.

JVM Logging
-----------

For Java-based services, ducktape can automatically collect JVM diagnostic logs without requiring any code changes to services or tests. Enable it with the ``--enable-jvm-logs`` flag::

    ducktape --enable-jvm-logs <test_path>

When enabled, ducktape wraps the service's ``start_node`` and ``clean_node`` methods to:

- Create a log directory (``/mnt/jvm_logs``) on each worker node before the service starts.
- Prepend ``JDK_JAVA_OPTIONS`` with the JVM logging flags to every SSH command sent to the node, so the options are inherited by any Java process the service launches.
- Remove the log directory after ``clean_node`` runs and restore the original SSH methods.

The following JVM options are injected automatically:

.. list-table::
   :header-rows: 1
   :widths: 40 60

   * - Option
     - Purpose
   * - ``-Xlog:disable``
     - Suppress default JVM console output to avoid polluting test logs
   * - ``-Xlog:gc*:file=<log_dir>/gc.log``
     - GC activity with timestamps, uptime, level, and tags
   * - ``-XX:+HeapDumpOnOutOfMemoryError``
     - Generate a heap dump when an OOM error occurs
   * - ``-XX:HeapDumpPath=<log_dir>/heap_dump.hprof``
     - Location for the heap dump file
   * - ``-Xlog:safepoint=info:file=<log_dir>/jvm.log``
     - Safepoint pause events
   * - ``-Xlog:class+load=info:file=<log_dir>/jvm.log``
     - Class loading events
   * - ``-XX:ErrorFile=<log_dir>/hs_err_pid%p.log``
     - Fatal error log (JVM crashes)
   * - ``-XX:NativeMemoryTracking=summary``
     - Native memory usage tracking
   * - ``-Xlog:jit+compilation=info:file=<log_dir>/jvm.log``
     - JIT compilation events

The following log files are collected from each node:

.. list-table::
   :header-rows: 1
   :widths: 30 20 50

   * - File
     - Collected by default
     - Contents
   * - ``gc.log``
     - Yes
     - Garbage collection activity
   * - ``jvm.log``
     - Yes
     - Safepoint, class loading, and JIT compilation events
   * - ``heap_dump.hprof``
     - No (failure only)
     - Heap dump generated on OutOfMemoryError

.. note:: If a service or test injects its own ``-Xlog`` options as part of the command, those options will override the ones injected by JVM logging, since ducktape prepends ``JDK_JAVA_OPTIONS`` before the command. In practice, services should behave as expected.

New test example
================

Lets expand on the StartServicesTest example. The test starts a Zookeeper cluster with 2 nodes, and a
Kafka cluster with 3 nodes, and then bounces a kafka broker node which is either a special controller node or a non-controller node, depending on the `bounce_controller_broker` test parameter.

.. code-block:: python

    class StartServicesTest(Test):
        def __init__(self, test_context):
            super(StartServicesTest, self).__init__(test_context=test_context)
            self.zk = ZookeeperService(test_context, num_nodes=2)
            self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk)

        def setUp(self):
            self.zk.start()
            self.kafka.start()

        @matrix(bounce_controller_broker=[True, False])
        def test_broker_bounce(self, bounce_controller_broker=False):
            controller_node = self.kafka.controller()
            self.logger.debug("Found controller broker %s", controller_node.account)
            if bounce_controller_broker:
                bounce_node = controller_node
            else:
                bounce_node = self.kafka.nodes[(self.kafka.idx(controller_node) + 1) % self.kafka.num_nodes]

            self.logger.debug("Will hard kill broker %s", bounce_node.account)
            self.kafka.signal_node(bounce_node, sig=signal.SIGKILL)

            wait_until(lambda: not self.kafka.is_registered(bounce_node),
                       timeout_sec=self.kafka.zk_session_timeout + 5,
                       err_msg="Failed to see timely deregistration of hard-killed broker %s"
                               % bounce_node.account)

            self.kafka.start_node(bounce_node)

This will run two tests, one with ‘bounce_controller_broker’: False and another with 'bounce_controller_broker': True arguments. We moved start of Zookeeper and Kafka services to :meth:`~ducktape.tests.test.Test.setUp`, which is called before every test run.

The test finds which of Kafka broker nodes is a special controller node via provided ``controller`` method in KafkaService. The ``controller`` method in KafkaService will raise an exception if the controller node is not found. Make sure to check the behavior of methods provided by a service or other helper classes and fail the test as soon as an issue is found. That way, it will be much easier to find the cause of the test failure.

The test then finds the node to bounce based on `bounce_controller_broker` test parameter and then forcefully terminates the service process on that node via ``signal_node`` method of KafkaService. This method just sends a signal to forcefully kill the process, and does not do any further check. Thus, our test needs to check that the hard killed kafka broker is not part of the Kafka cluster anymore, before restarting the killed broker process. We do this by waiting on ``is_registered`` method provided by KafkaService to return False with a timeout, since de-registering the broker may take some time. Notice the use of ``wait_until`` method instead of a check after ``time.sleep``. This allows the test to continue as soon as de-registration happens.

We don’t check if the restarted broker is registered, because this is already done in KafkaService  ``start_node`` implementation, which will raise an exception if the service is not started successfully on a given node.


================================================
FILE: docs/requirements.txt
================================================
Sphinx~=8.2.3
sphinx-argparse~=0.5.2
sphinx-rtd-theme~=3.0.2
boto3==1.33.13
pycryptodome==3.23.0
pywinrm==0.4.3
jinja2~=3.1.6
MarkupSafe~=2.1.5


================================================
FILE: docs/run_tests.rst
================================================
.. _topics-run_tests:

=========
Run Tests
=========

Running Tests
=============

ducktape discovers and runs tests in the path(s) provided.
You can specify a folder with tests (all tests in Python modules named with "test\_" prefix or "_test" suffix will be
run), a specific test file (with any name) or even a specific class or test method, via absolute or relative paths.
You can optionally specify a specific set of parameters for tests with ``@parametrize`` or ``@matrix`` annotations::

    ducktape <relative_path_to_testdirectory>                   # e.g. ducktape dir/tests
    ducktape <relative_path_to_file>                            # e.g. ducktape dir/tests/my_test.py
    ducktape <path_to_test>[::SomeTestClass]                    # e.g. ducktape dir/tests/my_test.py::TestA
    ducktape <path_to_test>[::SomeTestClass[.test_method]]      # e.g. ducktape dir/tests/my_test.py::TestA.test_a
    ducktape <path_to_test>[::TestClass[.method[@params_json]]] # e.g. ducktape 'dir/tests/my_test.py::TestA.test_a@{"x": 100}'


Excluding Tests
===============

Pass ``--exclude`` flag to exclude certain test(s) from the run, using the same syntax::

    ducktape ./my_tests_dir --exclude ./my_tests_dir/test_a.py ./my_tests_dir/test_b.py::TestB.test_b


Test Suites
===========

Test suite is a collection of tests to run, optionally also specifying which tests to exclude. Test suites are specified
via YAML file

.. code-block:: yaml

    # list all tests that are part of the suite under the test suite name:
    my_test_suite:
        - ./my_tests_dir/  # paths are relative to the test suite file location
        - ./another_tests_dir/test_file.py::TestClass.test_method  # same syntax as passing tests directly to ducktape
        - './another_tests_dir/test_file.py::TestClass.parametrized_method@{"x": 100}'  # params are supported too
        - ./third_tests_dir/prefix_*.py  # basic globs are supported (* and ? characters)

    # each YAML file can contain one or more test suites:
    another_test_suite:
        # you can optionally specify excluded tests in the suite as well using the following syntax:
        included:
            - ./some_tests_dir/
        excluded:
            - ./some_tests_dir/*_large_test.py


Running Test Suites
===================

Tests suites are run in the same fashion as separate tests.

Run a single test suite::

    ducktape ./path/to/test_suite.yml

Run multiple test suites::

    ducktape ./path/to/test_suite_1.yml ./test_suite_2.yml

You can specify both tests and test suites at the same time::

    ducktape ./my_test.py ./my_test_suite.yml ./another_test.py::TestClass.test_method

If the same test method is effectively specified more than once, it will only be executed once.

For example, if ``test_suite.yml`` lists ``test_a.py`` then running the following command
will execute ``test_a.py`` only once::

    ducktape test_suite.yml test_a.py

If you specify a folder, all tests (ie python files) under that folder will be discovered, but test suites will be not.

For example, if ``test_dir`` contains ``my_test.py`` and ``my_test_suite.yml``, then running::

    ducktape ./test_dir

will execute ``my_test.py`` but skip ``my_test_suite.yml``.

To execute both ``my_test.py`` and ``my_test_suite.yml`` you need to specify test suite path explicitly::

    ducktape ./test_dir/ ./test_dir/my_test_suite.yml


Exclude and Test Suites
=======================

Exclude section in the test suite applies only to that test suite. ``--exclude`` parameter passed to ducktape applies
to all loaded tests and test suites.

For example, if ``test_dir`` contains ``test_a.py``, ``test_b.py`` and ``test_c.py``, and ``test_suite.yml`` is:

.. code-block:: yaml

    suite_one:
        included:
            - ./test_dir/*.py
        excluded:
            - ./test_dir/test_a.py
    suite_two:
        included:
            - ./test_dir/
        excluded:
            - ./test_dir/test_b.py

Then running::

    ducktape test_suite.yml
runs each of ``test_a.py``, ``test_b.py`` and ``test_c.py`` once


But running::

    ducktape test_suite.yml --exclude test_dir/test_a.py
runs only ``test_b.py`` and ``test_c.py`` once, and skips ``test_a.py``.


Options
=======

To see a complete listing of options run::

    ducktape --help

.. argparse::
   :module: ducktape.command_line.parse_args
   :func: create_ducktape_parser
   :prog: ducktape

Configuration File
==================

You can configure options in three locations: on the command line (highest priority), in a user configuration file in
``~/.ducktape/config``, and in a project-specific configuration ``<project_dir>/.ducktape/config`` (lowest priority).
Configuration files use the same syntax as command line arguments and may split arguments across multiple lines::

    --debug
    --exit-first
    --cluster=ducktape.cluster.json.JsonCluster

Output
======

Test results go in ``results/<session_id>.<session_id>`` which looks like ``<date>--<test_number>``. For example: ``results/2015-03-28--002``

ducktape does its best to group test results and log files in a sensible way. The output directory is
structured like so::

    <session_id>
        session_log.info
        session_log.debug
        report.txt   # Summary report of all tests run in this session
        report.html  # Open this to see summary report in a browser
        report.css

        <test_class_name>
            <test_method_name>
                test_log.info
                test_log.debug
                report.txt   # Report on this single test
                [data.json]  # Present if the test returns data

                <service_1>
                    <node_1>
                        some_logs
                    <node_2>
                        some_logs
        ...


To see an example of the output structure, go `here`_ and click on one of the details links.

.. _here: http://testing.confluent.io/confluent-kafka-system-test-results/


================================================
FILE: docs/test_clusters.rst
================================================
.. _topics-test_clusters:

===================
Test Clusters
===================

Ducktape runs on a test cluster with several nodes.  Ducktape will take ownership of the nodes and handle starting, stopping, and running services on them.

Many test environments are possible.  The nodes may be local nodes, running inside Docker.  Or they could be virtual machines running on a public cloud.

Cluster Specifications
======================

A cluster specification-- also called a ClusterSpec-- describes a particular
cluster configuration.  Originally, cluster specifications could only express the
number of nodes of each operating system. Now, with heterogeneous cluster support,
specifications can also include node types (e.g., "small", "large") for more
fine-grained resource allocation. See `Heterogeneous Clusters`_ for more details.

Cluster specifications give us a vocabulary to express what a particular
service or test needs to run.  For example, a service might require a cluster
with three Linux nodes and one Windows node.  We could express that with a
ClusterSpec containing three Linux NodeSpec objects and one Windows NodeSpec
object.

Heterogeneous Clusters
======================

Ducktape supports heterogeneous clusters where nodes can have different types
(e.g., "small", "large", "arm64"). This allows tests to request specific node
types while maintaining backward compatibility with existing tests.

Using Node Types in Tests
-------------------------

Use the ``@cluster`` decorator with ``node_type`` to request specific node types::

    from ducktape.mark.resource import cluster

    @cluster(num_nodes=3, node_type="large")
    def test_with_large_nodes(self):
        # This test requires 3 large nodes
        pass

    @cluster(num_nodes=5)
    def test_any_nodes(self):
        # This test accepts any 5 nodes (backward compatible)
        pass

Cluster Configuration
---------------------

Node types are defined in your ``cluster.json`` file::

    {
      "nodes": [
        {
          "ssh_config": {"host": "worker1", ...},
          "node_type": "small"
        },
        {
          "ssh_config": {"host": "worker2", ...},
          "node_type": "large"
        }
      ]
    }

Backward Compatibility
----------------------

- Tests without ``node_type`` will match **any available node**
- Existing tests and cluster configurations continue to work unchanged
- Node type is optional in both test annotations and cluster configuration


================================================
FILE: ducktape/__init__.py
================================================
__version__ = "0.14.0"


================================================
FILE: ducktape/__main__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.command_line import main

if __name__ == "__main__":
    main.main()


================================================
FILE: ducktape/cluster/__init__.py
================================================
from .json import JsonCluster  # NOQA
from .localhost import LocalhostCluster  # NOQA
from .vagrant import VagrantCluster  # NOQA


================================================
FILE: ducktape/cluster/cluster.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
from typing import Iterable, List, Union

from ducktape.cluster.cluster_node import ClusterNode
from ducktape.cluster.cluster_spec import ClusterSpec


class Cluster(object):
    """Interface for a cluster -- a collection of nodes with login credentials.
    This interface doesn't define any mapping of roles/services to nodes. It only interacts with some underlying
    system that can describe available resources and mediates reservations of those resources.
    """

    def __init__(self):
        self.max_used_nodes = 0

    def __len__(self) -> int:
        """Size of this cluster object. I.e. number of 'nodes' in the cluster."""
        return self.available().size() + self.used().size()

    def alloc(self, cluster_spec) -> Union[ClusterNode, List[ClusterNode], "Cluster"]:
        """
        Allocate some nodes.

        :param cluster_spec:                    A ClusterSpec describing the nodes to be allocated.
        :throws InsufficientResources:          If the nodes cannot be allocated.
        :return:                                Allocated nodes spec
        """
        allocated = self.do_alloc(cluster_spec)
        self.max_used_nodes = max(self.max_used_nodes, len(self.used()))
        return allocated

    def do_alloc(self, cluster_spec) -> Union[ClusterNode, List[ClusterNode], "Cluster"]:
        """
        Subclasses should implement actual allocation here.

        :param cluster_spec:                    A ClusterSpec describing the nodes to be allocated.
        :throws InsufficientResources:          If the nodes cannot be allocated.
        :return:                                Allocated nodes spec
        """
        raise NotImplementedError

    def free(self, nodes: Union[Iterable[ClusterNode], ClusterNode]) -> None:
        """Free the given node or list of nodes"""
        if isinstance(nodes, collections.abc.Iterable):
            for s in nodes:
                self.free_single(s)
        else:
            self.free_single(nodes)

    def free_single(self, node: ClusterNode) -> None:
        raise NotImplementedError()

    def __eq__(self, other):
        return other is not None and self.__dict__ == other.__dict__

    def __hash__(self):
        return hash(tuple(sorted(self.__dict__.items())))

    def num_available_nodes(self) -> int:
        return self.available().size()

    def available(self) -> ClusterSpec:
        """
        Return a ClusterSpec object describing the currently available nodes.
        """
        raise NotImplementedError

    def used(self) -> ClusterSpec:
        """
        Return a ClusterSpec object describing the currently in use nodes.
        """
        raise NotImplementedError

    def max_used(self) -> int:
        return self.max_used_nodes

    def all(self):
        """
        Return a ClusterSpec object describing all nodes.
        """
        return self.available().clone().add(self.used())


================================================
FILE: ducktape/cluster/cluster_node.py
================================================
from typing import Optional

from ducktape.cluster.remoteaccount import RemoteAccount


class ClusterNode(object):
    def __init__(self, account: RemoteAccount, **kwargs):
        self.account = account
        for k, v in kwargs.items():
            setattr(self, k, v)

    @property
    def name(self) -> Optional[str]:
        return self.account.hostname

    @property
    def operating_system(self) -> Optional[str]:
        return self.account.operating_system


================================================
FILE: ducktape/cluster/cluster_spec.py
================================================
# Copyright 2017 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import

import json
import typing

from ducktape.cluster.node_container import NodeContainer

from .consts import LINUX
from .node_spec import NodeSpec


class ClusterSpec(object):
    """
    The specification for a ducktape cluster.
    """

    nodes: typing.Optional[NodeContainer] = None

    @staticmethod
    def empty():
        return ClusterSpec([])

    @staticmethod
    def simple_linux(num_nodes, node_type=None):
        """
        Create a ClusterSpec for Linux nodes, optionally of a specific type.

        Examples:
            ClusterSpec.simple_linux(5)              # 5 nodes, any type
            ClusterSpec.simple_linux(3, "large")     # 3 large nodes

        :param num_nodes: Number of Linux nodes
        :param node_type: Optional node type label (e.g., "large", "small")
        """
        node_specs = [NodeSpec(LINUX, node_type)] * num_nodes
        return ClusterSpec(node_specs)

    @staticmethod
    def from_nodes(nodes):
        """
        Create a ClusterSpec describing a list of nodes.
        """
        return ClusterSpec([NodeSpec(node.operating_system, getattr(node, "node_type", None)) for node in nodes])

    def __init__(self, nodes=None):
        """
        Initialize the ClusterSpec.

        :param nodes:           A collection of NodeSpecs, or None to create an empty cluster spec.
        """
        self.nodes = NodeContainer(nodes)

    def __len__(self):
        return self.size()

    def __iter__(self):
        return self.nodes.elements()

    def size(self):
        """Return the total size of this cluster spec, including all types of nodes."""
        return self.nodes.size()

    def add(self, other):
        """
        Add another ClusterSpec to this one.

        :param node_spec:       The other cluster spec.  This will not be modified.
        :return:                This ClusterSpec.
        """
        for node_spec in other.nodes:
            self.nodes.add_node(node_spec)
        return self

    def clone(self):
        """
        Returns a deep copy of this object.
        """
        return ClusterSpec(self.nodes.clone())

    def __str__(self):
        node_spec_to_num = {}
        for node_spec in self.nodes.elements():
            node_spec_str = str(node_spec)
            node_spec_to_num[node_spec_str] = node_spec_to_num.get(node_spec_str, 0) + 1
        rval = []
        for node_spec_str in sorted(node_spec_to_num.keys()):
            node_spec = json.loads(node_spec_str)
            node_spec["num_nodes"] = node_spec_to_num[node_spec_str]
            rval.append(node_spec)
        return json.dumps(rval, sort_keys=True)


================================================
FILE: ducktape/cluster/consts.py
================================================
# Copyright 2017 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

LINUX = "linux"

WINDOWS = "windows"

SUPPORTED_OS_TYPES = [LINUX, WINDOWS]


================================================
FILE: ducktape/cluster/finite_subcluster.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import typing

from ducktape.cluster.cluster import Cluster, ClusterNode
from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.node_container import NodeContainer


class FiniteSubcluster(Cluster):
    """This cluster class gives us a mechanism for allocating finite blocks of nodes from another cluster."""

    def __init__(self, nodes: typing.Iterable[ClusterNode]):
        super(FiniteSubcluster, self).__init__()
        self.nodes = nodes
        self._available_nodes = NodeContainer(nodes)
        self._in_use_nodes = NodeContainer()

    def do_alloc(self, cluster_spec) -> typing.List[ClusterNode]:
        # there cannot be any bad nodes here,
        # since FiniteSubcluster operates on ClusterNode objects,
        # which are not checked for health by NodeContainer.remove_spec
        # however there could be an error, specifically if a test decides to alloc more nodes than are available
        # in a previous ducktape version this exception was raised by remove_spec
        # in this one, for consistency, we let the cluster itself deal with allocation errors
        good_nodes, bad_nodes = self._available_nodes.remove_spec(cluster_spec)
        self._in_use_nodes.add_nodes(good_nodes)
        return good_nodes

    def free_single(self, node):
        self._in_use_nodes.remove_node(node)
        self._available_nodes.add_node(node)

    def available(self):
        return ClusterSpec.from_nodes(self._available_nodes)

    def used(self):
        return ClusterSpec.from_nodes(self._in_use_nodes)


================================================
FILE: ducktape/cluster/json.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import

import json
import os
import traceback

from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.consts import WINDOWS
from ducktape.cluster.linux_remoteaccount import LinuxRemoteAccount
from ducktape.cluster.node_container import InsufficientHealthyNodesError, NodeContainer
from ducktape.cluster.windows_remoteaccount import WindowsRemoteAccount
from ducktape.command_line.defaults import ConsoleDefaults

from .cluster import Cluster
from .cluster_node import ClusterNode
from .remoteaccount import RemoteAccountSSHConfig


def make_remote_account(ssh_config, *args, **kwargs):
    """Factory function for creating the correct RemoteAccount implementation."""

    if ssh_config.host and WINDOWS in ssh_config.host:
        return WindowsRemoteAccount(ssh_config, *args, **kwargs)
    else:
        return LinuxRemoteAccount(ssh_config, *args, **kwargs)


class JsonCluster(Cluster):
    """An implementation of Cluster that uses static settings specified in a cluster file or json-serializeable dict"""

    def __init__(
        self,
        cluster_json=None,
        *args,
        make_remote_account_func=make_remote_account,
        **kwargs,
    ):
        """Initialize JsonCluster

        JsonCluster can be initialized from:
            - a json-serializeable dict
            - a "cluster_file" containing json

        :param cluster_json: a json-serializeable dict containing node information. If ``cluster_json`` is None,
               load from file
        :param cluster_file (optional): Overrides the default location of the json cluster file

        Example json with a local Vagrant cluster::

            {
              "nodes": [
                {
                  "externally_routable_ip": "192.168.50.151",

                  "ssh_config": {
                    "host": "worker1",
                    "hostname": "127.0.0.1",
                    "identityfile": "/path/to/private_key",
                    "password": null,
                    "port": 2222,
                    "user": "vagrant"
                  }
                },
                {
                  "externally_routable_ip": "192.168.50.151",

                  "ssh_config": {
                    "host": "worker2",
                    "hostname": "127.0.0.1",
                    "identityfile": "/path/to/private_key",
                    "password": null,
                    "port": 2223,
                    "user": "vagrant"
                  }
                }
              ]
            }

        """
        super(JsonCluster, self).__init__()
        self._available_accounts: NodeContainer = NodeContainer()
        self._bad_accounts: NodeContainer = NodeContainer()
        self._in_use_nodes: NodeContainer = NodeContainer()
        if cluster_json is None:
            # This is a directly instantiation of JsonCluster rather than from a subclass (e.g. VagrantCluster)
            cluster_file = kwargs.get("cluster_file")
            if cluster_file is None:
                cluster_file = ConsoleDefaults.CLUSTER_FILE
            cluster_json = json.load(open(os.path.abspath(cluster_file)))
        try:
            for ninfo in cluster_json["nodes"]:
                ssh_config_dict = ninfo.get("ssh_config")
                assert ssh_config_dict is not None, (
                    "Cluster json has a node without a ssh_config field: %s\n Cluster json: %s" % (ninfo, cluster_json)
                )

                ssh_config = RemoteAccountSSHConfig(**ninfo.get("ssh_config", {}))

                # Extract node_type from JSON (optional field)
                node_type = ninfo.get("node_type")

                remote_account = make_remote_account_func(
                    ssh_config,
                    ninfo.get("externally_routable_ip"),
                    node_type=node_type,
                    ssh_exception_checks=kwargs.get("ssh_exception_checks"),
                )
                if remote_account.externally_routable_ip is None:
                    remote_account.externally_routable_ip = self._externally_routable_ip(remote_account)
                self._available_accounts.add_node(remote_account)
        except BaseException as e:
            msg = "JSON cluster definition invalid: %s: %s" % (
                e,
                traceback.format_exc(limit=16),
            )
            raise ValueError(msg)
        self._id_supplier = 0

    def do_alloc(self, cluster_spec):
        try:
            good_nodes, bad_nodes = self._available_accounts.remove_spec(cluster_spec)
        except InsufficientHealthyNodesError as e:
            self._bad_accounts.add_nodes(e.bad_nodes)
            raise e

        # even in case of no exceptions, we can still run into bad nodes, so let's track them
        if bad_nodes:
            self._bad_accounts.add_nodes(bad_nodes)

        # now let's gather all the good ones and convert them into ClusterNode objects
        allocated_nodes = []
        for account in good_nodes:
            allocated_nodes.append(ClusterNode(account, slot_id=self._id_supplier))
            self._id_supplier += 1
        self._in_use_nodes.add_nodes(allocated_nodes)

        return allocated_nodes

    def free_single(self, node):
        self._in_use_nodes.remove_node(node)
        self._available_accounts.add_node(node.account)
        node.account.close()

    def _externally_routable_ip(self, account):
        return None

    def available(self):
        return ClusterSpec.from_nodes(self._available_accounts)

    def used(self):
        return ClusterSpec.from_nodes(self._in_use_nodes)


================================================
FILE: ducktape/cluster/linux_remoteaccount.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

from paramiko import SFTPClient, SSHClient

from ducktape.cluster.consts import LINUX
from ducktape.cluster.remoteaccount import RemoteAccount, RemoteAccountError


class LinuxRemoteAccount(RemoteAccount):
    def __init__(self, *args, **kwargs):
        super(LinuxRemoteAccount, self).__init__(*args, **kwargs)
        self._ssh_client: Optional[SSHClient] = None
        self._sftp_client: Optional[SFTPClient] = None
        self.os = LINUX

    @property
    def local(self):
        """Returns True if this 'remote' account is probably local.
        This is an imperfect heuristic, but should work for simple local testing."""
        return self.hostname == "localhost" and self.user is None and self.ssh_config is None

    def get_network_devices(self):
        """
        Utility to get all network devices on a linux account
        """
        return [device for device in self.sftp_client.listdir("/sys/class/net")]

    def get_external_accessible_network_devices(self):
        """
        gets the subset of devices accessible through an external conenction
        """
        return [
            device
            for device in self.get_network_devices()
            if device != "lo"  # do not include local device
            and (device.startswith("en") or device.startswith("eth"))  # filter out other devices; "en" means ethernet
            # eth0 can also sometimes happen, see https://unix.stackexchange.com/q/134483
        ]

    # deprecated, please use the self.externally_routable_ip that is set in your cluster,
    # not explicitly deprecating it as it's used by vagrant cluster
    def fetch_externally_routable_ip(self, is_aws=None):
        if is_aws is not None:
            self.logger.warning("fetch_externally_routable_ip: is_aws is a deprecated flag, and does nothing")

        devices = self.get_external_accessible_network_devices()

        self.logger.debug("found devices: {}".format(devices))

        if not devices:
            raise RemoteAccountError(self, "Couldn't find any network devices")

        fmt_cmd = (
            "/sbin/ifconfig {device} | " "grep 'inet ' | " "tail -n 1 | " r"egrep -o '[0-9\.]+' | " "head -n 1 2>&1"
        )

        ips = ["".join(self.ssh_capture(fmt_cmd.format(device=device))).strip() for device in devices]
        self.logger.debug("found ips: {}".format(ips))
        self.logger.debug("returning the first ip found")
        return next(iter(ips))


================================================
FILE: ducktape/cluster/localhost.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.node_container import NodeContainer

from .cluster import Cluster
from .cluster_node import ClusterNode
from .linux_remoteaccount import LinuxRemoteAccount
from .remoteaccount import RemoteAccountSSHConfig


class LocalhostCluster(Cluster):
    """
    A "cluster" that runs entirely on localhost using default credentials. This doesn't require any user
    configuration and is equivalent to the old defaults in cluster_config.json. There are no constraints
    on the resources available.
    """

    def __init__(self, *args, **kwargs):
        super(LocalhostCluster, self).__init__()
        num_nodes = kwargs.get("num_nodes", 1000)
        self._available_nodes = NodeContainer()
        for i in range(num_nodes):
            ssh_config = RemoteAccountSSHConfig("localhost%d" % i, hostname="localhost", port=22)
            self._available_nodes.add_node(
                ClusterNode(
                    LinuxRemoteAccount(
                        ssh_config,
                        ssh_exception_checks=kwargs.get("ssh_exception_checks"),
                    )
                )
            )
        self._in_use_nodes = NodeContainer()

    def do_alloc(self, cluster_spec):
        # there shouldn't be any bad nodes in localhost cluster
        # since ClusterNode object does not implement `available()` method
        good_nodes, bad_nodes = self._available_nodes.remove_spec(cluster_spec)
        self._in_use_nodes.add_nodes(good_nodes)
        return good_nodes

    def free_single(self, node):
        self._in_use_nodes.remove_node(node)
        self._available_nodes.add_node(node)
        node.account.close()

    def available(self):
        return ClusterSpec.from_nodes(self._available_nodes)

    def used(self):
        return ClusterSpec.from_nodes(self._in_use_nodes)


================================================
FILE: ducktape/cluster/node_container.py
================================================
# Copyright 2017 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

from typing import (
    TYPE_CHECKING,
    Any,
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Tuple,
    Union,
)

from ducktape.cluster.cluster_node import ClusterNode
from ducktape.cluster.remoteaccount import RemoteAccount

if TYPE_CHECKING:
    from ducktape.cluster.cluster_spec import ClusterSpec

NodeType = Union[ClusterNode, RemoteAccount]
# Key for node grouping: (operating_system, node_type)
NodeGroupKey = Tuple[Optional[str], Optional[str]]


class NodeNotPresentError(Exception):
    pass


class InsufficientResourcesError(Exception):
    pass


class InsufficientHealthyNodesError(InsufficientResourcesError):
    def __init__(self, bad_nodes: List, *args):
        self.bad_nodes = bad_nodes
        super().__init__(*args)


def _get_node_key(node: NodeType) -> NodeGroupKey:
    """Extract the (os, node_type) key from a node."""
    os = getattr(node, "operating_system", None)
    node_type = getattr(node, "node_type", None)
    return (os, node_type)


class NodeContainer(object):
    """
    Container for cluster nodes, grouped by (operating_system, node_type).

    This enables efficient lookup and allocation of nodes matching specific
    requirements. Nodes with node_type=None are grouped under (os, None) and
    can match any request when no specific type is required.
    """

    # Key: (os, node_type) tuple, Value: list of nodes
    node_groups: Dict[NodeGroupKey, List[NodeType]]

    def __init__(self, nodes: Optional[Iterable[NodeType]] = None) -> None:
        """
        Create a NodeContainer with the given nodes.

        Node objects should implement at least an operating_system property,
        and optionally a node_type property.

        :param nodes:           A collection of node objects to add, or None to add nothing.
        """
        self.node_groups = {}
        if nodes is not None:
            for node in nodes:
                self.add_node(node)

    def size(self) -> int:
        """
        Returns the total number of nodes in the container.
        """
        return sum([len(val) for val in self.node_groups.values()])

    def __len__(self):
        return self.size()

    def __iter__(self) -> Iterator[Any]:
        return self.elements()

    def elements(self, operating_system: Optional[str] = None, node_type: Optional[str] = None) -> Iterator[NodeType]:
        """
        Yield the elements in this container.

        :param operating_system:    If this is non-None, we will iterate only over elements
                                    which have this operating system.
        :param node_type:           If this is non-None, we will iterate only over elements
                                    which have this node type.
        """
        for (os, nt), node_list in self.node_groups.items():
            # Filter by OS if specified
            if operating_system is not None and os != operating_system:
                continue
            # Filter by node_type if specified
            if node_type is not None and nt != node_type:
                continue
            for node in node_list:
                yield node

    def grouped_by_os_and_type(self) -> Dict[Tuple[Optional[str], Optional[str]], int]:
        """
        Returns nodes grouped by (operating_system, node_type) with counts.

        This is a pure data method that groups nodes without any ordering.
        The caller is responsible for determining processing order.

        :return: Dictionary mapping (os, node_type) tuples to counts
        """
        result: Dict[Tuple[Optional[str], Optional[str]], int] = {}
        for node in self.elements():
            key = (getattr(node, "operating_system", None), getattr(node, "node_type", None))
            result[key] = result.get(key, 0) + 1
        return result

    def add_node(self, node: Union[ClusterNode, RemoteAccount]) -> None:
        """
        Add a node to this collection, grouping by (os, node_type).

        :param node:                        The node to add.
        """
        key = _get_node_key(node)
        self.node_groups.setdefault(key, []).append(node)

    def add_nodes(self, nodes):
        """
        Add a collection of nodes to this collection.

        :param nodes:                       The nodes to add.
        """
        for node in nodes:
            self.add_node(node)

    def remove_node(self, node):
        """
        Removes a node from this collection.

        :param node:                        The node to remove.
        :returns:                           The node which has been removed.
        :throws NodeNotPresentError:        If the node is not in the collection.
        """
        key = _get_node_key(node)
        try:
            return self.node_groups.get(key, []).remove(node)
        except ValueError:
            raise NodeNotPresentError

    def remove_nodes(self, nodes):
        """
        Remove a collection of nodes from this collection.

        :param nodes:                       The nodes to remove.
        """
        for node in nodes:
            self.remove_node(node)

    def _find_matching_nodes(
        self, required_os: str, required_node_type: Optional[str], num_needed: int
    ) -> Tuple[List[NodeType], List[NodeType], int]:
        """
        Find nodes that match the required OS and node_type.

        Matching rules:
            - OS must match exactly
            - If required_node_type is None, match nodes of ANY type for this OS
            - If required_node_type is specified, match only nodes with that exact type

        :param required_os: The required operating system
        :param required_node_type: The required node type (None means any)
        :param num_needed: Number of nodes needed
        :return: Tuple of (good_nodes, bad_nodes, shortfall) where shortfall is how many more we need
        """
        good_nodes: List[NodeType] = []
        bad_nodes: List[NodeType] = []

        # Collect candidate keys - keys in node_groups that can satisfy this requirement
        candidate_keys: List[NodeGroupKey] = []
        for os, nt in self.node_groups.keys():
            if os != required_os:
                continue
            # If no specific type required, any node of this OS matches
            # If specific type required, only exact match
            if required_node_type is None or nt == required_node_type:
                candidate_keys.append((os, nt))

        # Try to allocate from candidate pools
        for key in candidate_keys:
            if len(good_nodes) >= num_needed:
                break

            avail_nodes = self.node_groups.get(key, [])
            while avail_nodes and len(good_nodes) < num_needed:
                node = avail_nodes.pop(0)
                if isinstance(node, RemoteAccount):
                    if node.available():
                        good_nodes.append(node)
                    else:
                        bad_nodes.append(node)
                else:
                    good_nodes.append(node)

        shortfall = max(0, num_needed - len(good_nodes))
        return good_nodes, bad_nodes, shortfall

    def remove_spec(self, cluster_spec: ClusterSpec) -> Tuple[List[NodeType], List[NodeType]]:
        """
        Remove nodes matching a ClusterSpec from this NodeContainer.

        Allocation strategy:
            - Specific node_type requirements are allocated BEFORE any-type (None) requirements
            - For each (os, node_type) in the spec:
                - If node_type is specified, allocate from that exact pool
                - If node_type is None, allocate from any pool matching the OS

        :param cluster_spec:                    The cluster spec.  This will not be modified.
        :returns:                               Tuple of (good_nodes, bad_nodes).
        :raises:                                InsufficientResourcesError when there aren't enough total nodes
                                                InsufficientHealthyNodesError when there aren't enough healthy nodes
        """
        err = self.attempt_remove_spec(cluster_spec)
        if err:
            raise InsufficientResourcesError(err)

        good_nodes: List[NodeType] = []
        bad_nodes: List[NodeType] = []
        msg = ""

        # Get requirements grouped by (os, node_type) with counts
        grouped_counts = cluster_spec.nodes.grouped_by_os_and_type()

        # Sort so specific types (node_type != None) are allocated before any-type (node_type == None)
        # This prevents any-type requests from "stealing" nodes needed by specific types
        def allocation_order(item: Tuple[Tuple[str, Optional[str]], int]) -> Tuple[int, str, str]:
            (os, node_type), _ = item
            # Specific types first (0), any-type last (1)
            type_order = 1 if node_type is None else 0
            return (type_order, os or "", node_type or "")

        sorted_requirements = sorted(grouped_counts.items(), key=allocation_order)

        for (os, node_type), num_needed in sorted_requirements:
            found_good, found_bad, shortfall = self._find_matching_nodes(os, node_type, num_needed)

            good_nodes.extend(found_good)
            bad_nodes.extend(found_bad)

            if shortfall > 0:
                type_desc = f"{os}" if node_type is None else f"{os}/{node_type}"
                msg += f"{type_desc} nodes requested: {num_needed}. Healthy nodes available: {len(found_good)}. "

        if msg:
            # Return good nodes back to the container
            for node in good_nodes:
                self.add_node(node)
            raise InsufficientHealthyNodesError(bad_nodes, msg)

        return good_nodes, bad_nodes

    def can_remove_spec(self, cluster_spec: ClusterSpec) -> bool:
        """
        Determine if we can remove nodes matching a ClusterSpec from this NodeContainer.
        This container will not be modified.

        :param cluster_spec:                    The cluster spec.  This will not be modified.
        :returns:                               True if we could remove the nodes; false otherwise
        """
        msg = self.attempt_remove_spec(cluster_spec)
        return len(msg) == 0

    def _count_nodes_by_os(self, target_os: str) -> int:
        """
        Count total nodes available for a given OS (regardless of node_type).

        :param target_os: The operating system to count nodes for
        :return: Total number of nodes with the given OS
        """
        count = 0
        for (os, _), nodes in self.node_groups.items():
            if os == target_os:
                count += len(nodes)
        return count

    def _count_nodes_by_os_and_type(self, target_os: str, target_type: str) -> int:
        """
        Count nodes available for a specific (os, node_type) combination.

        :param target_os: The operating system
        :param target_type: The specific node type (not None)
        :return: Number of nodes matching both OS and type
        """
        return len(self.node_groups.get((target_os, target_type), []))

    def attempt_remove_spec(self, cluster_spec: ClusterSpec) -> str:
        """
        Attempt to remove a cluster_spec from this node container.

        Uses holistic per-OS validation to correctly handle mixed typed and untyped
        requirements without double-counting shared capacity.

        Validation strategy:
            1. Check total OS capacity >= total OS demand
            2. Check each specific type has enough nodes
            3. Check remaining capacity (after specific types) >= any-type demand

        :param cluster_spec:                    The cluster spec.  This will not be modified.
        :returns:                               An empty string if we can remove the nodes;
                                                an error string otherwise.
        """
        # if cluster_spec is None this means the test cannot be run at all
        # e.g. users didn't specify `@cluster` annotation on it but the session context has a flag to fail
        # on such tests or any other state where the test deems its cluster spec incorrect.
        if cluster_spec is None:
            return "Invalid or missing cluster spec"
        # cluster spec may be empty and that's ok, shortcut to returning no error messages
        elif len(cluster_spec) == 0 or cluster_spec.nodes is None:
            return ""

        msg = ""

        # Build requirements_by_os: {os -> {node_type -> count}} in a single pass
        requirements_by_os: Dict[str, Dict[Optional[str], int]] = {}
        for node_spec in cluster_spec.nodes.elements():
            os = node_spec.operating_system
            node_type = node_spec.node_type
            requirements_by_os.setdefault(os, {})
            requirements_by_os[os][node_type] = requirements_by_os[os].get(node_type, 0) + 1

        # Validate each OS holistically
        for os, type_requirements in requirements_by_os.items():
            total_available = self._count_nodes_by_os(os)
            total_required = sum(type_requirements.values())

            # Check 1: Total capacity for this OS
            if total_available < total_required:
                msg += f"{os} nodes requested: {total_required}. {os} nodes available: {total_available}. "
                continue  # Already failed, no need for detailed checks

            # Check 2: Each specific type has enough nodes
            for node_type, count_needed in type_requirements.items():
                if node_type is None:
                    continue  # Handle any-type separately
                type_available = self._count_nodes_by_os_and_type(os, node_type)
                if type_available < count_needed:
                    msg += f"{os}/{node_type} nodes requested: {count_needed}. {os}/{node_type} nodes available: {type_available}. "

            # Check 3: After reserving specific types, is there capacity for any-type?
            any_type_demand = type_requirements.get(None, 0)
            if any_type_demand > 0:
                specific_demand = sum(c for t, c in type_requirements.items() if t is not None)
                remaining_capacity = total_available - specific_demand
                if remaining_capacity < any_type_demand:
                    msg += (
                        f"{os} (any type) nodes requested: {any_type_demand}. "
                        f"{os} nodes remaining after typed allocations: {remaining_capacity}. "
                    )

        return msg

    def clone(self) -> "NodeContainer":
        """
        Returns a deep copy of this object.
        """
        container = NodeContainer()
        for key, nodes in self.node_groups.items():
            for node in nodes:
                container.node_groups.setdefault(key, []).append(node)
        return container


================================================
FILE: ducktape/cluster/node_spec.py
================================================
# Copyright 2017 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import

import json
from typing import Optional

from .consts import LINUX, SUPPORTED_OS_TYPES


class NodeSpec(object):
    """
    Specification for a single cluster node.

    The node_type field is a generic label that can represent size, architecture,
    or any classification scheme defined by the cluster configuration. When None,
    it matches any available node (backward compatible behavior).

    :param operating_system:    The operating system of the node.
    :param node_type:           Node type label (e.g., "large", "small"). None means "match any".
    """

    def __init__(self, operating_system: str = LINUX, node_type: Optional[str] = None):
        self.operating_system = operating_system
        self.node_type = node_type
        if self.operating_system not in SUPPORTED_OS_TYPES:
            raise RuntimeError("Unsupported os type %s" % self.operating_system)

    def matches(self, available_node_spec: "NodeSpec") -> bool:
        """
        Check if this requirement can be satisfied by an available node.

        Matching rules:
            - OS must match exactly
            - If requested node_type is None, match any type
            - If requested node_type is specified, must match exactly

        :param available_node_spec: The specification of an available node
        :return: True if this requirement matches the available node
        """
        if self.operating_system != available_node_spec.operating_system:
            return False
        if self.node_type is None:
            return True  # Requestor doesn't care about type
        return self.node_type == available_node_spec.node_type

    def __str__(self):
        d = {"os": self.operating_system}
        if self.node_type is not None:
            d["node_type"] = self.node_type
        return json.dumps(d, sort_keys=True)

    def __eq__(self, other):
        if not isinstance(other, NodeSpec):
            return False
        return self.operating_system == other.operating_system and self.node_type == other.node_type

    def __hash__(self):
        return hash((self.operating_system, self.node_type))


================================================
FILE: ducktape/cluster/remoteaccount.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import shutil
import signal
import socket
import stat
import tempfile
import warnings
from contextlib import contextmanager
from typing import Callable, List, Optional, Union

from paramiko import MissingHostKeyPolicy, SFTPClient, SSHClient, SSHConfig
from paramiko.ssh_exception import NoValidConnectionsError, SSHException

from ducktape.errors import DucktapeError
from ducktape.utils.http_utils import HttpMixin
from ducktape.utils.util import wait_until


def check_ssh(method: Callable) -> Callable:
    def wrapper(self, *args, **kwargs):
        try:
            return method(self, *args, **kwargs)
        except (SSHException, NoValidConnectionsError, socket.error) as e:
            if self._custom_ssh_exception_checks:
                self._log(logging.DEBUG, "caught ssh error", exc_info=True)
                self._log(logging.DEBUG, "starting ssh checks:")
                self._log(
                    logging.DEBUG,
                    "\n".join(repr(f) for f in self._custom_ssh_exception_checks),
                )
                for func in self._custom_ssh_exception_checks:
                    func(e, self)
            raise

    return wrapper


class RemoteAccountSSHConfig(object):
    def __init__(
        self,
        host: Optional[str] = None,
        hostname: Optional[str] = None,
        user: Optional[str] = None,
        port: Optional[int] = None,
        password: Optional[str] = None,
        identityfile: Optional[str] = None,
        connecttimeout: Optional[Union[int, float]] = None,
        **kwargs,
    ) -> None:
        """Wrapper for ssh configs used by ducktape to connect to remote machines.

        The fields in this class are lowercase versions of a small selection of ssh config properties
        (see man page: "man ssh_config")
        """
        self.host = host
        self.hostname: str = hostname or "localhost"
        self.user = user
        self.port = port or 22
        self.port = int(self.port)
        self.password = password
        self.identityfile = identityfile
        # None is default, and it means default TCP timeout will be used.
        self.connecttimeout = int(connecttimeout) if connecttimeout is not None else None

    @staticmethod
    def from_string(config_str):
        """Construct RemoteAccountSSHConfig object from a string that looks like

        Host the-host
            Hostname the-hostname
            Port 22
            User ubuntu
            IdentityFile /path/to/key
        """
        config = SSHConfig()
        config.parse(config_str.split("\n"))

        hostnames = config.get_hostnames()
        if "*" in hostnames:
            hostnames.remove("*")
        assert len(hostnames) == 1, "Expected hostnames to have single entry: %s" % hostnames
        host = hostnames.pop()

        config_dict = config.lookup(host)
        if config_dict.get("identityfile") is not None:
            # paramiko.SSHConfig parses this in as a list, but we only want a single string
            config_dict["identityfile"] = config_dict["identityfile"][0]

        return RemoteAccountSSHConfig(host, **config_dict)

    def to_json(self):
        return self.__dict__

    def __repr__(self):
        return str(self.to_json())

    def __eq__(self, other):
        return other and other.__dict__ == self.__dict__

    def __hash__(self):
        return hash(tuple(sorted(self.__dict__.items())))


class RemoteAccountError(DucktapeError):
    """This exception is raised when an attempted action on a remote node fails."""

    def __init__(self, account, msg):
        self.account_str = str(account)
        self.msg = msg

    def __str__(self):
        return "%s: %s" % (self.account_str, self.msg)


class RemoteCommandError(RemoteAccountError):
    """This exception is raised when a process run by ssh*() returns a non-zero exit status."""

    def __init__(self, account, cmd, exit_status, msg):
        self.account_str = str(account)
        self.exit_status = exit_status
        self.cmd = cmd
        self.msg = msg

    def __str__(self):
        msg = "%s: Command '%s' returned non-zero exit status %d." % (
            self.account_str,
            self.cmd,
            self.exit_status,
        )
        if self.msg:
            msg += " Remote error message: %s" % self.msg
        return msg


class RemoteAccount(HttpMixin):
    """RemoteAccount is the heart of interaction with cluster nodes,
    and every allocated cluster node has a reference to an instance of RemoteAccount.

    It wraps metadata such as ssh configs, and provides methods for file system manipulation and shell commands.

    Each operating system has its own RemoteAccount implementation.

    The node_type attribute stores the classification label from the cluster
    configuration, enabling type-aware node allocation.
    """

    def __init__(
        self,
        ssh_config: RemoteAccountSSHConfig,
        externally_routable_ip: Optional[str] = None,
        node_type: Optional[str] = None,
        logger: Optional[logging.Logger] = None,
        ssh_exception_checks: List[Callable] = [],
    ) -> None:
        # Instance of RemoteAccountSSHConfig - use this instead of a dict, because we need the entire object to
        # be hashable
        self.ssh_config = ssh_config

        # We don't want to rely on the hostname (e.g. 'worker1') having been added to the driver host's /etc/hosts file.
        # But that means we need to distinguish between the hostname and the value of hostname we use for SSH commands.
        # We try to satisfy all use cases and keep things simple by
        #   a) storing the hostname the user probably expects (the "Host" value in .ssh/config)
        #   b) saving the real value we use for running the SSH command
        self.hostname = ssh_config.host
        self.ssh_hostname = ssh_config.hostname

        self.user = ssh_config.user
        self.externally_routable_ip = externally_routable_ip
        self.node_type = node_type  # Node type label (e.g., "large", "small")
        self._logger = logger
        self.os: Optional[str] = None
        self._ssh_client: Optional[SSHClient] = None
        self._sftp_client: Optional[SFTPClient] = None
        self._custom_ssh_exception_checks = ssh_exception_checks

    @property
    def operating_system(self) -> Optional[str]:
        return self.os

    @property
    def logger(self):
        if self._logger:
            return self._logger
        else:
            return logging.getLogger(__name__)

    @logger.setter
    def logger(self, logger):
        self._logger = logger

    def _log(self, level, msg, *args, **kwargs):
        msg = "%s: %s" % (str(self), msg)
        self.logger.log(level, msg, *args, **kwargs)

    @check_ssh
    def _set_ssh_client(self):
        client = SSHClient()
        client.set_missing_host_key_policy(IgnoreMissingHostKeyPolicy())

        self._log(logging.DEBUG, "ssh_config: %s" % str(self.ssh_config))

        client.connect(
            hostname=self.ssh_config.hostname,
            port=self.ssh_config.port,
            username=self.ssh_config.user,
            password=self.ssh_config.password,
            key_filename=self.ssh_config.identityfile,
            look_for_keys=False,
            timeout=self.ssh_config.connecttimeout,
        )

        if self._ssh_client:
            self._ssh_client.close()
        self._ssh_client = client
        self._set_sftp_client()

    @property
    def ssh_client(self):
        if self._ssh_client and self._ssh_client.get_transport() and self._ssh_client.get_transport().is_active():
            try:
                transport = self._ssh_client.get_transport()
                transport.send_ignore()
            except Exception as e:
                self._log(
                    logging.DEBUG,
                    "exception getting ssh_client (creating new client): %s" % str(e),
                )
                self._set_ssh_client()
        else:
            self._set_ssh_client()

        return self._ssh_client

    def _set_sftp_client(self):
        if self._sftp_client:
            self._sftp_client.close()
        self._sftp_client = self.ssh_client.open_sftp()

    @property
    def sftp_client(self):
        if not self._sftp_client:
            self._set_sftp_client()
        else:
            self.ssh_client  # test connection

        return self._sftp_client

    def close(self) -> None:
        """Close/release any outstanding network connections to remote account."""

        if self._ssh_client:
            self._ssh_client.close()
            self._ssh_client = None
        if self._sftp_client:
            self._sftp_client.close()
            self._sftp_client = None

    def __str__(self):
        r = ""
        if self.user:
            r += self.user + "@"
        r += self.hostname
        return r

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return other is not None and self.__dict__ == other.__dict__

    def __hash__(self):
        return hash(tuple(sorted(self.__dict__.items())))

    def wait_for_http_service(self, port, headers, timeout=20, path="/"):
        """Wait until this service node is available/awake."""
        url = "http://%s:%s%s" % (self.externally_routable_ip, str(port), path)

        err_msg = (
            "Timed out trying to contact service on %s. " % url
            + "Either the service failed to start, or there is a problem with the url."
        )
        wait_until(
            lambda: self._can_ping_url(url, headers),
            timeout_sec=timeout,
            backoff_sec=0.25,
            err_msg=err_msg,
        )

    def _can_ping_url(self, url, headers):
        """See if we can successfully issue a GET request to the given url."""
        try:
            self.http_request(url, "GET", None, headers, timeout=0.75)
            return True
        except Exception:
            return False

    def available(self):
        # TODO: https://github.com/confluentinc/ducktape/issues/339
        # try:
        #     self.ssh_client
        # except Exception:
        #     return False
        # else:
        #     return True
        # finally:
        #     self.close()
        return True

    @check_ssh
    def ssh(self, cmd, allow_fail=False):
        """Run the given command on the remote host, and block until the command has finished running.

        :param cmd: The remote ssh command
        :param allow_fail: If True, ignore nonzero exit status of the remote command,
               else raise an ``RemoteCommandError``

        :return: The exit status of the command.
        :raise RemoteCommandError: If allow_fail is False and the command returns a non-zero exit status
        """
        self._log(logging.DEBUG, "Running ssh command: %s" % cmd)

        client = self.ssh_client
        stdin, stdout, stderr = client.exec_command(cmd)

        # Unfortunately we need to read over the channel to ensure that recv_exit_status won't hang. See:
        # http://docs.paramiko.org/en/2.0/api/channel.html#paramiko.channel.Channel.recv_exit_status
        stdout.read()
        exit_status = stdout.channel.recv_exit_status()
        try:
            if exit_status != 0:
                if not allow_fail:
                    raise RemoteCommandError(self, cmd, exit_status, stderr.read())
                else:
                    self._log(
                        logging.DEBUG,
                        "Running ssh command '%s' exited with status %d and message: %s"
                        % (cmd, exit_status, stderr.read()),
                    )
        finally:
            stdin.close()
            stdout.close()
            stderr.close()

        return exit_status

    @check_ssh
    def ssh_capture(
        self,
        cmd,
        allow_fail=False,
        callback=None,
        combine_stderr=True,
        timeout_sec=None,
    ):
        """Run the given command asynchronously via ssh, and return an SSHOutputIter object.

        Does *not* block

        :param cmd: The remote ssh command
        :param allow_fail: If True, ignore nonzero exit status of the remote command,
               else raise an ``RemoteCommandError``
        :param callback: If set, the iterator returns ``callback(line)``
               for each line of output instead of the raw output
        :param combine_stderr: If True, return output from both stderr and stdout of the remote process.
        :param timeout_sec: Set timeout on blocking reads/writes. Default None. For more details see
            http://docs.paramiko.org/en/2.0/api/channel.html#paramiko.channel.Channel.settimeout

        :return SSHOutputIter: object which allows iteration through each line of output.
        :raise RemoteCommandError: If ``allow_fail`` is False and the command returns a non-zero exit status
        """
        self._log(logging.DEBUG, "Running ssh command: %s" % cmd)

        client = self.ssh_client
        chan = client.get_transport().open_session(timeout=timeout_sec)

        chan.settimeout(timeout_sec)
        chan.exec_command(cmd)
        chan.set_combine_stderr(combine_stderr)

        stdin = chan.makefile("wb", -1)  # set bufsize to -1
        stdout = chan.makefile("r", -1)
        stderr = chan.makefile_stderr("r", -1)

        def output_generator():
            for line in iter(stdout.readline, ""):
                if callback is None:
                    yield line
                else:
                    yield callback(line)
            try:
                exit_status = stdout.channel.recv_exit_status()
                if exit_status != 0:
                    if not allow_fail:
                        raise RemoteCommandError(self, cmd, exit_status, stderr.read())
                    else:
                        self._log(
                            logging.DEBUG,
                            "Running ssh command '%s' exited with status %d and message: %s"
                            % (cmd, exit_status, stderr.read()),
                        )
            finally:
                stdin.close()
                stdout.close()
                stderr.close()

        return SSHOutputIter(output_generator, stdout)

    @check_ssh
    def ssh_output(self, cmd, allow_fail=False, combine_stderr=True, timeout_sec=None):
        """Runs the command via SSH and captures the output, returning it as a string.

        :param cmd: The remote ssh command.
        :param allow_fail: If True, ignore nonzero exit status of the remote command,
               else raise an ``RemoteCommandError``
        :param combine_stderr: If True, return output from both stderr and stdout of the remote process.
        :param timeout_sec: Set timeout on blocking reads/writes. Default None. For more details see
            http://docs.paramiko.org/en/2.0/api/channel.html#paramiko.channel.Channel.settimeout

        :return: The stdout output from the ssh command.
        :raise RemoteCommandError: If ``allow_fail`` is False and the command returns a non-zero exit status
        """
        self._log(logging.DEBUG, "Running ssh command: %s" % cmd)

        client = self.ssh_client
        chan = client.get_transport().open_session(timeout=timeout_sec)

        chan.settimeout(timeout_sec)
        chan.exec_command(cmd)
        chan.set_combine_stderr(combine_stderr)

        stdin = chan.makefile("wb", -1)  # set bufsize to -1
        stdout = chan.makefile("r", -1)
        stderr = chan.makefile_stderr("r", -1)

        try:
            stdoutdata = stdout.read()
            exit_status = stdin.channel.recv_exit_status()
            if exit_status != 0:
                if not allow_fail:
                    raise RemoteCommandError(self, cmd, exit_status, stderr.read())
                else:
                    self._log(
                        logging.DEBUG,
                        "Running ssh command '%s' exited with status %d and message: %s"
                        % (cmd, exit_status, stderr.read()),
                    )
        finally:
            stdin.close()
            stdout.close()
            stderr.close()
        self._log(logging.DEBUG, "Returning ssh command output:\n%s" % stdoutdata)
        return stdoutdata

    def alive(self, pid):
        """Return True if and only if process with given pid is alive."""
        try:
            self.ssh("kill -0 %s" % str(pid), allow_fail=False)
            return True
        except Exception:
            return False

    def signal(self, pid, sig, allow_fail=False):
        cmd = "kill -%d %s" % (int(sig), str(pid))
        self.ssh(cmd, allow_fail=allow_fail)

    def kill_process(self, process_grep_str, clean_shutdown=True, allow_fail=False):
        cmd = """ps ax | grep -i """ + process_grep_str + """ | grep -v grep | awk '{print $1}'"""
        pids = [pid for pid in self.ssh_capture(cmd, allow_fail=True)]

        if clean_shutdown:
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for pid in pids:
            self.signal(pid, sig, allow_fail=allow_fail)

    def java_pids(self, match):
        """
        Get all the Java process IDs matching 'match'.

        :param match:               The AWK expression to match
        """
        cmd = """jcmd | awk '/%s/ { print $1 }'""" % match
        return [int(pid) for pid in self.ssh_capture(cmd, allow_fail=True)]

    def kill_java_processes(self, match, clean_shutdown=True, allow_fail=False):
        """
        Kill all the java processes matching 'match'.

        :param match:               The AWK expression to match
        :param clean_shutdown:      True if we should shut down cleanly with SIGTERM;
                                    false if we should shut down with SIGKILL.
        :param allow_fail:          True if we should throw exceptions if the ssh commands fail.
        """
        cmd = """jcmd | awk '/%s/ { print $1 }'""" % match
        pids = [pid for pid in self.ssh_capture(cmd, allow_fail=True)]

        if clean_shutdown:
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for pid in pids:
            self.signal(pid, sig, allow_fail=allow_fail)

    def copy_between(self, src, dest, dest_node):
        """Copy src to dest on dest_node

        :param src: Path to the file or directory we want to copy
        :param dest: The destination path
        :param dest_node: The node to which we want to copy the file/directory

        Note that if src is a directory, this will automatically copy recursively.

        """
        # TODO: if dest is an existing file, what is the behavior?

        temp_dir = tempfile.mkdtemp()

        try:
            # TODO: deal with very unlikely case that src_name matches temp_dir name?
            # TODO: I think this actually works
            local_dest = self._re_anchor_basename(src, temp_dir)

            self.copy_from(src, local_dest)

            dest_node.account.copy_to(local_dest, dest)

        finally:
            if os.path.isdir(temp_dir):
                shutil.rmtree(temp_dir)

    def scp_from(self, src, dest, recursive=False):
        warnings.warn("scp_from is now deprecated. Please use copy_from")
        self.copy_from(src, dest)

    def _re_anchor_basename(self, path, directory):
        """Anchor the basename of path onto the given directory

        Helper for the various copy_* methods.

        :param path: Path to a file or directory. Could be on the driver machine or a worker machine.
        :param directory: Path to a directory. Could be on the driver machine or a worker machine.

        Example::

            path/to/the_basename, another/path/ -> another/path/the_basename
        """
        path_basename = path

        # trim off path separator from end of path
        # this is necessary because os.path.basename of a path ending in a separator is an empty string
        # For example:
        #   os.path.basename("the/path/") == ""
        #   os.path.basename("the/path") == "path"
        if path_basename.endswith(os.path.sep):
            path_basename = path_basename[: -len(os.path.sep)]
        path_basename = os.path.basename(path_basename)

        return os.path.join(directory, path_basename)

    @check_ssh
    def copy_from(self, src, dest):
        if os.path.isdir(dest):
            # dest is an existing directory, so assuming src looks like path/to/src_name,
            # in this case we'll copy as:
            #   path/to/src_name -> dest/src_name
            dest = self._re_anchor_basename(src, dest)

        if self.isfile(src):
            self.sftp_client.get(src, dest)
        elif self.isdir(src):
            # we can now assume dest path looks like: path_that_exists/new_directory
            os.mkdir(dest)

            # for obj in `ls src`, if it's a file, copy with copy_file_from, elif its a directory, call again
            for obj in self.sftp_client.listdir(src):
                obj_path = os.path.join(src, obj)
                if self.isfile(obj_path) or self.isdir(obj_path):
                    self.copy_from(obj_path, dest)
                else:
                    # TODO what about uncopyable file types?
                    pass

    def scp_to(self, src, dest, recursive=False):
        warnings.warn("scp_to is now deprecated. Please use copy_to")
        self.copy_to(src, dest)

    @check_ssh
    def copy_to(self, src, dest):
        if self.isdir(dest):
            # dest is an existing directory, so assuming src looks like path/to/src_name,
            # in this case we'll copy as:
            #   path/to/src_name -> dest/src_name
            dest = self._re_anchor_basename(src, dest)

        if os.path.isfile(src):
            # local to remote
            self.sftp_client.put(src, dest)
        elif os.path.isdir(src):
            # we can now assume dest path looks like: path_that_exists/new_directory
            self.mkdir(dest)

            # for obj in `ls src`, if it's a file, copy with copy_file_from, elif its a directory, call again
            for obj in os.listdir(src):
                obj_path = os.path.join(src, obj)
                if os.path.isfile(obj_path) or os.path.isdir(obj_path):
                    self.copy_to(obj_path, dest)
                else:
                    # TODO what about uncopyable file types?
                    pass

    @check_ssh
    def islink(self, path):
        try:
            # stat should follow symlinks
            path_stat = self.sftp_client.lstat(path)
            return stat.S_ISLNK(path_stat.st_mode)
        except Exception:
            return False

    @check_ssh
    def isdir(self, path):
        try:
            # stat should follow symlinks
            path_stat = self.sftp_client.stat(path)
            return stat.S_ISDIR(path_stat.st_mode)
        except Exception:
            return False

    @check_ssh
    def exists(self, path):
        """Test that the path exists, but don't follow symlinks."""
        try:
            # stat follows symlinks and tries to stat the actual file
            self.sftp_client.lstat(path)
            return True
        except IOError:
            return False

    @check_ssh
    def isfile(self, path):
        """Imitates semantics of os.path.isfile

        :param path: Path to the thing to check
        :return: True if path is a file or a symlink to a file, else False. Note False can mean path does not exist.
        """
        try:
            # stat should follow symlinks
            path_stat = self.sftp_client.stat(path)
            return stat.S_ISREG(path_stat.st_mode)
        except Exception:
            return False

    def open(self, path, mode="r"):
        return self.sftp_client.open(path, mode)

    @check_ssh
    def create_file(self, path, contents):
        """Create file at path, with the given contents.

        If the path already exists, it will be overwritten.
        """
        # TODO: what should semantics be if path exists? what actually happens if it already exists?
        # TODO: what happens if the base part of the path does not exist?

        with self.sftp_client.open(path, "w") as f:
            f.write(contents)

    _DEFAULT_PERMISSIONS = int("755", 8)

    @check_ssh
    def mkdir(self, path, mode=_DEFAULT_PERMISSIONS):
        self.sftp_client.mkdir(path, mode)

    def mkdirs(self, path, mode=_DEFAULT_PERMISSIONS):
        self.ssh("mkdir -p %s && chmod %o %s" % (path, mode, path))

    def remove(self, path, allow_fail=False):
        """Remove the given file or directory"""

        if allow_fail:
            cmd = "rm -rf %s" % path
        else:
            cmd = "rm -r %s" % path

        self.ssh(cmd, allow_fail=allow_fail)

    @contextmanager
    def monitor_log(self, log):
        """
        Context manager that returns an object that helps you wait for events to
        occur in a log. This checks the size of the log at the beginning of the
        block and makes a helper object available with convenience methods for
        checking or waiting for a pattern to appear in the log. This will commonly
        be used to start a process, then wait for a log message indicating the
        process is in a ready state.

        See ``LogMonitor`` for more usage information.
        """
        try:
            offset = int(self.ssh_output("wc -c %s" % log).split()[0])
        except Exception:
            offset = 0
        yield LogMonitor(self, log, offset)


class SSHOutputIter(object):
    """Helper class that wraps around an iterable object to provide has_next() in addition to next()"""

    def __init__(self, iter_obj_func, channel_file=None):
        """
        :param iter_obj_func: A generator that returns an iterator over stdout from the remote process
        :param channel_file: A paramiko ``ChannelFile`` object
        """
        self.iter_obj_func = iter_obj_func
        self.iter_obj = iter_obj_func()
        self.channel_file = channel_file

        # sentinel is used as an indicator that there is currently nothing cached
        # If self.cached is self.sentinel, then next object from ier_obj is not yet cached.
        self.sentinel = object()
        self.cached = self.sentinel

    def __iter__(self):
        return self

    def next(self):
        if self.cached is self.sentinel:
            return next(self.iter_obj)
        next_obj = self.cached
        self.cached = self.sentinel
        return next_obj

    __next__ = next

    def has_next(self, timeout_sec=None):
        """Return True if next(iter_obj) would return another object within timeout_sec, else False.

        If timeout_sec is None, next(iter_obj) may block indefinitely.
        """
        assert timeout_sec is None or self.channel_file is not None, "should have descriptor to enforce timeout"

        prev_timeout = None
        if self.cached is self.sentinel:
            if self.channel_file is not None:
                prev_timeout = self.channel_file.channel.gettimeout()

                # when timeout_sec is None, next(iter_obj) will block indefinitely
                self.channel_file.channel.settimeout(timeout_sec)
            try:
                self.cached = next(self.iter_obj, self.sentinel)
            except socket.timeout:
                self.iter_obj = self.iter_obj_func()
                self.cached = self.sentinel
            finally:
                if self.channel_file is not None:
                    # restore preexisting timeout
                    self.channel_file.channel.settimeout(prev_timeout)

        return self.cached is not self.sentinel


class LogMonitor(object):
    """
    Helper class returned by monitor_log. Should be used as::

        with remote_account.monitor_log("/path/to/log") as monitor:
            remote_account.ssh("/command/to/start")
            monitor.wait_until("pattern.*to.*grep.*for", timeout_sec=5)

    to run the command and then wait for the pattern to appear in the log.
    """

    def __init__(self, acct, log, offset):
        self.acct = acct
        self.log = log
        self.offset = offset

    def wait_until(self, pattern, **kwargs):
        """
        Wait until the specified pattern is found in the log, after the initial
        offset recorded when the LogMonitor was created. Additional keyword args
        are passed directly to ``ducktape.utils.util.wait_until``
        """
        return wait_until(
            lambda: self.acct.ssh(
                "tail -c +%d %s | grep '%s'" % (self.offset + 1, self.log, pattern),
                allow_fail=True,
            )
            == 0,
            **kwargs,
        )


class IgnoreMissingHostKeyPolicy(MissingHostKeyPolicy):
    """Policy for ignoring missing host keys.
    Many examples show use of AutoAddPolicy, but this clutters up the known_hosts file unnecessarily.
    """

    def missing_host_key(self, client, hostname, key):
        return


================================================
FILE: ducktape/cluster/vagrant.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import

import json
import os
import subprocess

from ducktape.json_serializable import DucktapeJSONEncoder

from .json import JsonCluster, make_remote_account
from .remoteaccount import RemoteAccountSSHConfig


class VagrantCluster(JsonCluster):
    """
    An implementation of Cluster that uses a set of VMs created by Vagrant. Because we need hostnames that can be
    advertised, this assumes that the Vagrant VM's name is a routeable hostname on all the hosts.

    - If cluster_file is specified in the constructor's kwargs (i.e. passed via command line argument --cluster-file)
      - If cluster_file exists on the filesystem, read cluster info from the file
      - Otherwise, retrieve cluster info via "vagrant ssh-config" from vagrant and write cluster info to cluster_file
    - Otherwise, retrieve cluster info via "vagrant ssh-config" from vagrant
    """

    def __init__(self, *args, make_remote_account_func=make_remote_account, **kwargs) -> None:
        is_read_from_file = False
        self.ssh_exception_checks = kwargs.get("ssh_exception_checks")
        cluster_file = kwargs.get("cluster_file")
        if cluster_file is not None:
            try:
                cluster_json = json.load(open(os.path.abspath(cluster_file)))
                is_read_from_file = True
            except IOError:
                # It is OK if file is not found. Call vagrant ssh-info to read the cluster info.
                pass

        if not is_read_from_file:
            cluster_json = {"nodes": self._get_nodes_from_vagrant(make_remote_account_func)}

        super(VagrantCluster, self).__init__(
            cluster_json,
            *args,
            make_remote_account_func=make_remote_account_func,
            **kwargs,
        )

        # If cluster file is specified but the cluster info is not read from it, write the cluster info into the file
        if not is_read_from_file and cluster_file is not None:
            nodes = [
                {
                    "ssh_config": node_account.ssh_config,
                    "externally_routable_ip": node_account.externally_routable_ip,
                }
                for node_account in self._available_accounts
            ]
            cluster_json["nodes"] = nodes
            with open(cluster_file, "w+") as fd:
                json.dump(
                    cluster_json,
                    fd,
                    cls=DucktapeJSONEncoder,
                    indent=2,
                    separators=(",", ": "),
                    sort_keys=True,
                )

        # Release any ssh clients used in querying the nodes for metadata
        for node_account in self._available_accounts:
            node_account.close()

    def _get_nodes_from_vagrant(self, make_remote_account_func):
        ssh_config_info, error = self._vagrant_ssh_config()

        nodes = []
        node_info_arr = ssh_config_info.split("\n\n")
        node_info_arr = [ninfo.strip() for ninfo in node_info_arr if ninfo.strip()]

        for ninfo in node_info_arr:
            ssh_config = RemoteAccountSSHConfig.from_string(ninfo)

            account = None
            try:
                account = make_remote_account_func(ssh_config, ssh_exception_checks=self.ssh_exception_checks)
                externally_routable_ip = account.fetch_externally_routable_ip()
            finally:
                if account:
                    account.close()
                    del account

            nodes.append(
                {
                    "ssh_config": ssh_config.to_json(),
                    "externally_routable_ip": externally_routable_ip,
                }
            )

        return nodes

    def _vagrant_ssh_config(self):
        ssh_config_info, error = subprocess.Popen(
            "vagrant ssh-config",
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            close_fds=True,
            # Force to text mode in py2/3 compatible way
            universal_newlines=True,
        ).communicate()
        return ssh_config_info, error


================================================
FILE: ducktape/cluster/windows_remoteaccount.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import logging
import os

import boto3
import winrm
from botocore.exceptions import ClientError
from Crypto.Cipher import PKCS1_v1_5
from Crypto.PublicKey import RSA

from ducktape.cluster.consts import WINDOWS
from ducktape.cluster.remoteaccount import RemoteAccount, RemoteCommandError


class WindowsRemoteAccount(RemoteAccount):
    """
    Windows remote accounts are currently only supported in EC2. See ``_setup_winrm()`` for how the WinRM password
    is fetched, which is currently specific to AWS.

    The Windows AMI needs to also have an SSH server running to support SSH commands, SCP, and rsync.
    """

    WINRM_USERNAME = "Administrator"

    def __init__(self, *args, **kwargs):
        super(WindowsRemoteAccount, self).__init__(*args, **kwargs)
        self.os = WINDOWS
        self._winrm_client = None

    @property
    def winrm_client(self):
        # TODO: currently this only works in AWS EC2 provisioned by Vagrant. Add support for other environments.

        # check if winrm has already been setup. If yes, return immediately.
        if self._winrm_client:
            return self._winrm_client

        # first get the instance ID of this machine from Vagrant's metadata.
        ec2_instance_id_path = os.path.join(os.getcwd(), ".vagrant", "machines", self.ssh_config.host, "aws", "id")
        instance_id_file = None
        try:
            instance_id_file = open(ec2_instance_id_path, "r")
            ec2_instance_id = instance_id_file.read().strip()
            if not ec2_instance_id or ec2_instance_id == "":
                raise Exception
        except Exception:
            raise Exception("Could not extract EC2 instance ID from local file: %s" % ec2_instance_id_path)
        finally:
            if instance_id_file:
                instance_id_file.close()

        self._log(logging.INFO, "Found EC2 instance id: %s" % ec2_instance_id)

        # then get the encrypted password.
        client = boto3.client("ec2")
        try:
            response = client.get_password_data(InstanceId=ec2_instance_id)
        except ClientError as ce:
            if "InvalidInstanceID.NotFound" in str(ce):
                raise Exception(
                    "The instance id '%s' couldn't be found. Is the correct AWS region configured?" % ec2_instance_id
                )
            else:
                raise ce

        self._log(
            logging.INFO,
            "Fetched encrypted winrm password and will decrypt with private key: %s" % self.ssh_config.identityfile,
        )

        # then decrypt the password using the private key.
        key_file = None
        try:
            key_file = open(self.ssh_config.identityfile, "r")
            key = key_file.read()
            rsa_key = RSA.importKey(key)
            cipher = PKCS1_v1_5.new(rsa_key)
            winrm_password = cipher.decrypt(base64.b64decode(response["PasswordData"]), None)
            self._winrm_client = winrm.Session(
                self.ssh_config.hostname,
                auth=(WindowsRemoteAccount.WINRM_USERNAME, winrm_password),
            )
        finally:
            if key_file:
                key_file.close()

        return self._winrm_client

    def fetch_externally_routable_ip(self, is_aws=None):
        # EC2 windows machines aren't given an externally routable IP. Use the hostname instead.
        return self.ssh_config.hostname

    def run_winrm_command(self, cmd, allow_fail=False):
        self._log(logging.DEBUG, "Running winrm command: %s" % cmd)
        result = self.winrm_client.run_cmd(cmd)
        if not allow_fail and result.status_code != 0:
            raise RemoteCommandError(self, cmd, result.status_code, result.std_err)
        return result.status_code


================================================
FILE: ducktape/command_line/__init__.py
================================================


================================================
FILE: ducktape/command_line/defaults.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os


class ConsoleDefaults(object):
    # Directory for project-specific ducktape configs and runtime data
    DUCKTAPE_DIR = ".ducktape"

    # Store various bookkeeping data here
    METADATA_DIR = os.path.join(DUCKTAPE_DIR, "metadata")

    # Default path, relative to current project directory, to the project's ducktape config file
    PROJECT_CONFIG_FILE = os.path.join(DUCKTAPE_DIR, "config")

    # Default path to the user-specific config file
    USER_CONFIG_FILE = os.path.join("~", DUCKTAPE_DIR, "config")

    # Default cluster implementation
    CLUSTER_TYPE = "ducktape.cluster.vagrant.VagrantCluster"

    # Default path, relative to current project directory, to the cluster file
    CLUSTER_FILE = os.path.join(DUCKTAPE_DIR, "cluster.json")

    # Track the last-used session_id here
    SESSION_ID_FILE = os.path.join(METADATA_DIR, "session_id")

    # Folders with test reports, logs, etc all are created in this directory
    RESULTS_ROOT_DIRECTORY = "./results"

    SESSION_LOG_FORMATTER = "[%(levelname)s:%(asctime)s]: %(message)s"
    TEST_LOG_FORMATTER = "[%(levelname)-5s - %(asctime)s - %(module)s - %(funcName)s - lineno:%(lineno)s]: %(message)s"

    # Log this to indicate a test is misbehaving to help end user find which test is at fault
    BAD_TEST_MESSAGE = "BAD_TEST"

    # Ducktape will try to pick a random open port in the range [TEST_DRIVER_PORT_MIN, TEST_DRIVER_PORT_MAX]
    # this range is *inclusive*
    TEST_DRIVER_MIN_PORT = 5556
    TEST_DRIVER_MAX_PORT = 5656


================================================
FILE: ducktape/command_line/main.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import importlib
import json
import os
import random
import sys
import traceback

from ducktape.command_line.defaults import ConsoleDefaults
from ducktape.command_line.parse_args import parse_args
from ducktape.tests.loader import LoaderException, TestLoader
from ducktape.tests.loggermaker import close_logger
from ducktape.tests.reporter import (
    FailedTestSymbolReporter,
    HTMLSummaryReporter,
    JSONReporter,
    JUnitReporter,
    SimpleFileSummaryReporter,
    SimpleStdoutSummaryReporter,
)
from ducktape.tests.runner import TestRunner
from ducktape.tests.session import (
    SessionContext,
    SessionLoggerMaker,
    generate_results_dir,
    generate_session_id,
)
from ducktape.utils import persistence
from ducktape.utils.local_filesystem_utils import mkdir_p
from ducktape.utils.util import load_function


def get_user_defined_globals(globals_str):
    """Parse user-defined globals into an immutable dict using globals_str

    :param globals_str Either a file, in which case, attempt to open the file and parse the contents as JSON,
        or a JSON string representing a JSON object. The parsed JSON must represent a collection of key-value pairs,
        i.e. a python dict.
    :return dict containing user-defined global variables
    """
    if globals_str is None:
        return persistence.make_dict()

    from_file = False
    if os.path.isfile(globals_str):
        # The string appears to be a file, so try loading JSON from file
        # This may raise an IOError if the file can't be read or a ValueError if the contents of the file
        # cannot be parsed.
        user_globals = json.loads(open(globals_str, "r").read())
        from_file = True
    else:
        try:
            # try parsing directly as json if it doesn't seem to be a file
            user_globals = json.loads(globals_str)
        except ValueError as ve:
            message = str(ve)
            message += "\nglobals parameter %s is neither valid JSON nor a valid path to a JSON file." % globals_str
            raise ValueError(message)

    # Now check that the parsed JSON is a dictionary
    if not isinstance(user_globals, dict):
        if from_file:
            message = "The JSON contained in file %s must parse to a dict. " % globals_str
        else:
            message = "JSON string referred to by globals parameter must parse to a dict. "
        message += "I.e. the contents of the JSON must be an object, not an array or primitive. "
        message += "Instead found %s, which parsed to %s" % (
            str(user_globals),
            type(user_globals),
        )

        raise ValueError(message)

    # create the immutable dict
    return persistence.make_dict(**user_globals)


def setup_results_directory(new_results_dir):
    """Make directory in which results will be stored"""
    if os.path.exists(new_results_dir):
        raise Exception("A file or directory at %s already exists. Exiting without overwriting." % new_results_dir)
    mkdir_p(new_results_dir)


def update_latest_symlink(results_root, new_results_dir):
    """Create or update symlink "latest" which points to the new test results directory"""
    latest_test_dir = os.path.join(results_root, "latest")
    if os.path.islink(latest_test_dir):
        os.unlink(latest_test_dir)
    os.symlink(new_results_dir, latest_test_dir)


def main():
    """Ducktape entry point. This contains top level logic for ducktape command-line program which does the following:

    Discover tests
    Initialize cluster for distributed services
    Run tests
    Report a summary of all results
    """
    args_dict = parse_args(sys.argv[1:])

    injected_args = None
    if args_dict["parameters"]:
        try:
            injected_args = json.loads(args_dict["parameters"])
        except ValueError as e:
            print("parameters are not valid json: " + str(e))
            sys.exit(1)

    args_dict["globals"] = get_user_defined_globals(args_dict.get("globals"))

    # Make .ducktape directory where metadata such as the last used session_id is stored
    if not os.path.isdir(ConsoleDefaults.METADATA_DIR):
        os.makedirs(ConsoleDefaults.METADATA_DIR)

    # Generate a shared 'global' identifier for this test run and create the directory
    # in which all test results will be stored
    session_id = generate_session_id(ConsoleDefaults.SESSION_ID_FILE)
    results_dir = generate_results_dir(args_dict["results_root"], session_id)
    setup_results_directory(results_dir)

    session_context = SessionContext(session_id=session_id, results_dir=results_dir, **args_dict)
    session_logger = SessionLoggerMaker(session_context).logger
    for k, v in args_dict.items():
        session_logger.debug("Configuration: %s=%s", k, v)

    # Discover and load tests to be run
    loader = TestLoader(
        session_context,
        session_logger,
        repeat=args_dict["repeat"],
        injected_args=injected_args,
        subset=args_dict["subset"],
        subsets=args_dict["subsets"],
        historical_report=args_dict["historical_report"],
    )
    try:
        tests = loader.load(args_dict["test_path"], excluded_test_symbols=args_dict["exclude"])
    except LoaderException as e:
        print("Failed while trying to discover tests: {}".format(e))
        sys.exit(1)

    expected_test_count = len(tests)
    session_logger.info(f"Discovered {expected_test_count} tests to run")

    if args_dict["collect_only"]:
        print("Collected %d tests:" % expected_test_count)
        for test in tests:
            print("    " + str(test))
        sys.exit(0)

    if args_dict["collect_num_nodes"]:
        total_nodes = sum(test.expected_num_nodes for test in tests)
        print(total_nodes)
        sys.exit(0)

    if args_dict["sample"]:
        print("Running a sample of %d tests" % args_dict["sample"])
        try:
            tests = random.sample(tests, args_dict["sample"])
        except ValueError as e:
            if args_dict["sample"] > len(tests):
                print(
                    "sample size %d greater than number of tests %d; running all tests"
                    % (args_dict["sample"], len(tests))
                )
            else:
                print("invalid sample size (%s), running all tests" % e)

    # Initializing the cluster is slow, so do so only if
    # tests are sure to be run
    try:
        (cluster_mod_name, cluster_class_name) = args_dict["cluster"].rsplit(".", 1)
        cluster_mod = importlib.import_module(cluster_mod_name)
        cluster_class = getattr(cluster_mod, cluster_class_name)

        cluster_kwargs = {"cluster_file": args_dict["cluster_file"]}
        checker_function_names = args_dict["ssh_checker_function"]
        if checker_function_names:
            checkers = [load_function(func_path) for func_path in checker_function_names]
            if checkers:
                cluster_kwargs["ssh_exception_checks"] = checkers
        cluster = cluster_class(**cluster_kwargs)
        for ctx in tests:
            # Note that we're attaching a reference to cluster
            # only after test context objects have been instantiated
            ctx.cluster = cluster
    except Exception:
        print("Failed to load cluster: ", str(sys.exc_info()[0]))
        print(traceback.format_exc(limit=16))
        sys.exit(1)

    # Run the tests
    deflake_num = args_dict["deflake"]
    if deflake_num < 1:
        session_logger.warning("specified number of deflake runs specified to be less than 1, running without deflake.")
    deflake_num = max(1, deflake_num)
    runner = TestRunner(cluster, session_context, session_logger, tests, deflake_num)
    test_results = runner.run_all_tests()

    # Report results
    reporters = [
        SimpleStdoutSummaryReporter(test_results),
        SimpleFileSummaryReporter(test_results),
        HTMLSummaryReporter(test_results, expected_test_count),
        JSONReporter(test_results),
        JUnitReporter(test_results),
        FailedTestSymbolReporter(test_results),
    ]

    for r in reporters:
        r.report()

    update_latest_symlink(args_dict["results_root"], results_dir)

    if len(test_results) < expected_test_count:
        session_logger.warning(
            f"All tests were NOT run. Expected {expected_test_count} tests, only {len(test_results)} were run."
        )
        close_logger(session_logger)
        sys.exit(1)

    close_logger(session_logger)
    if not test_results.get_aggregate_success():
        # Non-zero exit if at least one test failed
        sys.exit(1)


================================================
FILE: ducktape/command_line/parse_args.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import argparse
import itertools
import os
import sys
from typing import Dict, List, Optional, Union

from ducktape.command_line.defaults import ConsoleDefaults
from ducktape.utils.util import ducktape_version


def create_ducktape_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Discover and run your tests")
    parser.add_argument(
        "test_path",
        metavar="test_path",
        type=str,
        nargs="*",
        default=[os.getcwd()],
        help="One or more test identifiers or test suite paths to execute",
    )
    parser.add_argument(
        "--exclude",
        type=str,
        nargs="*",
        default=None,
        help="one or more space-delimited strings indicating which tests to exclude",
    )
    parser.add_argument(
        "--collect-only",
        action="store_true",
        help="display collected tests, but do not run.",
    )
    parser.add_argument(
        "--collect-num-nodes",
        action="store_true",
        help="display total number of nodes requested by all tests, but do not run anything.",
    )
    parser.add_argument("--debug", action="store_true", help="pipe more verbose test output to stdout.")
    parser.add_argument(
        "--config-file",
        action="store",
        default=ConsoleDefaults.USER_CONFIG_FILE,
        help="path to project-specific configuration file.",
    )
    parser.add_argument(
        "--compress",
        action="store_true",
        help="compress remote logs before collection.",
    )
    parser.add_argument(
        "--cluster",
        action="store",
        default=ConsoleDefaults.CLUSTER_TYPE,
        help="cluster class to use to allocate nodes for tests.",
    )
    parser.add_argument(
        "--default-num-nodes",
        action="store",
        type=int,
        default=None,
        help="Global hint for cluster usage. A test without the @cluster annotation will "
        "default to this value for expected cluster usage.",
    )
    parser.add_argument(
        "--cluster-file",
        action="store",
        default=None,
        help="path to a json file which provides information needed to initialize a json cluster. "
        "The file is used to read/write cached cluster info if "
        "cluster is ducktape.cluster.vagrant.VagrantCluster.",
    )
    parser.add_argument(
        "--results-root",
        action="store",
        default=ConsoleDefaults.RESULTS_ROOT_DIRECTORY,
        help="path to custom root results directory. Running ducktape with this root "
        "specified will result in new test results being stored in a subdirectory of "
        "this root directory.",
    )
    parser.add_argument("--exit-first", action="store_true", help="exit after first failure")
    parser.add_argument(
        "--no-teardown",
        action="store_true",
        help="don't kill running processes or remove log files when a test has finished running. "
        "This is primarily useful for test developers who want to interact with running "
        "services after a test has run.",
    )
    parser.add_argument("--version", action="store_true", help="display version")
    parser.add_argument(
        "--parameters",
        action="store",
        help="inject these arguments into the specified test(s). Specify parameters as a JSON string.",
    )
    parser.add_argument(
        "--globals",
        action="store",
        help="user-defined globals go here. "
        "This can be a file containing a JSON object, or a string representing a JSON object.",
    )
    parser.add_argument(
        "--max-parallel",
        action="store",
        type=int,
        default=1,
        help="Upper bound on number of tests run simultaneously.",
    )
    parser.add_argument(
        "--repeat",
        action="store",
        type=int,
        default=1,
        help="Use this flag to repeat all discovered tests the given number of times.",
    )
    parser.add_argument(
        "--subsets",
        action="store",
        type=int,
        default=1,
        help="Number of subsets of tests to statically break the tests into to allow for parallel "
        "execution without coordination between test runner processes.",
    )
    parser.add_argument(
        "--subset",
        action="store",
        type=int,
        default=0,
        help="Which subset of the tests to run, based on the breakdown using the parameter for --subsets",
    )
    parser.add_argument(
        "--historical-report",
        action="store",
        type=str,
        help="URL of a JSON report file containing stats from a previous test run. If specified, "
        "this will be used when creating subsets of tests to divide evenly by total run time "
        "instead of by number of tests.",
    )
    parser.add_argument(
        "--skip-nodes-allocation",
        action="store_true",
        help="Use this flag to skip allocating "
        "nodes for services. Can be used when running specific tests on a running platform",
    )
    parser.add_argument(
        "--sample",
        action="store",
        type=int,
        help="The size of a random test sample to run",
    )
    parser.add_argument(
        "--fail-bad-cluster-utilization",
        action="store_true",
        help="Fail a test if the test declared that it needs more nodes than it actually used. "
        "E.g. if the test had `@cluster(num_nodes=10)` annotation, "
        "but never used more than 5 nodes during its execution.",
    )
    parser.add_argument(
        "--fail-greedy-tests",
        action="store_true",
        help="Fail a test if it has no @cluster annotation "
        "or if @cluster annotation is empty. "
        "You can still specify 0-sized cluster explicitly using either num_nodes=0 "
        "or cluster_spec=ClusterSpec.empty()",
    )
    parser.add_argument(
        "--test-runner-timeout",
        action="store",
        type=int,
        default=1800000,
        help="Amount of time in milliseconds between test communicating between the test runner"
        " before a timeout error occurs. Default is 30 minutes",
    )
    (
        parser.add_argument(
            "--ssh-checker-function",
            action="store",
            type=str,
            nargs="+",
            help="Python module path(s) to a function that takes an exception and a remote account"
            " that will be called when an ssh error occurs, this can give some "
            "validation or better logging when an ssh error occurs. Specify any "
            "number of module paths after this flag to be called.",
        ),
    )
    parser.add_argument(
        "--deflake",
        action="store",
        type=int,
        default=1,
        help="the number of times a failed test should be ran in total (including its initial run) "
        "to determine flakyness. When not present, deflake will not be used, "
        "and a test will be marked as either passed or failed. "
        "When enabled tests will be marked as flaky if it passes on any of the reruns",
    )
    parser.add_argument(
        "--enable-jvm-logs",
        action="store_true",
        help="Enable automatic JVM log collection for Java-based services (Kafka, ZooKeeper, Connect, etc.)",
    )
    return parser


def get_user_config_file(args: List[str]) -> str:
    """Helper function to get specified (or default) user config file.
    :return Filename which is the path to the config file.
    """
    parser = create_ducktape_parser()
    config_file = vars(parser.parse_args(args))["config_file"]
    assert config_file is not None
    return os.path.expanduser(config_file)


def config_file_to_args_list(config_file):
    """Parse in contents of config file, and return a list of command-line options parseable by the ducktape parser.

    Skip whitespace lines and comments (lines prefixed by "#")
    """
    if config_file is None:
        raise RuntimeError("config_file is None")

    # Read in configuration, but ignore empty lines and comments
    config_lines = [
        line for line in open(config_file).readlines() if (len(line.strip()) > 0 and line.lstrip()[0] != "#")
    ]

    return list(itertools.chain(*[line.split() for line in config_lines]))


def parse_non_default_args(parser: argparse.ArgumentParser, defaults: dict, args: list) -> dict:
    """
    Parse and remove default args from a list of args, and return the dict of the parsed args.
    """
    parsed_args = vars(parser.parse_args(args))

    # remove defaults
    for key, value in defaults.items():
        if parsed_args[key] == value:
            del parsed_args[key]

    return parsed_args


def parse_args(
    args: List[str],
) -> Dict[str, Optional[Union[List[str], bool, str, int]]]:
    """Parse in command-line and config file options.

    Command line arguments have the highest priority, then user configs specified in ~/.ducktape/config, and finally
    project configs specified in <ducktape_dir>/config.
    """

    parser = create_ducktape_parser()

    if len(args) == 0:
        # Show help if there are no arguments
        parser.print_help()
        sys.exit(0)

    # Collect arguments from project config file, user config file, and command line
    # later arguments supersede earlier arguments
    parsed_args_list = []

    # First collect all the default values
    defaults = vars(parser.parse_args([]))

    project_config_file = ConsoleDefaults.PROJECT_CONFIG_FILE
    # Load all non-default args from project config file.
    if os.path.exists(project_config_file):
        parsed_args_list.append(parse_non_default_args(parser, defaults, config_file_to_args_list(project_config_file)))

    # Load all non-default args from user config file.
    user_config_file = get_user_config_file(args)
    if os.path.exists(user_config_file):
        parsed_args_list.append(parse_non_default_args(parser, defaults, config_file_to_args_list(user_config_file)))

    # Load all non-default args from the command line.
    parsed_args_list.append(parse_non_default_args(parser, defaults, args))

    # Don't need to copy, done with the defaults dict.
    # Start with the default args, and layer on changes.
    parsed_args_dict = defaults
    for parsed_args in parsed_args_list:
        parsed_args_dict.update(parsed_args)

    if parsed_args_dict["version"]:
        print(ducktape_version())
        sys.exit(0)
    return parsed_args_dict


================================================
FILE: ducktape/errors.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


class DucktapeError(RuntimeError):
    pass


class TimeoutError(DucktapeError):
    pass


================================================
FILE: ducktape/json_serializable.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from json import JSONEncoder


class DucktapeJSONEncoder(JSONEncoder):
    def default(self, obj):
        if hasattr(obj, "to_json"):
            # to_json may return a dict or array or other naturally json serializable object
            # this allows serialization to work correctly on nested items
            return obj.to_json()
        else:
            # Let the base class default method raise the TypeError
            return JSONEncoder.default(self, obj)


================================================
FILE: ducktape/jvm_logging.py
================================================
# Copyright 2024 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
JVM logging support for Ducktape.

This module provides automatic JVM log collection for Java-based services without requiring any
code changes to services or tests.

Please Note: We are prepending JVM options to the SSH cmd. If any option is injected again as part of cmd, it will
override the option injected from this module.
For example, if a test or service injects its own -Xlog options, it may override the GC logging options injected
by this module. In practice, Services should work as expected.
"""

import os
import types


class JVMLogger:
    """Handles JVM logging configuration and enablement for services."""

    def __init__(self, log_dir="/mnt/jvm_logs"):
        """
        Initialize JVM logger.

        :param log_dir: Directory for JVM logs on worker nodes
        """
        self.log_dir = log_dir

    def enable_for_service(self, service):
        """
        Enable JVM logging for a service instance. Adds JVM log definitions and helper methods to the service.
        Wraps start_node so that when nodes are started, JVM logging is set up and JVM options are injected via SSH,
        and wraps clean_node to clean up JVM logs after node cleanup.
        :param service: Service instance to enable JVM logging for
        """
        # Store reference to JVMLogger instance for use in closures
        jvm_logger = self

        # Add JVM log definitions
        jvm_logs = {
            "jvm_gc_log": {"path": os.path.join(jvm_logger.log_dir, "gc.log"), "collect_default": True},
            "jvm_stdout_stderr": {"path": os.path.join(jvm_logger.log_dir, "jvm.log"), "collect_default": True},
            "jvm_heap_dump": {
                "path": os.path.join(jvm_logger.log_dir, "heap_dump.hprof"),
                "collect_default": False,  # Only on failure
            },
        }

        # Initialize logs dict if needed
        if not hasattr(service, "logs") or service.logs is None:
            service.logs = {}

        # Merge with existing logs
        service.logs.update(jvm_logs)

        # Add helper methods
        service.JVM_LOG_DIR = jvm_logger.log_dir
        service.jvm_options = lambda node: jvm_logger._get_jvm_options()
        service.setup_jvm_logging = lambda node: jvm_logger._setup_on_node(node)
        service.clean_jvm_logs = lambda node: jvm_logger._cleanup_on_node(node)

        # Wrap start_node to automatically setup JVM logging and wrap SSH
        original_start_node = service.start_node

        def wrapped_start_node(self, node, *args, **kwargs):
            jvm_logger._setup_on_node(node)  # Setup JVM log directory

            # Wrap all SSH methods to inject JDK_JAVA_OPTIONS, wrap once and keep active for entire service lifecycle
            if not hasattr(node.account, "original_ssh"):
                original_ssh = node.account.ssh
                original_ssh_capture = node.account.ssh_capture
                original_ssh_output = node.account.ssh_output

                node.account.original_ssh = original_ssh
                node.account.original_ssh_capture = original_ssh_capture
                node.account.original_ssh_output = original_ssh_output

                jvm_opts = jvm_logger._get_jvm_options()
                # Use env command and append to existing JDK_JAVA_OPTIONS
                env_prefix = f'env JDK_JAVA_OPTIONS="${{JDK_JAVA_OPTIONS:-}} {jvm_opts}" '

                def wrapped_ssh(cmd, allow_fail=False):
                    return original_ssh(env_prefix + cmd, allow_fail=allow_fail)

                def wrapped_ssh_capture(cmd, allow_fail=False, callback=None, combine_stderr=True, timeout_sec=None):
                    return original_ssh_capture(
                        env_prefix + cmd,
                        allow_fail=allow_fail,
                        callback=callback,
                        combine_stderr=combine_stderr,
                        timeout_sec=timeout_sec,
                    )

                def wrapped_ssh_output(cmd, allow_fail=False, combine_stderr=True, timeout_sec=None):
                    return original_ssh_output(
                        env_prefix + cmd, allow_fail=allow_fail, combine_stderr=combine_stderr, timeout_sec=timeout_sec
                    )

                node.account.ssh = wrapped_ssh
                node.account.ssh_capture = wrapped_ssh_capture
                node.account.ssh_output = wrapped_ssh_output

            return original_start_node(node, *args, **kwargs)

        # Bind the wrapper function to the service object
        service.start_node = types.MethodType(wrapped_start_node, service)

        # Wrap clean_node to cleanup JVM logs and restore SSH methods
        original_clean_node = service.clean_node

        def wrapped_clean_node(self, node, *args, **kwargs):
            result = original_clean_node(node, *args, **kwargs)
            jvm_logger._cleanup_on_node(node)

            # Restore original SSH methods
            if hasattr(node.account, "original_ssh"):
                node.account.ssh = node.account.original_ssh
                node.account.ssh_capture = node.account.original_ssh_capture
                node.account.ssh_output = node.account.original_ssh_output
                del node.account.original_ssh
                del node.account.original_ssh_capture
                del node.account.original_ssh_output

            return result

        # Bind the wrapper function to the service instance
        service.clean_node = types.MethodType(wrapped_clean_node, service)

    def _get_jvm_options(self):
        """Generate JVM options string for logging."""
        gc_log = os.path.join(self.log_dir, "gc.log")
        heap_dump = os.path.join(self.log_dir, "heap_dump.hprof")
        error_log = os.path.join(self.log_dir, "hs_err_pid%p.log")
        jvm_log = os.path.join(self.log_dir, "jvm.log")

        jvm_logging_opts = [
            "-Xlog:disable",  # Suppress all default JVM console logging to prevent output pollution
            f"-Xlog:gc*:file={gc_log}:time,uptime,level,tags",  # GC activity with timestamps
            "-XX:+HeapDumpOnOutOfMemoryError",  # Generate heap dump on OOM
            f"-XX:HeapDumpPath={heap_dump}",  # Heap dump file location
            f"-Xlog:safepoint=info:file={jvm_log}:time,uptime,level,tags",  # Safepoint pause events
            f"-Xlog:class+load=info:file={jvm_log}:time,uptime,level,tags",  # Class loading events
            f"-XX:ErrorFile={error_log}",  # Fatal error log location (JVM crashes)
            "-XX:NativeMemoryTracking=summary",  # Track native memory usage
            f"-Xlog:jit+compilation=info:file={jvm_log}:time,uptime,level,tags",  # JIT compilation events
        ]

        return " ".join(jvm_logging_opts)

    def _setup_on_node(self, node):
        """Create JVM log directory on worker node."""
        node.account.ssh(f"mkdir -p {self.log_dir}")
        node.account.ssh(f"chmod 755 {self.log_dir}")

    def _cleanup_on_node(self, node):
        """Clean JVM logs from worker node."""
        node.account.ssh(f"rm -rf {self.log_dir}", allow_fail=True)


================================================
FILE: ducktape/mark/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ._mark import matrix  # NOQA
from ._mark import defaults, env, ignore, ignored, is_env, parametrize, parametrized

__all__ = [
    "matrix",
    "defaults",
    "env",
    "ignore",
    "ignored",
    "is_env",
    "parametrize",
    "parametrized",
]


================================================
FILE: ducktape/mark/_mark.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import functools
import itertools
import os

from ducktape.errors import DucktapeError


class Mark(object):
    """Common base class for "marks" which may be applied to test functions/methods."""

    @staticmethod
    def mark(fun, mark):
        """Attach a tag indicating that fun has been marked with the given mark

        Marking fun updates it with two attributes:

        - marks:      a list of mark objects applied to the function. These may be strings or objects subclassing Mark
                      we use a list because in some cases, it is useful to preserve ordering.
        - mark_names: a set of names of marks applied to the function
        """
        # Update fun.marks
        if hasattr(fun, "marks"):
            fun.marks.append(mark)
        else:
            fun.__dict__["marks"] = [mark]

        # Update fun.mark_names
        if hasattr(fun, "mark_names"):
            fun.mark_names.add(mark.name)
        else:
            fun.__dict__["mark_names"] = {mark.name}

    @staticmethod
    def marked(f, mark):
        if f is None:
            return False

        if not hasattr(f, "mark_names"):
            return False

        return mark.name in f.mark_names

    @staticmethod
    def clear_marks(f):
        if not hasattr(f, "marks"):
            return

        del f.__dict__["marks"]
        del f.__dict__["mark_names"]

    @property
    def name(self):
        return "MARK"

    def apply(self, seed_context, context_list):
        raise NotImplementedError("Subclasses should implement apply")

    def __eq__(self, other):
        if not isinstance(self, type(other)):
            return False

        return self.name == other.name


class Ignore(Mark):
    """Ignore a specific parametrization of test."""

    def __init__(self, **kwargs):
        # Ignore tests with injected_args matching self.injected_args
        self.injected_args = kwargs

    @property
    def name(self):
        return "IGNORE"

    def apply(self, seed_context, context_list):
        assert len(context_list) > 0, "ignore annotation is not being applied to any test cases"
        for ctx in context_list:
            ctx.ignore = ctx.ignore or self.injected_args is None or self.injected_args == ctx.injected_args
        return context_list

    def __eq__(self, other):
        return super(Ignore, self).__eq__(other) and self.injected_args == other.injected_args


class IgnoreAll(Ignore):
    """This mark signals to ignore all parametrizations of a test."""

    def __init__(self):
        super(IgnoreAll, self).__init__()
        self.injected_args = None


class Matrix(Mark):
    """Parametrize with a matrix of arguments.
    Assume each values in self.injected_args is iterable
    """

    def __init__(self, **kwargs):
        self.injected_args = kwargs
        for k in self.injected_args:
            try:
                iter(self.injected_args[k])
            except TypeError as te:
                raise DucktapeError("Expected all values in @matrix decorator to be iterable: " + str(te))

    @property
    def name(self):
        return "MATRIX"

    def apply(self, seed_context, context_list):
        for injected_args in cartesian_product_dict(self.injected_args):
            injected_fun = _inject(**injected_args)(seed_context.function)
            context_list.insert(0, seed_context.copy(function=injected_fun, injected_args=injected_args))

        return context_list

    def __eq__(self, other):
        return super(Matrix, self).__eq__(other) and self.injected_args == other.injected_args


class Defaults(Mark):
    """Parametrize with a default matrix of arguments on existing parametrizations.
    Assume each values in self.injected_args is iterable
    """

    def __init__(self, **kwargs):
        self.injected_args = kwargs
        for k in self.injected_args:
            try:
                iter(self.injected_args[k])
            except TypeError as te:
                raise DucktapeError("Expected all values in @defaults decorator to be iterable: " + str(te))

    @property
    def name(self):
        return "DEFAULTS"

    def apply(self, seed_context, context_list):
        new_context_list = []
        if context_list:
            for ctx in context_list:
                for injected_args in cartesian_product_dict(
                    {arg: self.injected_args[arg] for arg in self.injected_args if arg not in ctx.injected_args}
                ):
                    injected_args.update(ctx.injected_args)
                    injected_fun = _inject(**injected_args)(seed_context.function)
                    new_context = seed_context.copy(
                        function=injected_fun,
                        injected_args=injected_args,
                        cluster_use_metadata=ctx.cluster_use_metadata,
                    )
                    new_context_list.insert(0, new_context)
        else:
            for injected_args in cartesian_product_dict(self.injected_args):
                injected_fun = _inject(**injected_args)(seed_context.function)
                new_context_list.insert(
                    0,
                    seed_context.copy(function=injected_fun, injected_args=injected_args),
                )

        return new_context_list

    def __eq__(self, other):
        return super(Defaults, self).__eq__(other) and self.injected_args == other.injected_args


class Parametrize(Mark):
    """Parametrize a test function"""

    def __init__(self, **kwargs):
        self.injected_args = kwargs

    @property
    def name(self):
        return "PARAMETRIZE"

    def apply(self, seed_context, context_list):
        injected_fun = _inject(**self.injected_args)(seed_context.function)
        context_list.insert(
            0,
            seed_context.copy(function=injected_fun, injected_args=self.injected_args),
        )
        return context_list

    def __eq__(self, other):
        return super(Parametrize, self).__eq__(other) and self.injected_args == other.injected_args


class Env(Mark):
    def __init__(self, **kwargs):
        self.injected_args = kwargs
        self.should_ignore = any(os.environ.get(key) != value for key, value in kwargs.items())

    @property
    def name(self):
        return "ENV"

    def apply(self, seed_context, context_list):
        for ctx in context_list:
            ctx.ignore = ctx.ignore or self.should_ignore

        return context_list

    def __eq__(self, other):
        return super(Env, self).__eq__(other) and self.injected_args == other.injected_args


PARAMETRIZED = Parametrize()
MATRIX = Matrix()
DEFAULTS = Defaults()
IGNORE = Ignore()
ENV = Env()


def _is_parametrize_mark(m):
    return m.name == PARAMETRIZED.name or m.name == MATRIX.name or m.name == DEFAULTS.name


def parametrized(f):
    """Is this function or object decorated with @parametrize or @matrix?"""
    return Mark.marked(f, PARAMETRIZED) or Mark.marked(f, MATRIX) or Mark.marked(f, DEFAULTS)


def ignored(f):
    """Is this function or object decorated with @ignore?"""
    return Mark.marked(f, IGNORE)


def is_env(f):
    return Mark.marked(f, ENV)


def cartesian_product_dict(d):
    """Return the "cartesian product" of this dictionary's values.
    d is assumed to be a dictionary, where each value in the dict is a list of values

    Example::

        {
            "x": [1, 2],
            "y": ["a", "b"]
        }

        expand this into a list of dictionaries like so:

        [
            {
                "x": 1,
                "y": "a"
            },
            {
                "x": 1,
                "y": "b"
            },
            {
                "x": 2,
                "y": "a"
            },
            {
                "x": 2,
                "y", "b"
            }
        ]
    """
    # Establish an ordering of the keys
    key_list = [k for k in d.keys()]

    expanded = []
    values_list = [d[k] for k in key_list]  # list of lists
    for v in itertools.product(*values_list):
        # Iterate through the cartesian product of the lists of values
        # One dictionary per element in this cartesian product
        new_dict = {}
        for i in range(len(key_list)):
            new_dict[key_list[i]] = v[i]
        expanded.append(new_dict)
    return expanded


def matrix(**kwargs):
    """Function decorator used to parametrize with a matrix of values.
    Decorating a function or method with ``@matrix`` marks it with the Matrix mark. When expanded using the
    ``MarkedFunctionExpander``, it yields a list of TestContext objects, one for every possible combination
    of arguments.

    Example::

        @matrix(x=[1, 2], y=[-1, -2])
        def g(x, y):
            print "x = %s, y = %s" % (x, y)

        for ctx in MarkedFunctionExpander(..., function=g, ...).expand():
            ctx.function()

        # output:
        # x = 1, y = -1
        # x = 1, y = -2
        # x = 2, y = -1
        # x = 2, y = -2
    """

    def parametrizer(f):
        Mark.mark(f, Matrix(**kwargs))
        return f

    return parametrizer


def defaults(**kwargs):
    """Function decorator used to parametrize with a default matrix of values.
    Decorating a function or method with ``@defaults`` marks it with the Defaults mark. When expanded using the
    ``MarkedFunctionExpander``, it yields a list of TestContext objects, one for every possible combination
    of defaults combined with ``@matrix`` and ``@parametrize``. If there are overlap between defaults
    and parametrization, defaults will not be applied.

    Example::

        @defaults(z=[1, 2])
        @matrix(x=[1], y=[1, 2])
        @parametrize(x=3, y=4)
        @parametrize(x=3, y=4, z=999)
        def g(x, y, z):
            print "x = %s, y = %s" % (x, y)

        for ctx in MarkedFunctionExpander(..., function=g, ...).expand():
            ctx.function()

        # output:
        # x = 1, y = 1, z = 1
        # x = 1, y = 1, z = 2
        # x = 1, y = 2, z = 1
        # x = 1, y = 2, z = 2
        # x = 3, y = 4, z = 1
        # x = 3, y = 4, z = 2
        # x = 3, y = 4, z = 999
    """

    def parametrizer(f):
        Mark.mark(f, Defaults(**kwargs))
        return f

    return parametrizer


def parametrize(**kwargs):
    """Function decorator used to parametrize its arguments.
    Decorating a function or method with ``@parametrize`` marks it with the Parametrize mark.

    Example::

        @parametrize(x=1, y=2 z=-1)
        @parametrize(x=3, y=4, z=5)
        def g(x, y, z):
            print "x = %s, y = %s, z = %s" % (x, y, z)

        for ctx in MarkedFunctionExpander(..., function=g, ...).expand():
            ctx.function()

        # output:
        # x = 1, y = 2, z = -1
        # x = 3, y = 4, z = 5
    """

    def parametrizer(f):
        Mark.mark(f, Parametrize(**kwargs))
        return f

    return parametrizer


def ignore(*args, **kwargs):
    """
    Test method decorator which signals to the test runner to ignore a given test.

    Example::

        When no parameters are provided to the @ignore decorator, ignore all parametrizations of the test function

        @ignore  # Ignore all parametrizations
        @parametrize(x=1, y=0)
        @parametrize(x=2, y=3)
        def the_test(...):
            ...

    Example::

        If parameters are supplied to the @ignore decorator, only ignore the parametrization with matching parameter(s)

        @ignore(x=2, y=3)
        @parametrize(x=1, y=0)  # This test will run as usual
        @parametrize(x=2, y=3)  # This test will be ignored
        def the_test(...):
            ...
    """
    if len(args) == 1 and len(kwargs) == 0:
        # this corresponds to the usage of the decorator with no arguments
        # @ignore
        # def test_function:
        #   ...
        Mark.mark(args[0], IgnoreAll())
        return args[0]

    # this corresponds to usage of @ignore with arguments
    def ignorer(f):
        Mark.mark(f, Ignore(**kwargs))
        return f

    return ignorer


def env(**kwargs):
    def environment(f):
        Mark.mark(f, Env(**kwargs))
        return f

    return environment


def _inject(*args, **kwargs):
    """Inject variables into the arguments of a function or method.
    This is almost identical to decorating with functools.partial, except we also propagate the wrapped
    function's __name__.
    """

    def injector(f):
        assert callable(f)

        @functools.wraps(f)
        def wrapper(*w_args, **w_kwargs):
            return functools.partial(f, *args, **kwargs)(*w_args, **w_kwargs)

        wrapper.args = args
        wrapper.kwargs = kwargs
        wrapper.function = f

        return wrapper

    return injector


================================================
FILE: ducktape/mark/consts.py
================================================
CLUSTER_SPEC_KEYWORD = "cluster_spec"
CLUSTER_SIZE_KEYWORD = "num_nodes"
CLUSTER_NODE_TYPE_KEYWORD = "node_type"


================================================
FILE: ducktape/mark/mark_expander.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from ducktape.tests.test_context import TestContext

from ._mark import Parametrize, _is_parametrize_mark, parametrized


class MarkedFunctionExpander(object):
    """This class helps expand decorated/marked functions into a list of test context objects."""

    def __init__(
        self,
        session_context=None,
        module=None,
        cls=None,
        function=None,
        file=None,
        cluster=None,
    ):
        self.seed_context = TestContext(
            session_context=session_context,
            module=module,
            cls=cls,
            function=function,
            file=file,
            cluster=cluster,
        )

        if parametrized(function):
            self.context_list = []
        else:
            self.context_list = [self.seed_context]

    def expand(self, test_parameters=None):
        """Inspect self.function for marks, and expand into a list of test context objects useable by the test runner."""
        f = self.seed_context.function

        # If the user has specified that they want to run tests with specific parameters, apply the parameters first,
        # then subsequently strip any parametrization decorators. Otherwise, everything gets applied normally.
        if test_parameters is not None:
            self.context_list = Parametrize(**test_parameters).apply(self.seed_context, self.context_list)

        for m in getattr(f, "marks", []):
            if test_parameters is None or not _is_parametrize_mark(m):
                self.context_list = m.apply(self.seed_context, self.context_list)

        return self.context_list


================================================
FILE: ducktape/mark/resource.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
from typing import Callable, List

from ducktape.mark._mark import Mark
from ducktape.tests.test_context import TestContext


class ClusterUseMetadata(Mark):
    """Provide a hint about how a given test will use the cluster."""

    def __init__(self, **kwargs) -> None:
        # shallow copy
        self.metadata = copy.copy(kwargs)

    @property
    def name(self) -> str:
        return "RESOURCE_HINT_CLUSTER_USE"

    def apply(self, seed_context: TestContext, context_list: List[TestContext]) -> List[TestContext]:
        assert len(context_list) > 0, "cluster use annotation is not being applied to any test cases"

        for ctx in context_list:
            if not ctx.cluster_use_metadata:
                # only update if non-None and non-empty
                ctx.cluster_use_metadata = self.metadata
        return context_list


def cluster(**kwargs) -> Callable:
    """Test method decorator used to provide hints about how the test will use the given cluster.

    If this decorator is not provided, the test will either claim all cluster resources or fail immediately,
    depending on the flags passed to ducktape.


    :Keywords used by ducktape:

        - ``num_nodes`` provide hint about how many nodes the test will consume
        - ``node_type`` provide hint about what type of nodes the test needs (e.g., "large", "small")
        - ``cluster_spec`` provide hint about how many nodes of each type the test will consume


    Example::

        # basic usage with num_nodes
        @cluster(num_nodes=10)
        def the_test(...):
            ...

        # usage with num_nodes and node_type
        @cluster(num_nodes=5, node_type="large")
        def the_test(...):
            ...

        # basic usage with cluster_spec
        @cluster(cluster_spec=ClusterSpec.simple_linux(10))
        def the_test(...):
            ...

        # parametrized test:
        # both test cases will be marked with cluster_size of 200
        @cluster(num_nodes=200)
        @parametrize(x=1)
        @parametrize(x=2)
        def the_test(x):
            ...

        # test case {'x': 1} has cluster size 100, test case {'x': 2} has cluster size 200
        @cluster(num_nodes=100)
        @parametrize(x=1)
        @cluster(num_nodes=200)
        @parametrize(x=2)
        def the_test(x):
            ...

    """

    def cluster_use_metadata_adder(f):
        Mark.mark(f, ClusterUseMetadata(**kwargs))
        return f

    return cluster_use_metadata_adder


================================================
FILE: ducktape/services/__init__.py
================================================


================================================
FILE: ducktape/services/background_thread.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import threading
import traceback

from ducktape.services.service import Service


class BackgroundThreadService(Service):
    def __init__(self, context, num_nodes=None, cluster_spec=None, *args, **kwargs):
        super(BackgroundThreadService, self).__init__(context, num_nodes, cluster_spec, *args, **kwargs)
        self.worker_threads = {}
        self.worker_errors = {}
        self.errors = ""
        self.lock = threading.RLock()

    def _protected_worker(self, idx, node):
        """Protected worker captures exceptions and makes them available to the main thread.

        This gives us the ability to propagate exceptions thrown in background threads, if desired.
        """
        try:
            self._worker(idx, node)
        except BaseException:
            with self.lock:
                self.logger.info("BackgroundThreadService threw exception: ")
                tb = traceback.format_exc()
                self.logger.info(tb)
                self.worker_errors[threading.currentThread().name] = tb
                if self.errors:
                    self.errors += "\n"
                self.errors += "%s: %s" % (threading.currentThread().name, tb)

            raise

    def start_node(self, node):
        idx = self.idx(node)

        if idx in self.worker_threads and self.worker_threads[idx].is_alive():
            raise RuntimeError("Cannot restart node since previous thread is still alive")

        self.logger.info("Running %s node %d on %s", self.service_id, idx, node.account.hostname)
        worker = threading.Thread(
            name=self.service_id + "-worker-" + str(idx),
            target=self._protected_worker,
            args=(idx, node),
        )
        worker.daemon = True
        worker.start()
        self.worker_threads[idx] = worker

    def wait(self, timeout_sec=600):
        """Wait no more than timeout_sec for all worker threads to finish.

        raise TimeoutException if all worker threads do not finish within timeout_sec
        """
        super(BackgroundThreadService, self).wait(timeout_sec)

        self._propagate_exceptions()

    def stop(self):
        alive_workers = [worker for worker in self.worker_threads.values() if worker.is_alive()]
        if len(alive_workers) > 0:
            self.logger.debug("Called stop with at least one worker thread is still running: " + str(alive_workers))

            self.logger.debug("%s" % str(self.worker_threads))

        super(BackgroundThreadService, self).stop()

        self._propagate_exceptions()

    def wait_node(self, node, timeout_sec=600):
        idx = self.idx(node)
        worker_thread = self.worker_threads.get(idx)
        # worker thread can be absent if this node has never been started
        if worker_thread:
            worker_thread.join(timeout_sec)
            return not (worker_thread.is_alive())
        else:
            self.logger.debug(f"Worker thread not found for {self.who_am_i(node)}")
            return True

    def _propagate_exceptions(self):
        """
        Propagate exceptions thrown in background threads
        """
        with self.lock:
            if len(self.worker_errors) > 0:
                raise Exception(self.errors)


================================================
FILE: ducktape/services/service.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import shutil
import tempfile
import time
from typing import Any, Dict

from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.command_line.defaults import ConsoleDefaults
from ducktape.errors import TimeoutError
from ducktape.template import TemplateRenderer


class ServiceIdFactory:
    def generate_service_id(self, service):
        return "{service_name}-{service_number}-{service_id}".format(
            service_name=service.__class__.__name__,
            service_number=service._order,
            service_id=id(service),
        )


class MultiRunServiceIdFactory:
    def __init__(self, run_number=1):
        self.run_number = run_number

    def generate_service_id(self, service):
        return "{run_number}-{service_name}-{service_number}-{service_id}".format(
            run_number=self.run_number,
            service_name=service.__class__.__name__,
            service_number=service._order,
            service_id=id(service),
        )


service_id_factory = ServiceIdFactory()


class Service(TemplateRenderer):
    """Service classes know how to deploy a service onto a set of nodes and then clean up after themselves.

    They request the necessary resources from the cluster,
    configure each node, and bring up/tear down the service.

    They also expose
    information about the service so that other services or test scripts can
    easily be configured to work with them. Finally, they may be able to collect
    and check logs/output from the service, which can be helpful in writing tests
    or benchmarks.

    Services should generally be written to support an arbitrary number of nodes,
    even if instances are independent of each other. They should be able to assume
    that there won't be resource conflicts: the cluster tests are being run on
    should be large enough to use one instance per service instance.
    """

    # Provides a mechanism for locating and collecting log files produced by the service on its nodes.
    # logs is a dict with entries that look like log_name: {"path": log_path, "collect_default": boolean}
    #
    # For example, zookeeper service might have self.logs like this:
    # self.logs = {
    #    "zk_log": {"path": "/mnt/zk.log",
    #               "collect_default": True}
    # }
    logs: Dict[str, Dict[str, Any]] = {}

    def __init__(self, context, num_nodes=None, cluster_spec=None, *args, **kwargs):
        """
        Initialize the Service.

        Note: only one of (num_nodes, cluster_spec) may be set.

        :param context:         An object which has at minimum 'cluster' and 'logger' attributes. In tests, this
                                is always a TestContext object.
        :param num_nodes:       An integer representing the number of Linux nodes to allocate.
        :param cluster_spec:    A ClusterSpec object representing the minimum cluster specification needed.
        """
        super(Service, self).__init__(*args, **kwargs)
        # Keep track of significant events in the lifetime of this service
        self._init_time = time.time()
        self._start_time = -1
        self._start_duration_seconds = -1
        self._stop_time = -1
        self._stop_duration_seconds = -1
        self._clean_time = -1

        self._initialized = False
        self.service_id_factory = service_id_factory
        self.cluster_spec = Service.setup_cluster_spec(num_nodes=num_nodes, cluster_spec=cluster_spec)
        self.context = context

        self.nodes = []
        self.skip_nodes_allocation = kwargs.get("skip_nodes_allocation", False)
        if not self.skip_nodes_allocation:
            self.allocate_nodes()

        # Keep track of which nodes nodes were allocated to this service, even after nodes are freed
        # Note: only keep references to representations of the nodes, not the actual node objects themselves
        self._nodes_formerly_allocated = [str(node.account) for node in self.nodes]

        # Every time a service instance is created, it registers itself with its
        # context object. This makes it possible for external mechanisms to clean up
        # after the service if something goes wrong.
        #
        # Note: Allocate nodes *before* registering self with the service registry
        self.context.services.append(self)

        # Each service instance has its own local scratch directory on the test driver
        self._local_scratch_dir = None
        self._initialized = True

    @staticmethod
    def setup_cluster_spec(num_nodes=None, cluster_spec=None):
        if num_nodes is None:
            if cluster_spec is None:
                raise RuntimeError("You must set either num_nodes or cluster_spec.")
            else:
                return cluster_spec
        else:
            if cluster_spec is not None:
                raise RuntimeError("You must set only one of (num_nodes, cluster_spec)")
            return ClusterSpec.simple_linux(num_nodes)

    def __repr__(self):
        return "<%s: %s>" % (
            self.who_am_i(),
            "num_nodes: %d, nodes: %s" % (len(self.nodes), [n.account.hostname for n in self.nodes]),
        )

    @property
    def num_nodes(self):
        return len(self.nodes)

    @property
    def local_scratch_dir(self):
        """This local scratch directory is created/destroyed on the test driver before/after each test is run."""
        if not self._local_scratch_dir:
            self._local_scratch_dir = tempfile.mkdtemp()
        return self._local_scratch_dir

    @property
    def service_id(self):
        """Human-readable identifier (almost certainly) unique within a test run."""
        return self.service_id_factory.generate_service_id(self)

    @property
    def _order(self):
        """Index of this service instance with respect to other services of the same type registered with self.context.
        When used with a test_context, this lets the user know

        Example::

            suppose the services registered with the same context looks like
                context.services == [Zookeeper, Kafka, Zookeeper, Kafka, MirrorMaker]
            then:
                context.services[0]._order == 0  # "0th" Zookeeper instance
                context.services[2]._order == 0  # "0th" Kafka instance
                context.services[1]._order == 1  # "1st" Zookeeper instance
                context.services[3]._order == 1  # "1st" Kafka instance
                context.services[4]._order == 0  # "0th" MirrorMaker instance

        """
        if hasattr(self.context, "services"):
            same_services = [id(s) for s in self.context.services if isinstance(self, type(s))]

            if self not in self.context.services and not self._initialized:
                # It's possible that _order will be invoked in the constructor *before* self has been registered with
                # the service registry (aka self.context.services).
                return len(same_services)

            # Note: index raises ValueError if the item is not in the list
            index = same_services.index(id(self))
            return index
        else:
            return 0

    @property
    def logger(self):
        """The logger instance for this service."""
        return self.context.logger

    @property
    def cluster(self):
        """The cluster object from which this service instance gets its nodes."""
        return self.context.cluster

    @property
    def allocated(self):
        """Return True iff nodes have been allocated to this service instance."""
        return len(self.nodes) > 0

    def who_am_i(self, node=None):
        """Human-readable identifier useful for log messages."""
        if node is None:
            return self.service_id
        else:
            return "%s node %d on %s" % (
                self.service_id,
                self.idx(node),
                node.account.hostname,
            )

    def allocate_nodes(self):
        """Request resources from the cluster."""
        if self.allocated:
            raise Exception("Requesting nodes for a service that has already been allocated nodes.")

        self.logger.debug("Requesting nodes from the cluster: %s" % self.cluster_spec)

        try:
            self.nodes = self.cluster.alloc(self.cluster_spec)
        except RuntimeError as e:
            msg = str(e)
            if hasattr(self.context, "services"):
                msg += " Currently registered services: " + str(self.context.services)
            raise RuntimeError(msg)

        for idx, node in enumerate(self.nodes, 1):
            # Remote accounts utilities should log where this service logs
            if node.account._logger is not None:
                # This log message help test-writer identify which test and/or service didn't clean up after itself
                node.account.logger.critical(ConsoleDefaults.BAD_TEST_MESSAGE)
                raise RuntimeError(
                    "logger was not None on service start. There may be a concurrency issue, "
                    "or some service which isn't properly cleaning up after itself. "
                    "Service: %s, node.account: %s" % (self.__class__.__name__, str(node.account))
                )
            node.account.logger = self.logger

        self.logger.debug("Successfully allocated %d nodes to %s" % (len(self.nodes), self.who_am_i()))

    def start(self, **kwargs):
        """Start the service on all nodes."""
        self.logger.info("%s: starting service" % self.who_am_i())
        if self._start_time < 0:
            # Set self._start_time only the first time self.start is invoked
            self._start_time = time.time()

        self.logger.debug(self.who_am_i() + ": killing processes and attempting to clean up before starting")
        for node in self.nodes:
            # Added precaution - kill running processes, clean persistent files (if 'clean'=False flag passed,
            # skip cleaning), try/except for each step, since each of these steps may fail if there
            # are no processes to kill or no files to remove

            try:
                self.stop_node(node)
            except Exception:
                pass

            try:
                if kwargs.get("clean", True):
                    self.clean_node(node)
                else:
                    self.logger.debug("%s: skip cleaning node" % self.who_am_i(node))
            except Exception:
                pass

        for node in self.nodes:
            self.logger.debug("%s: starting node" % self.who_am_i(node))
            self.start_node(node, **kwargs)

        if self._start_duration_seconds < 0:
            self._start_duration_seconds = time.time() - self._start_time

    def start_node(self, node, **kwargs):
        """Start service process(es) on the given node."""
        pass

    def wait(self, timeout_sec=600):
        """Wait for the service to finish.
        This only makes sense for tasks with a fixed amount of work to do. For services that generate
        output, it is only guaranteed to be available after this call returns.
        """
        unfinished_nodes = []
        start = time.time()
        end = start + timeout_sec
        for node in self.nodes:
            now = time.time()
            node_name = self.who_am_i(node)
            if end > now:
                self.logger.debug("%s: waiting for node", node_name)
                if not self.wait_node(node, end - now):
                    unfinished_nodes.append(node_name)
            else:
                unfinished_nodes.append(node_name)

        if unfinished_nodes:
            raise TimeoutError(
                "Timed out waiting %s seconds for service nodes to finish. " % str(timeout_sec)
                + "These nodes are still alive: "
                + str(unfinished_nodes)
            )

    def wait_node(self, node, timeout_sec=None):
        """Wait for the service on the given node to finish.
        Return True if the node finished shutdown, False otherwise.
        """
        pass

    def stop(self, **kwargs):
        """Stop service processes on each node in this service.
        Subclasses must override stop_node.
        """
        self._stop_time = time.time()  # The last time stop is invoked
        self.logger.info("%s: stopping service" % self.who_am_i())
        for node in self.nodes:
            self.logger.info("%s: stopping node" % self.who_am_i(node))
            self.stop_node(node, **kwargs)

        self._stop_duration_seconds = time.time() - self._stop_time

    def stop_node(self, node, **kwargs):
        """Halt service process(es) on this node."""
        pass

    def clean(self, **kwargs):
        """Clean up persistent state on each node - e.g. logs, config files etc.
        Subclasses must override clean_node.
        """
        self._clean_time = time.time()
        self.logger.info("%s: cleaning service" % self.who_am_i())
        for node in self.nodes:
            self.logger.info("%s: cleaning node" % self.who_am_i(node))
            self.clean_node(node, **kwargs)

    def clean_node(self, node, **kwargs):
        """Clean up persistent state on this node - e.g. service logs, configuration files etc."""
        self.logger.warn(
            "%s: clean_node has not been overriden. "
            "This may be fine if the service leaves no persistent state." % self.who_am_i()
        )

    def free(self):
        """Free each node. This 'deallocates' the nodes so the cluster can assign them to other services."""
        while self.nodes:
            node = self.nodes.pop()
            self.logger.info("%s: freeing node" % self.who_am_i(node))
            node.account.logger = None
            self.cluster.free(node)

    def run(self):
        """Helper that executes run(), wait(), and stop() in sequence."""
        self.start()
        self.wait()
        self.stop()

    def get_node(self, idx):
        """ids presented externally are indexed from 1, so we provide a helper method to avoid confusion."""
        return self.nodes[idx - 1]

    def idx(self, node):
        """Return id of the given node. Return -1 if node does not belong to this service.

        idx identifies the node within this service instance (not globally).
        """
        for idx, n in enumerate(self.nodes, 1):
            if self.get_node(idx) == node:
                return idx
        return -1

    def close(self):
        """Release resources."""
        # Remove local scratch directory
        if self._local_scratch_dir and os.path.exists(self._local_scratch_dir):
            shutil.rmtree(self._local_scratch_dir)

    @staticmethod
    def run_parallel(*args):
        """Helper to run a set of services in parallel. This is useful if you want
        multiple services of different types to run concurrently, e.g. a
        producer + consumer pair.
        """
        for svc in args:
            svc.start()
        for svc in args:
            svc.wait()
        for svc in args:
            svc.stop()

    def to_json(self):
        return {
            "cls_name": self.__class__.__name__,
            "module_name": self.__module__,
            "lifecycle": {
                "init_time": self._init_time,
                "start_time": self._start_time,
                "start_duration_seconds": self._start_duration_seconds,
                "stop_time": self._stop_time,
                "stop_duration_seconds": self._stop_duration_seconds,
                "clean_time": self._clean_time,
            },
            "service_id": self.service_id,
            "nodes": self._nodes_formerly_allocated,
        }


================================================
FILE: ducktape/services/service_registry.py
================================================
# Copyright 2014 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from collections import OrderedDict

from ducktape.jvm_logging import JVMLogger


class ServiceRegistry(object):
    def __init__(self, enable_jvm_logs=False) -> None:
        self._services: OrderedDict = OrderedDict()
        self._nodes: dict = {}

        # Initialize JVM logging if enabled
        if enable_jvm_logs:
            self._jvm_logger = JVMLogger()
        else:
            self._jvm_logger = None

    def __contains__(self, item):
        return id(item) in self._services

    def __iter__(self):
        return iter(self._services.values())

    def __repr__(self):
        return str(self._services.values())

    def append(self, service):
        # Auto-enable JVM logging for Java-based services
        if self._jvm_logger:
            self._jvm_logger.enable_for_service(service)

        self._services[id(service)] = service
        self._nodes[id(service)] = [str(n.account) for n in service.nodes]

    def to_json(self):
        return [service.to_json() for service in self._services.values()]

    def stop_all(self):
        """Stop all currently registered services in the reverse of the order in which they were added.

        Note that this does not clean up persistent state or free the nodes back to the cluster.
        """
        keyboard_interrupt = None
        for service in reversed(self._services.values()):
            try:
                service.stop()
            except BaseException as e:
                if isinstance(e, KeyboardInterrupt):
                    keyboard_interrupt = e
                service.logger.warn("Error stopping service %s: %s", service, e)

        if keyboard_interrupt is not None:
            raise keyboard_interrupt

    def clean_all(self):
        """Clean all services. This should only be called after services are stopped."""
        keyboard_interrupt = None
        for service in self._services.values():
            try:
                service.clean()
            except BaseException as e:
                if isinstance(e, KeyboardInterrupt):
                    keyboard_interrupt = e
                service.logger.warn("Error cleaning service %s: %s" % (service, e))

        if keyboard_interrupt is not None:
            raise keyboard_interrupt

    def free_all(self):
        """Release nodes back to the cluster."""
        keyboard_interrupt = None
        for service in self._services.values():
            try:
                service.free()
            except BaseException as e:
                if isinstance(e, KeyboardInterrupt):
                    keyboard_interrupt = e
                service.logger.warn("Error freeing service %s: %s" % (service, e))

        if keyboard_interrupt is not None:
            raise keyboard_interrupt
        self._services.clear()
        self._nodes.clear()

    def errors(self):
        """
        Gets a printable string containing any errors produced by the services.
        """
        return "\n\n".join(
            "{}: {}".format(service.who_am_i(), service.error)
            for service in self._services.values()
            if hasattr(service, "error") and service.error
        )


================================================
FILE: ducktape/template.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import os.path

from jinja2 import ChoiceLoader, Environment, FileSystemLoader, PackageLoader, Template

from ducktape.utils.util import package_is_installed


class TemplateRenderer(object):
    def _get_ctx(self):
        ctx = {k: getattr(self.__class__, k) for k in dir(self.__class__)}
        ctx.update(self.__dict__)
        return ctx

    def render_template(self, template, **kwargs):
        """
        Render a template using the context of the current object, optionally with overrides.

        :param template: the template to render, a Template or a str
        :param kwargs: optional override parameters
        :return: the rendered template
        """
        if not hasattr(template, "render"):
            template = Template(template)
        ctx = self._get_ctx()
        return template.render(ctx, **kwargs)

    @staticmethod
    def _package_search_path(module_name):
        """
        :param module_name: Name of a module
        :return: (package, package_search_path) where package is the package containing the module,
            and package_search_path is a path relative to the package in which to search for templates.
        """
        module_parts = module_name.split(".")
        package = module_parts[0]

        # Construct path relative to package under which "templates" would be found
        directory = ""
        for d in module_parts[1:-1]:
            directory = os.path.join(directory, d)
        return package, os.path.join(directory, "templates")

    def render(self, path, **kwargs):
        """
        Render a template loaded from a file.
        template files referenced in file f should be in a sibling directory of f called "templates".

        :param path: path, relative to the search paths, to the template file
        :param kwargs: optional override parameters
        :return: the rendered template
        """
        if not hasattr(self, "template_loader"):
            class_dir = os.path.dirname(inspect.getfile(self.__class__))

            module_name = self.__class__.__module__
            package, package_search_path = self._package_search_path(module_name)

            loaders = []
            msg = ""
            if os.path.isdir(class_dir):
                # FileSystemLoader overrides PackageLoader if the path containing this directory
                # is a valid directory. FileSystemLoader throws an error from which ChoiceLoader
                # doesn't recover if the directory is invalid
                loaders.append(FileSystemLoader(os.path.join(class_dir, "templates")))
            else:
                msg += "Will not search in %s for template files since it is not a valid directory. " % class_dir

            if package_is_installed(package):
                loaders.append(PackageLoader(package, package_search_path))
            else:
                msg += "Will not search in package %s for template files because it cannot be imported."

            if len(loaders) == 0:
                # Expect at least one of FileSystemLoader and PackageLoader to be present
                raise EnvironmentError(msg)

            self.template_loader = ChoiceLoader(loaders)
            self.template_env = Environment(loader=self.template_loader, trim_blocks=True, lstrip_blocks=True)

        template = self.template_env.get_template(path)
        return self.render_template(template, **kwargs)


================================================
FILE: ducktape/templates/report/report.css
================================================
body { 
    font-family: verdana, arial, helvetica, sans-serif; 
    font-size: 80%; 
}

table { 
    font-size: 100%;
}

h1, h2, h3, h4, h5, h6 {
    color: gray;
}

#summary_header_row {
    font-weight: bold;
    color: white;
    background-color: #777;
}

#summary_header_row th {
    border: 1px solid #777;
    padding: 2px; 
}

#summary_report_table {
    width: 80%;
    border-collapse: collapse;
    border: 1px solid #777;
}

#summary_report_table td {
    border: 1px solid #777;
    padding: 2px;
}

.pre_stack_trace {
    white-space: pre-wrap;       /* Since CSS 2.1 */
    white-space: -moz-pre-wrap;  /* Mozilla, since 1999 */
    white-space: -o-pre-wrap;    /* Opera 7 */
    word-wrap: break-word;       /* Internet Explorer 5.5+ */
}

.header_row {
    font-weight: bold;
    color: white;
    background-color: #777;
}

.header_row th {
    border: 1px solid #777;
    padding: 2px; 
}

.report_table {
    width: 80%;
    border-collapse: collapse;
    border: 1px solid #777;
}

.report_table td {
    border: 1px solid #777;
    padding: 2px;
}

.pass {
    background-color: #6c6;
}

.flaky {
    background-color: #dd2;
}

.fail {
    background-color: #c60; 
}

.ignore {
    background-color: #555;
}

.testcase { 
    margin-left: 2em;
}


================================================
FILE: ducktape/templates/report/report.html
================================================
<!DOCTYPE html>
<html>
  <head>
    <script src="https://fb.me/react-0.13.1.min.js"></script>
    <script src="https://fb.me/JSXTransformer-0.13.1.js"></script>
  </head>
  <link rel="stylesheet" href="report.css" type="text/css">
  <body>
    <div id="heading"></div>
    <div id="summary_panel"></div>
    <div id="color_key_panel"></div>
    <div id="failed_test_panel"></div>
    <div id="ignored_test_panel"></div>
    <div id="flaky_test_panel"></div>
    <div id="passed_test_panel"></div>
    <script type="text/jsx">
      /* This small block makes it possible to use React dev tools in the Chrome browser */
      if (typeof window !== 'undefined') {
        window.React = React;
      }

      var Heading = React.createClass({
        render: function() {
          return (
            <div>
              <h1>
                System Test Report
              </h1>
              <p>Test Session: {this.props.heading.session}</p>
              <p>Ducktape Version: {this.props.heading.ducktape_version}</p>
            </div>
          );
        }
      });
      
      var SummaryRow = React.createClass({
        render: function() {
          return (
            <tr>
              <td colSpan='5' align='center'>{this.props.summary_prop.expected_test_count}</td>
              <td colSpan='5' align='center'>{this.props.summary_prop.tests_run}</td>
              <td colSpan='5' align='center'>{this.props.summary_prop.passes}</td>
              <td colSpan='5' align='center'>{this.props.summary_prop.flaky}</td>
              <td colSpan='5' align='center'>{this.props.summary_prop.failures}</td>
              <td colSpan='5' align='center'>{this.props.summary_prop.ignored}</td>
              <td colSpan='5' align='center'>{this.props.summary_prop.run_time}</td>
            </tr>
          );
        }
      })
      
      var SummaryTable = React.createClass({
        render: function() {
          return (
            <table id="summary_report_table">
              <thead>
                <tr id="summary_header_row">
                  <th colSpan='5' align='center'>Tests Expected</th>
                  <th colSpan='5' align='center'>Tests Run</th>
                  <th colSpan='5' align='center'>Passes</th>
                  <th colSpan='5' align='center'>Flaky</th>
                  <th colSpan='5' align='center'>Failures</th>
                  <th colSpan='5' align='center'>Ignored</th>
                  <th colSpan='5' align='center'>Time</th>
                </tr>
              </thead>
              <tbody>
                {this.props.summary_props.map(function(summary_prop){
                  return (
                    <SummaryRow summary_prop={summary_prop} />
                  );
                }, this)}
              </tbody>
            </table>
          );
        }
      });
      
      var TestRow = React.createClass({
        render: function() {
          var className = this.props.test.test_result;
          var detailCol;
          if (className !== "ignore") {
            detailCol = <td colSpan='5' align='center'><pre><a href={this.props.test.test_log}>Detail</a></pre></td>
          } else {
            detailCol = <td colSpan='5' align='center'></td>
          }

          return (
            <tr className={className}>
              <td colSpan='5' align='center'><pre>{this.props.test.test_name}</pre></td>
              <td colSpan='5' align='center'><pre>{this.props.test.description}</pre></td>
              <td colSpan='5' align='center'><pre>{this.props.test.run_time}</pre></td>
              <td colSpan='5' align='center'><pre>{this.props.test.data}</pre></td>
              <td colSpan='5' align='center'><pre className="pre_stack_trace">{this.props.test.summary}</pre></td>
              {detailCol}
            </tr>
          );
        }
      });


      var TestTable = React.createClass({
        render: function() {
          return (
            <table className="report_table">
              <thead>
                <tr className="header_row">
                  <th colSpan='5' align='center'>Test</th>
                  <th colSpan='5' align='center'>Description</th>
                  <th colSpan='5' align='center'>Time</th>
                  <th colSpan='5' align='center'>Data</th>
                  <th colSpan='5' align='center'>Summary</th>
                  <th colSpan='5' align='center'>Detail</th>
                </tr>
              </thead>
              <tbody>
                {this.props.tests.map(function(test) {
                  return (
                    <TestRow test={test} /> 
                  );
                }, this)}
              </tbody>
            </table>
          );
        }
      });

      /* A key which shows how colors map to different test statuses. E.g. red -> fail, green -> pass, etc */
      var ColorKeyTable = React.createClass({
        render: function() {
          return (
            <table id="color_key_table">
              <tbody>
                {this.props.test_status_names.map(function(status_name) {
                  return (
                    <th colSpan='5' align='center' className={status_name}>{status_name}</th>
                  );
                }, this)}
              </tbody>
            </table>
          );
        }
      });


      ColorKeyPanel = React.createClass({
        render: function() {
          return (
            <div>
              <h3>Color Key</h3>
              <ColorKeyTable test_status_names={this.props.test_status_names}/>
            </div>
          );

        }
      });

      SummaryPanel = React.createClass({
        render: function() {
          return (
            <div>
              <h2>Summary</h2>
              <SummaryTable summary_props={this.props.summary_props}/>
            </div>
          );
        }
      });

      TestPanel = React.createClass({
        render: function() {
          return (
            <div>
              <h2>{this.props.title}</h2>
              <TestTable tests={this.props.tests}/>
            </div>
          );
        }
      });

      SUMMARY=[{
        "expected_test_count": %(expected_test_count)d,
        "tests_run": %(num_tests_run)d,
        "passes": %(num_passes)d,
        "flaky": %(num_flaky)d,
        "failures": %(num_failures)d,
        "ignored": %(num_ignored)d,
        "run_time": '%(run_time)s'
      }];
      
      HEADING={
        "ducktape_version": '%(ducktape_version)s',
        "session": '%(session)s'
      };

      COLOR_KEYS=[%(test_status_names)s];

      PASSED_TESTS=[%(passed_tests)s];
      FLAKY_TESTS=[%(flaky_tests)s];
      FAILED_TESTS=[%(failed_tests)s];
      IGNORED_TESTS=[%(ignored_tests)s];

      React.render(<Heading heading={HEADING}/>, document.getElementById('heading'));
      React.render(<ColorKeyPanel test_status_names={COLOR_KEYS}/>, document.getElementById('color_key_panel'));
      React.render(<SummaryPanel summary_props={SUMMARY}/>, document.getElementById('summary_panel'));
      React.render(<TestPanel title="Failed Tests" tests={FAILED_TESTS}/>, document.getElementById('failed_test_panel'));
      React.render(<TestPanel title="Ignored Tests" tests={IGNORED_TESTS}/>, document.getElementById('ignored_test_panel'));
      React.render(<TestPanel title="Flaky Tests" tests={FLAKY_TESTS}/>, document.getElementById('flaky_test_panel'));
      React.render(<TestPanel title="Passed Tests" tests={PASSED_TESTS}/>, document.getElementById('passed_test_panel'));
    </script>
  </body>
</html>


================================================
FILE: ducktape/tests/__init__.py
================================================
from .test import Test
from .test_context import TestContext

__all__ = ["Test", "TestContext"]


================================================
FILE: ducktape/tests/event.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import os
import time


class ClientEventFactory(object):
    """Used by test runner clients to generate events."""

    READY = "READY"  # reply: {test_metadata, cluster, session_context}
    SETTING_UP = "SETTING_UP"
    RUNNING = "RUNNING"
    TEARING_DOWN = "TEARING_DOWN"
    FINISHED = "FINISHED"
    LOG = "LOG"

    # Types of messages available
    TYPES = {READY, SETTING_UP, RUNNING, TEARING_DOWN, FINISHED, LOG}

    def __init__(self, test_id, test_index, source_id):
        self.test_id = test_id
        # id of event source
        self.test_index = test_index
        self.source_id = source_id
        self.event_id = 0

    def _event(self, event_type, payload=None):
        """Create a message object with certain base fields, and augmented by the payload.

        :param event_type: type of message this is
        :param payload: a dict containing extra fields for the message. Key names should not conflict with keys
            in the base event.
        """
        assert event_type in ClientEventFactory.TYPES, "Unknown event type"
        if payload is None:
            payload = {}

        event = {
            "test_id": self.test_id,
            "source_id": self.source_id,
            "test_index": self.test_index,
            "event_id": self.event_id,
            "event_type": event_type,
            "event_time": time.time(),
        }

        assert len(set(event.keys()).intersection(set(payload.keys()))) == 0, (
            "Payload and base event should not share keys. base event: %s, payload: %s" % (str(event), str(payload))
        )

        event.update(payload)
        self.event_id += 1
        return event

    def copy(self, event):
        """Copy constructor: return a copy of the original message, but with a unique message id."""
        new_event = copy.copy(event)
        new_event["message_id"] = self.event_id
        self.event_id += 1

        return new_event

    def running(self):
        return self._event(
            event_type=ClientEventFactory.RUNNING,
            payload={"pid": os.getpid(), "pgroup_id": os.getpgrp()},
        )

    def ready(self):
        return self._event(
            event_type=ClientEventFactory.READY,
            payload={"pid": os.getpid(), "pgroup_id": os.getpgrp()},
        )

    def setting_up(self):
        return self._event(event_type=ClientEventFactory.SETTING_UP)

    def finished(self, result):
        return self._event(event_type=ClientEventFactory.FINISHED, payload={"result": result})

    def log(self, message, level):
        return self._event(
            event_type=ClientEventFactory.LOG,
            payload={"message": message, "log_level": level},
        )


class EventResponseFactory(object):
    """Used by the test runner to create responses to events from client processes."""

    def _event_response(self, client_event, payload=None):
        if payload is None:
            payload = {}

        event_response = {
            "ack": True,
            "source_id": client_event["source_id"],
            "event_id": client_event["event_id"],
        }

        assert len(set(event_response.keys()).intersection(set(payload.keys()))) == 0, (
            "Payload and base event should not share keys. base event: %s, payload: %s"
            % (str(event_response), str(payload))
        )

        event_response.update(payload)
        return event_response

    def running(self, client_event):
        return self._event_response(client_event)

    def ready(self, client_event, session_context, test_context, cluster):
        payload = {
            "session_context": session_context,
            "test_metadata": test_context.test_metadata,
            "cluster": cluster,
        }

        return self._event_response(client_event, payload)

    def setting_up(self, client_event):
        return self._event_response(client_event)

    def finished(self, client_event):
        return self._event_response(client_event)

    def log(self, client_event):
        return self._event_response(client_event)


================================================
FILE: ducktape/tests/loader.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import glob
import importlib
import inspect
import itertools
import json
import os
import re
import sys
from logging import Logger
from operator import attrgetter
from typing import Any, Iterable, List, Optional, Set, Tuple, Union

import requests
import yaml

from ducktape.mark import parametrized
from ducktape.mark.mark_expander import MarkedFunctionExpander
from ducktape.tests.session import SessionContext
from ducktape.tests.test import Test
from ducktape.tests.test_context import TestContext


class LoaderException(Exception):
    pass


# A helper container class
ModuleAndFile = collections.namedtuple("ModuleAndFile", ["module", "file"])


DEFAULT_TEST_FILE_PATTERN = r"(^test_.*\.py$)|(^.*_test\.py$)"
DEFAULT_TEST_FUNCTION_PATTERN = "(^test.*)|(.*test$)"

# Included for unit tests to be able to add support for loading local file:/// URLs.
_requests_session = requests.session()


class TestLoader(object):
    """Class used to discover and load tests."""

    def __init__(
        self,
        session_context: SessionContext,
        logger: Logger,
        repeat: int = 1,
        injected_args: None = None,
        cluster: None = None,
        subset: int = 0,
        subsets: int = 1,
        historical_report: None = None,
    ) -> None:
        self.session_context = session_context
        self.cluster = cluster
        assert logger is not None
        self.logger = logger

        assert repeat >= 1
        self.repeat = repeat

        if subset >= subsets:
            raise ValueError("The subset to execute must be in the range [0, subsets-1]")
        self.subset = subset
        self.subsets = subsets

        self.historical_report = historical_report

        self.test_file_pattern = DEFAULT_TEST_FILE_PATTERN
        self.test_function_pattern = DEFAULT_TEST_FUNCTION_PATTERN

        # A non-None value here means the loader will override the injected_args
        # in any discovered test, whether or not it is parametrized
        self.injected_args = injected_args

    def load(self, symbols: List[str], excluded_test_symbols: None = None) -> List[TestContext]:
        """
        Discover tests specified by the symbols parameter (iterable of test symbols and/or test suite file paths).
        Skip any tests specified by excluded_test_symbols (iterable of test symbols).

        *Test symbol* is a pointer to the test or a group of tests.
        It is specified by the file/folder path or glob, optionally with Class.method after `::` :
        - `test-dir/`  - loads all tests under `test-dir`  but does NOT load test suites found under `test-dir`
        - `test-dir/prefix_*.py` - loads all files with a specified prefix
        - `/path/to/test/file.py`
        - `test/file.py::TestClass`
        - `test/file.py::TestClass.test_method`

        *Test suite* is a yaml file with the following format:
        ```
            # multiple test suites can be included:
            test_suite_name:
                # list included test symbols
                - path/to/test.py
            # optionally test suite can have included and excluded sections:
            # you may also specify a list of other suite's you wish to import
            # that will also be loaded when loading this file by using the
            # import tag.
            import:
                # list of yaml files whose suites will also run:
                - path/to/suite.yml
            another_test_suite:
                included:
                    # list of included test symbols:
                    - path/to/test-dir/prefix_*.py
                excluded:
                    # list of excluded test symbols:
                    - path/to/test-dir/prefix_excluded.py
        ```
        Each file found after parsing a symbol is checked to see if it contains a test:
        - Discover modules that 'look like' a test. By default, this means the filename is "test_*" or "*_test.py"
        - Discover test classes within each test module. A test class is a subclass of Test which is a leaf
          (i.e. it has no subclasses).
        - Discover test methods within each test class. A test method is a method containing 'test' in its name

        :param symbols: iterable that contains test symbols and/or test suite file paths.
        :param excluded_test_symbols: iterable that contains test symbols only.
        :return list of test context objects found during discovery. Note: if self.repeat is set to n, each test_context
            will appear in the list n times.
        """

        test_symbols = []
        test_suites = []
        # symbol can point to a test or a test suite
        for symbol in symbols:
            if symbol.endswith(".yml"):
                # if it ends with .yml, its a test suite, read included and excluded paths from the file
                test_suites.append(symbol)
            else:
                # otherwise add it to default suite's included list
                test_symbols.append(symbol)

        contexts_from_suites = self._load_test_suite_files(test_suites)
        contexts_from_symbols = self._load_test_contexts(test_symbols)
        all_included = contexts_from_suites.union(contexts_from_symbols)

        # excluded_test_symbols apply to both tests from suites and tests from symbols
        global_excluded = self._load_test_contexts(excluded_test_symbols)
        all_test_context_list: Union[List[TestContext], Set[TestContext]] = self._filter_excluded_test_contexts(
            all_included, global_excluded
        )

        # make sure no test is loaded twice
        all_test_context_list = self._filter_by_unique_test_id(all_test_context_list)

        # Sort to make sure we get a consistent order for when we create subsets
        all_test_context_list = sorted(all_test_context_list, key=attrgetter("test_id"))
        if not all_test_context_list:
            raise LoaderException("No tests to run!")
        self.logger.debug("Discovered these tests: " + str(all_test_context_list))
        # Select the subset of tests.
        if self.historical_report:
            # With timing info, try to pack the subsets reasonably evenly based on timing. To do so, get timing info
            # for each test (using avg as a fallback for missing data), sort in descending order, then start greedily
            # packing tests into bins based on the least full bin at the time.
            raw_results = _requests_session.get(self.historical_report).json()["results"]
            time_results = {r["test_id"]: r["run_time_seconds"] for r in raw_results}
            avg_result_time = sum(time_results.values()) / len(time_results)
            time_results = {tc.test_id: time_results.get(tc.test_id, avg_result_time) for tc in all_test_context_list}
            all_test_context_list = sorted(
                all_test_context_list,
                key=lambda x: time_results[x.test_id],
                reverse=True,
            )

            subsets = [[] for _ in range(self.subsets)]
            subsets_accumulated_time = [0] * self.subsets

            for tc in all_test_context_list:
                min_subset_idx = min(
                    range(len(subsets_accumulated_time)),
                    key=lambda i: subsets_accumulated_time[i],
                )
                subsets[min_subset_idx].append(tc.test_id)
                subsets_accumulated_time[min_subset_idx] += time_results[tc.test_id]

            subset_test_context_list = subsets[self.subset]
        else:
            # Without timing info, select every nth test instead of blocks of n to avoid groups of tests that are
            # parametrizations of the same test being grouped together since that can lead to a single, parameterized,
            # long-running test causing a very imbalanced workload across different subsets. Imbalance is still
            # possible, but much less likely using this heuristic.
            subset_test_context_list = list(itertools.islice(all_test_context_list, self.subset, None, self.subsets))

        self.logger.debug("Selected this subset of tests: " + str(subset_test_context_list))
        return subset_test_context_list * self.repeat

    def discover(
        self,
        directory: str,
        module_name: str,
        cls_name: str,
        method_name: str,
        injected_args: None = None,
    ) -> List[TestContext]:
        """Discover and unpack parametrized tests tied to the given module/class/method

        :return list of test_context objects
        """
        self.logger.debug(
            "Discovering tests at {} - {} - {} - {} - {}".format(
                directory, module_name, cls_name, method_name, injected_args
            )
        )
        # Check validity of path
        path = os.path.join(directory, module_name)
        if not os.path.exists(path):
            raise LoaderException("Path {} does not exist".format(path))

        # Recursively search path for test modules
        module_and_file = self._import_module(path)
        if module_and_file:
            # Find all tests in discovered modules and filter out any that don't match the discovery symbol
            test_context_list = self._expand_module(module_and_file)
            if len(cls_name) > 0:
                test_context_list = [t for t in test_context_list if t.cls_name == cls_name]
            if len(method_name) > 0:
                test_context_list = [t for t in test_context_list if t.function_name == method_name]
            if injected_args is not None:
                if isinstance(injected_args, List):

                    def condition(t):
                        return t.injected_args in injected_args

                else:

                    def condition(t):
                        return t.injected_args == injected_args

                test_context_list = filter(condition, test_context_list)

            listed = list(test_context_list)
            if not listed:
                self.logger.warn(
                    "No tests loaded for {} - {} - {} - {} - {}".format(
                        directory, module_name, cls_name, method_name, injected_args
                    )
                )
            return listed
        else:
            return []

    def _parse_discovery_symbol(self, discovery_symbol: str, base_dir: None = None) -> Tuple[str, str, str, None]:
        """Parse a single 'discovery symbol'

        :param discovery_symbol: a symbol used to target test(s).
            Looks like: <path/to/file_or_directory>[::<ClassName>[.method_name]]
        :return tuple of form (directory, module.py, cls_name, function_name)

        :raise LoaderException if it can't be parsed

        Examples:
            "path/to/directory" -> ("path/to/directory", "", "")
            "path/to/test_file.py" -> ("path/to/test_file.py", "", "")
            "path/to/test_file.py::ClassName.method" -> ("path/to/test_file.py", "ClassName", "method")
        """

        def divide_by_symbol(ds, symbol):
            if symbol not in ds:
                return ds, ""
            return ds.split(symbol, maxsplit=1)

        self.logger.debug("Trying to parse discovery symbol {}".format(discovery_symbol))
        if base_dir:
            discovery_symbol = os.path.join(base_dir, discovery_symbol)
        if discovery_symbol.find("::") >= 0:
            path, cls_name = divide_by_symbol(discovery_symbol, "::")
            # If the part after :: contains a dot, use it to split into class + method
            cls_name, method_name = divide_by_symbol(cls_name, ".")
            method_name, injected_args_str = divide_by_symbol(method_name, "@")

            if injected_args_str:
                if self.injected_args:
                    raise LoaderException("Cannot use both global and per-method test parameters")
                try:
                    injected_args = json.loads(injected_args_str)
                except Exception as e:
                    raise LoaderException("Invalid discovery symbol: cannot parse params: " + injected_args_str) from e
            else:
                injected_args = None
        else:
            # No "::" present in symbol
            path, cls_name, method_name, injected_args = discovery_symbol, "", "", None

        return path, cls_name, method_name, injected_args

    def _import_module(self, file_path: str) -> Optional[ModuleAndFile]:
        """Attempt to import a python module from the file path.
        Assume file_path is an absolute path ending in '.py'

        Return the imported module..

        :param file_path: file to import module from.
        :return ModuleAndFile object that contains the successfully imported module and
            the file from which it was imported
        """
        self.logger.debug("Trying to import module at path {}".format(file_path))
        if file_path[-3:] != ".py" or not os.path.isabs(file_path):
            raise Exception("Expected absolute path ending in '.py' but got " + file_path)

        # Try all possible module imports for given file
        # Strip off '.py' before splitting
        path_pieces = [piece for piece in file_path[:-3].split("/") if len(piece) > 0]
        while len(path_pieces) > 0:
            module_name = ".".join(path_pieces)
            # Try to import the current file as a module
            self.logger.debug("Trying to import module {}".format(module_name))
            try:
                module_and_file = ModuleAndFile(module=importlib.import_module(module_name), file=file_path)
                self.logger.debug("Successfully imported " + module_name)
                return module_and_file
            except Exception as e:
                # Because of the way we are searching for
                # valid modules in this loop, we expect some of the
                # module names we construct to fail to import.
                #
                # Therefore we check if the failure "looks normal", and log
                # expected failures only at debug level.
                #
                # Unexpected errors are aggressively logged, e.g. if the module
                # is valid but itself triggers an ImportError (e.g. typo in an
                # import line), or a SyntaxError.
                expected_error = False
                if isinstance(e, ImportError):
                    match = re.search(r"No module named '?([^\s\']+)'?", str(e))

                    if match is not None:
                        missing_module = match.groups()[0]

                        if missing_module in module_name:
                            expected_error = True
                        else:
                            # The error is still an expected error if missing_module is a suffix of module_name.
                            # This is because the error message may contain only a suffix
                            # of the original module_name if leftmost chunk of module_name is a legitimate
                            # module name, but the rightmost part doesn't exist.
                            #
                            # Check this by seeing if it is a "piecewise suffix" of module_name - i.e. if the parts
                            # delimited by dots match. This is a little bit stricter than just checking for a suffix
                            #
                            # E.g. "fancy.cool_module" is a piecewise suffix of "my.fancy.cool_module",
                            # but  "module" is not a piecewise suffix of "my.fancy.cool_module"
                            missing_module_pieces = missing_module.split(".")
                            expected_error = missing_module_pieces == path_pieces[-len(missing_module_pieces) :]

                if expected_error:
                    self.logger.debug(
                        "Failed to import %s. This is likely an artifact of the "
                        "ducktape module loading process: %s: %s",
                        module_name,
                        e.__class__.__name__,
                        e,
                    )
                else:
                    self.logger.error(
                        "Failed to import %s, which may indicate a broken test that cannot be loaded: %s: %s",
                        module_name,
                        e.__class__.__name__,
                        e,
                    )
            finally:
                path_pieces = path_pieces[1:]

        self.logger.debug("Unable to import %s" % file_path)
        return None

    def _expand_module(self, module_and_file: ModuleAndFile) -> List[TestContext]:
        """Return a list of TestContext objects, one object for every 'testable unit' in module"""

        test_context_list = []
        module = module_and_file.module
        file_name = module_and_file.file
        module_objects = module.__dict__.values()
        test_classes = [c for c in module_objects if self._is_test_class(c)]

        for cls in test_classes:
            test_context_list.extend(
                self._expand_class(
                    TestContext(
                        session_context=self.session_context,
                        cluster=self.cluster,
                        module=module.__name__,
                        cls=cls,
                        file=file_name,
                    )
                )
            )

        return test_context_list

    def _expand_class(self, t_ctx: TestContext) -> List[TestContext]:
        """Return a list of TestContext objects, one object for each method in t_ctx.cls"""
        test_methods = []
        for f_name in dir(t_ctx.cls):
            f = getattr(t_ctx.cls, f_name)
            if self._is_test_function(f):
                test_methods.append(f)

        test_context_list = []
        for f in test_methods:
            t = t_ctx.copy(function=f)
            test_context_list.extend(self._expand_function(t))
        return test_context_list

    def _expand_function(self, t_ctx: TestContext) -> List[TestContext]:
        expander = MarkedFunctionExpander(
            t_ctx.session_context,
            t_ctx.module,
            t_ctx.cls,
            t_ctx.function,
            t_ctx.file,
            t_ctx.cluster,
        )
        return expander.expand(self.injected_args)

    def _find_test_files(self, path_or_glob):
        """
        Return a list of files at the specified path (or glob) that look like test files.

        - Globs are not recursive, so ** is not supported.
        - However, if the glob matches a folder (or is not a glob but simply a folder path),
            we will load all tests in that folder and recursively search the sub folders.

        :param path_or_glob: path to a test file, folder with test files or a glob that expands to folders and files
        :return: list of absolute paths to test files
        """
        test_files = []
        self.logger.debug("Looking for test files in {}".format(path_or_glob))
        # glob is safe to be called on non-glob path - it would just return that same path wrapped in a list
        expanded_glob = glob.glob(path_or_glob)
        self.logger.debug("Expanded {} into {}".format(path_or_glob, expanded_glob))

        def maybe_add_test_file(f):
            if self._is_test_file(f):
                test_files.append(f)
            else:
                self.logger.debug("Skipping {} because it isn't a test file".format(f))

        for path in expanded_glob:
            if not os.path.exists(path):
                raise LoaderException("Path {} does not exist".format(path))
            self.logger.debug("Checking {}".format(path))
            if os.path.isfile(path):
                maybe_add_test_file(path)
            elif os.path.isdir(path):
                for pwd, dirs, files in os.walk(path):
                    if "__init__.py" not in files:
                        # Not a package - ignore this directory
                        continue
                    for f in files:
                        file_path = os.path.abspath(os.path.join(pwd, f))
                        maybe_add_test_file(file_path)
            else:
                raise LoaderException("Got a path that we don't understand: " + path)

        return test_files

    def _is_test_file(self, file_name):
        """By default, a test file looks like test_*.py or *_test.py"""
        return re.match(self.test_file_pattern, os.path.basename(file_name)) is not None

    def _is_test_class(self, obj: Any) -> bool:
        """An object is a test class if it's a leafy subclass of Test."""
        return inspect.isclass(obj) and issubclass(obj, Test) and len(obj.__subclasses__()) == 0

    def _is_test_function(self, function: Any) -> bool:
        """A test function looks like a test and is callable (or expandable)."""
        if function is None:
            return False

        if not parametrized(function) and not callable(function):
            return False

        return re.match(self.test_function_pattern, function.__name__) is not None

    def _load_test_suite_files(self, test_suite_files: List[Any]) -> Set[Any]:
        suites = list()

        suites.extend(self._read_test_suite_from_file(test_suite_files))

        all_contexts = set()
        for suite in suites:
            all_contexts.update(self._load_test_suite(**suite))
        return all_contexts

    def _load_file(self, suite_file_path):
        if not os.path.exists(suite_file_path):
            raise LoaderException(f"Path {suite_file_path} does not exist")
        if not os.path.isfile(suite_file_path):
            raise LoaderException(f"{suite_file_path} is not a file, so it cannot be a test suite")

        with open(suite_file_path) as fp:
            try:
                file_content = yaml.load(fp, Loader=yaml.FullLoader)
            except Exception as e:
                raise LoaderException("Failed to load test suite from file: " + suite_file_path, e)

        if not file_content:
            raise LoaderException("Test suite file is empty: " + suite_file_path)
        if not isinstance(file_content, dict):
            raise LoaderException("Malformed test suite file: " + suite_file_path)

        for suite_name, suite_content in file_content.items():
            if not suite_content:
                raise LoaderException("Empty test suite " + suite_name + " in " + suite_file_path)
        return file_content

    def _load_suites(self, file_path, file_content):
        suites = []
        for suite_name, suite_content in file_content.items():
            if not suite_content:
                raise LoaderException(f"Empty test suite {suite_name} in {file_path}")

            # if test suite is just a list of paths, those are included paths
            # otherwise, expect separate sections for included and excluded
            if isinstance(suite_content, list):
                included_paths = suite_content
                excluded_paths = None
            elif isinstance(suite_content, dict):
                included_paths = suite_content.get("included")
                excluded_paths = suite_content.get("excluded")
            else:
                raise LoaderException(f"Malformed test suite {suite_name} in {file_path}")
            suites.append(
                {
                    "name": suite_name,
                    "included": included_paths,
                    "excluded": excluded_paths,
                    "base_dir": os.path.dirname(file_path),
                }
            )
        return suites

    def _read_test_suite_from_file(self, root_suite_file_paths: List[Any]) -> List[Any]:
        root_suite_file_paths = [os.path.abspath(file_path) for file_path in root_suite_file_paths]
        files = {file: self._load_file(file) for file in root_suite_file_paths}
        stack = root_suite_file_paths

        # load all files
        while len(stack) != 0:
            curr = stack.pop()
            loaded = files[curr]
            if "import" in loaded:
                if isinstance(loaded["import"], str):
                    loaded["import"] = [loaded["import"]]
                directory = os.path.dirname(curr)
                # apply path of current file to the files inside
                abs_file_iter = (os.path.abspath(os.path.join(directory, file)) for file in loaded.get("import", []))
                imported = [file for file in abs_file_iter if file not in files]
                for file in imported:
                    files[file] = self._load_file(file)
                stack.extend(imported)
                del files[curr]["import"]

        # load all suites from all loaded files
        suites = []
        for file_name, file_context in files.items():
            suites.extend(self._load_suites(file_name, file_context))

        return suites

    def _load_test_suite(self, **kwargs):
        name = kwargs["name"]
        included = kwargs["included"]
        excluded = kwargs.get("excluded")
        base_dir = kwargs.get("base_dir")
        excluded_contexts = self._load_test_contexts(excluded, base_dir=base_dir)
        included_contexts = self._load_test_contexts(included, base_dir=base_dir)

        self.logger.debug("Including tests: " + str(included_contexts))
        self.logger.debug("Excluding tests: " + str(excluded_contexts))

        # filter out any excluded test from the included tests set
        all_test_context_list = self._filter_excluded_test_contexts(included_contexts, excluded_contexts)
        if not all_test_context_list:
            raise LoaderException("No tests found in  " + name)

        return all_test_context_list

    def _load_test_contexts(
        self, test_discovery_symbols: Optional[List[str]], base_dir: None = None
    ) -> Set[TestContext]:
        """
        Load all test_context objects found in test_discovery_symbols.
        Each test discovery symbol is a dir or file path, optionally with with a ::Class or ::Class.method specified.

        :param test_discovery_symbols: list of test symbols to look into
        :return: List of test_context objects discovered by checking test_discovery_symbols (may be empty if none were
            discovered)
        """
        if not test_discovery_symbols:
            return set()
        if not isinstance(test_discovery_symbols, list):
            raise LoaderException("Expected test_discovery_symbols to be a list.")
        all_test_context_list = set()
        for symbol in test_discovery_symbols:
            (
                path_or_glob,
                cls_name,
                method,
                injected_args,
            ) = self._parse_discovery_symbol(symbol, base_dir)
            self.logger.debug(
                "Parsed symbol into {} - {} - {} - {}".format(path_or_glob, cls_name, method, injected_args)
            )
            path_or_glob = os.path.abspath(path_or_glob)

            # TODO: consider adding a check to ensure glob or dir is not used together with cls_name and method
            test_files = []
            if os.path.isfile(path_or_glob):
                # if it is a single file, just add it directly - https://github.com/confluentinc/ducktape/issues/284
                test_files = [path_or_glob]
            else:
                # otherwise, when dealing with a dir or a glob, apply pattern matching rules
                test_files = self._find_test_files(path_or_glob)

            self._add_top_level_dirs_to_sys_path(test_files)

            for test_file in test_files:
                directory = os.path.dirname(test_file)
                module_name = os.path.basename(test_file)
                test_context_list_for_file = self.discover(
                    directory,
                    module_name,
                    cls_name,
                    method,
                    injected_args=injected_args,
                )
                all_test_context_list.update(test_context_list_for_file)
                if len(test_context_list_for_file) == 0:
                    self.logger.warn("Didn't find any tests in %s " % test_file)

        return all_test_context_list

    def _filter_by_unique_test_id(self, contexts: Iterable[TestContext]) -> List[TestContext]:
        contexts_dict = dict()
        for context in contexts:
            if context.test_id not in contexts_dict:
                contexts_dict[context.test_id] = context
        return list(contexts_dict.values())

    def _filter_excluded_test_contexts(
        self, included_contexts: Iterable[TestContext], excluded_contexts: Set[Any]
    ) -> Set[TestContext]:
        excluded_test_ids = {ctx.test_id for ctx in excluded_contexts}
        return {ctx for ctx in included_contexts if ctx.test_id not in excluded_test_ids}

    def _add_top_level_dirs_to_sys_path(self, test_files: List[str]) -> None:
        seen_dirs = set()
        for path in test_files:
            dir = os.path.dirname(path)
            while os.path.exists(os.path.join(dir, "__init__.py")):
                dir = os.path.dirname(dir)
            if dir not in seen_dirs:
                sys.path.append(dir)
                seen_dirs.add(dir)


================================================
FILE: ducktape/tests/loggermaker.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging


class LoggerMaker(object):
    """This class helps ensure programmatically configured loggers are configured only once."""

    def __init__(self, logger_name: str) -> None:
        self.logger_name = logger_name

    @property
    def logger(self) -> logging.Logger:
        """Read-only logger attribute."""
        if not hasattr(self, "_logger"):
            self._logger = logging.getLogger(self.logger_name)

        if not self.configured:
            self.configure_logger()

        return self._logger

    @property
    def configured(self) -> bool:
        """Return True iff the logger has been configured.

        Since logging objects are global in the sense that logging.getLogger(self.logger_name) yields the same
        object, we assume it is already configured if it already has at least 1 handler.
        """
        return len(logging.getLogger(self.logger_name).handlers) > 0

    def configure_logger(self):
        raise NotImplementedError("configure_logger property must be implemented by a subclass")


def close_logger(logger):
    """Filehandles etc are not closed automatically, so close them here"""
    if logger is not None:
        handlers = logger.handlers[:]
        for handler in handlers:
            handler.close()
            logger.removeHandler(handler)


================================================
FILE: ducktape/tests/reporter.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import json
import os
import shutil
import sys
import xml.etree.ElementTree as ET
from pathlib import Path

import yaml

from ducktape.json_serializable import DucktapeJSONEncoder
from ducktape.tests.status import FAIL, FLAKY, IGNORE, PASS
from ducktape.utils.terminal_size import get_terminal_size
from ducktape.utils.util import ducktape_version

DEFAULT_SEPARATOR_WIDTH = 100


def format_time(t):
    """Return human-readable interval of time.
    Assumes t is in units of seconds.
    """
    minutes = int(t / 60)
    seconds = t % 60

    r = ""
    if minutes > 0:
        r += "%d minute%s " % (minutes, "" if minutes == 1 else "s")
    r += "%.3f seconds" % seconds
    return r


class SingleResultReporter(object):
    """Helper class for creating a view of results from a single test."""

    def __init__(self, result):
        self.result = result
        self.width = get_terminal_size()[0]

    def result_string(self):
        """Stringify single result"""
        result_lines = [
            "test_id:    %s" % self.result.test_id,
            "status:     %s" % str(self.result.test_status).upper(),
            "run time:   %s" % format_time(self.result.run_time_seconds),
        ]

        if self.result.test_status == FAIL:
            # Add summary if the test failed
            result_lines.append("\n")
            result_lines.append("    " + self.result.summary)

        if self.result.data is not None:
            result_lines.append(json.dumps(self.result.data))

        return "\n".join(result_lines)

    def report_string(self):
        """Get the whole report string."""
        return "\n".join(["=" * self.width, self.result_string()])


class SingleResultFileReporter(SingleResultReporter):
    def report(self):
        self.width = DEFAULT_SEPARATOR_WIDTH
        report_file = os.path.join(self.result.results_dir, "report.txt")
        with open(report_file, "w") as fp:
            fp.write(self.report_string())

        # write collected data
        if self.result.data is not None:
            data_file = os.path.join(self.result.results_dir, "data.json")
            with open(data_file, "w") as fp:
                fp.write(json.dumps(self.result.data))


class SummaryReporter(object):
    def __init__(self, results):
        self.results = results
        self.width = get_terminal_size()[0]

    def report(self):
        raise NotImplementedError("method report must be implemented by subclasses of SummaryReporter")


class SimpleSummaryReporter(SummaryReporter):
    def header_string(self):
        """Header lines of the report"""
        header_lines = [
            "=" * self.width,
            "SESSION REPORT (ALL TESTS)",
            "ducktape version: %s" % ducktape_version(),
            "session_id:       %s" % self.results.session_context.session_id,
            "run time:         %s" % format_time(self.results.run_time_seconds),
            "tests run:        %d" % len(self.results),
            "passed:           %d" % self.results.num_passed,
            "flaky:            %d" % self.results.num_flaky,
            "failed:           %d" % self.results.num_failed,
            "ignored:          %d" % self.results.num_ignored,
            "=" * self.width,
        ]

        return "\n".join(header_lines)

    def report_string(self):
        """Get the whole report string."""
        report_lines = [self.header_string()]

        report_lines.extend(
            [SingleResultReporter(result).result_string() + "\n" + "-" * self.width for result in self.results]
        )

        return "\n".join(report_lines)


class SimpleFileSummaryReporter(SimpleSummaryReporter):
    def report(self):
        self.width = DEFAULT_SEPARATOR_WIDTH
        report_file = os.path.join(self.results.session_context.results_dir, "report.txt")
        with open(report_file, "w") as fp:
            fp.write(self.report_string())


class SimpleStdoutSummaryReporter(SimpleSummaryReporter):
    def report(self):
        print(self.report_string())


class JSONReporter(object):
    def __init__(self, results):
        self.results = results

    def report(self):
        report_file = os.path.abspath(os.path.join(self.results.session_context.results_dir, "report.json"))
        with open(report_file, "w") as f:
            f.write(
                json.dumps(
                    self.results,
                    cls=DucktapeJSONEncoder,
                    sort_keys=True,
                    indent=2,
                    separators=(",", ": "),
                )
            )


class JUnitReporter(object):
    def __init__(self, results):
        self.results = results

    def report(self):
        report_file = os.path.abspath(os.path.join(self.results.session_context.results_dir, "report.xml"))
        testsuites = {}

        # First bucket by module_name and argregate counts
        for result in self.results:
            module_name = result.module_name
            testsuites.setdefault(module_name, {})
            # Set default values
            testsuite = testsuites[module_name]
            testsuite.setdefault("tests", 0)
            testsuite.setdefault("skipped", 0)
            testsuite.setdefault("failures", 0)
            testsuite.setdefault("errors", 0)
            testsuite.setdefault("testcases", []).append(result)

            # Always increment total number of tests
            testsuite["tests"] += 1
            if result.test_status == FAIL:
                testsuite["failures"] += 1
            elif result.test_status == IGNORE:
                testsuite["skipped"] += 1

        total = self.results.num_failed + self.results.num_ignored + self.results.num_passed + self.results.num_flaky
        # Now start building XML document
        root = ET.Element(
            "testsuites",
            attrib=dict(
                name="ducktape",
                time=str(self.results.run_time_seconds),
                tests=str(total),
                disabled="0",
                errors="0",
                failures=str(self.results.num_failed),
            ),
        )
        for module_name, testsuite in testsuites.items():
            xml_testsuite = ET.SubElement(
                root,
                "testsuite",
                attrib=dict(
                    name=module_name,
                    tests=str(testsuite["tests"]),
                    disabled="0",
                    errors="0",
                    failures=str(testsuite["failures"]),
                    skipped=str(testsuite["skipped"]),
                ),
            )
            for test in testsuite["testcases"]:
                # Since we're already aware of module_name and cls_name, strip that prefix off
                full_name = "{module_name}.{cls_name}.".format(module_name=module_name, cls_name=test.cls_name)
                if test.test_id.startswith(full_name):
                    name = test.test_id[len(full_name) :]
                else:
                    name = test.test_id
                xml_testcase = ET.SubElement(
                    xml_testsuite,
                    "testcase",
                    attrib=dict(
                        name=name,
                        classname=test.cls_name,
                        time=str(test.run_time_seconds),
                        status=str(test.test_status),
                        assertions="",
                    ),
                )
                if test.test_status == FAIL:
                    xml_failure = ET.SubElement(
                        xml_testcase,
                        "failure",
                        attrib=dict(message=test.summary.splitlines()[0]),
                    )
                    xml_failure.text = test.summary
                elif test.test_status == IGNORE:
                    ET.SubElement(xml_testcase, "skipped")

        with open(report_file, "w") as f:
            content = ET.tostring(root)
            if isinstance(content, bytes):
                content = content.decode("utf-8")
            f.write(content)


class HTMLSummaryReporter(SummaryReporter):
    def __init__(self, results, expected_test_count):
        super().__init__(results)
        self.expected_test_count = expected_test_count

    def format_test_name(self, result):
        lines = [
            "Module:      " + result.module_name,
            "Class:       " + result.cls_name,
            "Method:      " + result.function_name,
            f"Nodes (used/allocated): {result.nodes_used}/{result.nodes_allocated}",
        ]

        if result.injected_args is not None:
            lines.append("Arguments:")
            lines.append(
                json.dumps(
                    result.injected_args,
                    sort_keys=True,
                    indent=2,
                    separators=(",", ": "),
                )
            )

        return "\n".join(lines)

    def format_result(self, result):
        test_result = str(result.test_status).lower()

        result_json = {
            "test_name": self.format_test_name(result),
            "test_result": test_result,
            "description": result.description,
            "run_time": format_time(result.run_time_seconds),
            "data": ""
            if result.data is None
            else json.dumps(result.data, sort_keys=True, indent=2, separators=(",", ": ")),
            "summary": result.summary,
            "test_log": self.test_results_dir(result),
        }
        return result_json

    def test_results_dir(self, result):
        """Return *relative path* to test results directory.

        Path is relative to the base results_dir. Relative path behaves better if the results directory is copied,
        moved etc.
        """
        base_dir = os.path.abspath(result.session_context.results_dir)
        base_dir = os.path.join(base_dir, "")  # Ensure trailing directory indicator

        test_results_dir = os.path.abspath(result.results_dir)
        return test_results_dir[len(base_dir) :]  # truncate the "absolute" portion

    def format_report(self):
        if sys.version_info >= (3, 9):
            import importlib.resources as importlib_resources

            template = importlib_resources.files("ducktape").joinpath("templates/report/report.html").read_text("utf-8")
        else:
            import pkg_resources

            template = pkg_resources.resource_string(__name__, "../templates/report/report.html").decode("utf-8")

        num_tests_run = len(self.results)
        num_passes = 0
        failed_result_string = []
        passed_result_string = []
        ignored_result_string = []
        flaky_result_string = []

        for result in self.results:
            json_string = json.dumps(self.format_result(result))
            if result.test_status == PASS:
                num_passes += 1
                passed_result_string.append(json_string)
                passed_result_string.append(",")
            elif result.test_status == FAIL:
                failed_result_string.append(json_string)
                failed_result_string.append(",")
            elif result.test_status == IGNORE:
                ignored_result_string.append(json_string)
                ignored_result_string.append(",")
            elif result.test_status == FLAKY:
                flaky_result_string.append(json_string)
                flaky_result_string.append(",")
            else:
                raise Exception("Unknown test status in report: {}".format(result.test_status.to_json()))

        args = {
            "ducktape_version": ducktape_version(),
            "expected_test_count": self.expected_test_count,
            "num_tests_run": num_tests_run,
            "num_passes": self.results.num_passed,
            "num_flaky": self.results.num_flaky,
            "num_failures": self.results.num_failed,
            "num_ignored": self.results.num_ignored,
            "run_time": format_time(self.results.run_time_seconds),
            "session": self.results.session_context.session_id,
            "passed_tests": "".join(passed_result_string),
            "flaky_tests": "".join(flaky_result_string),
            "failed_tests": "".join(failed_result_string),
            "ignored_tests": "".join(ignored_result_string),
            "test_status_names": ",".join(f"'{status}'" for status in (PASS, FAIL, IGNORE, FLAKY)),
        }

        html = template % args
        report_html = os.path.join(self.results.session_context.results_dir, "report.html")
        with open(report_html, "w") as fp:
            fp.write(html)
            fp.close()

        report_css = os.path.join(self.results.session_context.results_dir, "report.css")

        if sys.version_info >= (3, 9):
            import importlib.resources as importlib_resources

            with importlib_resources.as_file(
                importlib_resources.files("ducktape") / "templates/report/report.css"
            ) as report_css_origin:
                shutil.copy2(report_css_origin, report_css)
        else:
            import pkg_resources

            report_css_origin = pkg_resources.resource_filename(__name__, "../templates/report/report.css")
            shutil.copy2(report_css_origin, report_css)

    def report(self):
        self.format_report()


class FailedTestSymbolReporter(SummaryReporter):
    def __init__(self, results):
        super().__init__(results)
        self.working_dir = Path().absolute()
        self.separator = "=" * self.width

    def to_symbol(self, result):
        p = Path(result.file_name).relative_to(self.working_dir)
        line = f"{p}::{result.cls_name}.{result.function_name}"
        if result.injected_args:
            injected_args_str = json.dumps(result.injected_args, separators=(",", ":"))
            line += f"@{injected_args_str}"
        return line

    def dump_test_suite(self, lines):
        print(self.separator)
        print("FAILED TEST SUITE")
        suite = {self.results.session_context.session_id: lines}
        file_path = Path(self.results.session_context.results_dir) / "rerun-failed.yml"
        with file_path.open("w") as fp:
            print(f"Test suite to rerun failed tests: {file_path}")
            yaml.dump(suite, stream=fp, indent=4)

    def print_test_symbols_string(self, lines):
        print(self.separator)
        print("FAILED TEST SYMBOLS")
        print("Pass the test symbols below to your ducktape run")
        # quote the symbol because json parameters will be processed by shell otherwise, making it not copy-pasteable
        print(" ".join([f"'{line}'" for line in lines]))

    def report(self):
        symbols = [self.to_symbol(result) for result in self.results if result.test_status == FAIL]
        if not symbols:
            return

        self.dump_test_suite(symbols)
        self.print_test_symbols_string(symbols)


================================================
FILE: ducktape/tests/result.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import time
from typing import List

from ducktape.cluster.cluster import Cluster
from ducktape.cluster.vagrant import VagrantCluster
from ducktape.json_serializable import DucktapeJSONEncoder
from ducktape.tests.reporter import SingleResultFileReporter
from ducktape.tests.session import SessionContext
from ducktape.tests.status import FAIL, FLAKY, IGNORE, PASS
from ducktape.tests.test_context import TestContext
from ducktape.utils.local_filesystem_utils import mkdir_p
from ducktape.utils.util import ducktape_version


class TestResult(object):
    """Wrapper class for a single result returned by a single test."""

    def __init__(
        self,
        test_context,
        test_index,
        session_context,
        test_status=PASS,
        summary="",
        data=None,
        start_time=-1,
        stop_time=-1,
    ):
        """
        @param test_context  standard test context object
        @param test_status   did the test pass or fail, etc?
        @param summary       summary information
        @param data          data returned by the test, e.g. throughput
        """
        self.nodes_allocated = len(test_context.cluster)
        self.nodes_used = test_context.cluster.max_used_nodes

        # Collect node_type from cluster metadata if available
        self.node_type = test_context.cluster_use_metadata.get("node_type")

        if hasattr(test_context, "services"):
            self.services = test_context.services.to_json()
        else:
            self.services = {}

        self.test_id = test_context.test_id
        self.module_name = test_context.module_name
        self.cls_name = test_context.cls_name
        self.function_name = test_context.function_name
        self.injected_args = test_context.injected_args
        self.description = test_context.description
        self.results_dir = TestContext.results_dir(test_context, test_index)

        self.test_index = test_index

        self.session_context = session_context
        self.test_status = test_status
        self.summary = summary
        self.data = data
        self.file_name = test_context.file

        self.base_results_dir = session_context.results_dir
        if not self.results_dir.endswith(os.path.sep):
            self.results_dir += os.path.sep
        if not self.base_results_dir.endswith(os.path.sep):
            self.base_results_dir += os.path.sep
        assert self.results_dir.startswith(self.base_results_dir)
        self.relative_results_dir = self.results_dir[len(self.base_results_dir) :]

        # For tracking run time
        self.start_time = start_time
        self.stop_time = stop_time

    def __repr__(self):
        return "<%s - test_status:%s, data:%s>" % (
            self.__class__.__name__,
            self.test_status,
            str(self.data),
        )

    @property
    def run_time_seconds(self):
        if self.start_time < 0:
            return -1
        if self.stop_time < 0:
            return time.time() - self.start_time

        return self.stop_time - self.start_time

    def report(self):
        if not os.path.exists(self.results_dir):
            mkdir_p(self.results_dir)

        self.dump_json()
        test_reporter = SingleResultFileReporter(self)
        test_reporter.report()

    def dump_json(self):
        """Dump this object as json to the given location. By default, dump into self.results_dir/report.json"""
        with open(os.path.join(self.results_dir, "report.json"), "w") as fd:
            json.dump(self, fd, cls=DucktapeJSONEncoder, sort_keys=True, indent=2)

    def to_json(self):
        result = {
            "test_id": self.test_id,
            "module_name": self.module_name,
            "cls_name": self.cls_name,
            "function_name": self.function_name,
            "injected_args": self.injected_args,
            "description": self.description,
            "results_dir": self.results_dir,
            "relative_results_dir": self.relative_results_dir,
            "base_results_dir": self.base_results_dir,
            "test_status": self.test_status,
            "summary": self.summary,
            "data": self.data,
            "start_time": self.start_time,
            "stop_time": self.stop_time,
            "run_time_seconds": self.run_time_seconds,
            "nodes_allocated": self.nodes_allocated,
            "nodes_used": self.nodes_used,
            "services": self.services,
        }

        # Only include node_type if it was specified in the @cluster decorator
        if self.node_type is not None:
            result["node_type"] = self.node_type

        return result


class TestResults(object):
    """Class used to aggregate individual TestResult objects from many tests."""

    def __init__(self, session_context: SessionContext, cluster: VagrantCluster, client_status: dict) -> None:
        """
        :type session_context: ducktape.tests.session.SessionContext
        """
        self._results: List[TestResult] = []
        self.session_context: SessionContext = session_context
        self.cluster: Cluster = cluster

        # For tracking total run time
        self.start_time: int = -1
        self.stop_time: int = -1
        self.client_status = client_status

    def append(self, obj: TestResult):
        return self._results.append(obj)

    def __len__(self):
        return len(self._results)

    def __iter__(self):
        return iter(self._results)

    @property
    def num_passed(self):
        return len([r for r in self._results if r.test_status == PASS])

    @property
    def num_failed(self):
        return len([r for r in self._results if r.test_status == FAIL])

    @property
    def num_ignored(self):
        return len([r for r in self._results if r.test_status == IGNORE])

    @property
    def num_flaky(self):
        return len([r for r in self._results if r.test_status == FLAKY])

    @property
    def run_time_seconds(self):
        if self.start_time < 0:
            return -1
        if self.stop_time < 0:
            self.stop_time = time.time()

        return self.stop_time - self.start_time

    def get_aggregate_success(self):
        """Check cumulative success of all tests run so far
        :rtype: bool
        """
        for result in self._results:
            if result.test_status == FAIL:
                return False
        return True

    def _stats(self, num_list):
        if len(num_list) == 0:
            return {"mean": None, "min": None, "max": None}

        return {
            "mean": sum(num_list) / float(len(num_list)),
            "min": min(num_list),
            "max": max(num_list),
        }

    def to_json(self):
        if self.run_time_seconds == 0 or len(self.cluster) == 0:
            # If things go horribly wrong, the test run may be effectively instantaneous
            # Let's handle this case gracefully, and avoid divide-by-zero
            cluster_utilization = 0
            parallelism = 0
        else:
            cluster_utilization = (
                (1.0 / len(self.cluster))
                * (1.0 / self.run_time_seconds)
                * sum([r.nodes_used * r.run_time_seconds for r in self])
            )
            parallelism = sum([r.run_time_seconds for r in self._results]) / self.run_time_seconds
        result = {
            "ducktape_version": ducktape_version(),
            "session_context": self.session_context,
            "run_time_seconds": self.run_time_seconds,
            "start_time": self.start_time,
            "stop_time": self.stop_time,
            "run_time_statistics": self._stats([r.run_time_seconds for r in self]),
            "cluster_nodes_used": self._stats([r.nodes_used for r in self]),
            "cluster_nodes_allocated": self._stats([r.nodes_allocated for r in self]),
            "cluster_utilization": cluster_utilization,
            "cluster_num_nodes": len(self.cluster),
            "num_passed": self.num_passed,
            "num_failed": self.num_failed,
            "num_ignored": self.num_ignored,
            "parallelism": parallelism,
            "client_status": {str(key): value for key, value in self.client_status.items()},
            "results": [r for r in self._results],
        }
        if self.num_flaky:
            result["num_flaky"] = self.num_flaky
        return result


================================================
FILE: ducktape/tests/runner.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

from collections import namedtuple, defaultdict
import copy
import logging
import multiprocessing
import os
import signal
import time
import traceback
from typing import Dict, List

import zmq

from ducktape.cluster.finite_subcluster import FiniteSubcluster
from ducktape.command_line.defaults import ConsoleDefaults
from ducktape.cluster.vagrant import VagrantCluster
from ducktape.cluster.node_container import InsufficientResourcesError
from ducktape.tests.scheduler import TestScheduler
from ducktape.tests.result import FAIL, TestResult
from ducktape.tests.reporter import (
    SimpleFileSummaryReporter,
    HTMLSummaryReporter,
    JSONReporter,
    JUnitReporter,
)
from ducktape.utils import persistence
from ducktape.errors import TimeoutError
from ducktape.tests.event import ClientEventFactory, EventResponseFactory
from ducktape.tests.result import TestResults
from ducktape.tests.runner_client import run_client
from ducktape.tests.serde import SerDe
from ducktape.tests.session import SessionContext
from ducktape.tests.test_context import TestContext
from ducktape.utils.terminal_size import get_terminal_size

DEFAULT_MP_JOIN_TIMEOUT = 30
DEFAULT_TIMEOUT_EXCEPTION_JOIN_TIMEOUT = 120


class Receiver(object):
    def __init__(self, min_port: int, max_port: int) -> None:
        assert min_port <= max_port, "Expected min_port <= max_port, but instead: min_port: %s, max_port %s" % (
            min_port,
            max_port,
        )
        self.port = None
        self.min_port = min_port
        self.max_port = max_port

        self.serde = SerDe()

        self.zmq_context = zmq.Context()
        self.socket = self.zmq_context.socket(zmq.REP)

    def start(self):
        """Bind to a random port in the range [self.min_port, self.max_port], inclusive"""
        # note: bind_to_random_port may retry the same port multiple times
        self.port = self.socket.bind_to_random_port(
            addr="tcp://*",
            min_port=self.min_port,
            max_port=self.max_port + 1,
            max_tries=2 * (self.max_port + 1 - self.min_port),
        )

    def recv(self, timeout=1800000):
        if timeout is None:
            # use default value of 1800000 or 30 minutes
            timeout = 1800000
        self.socket.RCVTIMEO = timeout
        try:
            message = self.socket.recv()
        except zmq.Again:
            raise TimeoutError("runner client unresponsive")
        return self.serde.deserialize(message)

    def send(self, event):
        self.socket.send(self.serde.serialize(event))

    def close(self):
        self.socket.setsockopt(zmq.LINGER, 0)
        self.socket.close()


TestKey = namedtuple("TestKey", ["test_id", "test_index"])


class TestRunner(object):
    # When set to True, the test runner will finish running/cleaning the current test, but it will not run any more
    stop_testing = False

    def __init__(
        self,
        cluster: VagrantCluster,
        session_context: SessionContext,
        session_logger: logging.Logger,
        tests: List[TestContext],
        deflake_num: int,
        min_port: int = ConsoleDefaults.TEST_DRIVER_MIN_PORT,
        max_port: int = ConsoleDefaults.TEST_DRIVER_MAX_PORT,
        finish_join_timeout: int = DEFAULT_MP_JOIN_TIMEOUT,
        timeout_exception_join_timeout: int = DEFAULT_TIMEOUT_EXCEPTION_JOIN_TIMEOUT,
    ) -> None:
        # Set handler for SIGTERM (aka kill -15)
        # Note: it doesn't work to set a handler for SIGINT (Ctrl-C) in this parent process because the
        # handler is inherited by all forked child processes, and it prevents the default python behavior
        # of translating SIGINT into a KeyboardInterrupt exception
        signal.signal(signal.SIGTERM, self._propagate_sigterm)

        # session_logger, message logger,
        self.session_logger = session_logger
        self.cluster = cluster
        self.event_response = EventResponseFactory()
        self.hostname = "localhost"
        self.receiver = Receiver(min_port, max_port)

        self.deflake_num = deflake_num

        self.session_context = session_context
        self.max_parallel = session_context.max_parallel
        self.client_report = defaultdict(dict)
        self.results = TestResults(self.session_context, self.cluster, client_status=self.client_report)

        self.exit_first = self.session_context.exit_first

        self.main_process_pid = os.getpid()
        self.scheduler = TestScheduler(tests, self.cluster)

        self.test_counter = 1
        self.total_tests = len(self.scheduler)
        # This immutable dict tracks test_id -> test_context
        self._test_context = persistence.make_dict(**{t.test_id: t for t in tests})
        self._test_cluster: Dict[TestKey, FiniteSubcluster] = {}  # Track subcluster assigned to a particular TestKey
        self._client_procs: Dict[TestKey, multiprocessing.Process] = {}  # track client processes running tests
        self.active_tests: Dict[TestKey, bool] = {}
        self.finished_tests: Dict[TestKey, dict] = {}
        self.test_schedule_log: List[TestKey] = []
        self.finish_join_timeout: int = finish_join_timeout
        self.timeout_exception_join_timeout: int = timeout_exception_join_timeout

    def _terminate_process(self, process: multiprocessing.Process):
        # use os.kill rather than multiprocessing.terminate for more control
        # This is called after SIGTERM was sent and process didn't exit, so escalate to SIGKILL
        assert process.pid != os.getpid(), "Signal handler should not reach this point in a client subprocess."
        assert process.pid is not None, "Process has no pid, cannot terminate."
        if process.is_alive():
            self._log(logging.WARNING, f"Process {process.name} did not respond to SIGTERM, escalating to SIGKILL")
            os.kill(process.pid, signal.SIGKILL)

    def _join_test_process(self, process_key, timeout: int = DEFAULT_MP_JOIN_TIMEOUT):
        # waits for process to complete, if it doesn't terminate it
        process: multiprocessing.Process = self._client_procs[process_key]
        start = time.time()
        while time.time() - start <= timeout:
            if not process.is_alive():
                self.client_report[process_key]["status"] = "FINISHED"
                break
            time.sleep(0.1)
        else:
            # Note: This can lead to some tmp files being uncleaned, otherwise nothing else should be executed by the
            #       client after this point.
            self._log(
                logging.ERROR, f"after waiting {timeout}s, process {process.name} failed to complete.  Terminating..."
            )
            self._terminate_process(process)
            self.client_report[process_key]["status"] = "TERMINATED"
        process.join()
        self.client_report[process_key]["exitcode"] = process.exitcode
        self.client_report[process_key]["runner_end_time"] = time.time()
        assert not process.is_alive()
        del self._client_procs[process_key]

    def _propagate_sigterm(self, signum, frame):
        """Handler SIGTERM and SIGINT by propagating SIGTERM to all client processes.

        Note that multiprocessing processes are in the same process group as the main process, so Ctrl-C will
        result in SIGINT being propagated to all client processes automatically. This may result in multiple SIGTERM
        signals getting sent to client processes in quick succession.

        However, it is possible that the main process (and not the process group) receives a SIGINT or SIGTERM
        directly. Propagating SIGTERM to client processes is necessary in this case.
        """
        if os.getpid() != self.main_process_pid:
            # since we're using the multiprocessing module to create client processes,
            # this signal handler is also attached client processes, so we only want to propagate TERM signals
            # if this process *is* the main runner server process
            return

        self.stop_testing = True
        for p in self._client_procs.values():
            self._terminate_process(p)

    def who_am_i(self):
        """Human-readable name helpful for logging."""
        return self.__class__.__name__

    @property
    def _ready_to_trigger_more_tests(self):
        """Should we pull another test from the scheduler?"""
        return (
            not self.stop_testing and len(self.active_tests) < self.max_parallel and self.scheduler.peek() is not None
        )

    @property
    def _expect_client_requests(self):
        return len(self.active_tests) > 0

    def _report_unschedulable(self, unschedulable, err_msg=None):
        if not unschedulable:
            return

        self._log(
            logging.ERROR,
            f"There are {len(unschedulable)} tests which cannot be run due to insufficient cluster resources",
        )
        for tc in unschedulable:
            if err_msg:
                msg = err_msg
            else:
                msg = (
                    f"Test {tc.test_id} requires more resources than are available in the whole cluster. "
                    f"{self.cluster.all().nodes.attempt_remove_spec(tc.expected_cluster_spec)}"
                )

            self._log(logging.ERROR, msg)

            result = TestResult(
                tc,
                self.test_counter,
                self.session_context,
                test_status=FAIL,
                summary=msg,
                start_time=time.time(),
                stop_time=time.time(),
            )
            self.results.append(result)
            result.report()

            self.test_counter += 1

    def _check_unschedulable(self):
        self._report_unschedulable(self.scheduler.filter_unschedulable_tests())

    def _report_remaining_as_failed(self, reason):
        """Mark all remaining tests in the scheduler as failed with the given reason."""
        remaining_tests = self.scheduler.drain_remaining_tests()
        if not remaining_tests:
            return

        self._log(
            logging.ERROR,
            f"Marking {len(remaining_tests)} remaining tests as failed: {reason}",
        )
        for tc in remaining_tests:
            msg = f"Test not run: {reason}"
            self._log(logging.ERROR, f"{tc.test_id}: {msg}")

            result = TestResult(
                tc,
                self.test_counter,
                self.session_context,
                test_status=FAIL,
                summary=msg,
                start_time=time.time(),
                stop_time=time.time(),
            )
            self.results.append(result)
            result.report()

            self.test_counter += 1

    def _report_active_as_failed(self, reason):
        """Mark all currently active/running tests as failed with the given reason."""
        active_test_keys = list(self.active_tests.keys())
        if not active_test_keys:
            return

        self._log(
            logging.ERROR,
            f"Marking {len(active_test_keys)} active tests as failed: {reason}",
        )
        stop_time = time.time()
        for test_key in active_test_keys:
            if hasattr(self, "_test_cluster") and test_key in self._test_cluster:
                subcluster = self._test_cluster[test_key]
                # Return nodes to the cluster and remove tracking entry
                self.cluster.free(subcluster.nodes)
                del self._test_cluster[test_key]

            tc = self._test_context[test_key.test_id]
            msg = f"Test timed out: {reason}"
            self._log(logging.ERROR, f"{tc.test_id}: {msg}")

            start_time = self.client_report[test_key].get("runner_start_time", stop_time)
            result = TestResult(
                tc,
                test_key.test_index,
                self.session_context,
                test_status=FAIL,
                summary=msg,
                start_time=start_time,
                stop_time=stop_time,
            )
            self.results.append(result)
            result.report()

        # Clear active tests since we've reported them all as failed
        self.active_tests.clear()

    def run_all_tests(self):
        self.receiver.start()
        self.results.start_time = time.time()

        # Report tests which cannot be run
        self._check_unschedulable()

        # Run the tests!
        self._log(
            logging.INFO,
            "starting test run with session id %s..." % self.session_context.session_id,
        )
        self._log(logging.INFO, "running %d tests..." % len(self.scheduler))
        while self._ready_to_trigger_more_tests or self._expect_client_requests:
            try:
                while self._ready_to_trigger_more_tests:
                    next_test_context = self.scheduler.peek()
                    try:
                        self._preallocate_subcluster(next_test_context)
                    except InsufficientResourcesError:
                        # We were not able to allocate the subcluster for this test,
                        # this means not enough nodes passed health check.
                        # Don't mark this test as failed just yet, some other test might finish running and
                        # free up healthy nodes.
                        # However, if some nodes failed, cluster size changed too, so we need to check if
                        # there are any tests that can no longer be scheduled.
                        self._log(
                            logging.INFO,
                            f"Couldn't schedule test context {next_test_context} but we'll keep trying",
                            exc_info=True,
                        )
                        self._check_unschedulable()
                    else:
                        # only remove the test from the scheduler once we've successfully allocated a subcluster for it
                        self.scheduler.remove(next_test_context)
                        self._run_single_test(next_test_context)

                if self._expect_client_requests:
                    try:
                        event = self.receiver.recv(timeout=self.session_context.test_runner_timeout)
                        self._handle(event)
                    except TimeoutError as e:
                        # Handle timeout gracefully - clean up, mark tests as failed, and return results
                        err_str = "Timeout error while receiving message: %s: %s" % (
                            str(type(e)),
                            str(e),
                        )
                        err_str += "\n" + traceback.format_exc(limit=16)
                        self._log(logging.ERROR, err_str)

                        # Send SIGTERM to all client processes immediately to allow graceful cleanup
                        # (copy logs, run teardown, etc.)
                        for process_key in list(self._client_procs.keys()):
                            proc = self._client_procs[process_key]
                            if proc.is_alive():
                                self._log(logging.INFO, f"Sending SIGTERM to process {proc.name} for graceful shutdown")
                                os.kill(proc.pid, signal.SIGTERM)

                        # Wait for processes to shutdown gracefully (in parallel), escalate to SIGKILL if needed
                        for process_key in list(self._client_procs.keys()):
                            self._join_test_process(process_key, self.timeout_exception_join_timeout)
                        self._client_procs = {}
                        # Mark active tests as failed with the exception message
                        self._report_active_as_failed(str(e))

                        # Mark all remaining tests as failed with the exception message
                        self._report_remaining_as_failed(str(e))

                        self.receiver.close()
                        return self.results
                    except Exception as e:
                        # For all other exceptions, clean up and re-raise
                        err_str = "Exception receiving message: %s: %s" % (
                            str(type(e)),
                            str(e),
                        )
                        err_str += "\n" + traceback.format_exc(limit=16)
                        self._log(logging.ERROR, err_str)

                        for proc in list(self._client_procs):
                            self._join_test_process(proc, self.finish_join_timeout)
                        self._client_procs = {}
                        raise
            except KeyboardInterrupt:
                # If SIGINT is received, stop triggering new tests, and let the currently running tests finish
                self._log(
                    logging.INFO,
                    "Received KeyboardInterrupt. Now waiting for currently running tests to finish...",
                )
                self.stop_testing = True

        # All clients should be cleaned up in their finish block
        if self._client_procs:
            self._log(logging.WARNING, f"Some clients failed to clean up, waiting 10min to join: {self._client_procs}")
        for proc in list(self._client_procs):
            self._join_test_process(proc, self.finish_join_timeout)

        self.receiver.close()

        return self.results

    def _run_single_test(self, test_context):
        """Start a test runner client in a subprocess"""
        current_test_counter = self.test_counter
        self.test_counter += 1
        self._log(
            logging.INFO,
            "Triggering test %d of %d..." % (current_test_counter, self.total_tests),
        )

        # Test is considered "active" as soon as we start it up in a subprocess
        test_key = TestKey(test_context.test_id, current_test_counter)
        self.active_tests[test_key] = True
        self.test_schedule_log.append(test_key)

        proc = multiprocessing.Process(
            target=run_client,
            args=[
                self.hostname,
                self.receiver.port,
                test_context.test_id,
                current_test_counter,
                TestContext.logger_name(test_context, current_test_counter),
                TestContext.results_dir(test_context, current_test_counter),
                self.session_context.debug,
                self.session_context.fail_bad_cluster_utilization,
                self.deflake_num,
            ],
        )

        self._client_procs[test_key] = proc
        proc.start()
        self.client_report[test_key]["status"] = "RUNNING"
        self.client_report[test_key]["pid"] = proc.pid
        self.client_report[test_key]["name"] = proc.name
        self.client_report[test_key]["runner_start_time"] = time.time()

    def _preallocate_subcluster(self, test_context):
        """Preallocate the subcluster which will be used to run the test.

        Side effect: store association between the test_id and the preallocated subcluster.

        :param test_context
        :return None
        """
        allocated = self.cluster.alloc(test_context.expected_cluster_spec)
        if len(self.cluster.available()) == 0 and self.max_parallel > 1 and not self._test_cluster:
            self._log(
                logging.WARNING,
                "Test %s is using entire cluster. It's possible this test has no associated cluster metadata."
                % test_context.test_id,
            )

        self._test_cluster[TestKey(test_context.test_id, self.test_counter)] = FiniteSubcluster(allocated)

    def _handle(self, event):
        self._log(logging.DEBUG, str(event))

        if event["event_type"] == ClientEventFactory.READY:
            self._handle_ready(event)
        elif event["event_type"] in [
            ClientEventFactory.RUNNING,
            ClientEventFactory.SETTING_UP,
            ClientEventFactory.TEARING_DOWN,
        ]:
            self._handle_lifecycle(event)
        elif event["event_type"] == ClientEventFactory.FINISHED:
            self._handle_finished(event)
        elif event["event_type"] == ClientEventFactory.LOG:
            self._handle_log(event)
        else:
            raise RuntimeError("Received event with unknown event type: " + str(event))

    def _handle_ready(self, event):
        test_key = TestKey(event["test_id"], event["test_index"])
        test_context = self._test_context[event["test_id"]]
        subcluster = self._test_cluster[test_key]

        self.receiver.send(self.event_response.ready(event, self.session_context, test_context, subcluster))

    def _handle_log(self, event):
        self.receiver.send(self.event_response.log(event))
        self._log(event["log_level"], event["message"])

    def _handle_finished(self, event):
        test_key = TestKey(event["test_id"], event["test_index"])
        self.receiver.send(self.event_response.finished(event))

        # Idempotency: check if this is a ZMQ retry (test already finished)
        if test_key in self.finished_tests:
            self._log(logging.DEBUG, f"Received duplicate FINISHED for {test_key} (ZMQ retry), ignoring")
            return

        # Normal case: test is currently active
        if test_key not in self.active_tests:
            # This should never happen - indicates a logic bug
            raise RuntimeError(
                f"Received FINISHED for test {test_key} which is not in active_tests. "
                f"This indicates a bug in the test runner state management."
            )

        result = event["result"]
        if result.test_status == FAIL and self.exit_first:
            self.stop_testing = True

        # Transition this test from running to finished
        del self.active_tests[test_key]
        self.finished_tests[test_key] = event
        self.results.append(result)

        # Free nodes used by the test
        subcluster = self._test_cluster[test_key]
        self.cluster.free(subcluster.nodes)
        del self._test_cluster[test_key]

        # Join on the finished test process
        self._join_test_process(test_key, timeout=self.finish_join_timeout)

        # Report partial result summaries - it is helpful to have partial test reports available if the
        # ducktape process is killed with a SIGKILL partway through
        test_results = copy.copy(self.results)  # shallow copy
        reporters = [
            SimpleFileSummaryReporter(test_results),
            HTMLSummaryReporter(test_results, self.total_tests),
            JSONReporter(test_results),
            JUnitReporter(test_results),
        ]
        for r in reporters:
            r.report()

        if self._should_print_separator:
            terminal_width, y = get_terminal_size()
            self._log(logging.INFO, "~" * int(2 * terminal_width / 3))

    @property
    def _should_print_separator(self):
        """The separator is the twiddle that goes in between tests on stdout.

        This only makes sense to print if tests are run sequentially (aka max_parallel = 1) since
        output from tests is interleaved otherwise.

        Also, we don't want to print the separator after the last test output has been received, so
        we check that there's more test output expected.
        """
        return self.session_context.max_parallel == 1 and (
            self._expect_client_requests or self._ready_to_trigger_more_tests
        )

    def _handle_lifecycle(self, event):
        self.receiver.send(self.event_response._event_response(event))

    def _log(self, log_level, msg, *args, **kwargs):
        """Log to the service log of the current test."""
        self.session_logger.log(log_level, msg, *args, **kwargs)


================================================
FILE: ducktape/tests/runner_client.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import signal
import threading
import time
import traceback
from collections import defaultdict
from typing import List, Mapping, Optional, Tuple

import zmq
import psutil

from ducktape.services.service import MultiRunServiceIdFactory, service_id_factory
from ducktape.services.service_registry import ServiceRegistry
from ducktape.tests.event import ClientEventFactory
from ducktape.tests.loader import TestLoader
from ducktape.tests.result import FAIL, IGNORE, PASS, TestResult
from ducktape.tests.serde import SerDe
from ducktape.tests.status import FLAKY, TestStatus
from ducktape.tests.test import Test
from ducktape.tests.test_context import TestContext, test_logger
from ducktape.utils.local_filesystem_utils import mkdir_p


def run_client(*args, **kwargs):
    client = RunnerClient(*args, **kwargs)
    client.ready()
    client.run()


class Sender(object):
    REQUEST_TIMEOUT_MS = 3000
    NUM_RETRIES = 5
    BACKOFF_MULTIPLIER = 2.0  # Exponential backoff multiplier

    serde: SerDe
    message_supplier: ClientEventFactory
    server_endpoint: str

    zmq_context: zmq.Context
    socket: Optional[zmq.Socket]
    poller: zmq.Poller

    def __init__(
        self,
        server_host: str,
        server_port: str,
        message_supplier: ClientEventFactory,
        logger: logging.Logger,
    ):
        self.serde = SerDe()
        self.server_endpoint = "tcp://%s:%s" % (str(server_host), str(server_port))
        self.zmq_context = zmq.Context()
        self.socket = None
        self.poller = zmq.Poller()

        self.message_supplier = message_supplier
        self.logger = logger

        self._init_socket()

    def _init_socket(self):
        self.socket = self.zmq_context.socket(zmq.REQ)
        self.socket.connect(self.server_endpoint)
        self.poller.register(self.socket, zmq.POLLIN)

    def send(self, event, blocking=True):
        retries_left = Sender.NUM_RETRIES
        current_timeout_ms = Sender.REQUEST_TIMEOUT_MS

        while retries_left > 0:
            serialized_event = self.serde.serialize(event)
            self.socket.send(serialized_event)
            retries_left -= 1
            waiting_for_reply = True

            while waiting_for_reply:
                sockets = dict(self.poller.poll(current_timeout_ms))

                if sockets.get(self.socket) == zmq.POLLIN:
                    reply = self.socket.recv()
                    if reply:
                        return self.serde.deserialize(reply)
                    else:
                        # send another request...
                        break
                else:
                    self.close()
                    self._init_socket()
                    waiting_for_reply = False
                    # Apply exponential backoff for next retry
                    current_timeout_ms = int(current_timeout_ms * Sender.BACKOFF_MULTIPLIER)
                # Ensure each message we attempt to send has a unique id
                # This copy constructor gives us a duplicate with a new message id
                event = self.message_supplier.copy(event)

        raise RuntimeError("Unable to receive response from driver")

    def close(self):
        self.socket.setsockopt(zmq.LINGER, 0)
        self.socket.close()
        self.poller.unregister(self.socket)


class RunnerClient(object):
    """Run a single test"""

    serde: SerDe
    logger: logging.Logger
    runner_port: int
    message: ClientEventFactory
    sender: Sender

    test_id: str
    test_index: int
    id: str

    test: Optional[Test]
    test_context: Optional[TestContext]
    all_services: Optional[ServiceRegistry]

    # configs
    fail_bad_cluster_utilization: bool
    deflake_num: int

    def __init__(
        self,
        server_hostname: str,
        server_port: int,
        test_id: str,
        test_index: int,
        logger_name: str,
        log_dir: str,
        debug: bool,
        fail_bad_cluster_utilization: bool,
        deflake_num: int,
    ):
        signal.signal(signal.SIGTERM, self._sigterm_handler)  # register a SIGTERM handler

        self.serde = SerDe()
        self.logger = test_logger(logger_name, log_dir, debug)
        self.runner_port = server_port

        self.fail_bad_cluster_utilization = fail_bad_cluster_utilization
        self.test_id = test_id
        self.test_index = test_index
        self.id = "test-runner-%d-%d" % (os.getpid(), id(self))
        self.message = ClientEventFactory(self.test_id, self.test_index, self.id)
        self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger)

        self.deflake_num = deflake_num

        # Wait to instantiate the test object until running the test
        self.test = None
        self.test_context = None
        self.all_services = None
        self.runner_shutting_down = False

    @property
    def deflake_enabled(self) -> bool:
        return self.deflake_num > 1

    def ready(self):
        ready_reply = self.sender.send(self.message.ready())
        self.session_context = ready_reply["session_context"]
        self.test_metadata = ready_reply["test_metadata"]
        self.cluster = ready_reply["cluster"]

    def send(self, event):
        # Don't attempt to send messages if the runner is shutting down
        # The driver is no longer listening and will timeout anyway
        if self.runner_shutting_down:
            return None
        return self.sender.send(event)

    def _kill_all_child_processes(self, send_signal=signal.SIGTERM):
        current_process = psutil.Process()
        # if this client has any children - kill them (for instances background service)
        children = current_process.children(recursive=True)
        for child in children:
            self.logger.warning(f"process {repr(child)} did not terminate on its own, killing with {send_signal}")
            child.send_signal(send_signal)

    def _sigterm_handler(self, signum, frame):
        """Translate SIGTERM to SIGINT on this process

        python will treat SIGINT as a Keyboard exception. Exception handling does the rest.
        """
        self.runner_shutting_down = True
        self.logger.warning("Received SIGTERM, sending SIGINT to self and all child processes")
        self._kill_all_child_processes(signal.SIGINT)
        os.kill(os.getpid(), signal.SIGINT)  # This will send SIGINT to the current process

    def _collect_test_context(self, directory, file_name, cls_name, method_name, injected_args):
        loader = TestLoader(
            self.session_context,
            self.logger,
            injected_args=injected_args,
            cluster=self.cluster,
        )
        # TODO: deal with this in a more graceful fashion.
        #       In an unlikely even that discover either raises the exception or fails to find exactly one test
        #       we should probably continue trying other tests rather than killing this process
        loaded_context_list = loader.discover(directory, file_name, cls_name, method_name)

        assert len(loaded_context_list) == 1
        test_context = loaded_context_list[0]
        test_context.cluster = self.cluster
        return test_context

    def run(self):
        self.log(logging.INFO, "Loading test %s" % str(self.test_metadata))
        self.test_context = self._collect_test_context(**self.test_metadata)
        self.test_context.test_index = self.test_index

        self.send(self.message.running())
        if self.test_context.ignore:
            # Skip running this test, but keep track of the fact that we ignored it
            result = TestResult(
                self.test_context,
                self.test_index,
                self.session_context,
                test_status=IGNORE,
                start_time=time.time(),
                stop_time=time.time(),
            )
            result.report()
            # Tell the server we are finished
            self.send(self.message.finished(result=result))
            return

        start_time = -1
        stop_time = -1
        test_status = FAIL
        data = None
        self.all_services = ServiceRegistry(enable_jvm_logs=self.session_context.enable_jvm_logs)

        summaries = []
        num_runs = 0

        try:
            while test_status == FAIL and num_runs < self.deflake_num:
                num_runs += 1
                self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num))
                start_time = time.time()
                test_status, run_summary, data = self._do_run(num_runs)
                if run_summary:
                    summaries.append(run_summary)

                # dump threads after the test is complete;
                # if any thread is not terminated correctly by the test we'll see it here
                self.dump_threads(f"Threads after {self.test_id} finished")

                # if run passed, and not on the first run, the test is flaky
                if test_status == PASS and num_runs > 1:
                    test_status = FLAKY

                msg = str(test_status.to_json())
                if run_summary:
                    msg += ": {}".format("\n".join(run_summary))
                self.log(logging.INFO, msg)

        finally:
            stop_time = time.time()
            summary = self.process_run_summaries(summaries, test_status)
            test_status, summary = self._check_cluster_utilization(test_status, summary)
            # convert summary from list to string
            summary = "\n".join(summary)
            if num_runs > 1:
                # for reporting purposes report all services
                self.test_context.services = self.all_services
            # for flaky tests, we report the start and end time of the successful run, and not the whole run period
            result = TestResult(
                self.test_context,
                self.test_index,
                self.session_context,
                test_status,
                summary,
                data,
                start_time,
                stop_time,
            )

            self.log(logging.INFO, "Data: %s" % str(result.data))

            result.report()
            # Tell the server we are finished
            self._do_safely(
                lambda: self.send(self.message.finished(result=result)),
                f"Problem sending FINISHED message for {self.test_metadata}:\n",
            )
            self._kill_all_child_processes()
            # Release test_context resources only after creating the result and finishing logging activity
            # The Sender object uses the same logger, so we postpone closing until after the finished message is sent
            self.test_context.close()
            self.all_services = None
            self.test_context = None
            self.test = None

    def process_run_summaries(self, run_summaries: List[List[str]], test_status: TestStatus) -> List[str]:
        """
        Converts individual run summaries (there may be multiple if deflake is enabled)
        into a single run summary
        """
        # no summary case, return test passed
        if not run_summaries:
            return ["Test Passed"]
        # single run, can just return the summary
        if not self.deflake_enabled:
            return run_summaries[0]

        failure_summaries: Mapping[Tuple[str, ...], List[int]] = defaultdict(list)
        # populate run summaries grouping run numbers by stack trace
        for run_num, summary in enumerate(run_summaries):
            # convert to tuple to be serializable (+1 for human readability 1 based indexing)
            failure_summaries[tuple(summary)].append(run_num + 1)

        final_summary = []

        # handle run summaries for each deflake run:
        sub_summaries = []
        for individual_summary, runs in failure_summaries.items():
            sub_summary = []
            runs_str = ", ".join(str(r) for r in runs)
            run_msg = f"run{'s' if len(runs) > 1 else ''} {runs_str} summary:"
            sub_summary.append(run_msg)
            sub_summary.extend(individual_summary)
            sub_summaries.append(sub_summary)

        if test_status == FLAKY:
            sub_summaries.append([f"run {len(run_summaries)}: PASSED"])

        # combine summaries, with a '~~~~~' divider
        for sub_summary in sub_summaries[:-1]:
            final_summary.extend(sub_summary)
            break_line = "~" * max(len(line) for line in final_summary) if final_summary else ""
            final_summary.append(break_line)

        # the pass case could have no summaries, so need to validate that a subsummary exists
        if sub_summaries:
            final_summary.extend(sub_summaries[-1])

        return final_summary

    def _do_run(self, num_runs):
        test_status = FAIL
        summary = []
        data = None
        sid_factory = MultiRunServiceIdFactory(num_runs) if self.deflake_enabled else service_id_factory
        try:
            # Results from this test, as well as logs will be dumped here
            mkdir_p(TestContext.results_dir(self.test_context, self.test_index))
            # Instantiate test
            self.test = self.test_context.cls(self.test_context)

            # Run the test unit
            self.setup_test()
            data = self.run_test()
            test_status = PASS

        except BaseException as e:
            # mark the test as failed before doing anything else
            test_status = FAIL
            err_trace = self._exc_msg(e)
            summary.extend(err_trace.split("\n"))

        finally:
            for service in self.test_context.services:
                service.service_id_factory = sid_factory
                self.all_services.append(service)

            self.teardown_test(
                teardown_services=not self.session_context.no_teardown,
                test_status=test_status,
            )

            if hasattr(self.test_context, "services"):
                service_errors = self.test_context.services.errors()
                if service_errors:
                    summary.extend(["", "", service_errors])

            # free nodes
            if self.test:
                self.log(logging.DEBUG, "Freeing nodes...")
                self._do_safely(self.test.free_nodes, "Error freeing nodes:")
            return test_status, summary, data

    def _check_cluster_utilization(self, result, summary):
        """Checks if the number of nodes used by a test is less than the number of
        nodes requested by the test. If this is the case and we wish to fail
        on bad cluster utilization, the result value is failed. Will also print
        a warning if the test passes and the node utilization doesn't match.
        """
        max_used = self.cluster.max_used()
        total = len(self.cluster.all())
        if max_used < total:
            message = "Test requested %d nodes, used only %d" % (total, max_used)
            if self.fail_bad_cluster_utilization:
                # only check node utilization on test pass
                if result == PASS or result == FLAKY:
                    self.log(logging.INFO, "FAIL: " + message)

                result = FAIL
                summary.append(message)
            else:
                self.log(logging.WARN, message)
        return result, summary

    def setup_test(self):
        """start services etc"""
        self.log(logging.INFO, "Setting up...")
        self.test.setup()

    def run_test(self):
        """Run the test!

        We expect test_context.function to be a function or unbound method which takes an
        instantiated test object as its argument.
        """
        self.log(logging.INFO, "Running...")
        return self.test_context.function(self.test)

    def _exc_msg(self, e):
        return repr(e) + "\n" + traceback.format_exc(limit=16)

    def _do_safely(self, action, err_msg):
        try:
            action()
        except BaseException as e:
            self.log(logging.WARN, err_msg + " " + self._exc_msg(e))

    def teardown_test(self, teardown_services=True, test_status=None):
        """teardown method which stops services, gathers log data, removes persistent state, and releases cluster nodes.

        Catch all exceptions so that every step in the teardown process is tried, but signal that the test runner
        should stop if a keyboard interrupt is caught.
        """
        self.log(logging.INFO, "Tearing down...")
        if not self.test:
            self.log(logging.WARN, "%s failed to instantiate" % self.test_id)
            self.test_context.close()
            return

        services = self.test_context.services

        if teardown_services:
            self._do_safely(self.test.teardown, "Error running teardown method:")
            # stop services
            self._do_safely(services.stop_all, "Error stopping services:")

        # always collect service logs whether or not we tear down
        # logs are typically removed during "clean" phase, so collect logs before cleaning
        self.log(logging.DEBUG, "Copying logs from services...")
        self._do_safely(
            lambda: self.test.copy_service_logs(test_status),
            "Error copying service logs:",
        )

        # clean up stray processes and persistent state
        if teardown_services:
            self.log(logging.DEBUG, "Cleaning up services...")
            self._do_safely(services.clean_all, "Error cleaning services:")

    def log(self, log_level, msg, *args, **kwargs):
        """Log to the service log and the test log of the current test."""

        if self.test_context is None:
            msg = "%s: %s" % (self.__class__.__name__, str(msg))
            self.logger.log(log_level, msg, *args, **kwargs)
        else:
            msg = "%s: %s: %s" % (
                self.__class__.__name__,
                self.test_context.test_name,
                str(msg),
            )
            self.logger.log(log_level, msg, *args, **kwargs)

        self.send(self.message.log(msg, level=log_level))

    def dump_threads(self, msg):
        dump = "\n".join([t.name for t in threading.enumerate()])
        self.log(logging.DEBUG, f"{msg}: {dump}")


================================================
FILE: ducktape/tests/scheduler.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import List

from ducktape.cluster.vagrant import VagrantCluster
from ducktape.tests.test_context import TestContext


class TestScheduler(object):
    """This class tracks tests which are scheduled to run, and provides an ordering based on the current cluster state.

    The ordering is "on-demand"; calling next returns the largest cluster user which fits in the currently
    available cluster nodes.
    """

    def __init__(self, test_contexts: List[TestContext], cluster: VagrantCluster) -> None:
        self.cluster = cluster

        # Track tests which would never be offered up by the scheduling algorithm due to insufficient
        # cluster resources
        self._test_context_list = test_contexts.copy()

        self._sort_test_context_list()

    def __len__(self) -> int:
        """Number of tests currently in the scheduler"""
        return len(self._test_context_list)

    def __iter__(self):
        return self

    def filter_unschedulable_tests(self):
        """
        Filter out tests that cannot be scheduled with the current cluster, remove them from
        this scheduler and return them.
        """
        all = self.cluster.all()
        unschedulable = []
        for test_context in self._test_context_list:
            if not all.nodes.can_remove_spec(test_context.expected_cluster_spec):
                unschedulable.append(test_context)
        for u in unschedulable:
            self._test_context_list.remove(u)
        return unschedulable

    def _sort_test_context_list(self) -> None:
        """Replace self.test_context_list with a sorted shallow copy

        Sort from largest cluster users to smallest
        """
        # sort from the largest cluster users to smallest
        self._test_context_list = sorted(self._test_context_list, key=lambda tc: tc.expected_num_nodes, reverse=True)

    def peek(self):
        """Locate and return the next object to be scheduled, without removing it internally.

        :return test_context for the next test to be scheduled.
            If scheduler is empty, or no test can currently be scheduled, return None.
        """
        for tc in self._test_context_list:
            if self.cluster.available().nodes.can_remove_spec(tc.expected_cluster_spec):
                return tc

        return None

    def remove(self, tc):
        """Remove test context object from this scheduler.
        Intended usage is to peek() first, then perform whatever validity checks,
        and if they pass, remove() it from the scheduler.
        """
        if tc:
            self._test_context_list.remove(tc)

    def drain_remaining_tests(self):
        """
        Get all remaining tests in the scheduler and clear the scheduler.

        :return List of test contexts that were still in the scheduler
        """
        remaining = self._test_context_list.copy()
        self._test_context_list.clear()
        return remaining


================================================
FILE: ducktape/tests/serde.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle


class SerDe(object):
    def serialize(self, obj):
        if hasattr(obj, "serialize"):
            obj.serialize()
        else:
            return pickle.dumps(obj)

    def deserialize(self, bytes_obj, obj_cls=None):
        if obj_cls and hasattr(obj_cls, "deserialize"):
            return obj_cls.deserialize(bytes_obj)
        else:
            return pickle.loads(bytes_obj)


================================================
FILE: ducktape/tests/session.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys
import time

from ducktape.command_line.defaults import ConsoleDefaults
from ducktape.tests.loggermaker import LoggerMaker


class SessionContext(object):
    """Wrapper class for 'global' variables. A call to ducktape generates a single shared SessionContext object
    which helps route logging and reporting, etc.
    """

    def __init__(self, **kwargs) -> None:
        # session_id, results_dir, cluster, globals):
        self.session_id = kwargs["session_id"]
        self.results_dir = os.path.abspath(kwargs["results_dir"])

        self.debug = kwargs.get("debug", False)
        self.compress = kwargs.get("compress", False)
        self.exit_first = kwargs.get("exit_first", False)
        self.no_teardown = kwargs.get("no_teardown", False)
        self.max_parallel = kwargs.get("max_parallel", 1)
        self.default_expected_num_nodes = kwargs.get("default_num_nodes", None)
        self.fail_bad_cluster_utilization = kwargs.get("fail_bad_cluster_utilization")
        self.fail_greedy_tests = kwargs.get("fail_greedy_tests", False)
        self.test_runner_timeout = kwargs.get("test_runner_timeout")
        self.enable_jvm_logs = kwargs.get("enable_jvm_logs", False)
        self._globals = kwargs.get("globals")

    @property
    def globals(self):
        """None, or an immutable dictionary containing user-defined global variables."""
        return self._globals

    def to_json(self):
        return self.__dict__


class SessionLoggerMaker(LoggerMaker):
    def __init__(self, session_context: SessionContext) -> None:
        super(SessionLoggerMaker, self).__init__(session_context.session_id + ".session_logger")
        self.log_dir = session_context.results_dir
        self.debug = session_context.debug

    def configure_logger(self) -> None:
        """Set up the logger to log to stdout and files. This creates a few files as a side-effect."""
        if self.configured:
            return

        self._logger.setLevel(logging.DEBUG)

        fh_info = logging.FileHandler(os.path.join(self.log_dir, "session_log.info"))
        fh_debug = logging.FileHandler(os.path.join(self.log_dir, "session_log.debug"))
        fh_info.setLevel(logging.INFO)
        fh_debug.setLevel(logging.DEBUG)

        # create console handler with a higher log level
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(logging.DEBUG if self.debug else logging.INFO)

        # create formatter and add it to the handlers
        formatter = logging.Formatter(ConsoleDefaults.SESSION_LOG_FORMATTER)
        fh_info.setFormatter(formatter)
        fh_debug.setFormatter(formatter)
        ch.setFormatter(formatter)

        # add the handlers to the logger
        self._logger.addHandler(fh_info)
        self._logger.addHandler(fh_debug)
        self._logger.addHandler(ch)


def generate_session_id(session_id_file: str) -> str:
    """Generate a new session id based on the previous session id found in session_id_file
    :type session_id_file: str  Last-used session_id is in this file
    :rtype str                  New session_id
    """

    def get_id(day, num):
        return day + "--%03d" % num

    def split_id(an_id):
        day = an_id[:10]
        num = int(an_id[12:])
        return day, num

    def today():
        return time.strftime("%Y-%m-%d")

    def next_id(prev_id):
        if prev_id is None:
            prev_day = today()
            prev_num = 0
        else:
            prev_day, prev_num = split_id(prev_id)

        if prev_day == today():
            next_day = prev_day
            next_num = prev_num + 1
        else:
            next_day = today()
            next_num = 1

        return get_id(next_day, next_num)

    if os.path.isfile(session_id_file):
        with open(session_id_file, "r") as fp:
            session_id = next_id(fp.read())
    else:
        session_id = next_id(None)

    with open(session_id_file, "w") as fp:
        fp.write(session_id)

    return session_id


def generate_results_dir(results_root: str, session_id: str) -> str:
    """Results from a single run of ducktape are assigned a session_id and put together in this directory.

    :type session_id: str
    :rtype: str
    """
    return os.path.join(os.path.abspath(results_root), session_id)


================================================
FILE: ducktape/tests/status.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


class TestStatus(object):
    def __init__(self, status: str) -> None:
        self._status = str(status).lower()

    def __eq__(self, other):
        return str(self).lower() == str(other).lower()

    def __str__(self):
        return self._status

    def to_json(self):
        return str(self).upper()


PASS = TestStatus("pass")
FLAKY = TestStatus("flaky")
FAIL = TestStatus("fail")
IGNORE = TestStatus("ignore")


================================================
FILE: ducktape/tests/test.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import shutil
import tempfile
from contextlib import contextmanager

from ducktape.template import TemplateRenderer
from ducktape.tests.status import FAIL
from ducktape.utils.local_filesystem_utils import mkdir_p

from .test_context import TestContext


class Test(TemplateRenderer):
    """Base class for tests."""

    def __init__(self, test_context, *args, **kwargs):
        """
        :type test_context: ducktape.tests.test.TestContext
        """
        super(Test, self).__init__(*args, **kwargs)
        self.test_context = test_context

    @property
    def cluster(self):
        return self.test_context.cluster

    @property
    def logger(self):
        return self.test_context.logger

    def min_cluster_spec(self):
        """
        THIS METHOD IS DEPRECATED AND WILL BE REMOVED IN THE SUBSEQUENT RELEASES.
        Nothing in the ducktape framework calls it, it is only provided so that subclasses don't break.
        If you're overriding this method in your subclass, please remove it.
        """
        raise NotImplementedError

    def min_cluster_size(self):
        """
        THIS METHOD IS DEPRECATED AND WILL BE REMOVED IN THE SUBSEQUENT RELEASES.
        Nothing in the ducktape framework calls it, it is only provided so that subclasses don't break.
        If you're overriding this method in your subclass, please remove it.
        """
        raise NotImplementedError

    def setup(self):
        """Override this for custom setup logic."""

        # for backward compatibility
        self.setUp()

    def teardown(self):
        """Override this for custom teardown logic."""

        # for backward compatibility
        self.tearDown()

    def setUp(self):
        pass

    def tearDown(self):
        pass

    def free_nodes(self):
        try:
            self.test_context.services.free_all()
        except BaseException as e:
            if isinstance(e, KeyboardInterrupt):
                raise

    def compress_service_logs(self, node, service, node_logs):
        """Compress logs on a node corresponding to the given service.

        :param node: The node on which to compress the given logs
        :param service: The service to which the node belongs
        :param node_logs: Paths to logs (or log directories) which will be compressed
        :return: a list of paths to compressed logs.

        """
        compressed_logs = []
        for nlog in node_logs:
            try:
                node.account.ssh(_compress_cmd(nlog))
                if nlog.endswith(os.path.sep):
                    nlog = nlog[: -len(os.path.sep)]
                nlog += ".tgz"
                compressed_logs.append(nlog)

            except Exception as e:
                self.test_context.logger.warn("Error compressing log %s: service %s: %s" % (nlog, service, str(e)))

        return compressed_logs

    def copy_service_logs(self, test_status):
        """
        Copy logs from service nodes to the results directory.

        If the test passed, only the default set will be collected. If the the test failed, all logs will be collected.
        """
        for service in self.test_context.services:
            if not hasattr(service, "logs") or len(service.logs) == 0:
                self.test_context.logger.debug(
                    "Won't collect service logs from %s - no logs to collect." % service.service_id
                )
                continue

            log_dirs = service.logs
            for node in service.nodes:
                # Gather locations of logs to collect
                node_logs = []
                for log_name in log_dirs.keys():
                    if test_status == FAIL or self.should_collect_log(log_name, service):
                        node_logs.append(log_dirs[log_name]["path"])

                self.test_context.logger.debug(
                    "Preparing to copy logs from %s: %s" % (node.account.hostname, node_logs)
                )

                if self.test_context.session_context.compress:
                    self.test_context.logger.debug("Compressing logs...")
                    node_logs = self.compress_service_logs(node, service, node_logs)

                if len(node_logs) > 0:
                    # Create directory into which service logs will be copied
                    dest = os.path.join(
                        TestContext.results_dir(self.test_context, self.test_context.test_index),
                        service.service_id,
                        node.account.hostname,
                    )
                    if not os.path.isdir(dest):
                        mkdir_p(dest)

                    # Try to copy the service logs
                    self.test_context.logger.debug("Copying logs...")
                    try:
                        for log in node_logs:
                            node.account.copy_from(log, dest)
                    except Exception as e:
                        self.test_context.logger.warn(
                            "Error copying log %(log_name)s from %(source)s to %(dest)s. \
                            service %(service)s: %(message)s"
                            % {
                                "log_name": log_name,
                                "source": log_dirs[log_name],
                                "dest": dest,
                                "service": service,
                                "message": e,
                            }
                        )

    def mark_for_collect(self, service, log_name=None):
        if log_name is None:
            # Mark every log for collection
            for log_name in service.logs:
                self.test_context.log_collect[(log_name, service)] = True
        else:
            self.test_context.log_collect[(log_name, service)] = True

    def mark_no_collect(self, service, log_name=None):
        self.test_context.log_collect[(log_name, service)] = False

    def should_collect_log(self, log_name, service):
        key = (log_name, service)
        default = service.logs[log_name]["collect_default"]
        val = self.test_context.log_collect.get(key, default)
        return val


def _compress_cmd(log_path):
    """Return bash command which compresses the given path to a tarball."""
    compres_cmd = 'cd "$(dirname %s)" && ' % log_path
    compres_cmd += 'f="$(basename %s)" && ' % log_path
    compres_cmd += 'if [ -e "$f" ]; then tar czf "$f.tgz" "$f"; fi && '
    compres_cmd += "rm -rf %s" % log_path

    return compres_cmd


@contextmanager
def in_dir(path):
    """Changes working directory to given path. On exit, restore to original working directory."""
    cwd = os.getcwd()

    try:
        os.chdir(path)
        yield

    finally:
        os.chdir(cwd)


@contextmanager
def in_temp_dir():
    """Creates a temporary directory as the working directory. On exit, it is removed."""
    with _new_temp_dir() as tmpdir:
        with in_dir(tmpdir):
            yield tmpdir


@contextmanager
def _new_temp_dir():
    """Create a temporary directory that is removed automatically"""
    tmpdir = tempfile.mkdtemp()

    try:
        yield tmpdir

    finally:
        shutil.rmtree(tmpdir)


================================================
FILE: ducktape/tests/test_context.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import logging
import os
import re
import shutil
import sys
import tempfile
from typing import TYPE_CHECKING, Dict, Optional, Tuple

from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.command_line.defaults import ConsoleDefaults
from ducktape.mark.consts import CLUSTER_NODE_TYPE_KEYWORD, CLUSTER_SIZE_KEYWORD, CLUSTER_SPEC_KEYWORD
from ducktape.services.service_registry import ServiceRegistry
from ducktape.tests.loggermaker import LoggerMaker, close_logger
from ducktape.tests.session import SessionContext
from ducktape.utils.local_filesystem_utils import mkdir_p

if TYPE_CHECKING:
    from ducktape.services.service import Service


def _escape_pathname(s):
    """Remove fishy characters, replace most with dots"""
    # Remove all whitespace completely
    s = re.sub(r"\s+", "", s)

    # Replace bad characters with dots
    blacklist = r"[^\.\-=_\w\d]+"
    s = re.sub(blacklist, ".", s)

    # Multiple dots -> single dot (and no leading or trailing dot)
    s = re.sub(r"[\.]+", ".", s)
    return re.sub(r"^\.|\.$", "", s)


class TestLoggerMaker(LoggerMaker):
    def __init__(self, logger_name, log_dir, debug):
        super(TestLoggerMaker, self).__init__(logger_name)
        self.log_dir = log_dir
        self.debug = debug

    def configure_logger(self):
        """Set up the logger to log to stdout and files.
        This creates a directory and a few files as a side-effect.
        """
        if self.configured:
            return

        self._logger.setLevel(logging.DEBUG)
        mkdir_p(self.log_dir)

        # Create info and debug level handlers to pipe to log files
        info_fh = logging.FileHandler(os.path.join(self.log_dir, "test_log.info"))
        debug_fh = logging.FileHandler(os.path.join(self.log_dir, "test_log.debug"))

        info_fh.setLevel(logging.INFO)
        debug_fh.setLevel(logging.DEBUG)

        formatter = logging.Formatter(ConsoleDefaults.TEST_LOG_FORMATTER)
        info_fh.setFormatter(formatter)
        debug_fh.setFormatter(formatter)

        self._logger.addHandler(info_fh)
        self._logger.addHandler(debug_fh)

        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(formatter)
        if self.debug:
            # If debug flag is set, pipe debug logs to stdout
            ch.setLevel(logging.DEBUG)
        else:
            # default - pipe warning level logging to stdout
            ch.setLevel(logging.WARNING)
        self._logger.addHandler(ch)


def test_logger(logger_name, log_dir, debug):
    """Helper method for getting a test logger object

    Note that if this method is called multiple times with the same ``logger_name``, it returns the same logger object.
    Note also, that for a fixed ``logger_name``, configuration occurs only the first time this function is called.
    """
    return TestLoggerMaker(logger_name, log_dir, debug).logger


class TestContext(object):
    """Wrapper class for state variables needed to properly run a single 'test unit'."""

    def __init__(self, **kwargs) -> None:
        """
        :param session_context:
        :param cluster: the cluster object which will be used by this test
        :param module: name of the module containing the test class/method
        :param cls: class object containing the test method
        :param function: the test method
        :param file: file containing this module
        :param injected_args: a dict containing keyword args which will be passed to the test method
        :param cluster_use_metadata: dict containing information about how this test will use cluster resources
        """

        self.session_context: SessionContext = kwargs["session_context"]
        self.cluster = kwargs.get("cluster")
        self.module = kwargs.get("module")
        self.test_suite_name = kwargs.get("test_suite_name")

        if kwargs.get("file") is not None:
            self.file = os.path.abspath(kwargs["file"])
        else:
            self.file = None
        self.cls = kwargs.get("cls")
        self.function = kwargs.get("function")
        self.injected_args = kwargs.get("injected_args")
        self.ignore = kwargs.get("ignore", False)

        # cluster_use_metadata is a dict containing information about how this test will use cluster resources
        self.cluster_use_metadata = copy.copy(kwargs.get("cluster_use_metadata", {}))

        enable_jvm_logs = self.session_context.enable_jvm_logs if self.session_context else False
        self.services = ServiceRegistry(enable_jvm_logs=enable_jvm_logs)
        self.test_index = None

        # dict for toggling service log collection on/off
        self.log_collect: Dict[Tuple[str, Service], bool] = {}

        self._logger = None
        self._local_scratch_dir = None

    def __repr__(self) -> str:
        return (
            f"<module={self.module}, cls={self.cls_name}, function={self.function_name}, "
            f"injected_args={self.injected_args}, file={self.file}, ignore={self.ignore}, "
            f"cluster_spec={self.expected_cluster_spec}>"
        )

    def copy(self, **kwargs) -> "TestContext":
        """Construct a new TestContext object from another TestContext object
        Note that this is not a true copy, since a fresh ServiceRegistry instance will be created.
        """
        ctx_copy = TestContext(**self.__dict__)
        ctx_copy.__dict__.update(**kwargs)

        return ctx_copy

    @property
    def local_scratch_dir(self):
        """This local scratch directory is created/destroyed on the test driver before/after each test is run."""
        if not self._local_scratch_dir:
            self._local_scratch_dir = tempfile.mkdtemp()
        return self._local_scratch_dir

    @property
    def test_metadata(self):
        return {
            "directory": os.path.dirname(self.file),
            "file_name": os.path.basename(self.file),
            "cls_name": self.cls_name,
            "method_name": self.function_name,
            "injected_args": self.injected_args,
        }

    @staticmethod
    def logger_name(test_context, test_index):
        if test_index is None:
            return test_context.test_id
        else:
            return "%s-%s" % (test_context.test_id, str(test_index))

    @staticmethod
    def results_dir(test_context, test_index):
        d = test_context.session_context.results_dir

        if test_context.cls is not None:
            d = os.path.join(d, test_context.cls.__name__)
        if test_context.function is not None:
            d = os.path.join(d, test_context.function.__name__)
        if test_context.injected_args is not None:
            d = os.path.join(d, test_context.injected_args_name)
        if test_index is not None:
            d = os.path.join(d, str(test_index))

        return d

    @property
    def expected_num_nodes(self) -> int:
        """
        How many nodes of any type we expect this test to consume when run.
        Note that this will be 0 for both unschedulable tests and the tests that legitimately need 0 nodes.

        :return:            an integer number of nodes.
        """
        return self.expected_cluster_spec.size() if self.expected_cluster_spec else 0

    @property
    def expected_cluster_spec(self) -> Optional[ClusterSpec]:
        """
        The cluster spec we expect this test to consume when run.

        :return:            A ClusterSpec object or None if the test cannot be run
                            (e.g. session context settings disallow tests with no cluster metadata attached).
        """
        cluster_spec = self.cluster_use_metadata.get(CLUSTER_SPEC_KEYWORD)
        cluster_size = self.cluster_use_metadata.get(CLUSTER_SIZE_KEYWORD)
        node_type = self.cluster_use_metadata.get(CLUSTER_NODE_TYPE_KEYWORD)
        if cluster_spec is not None:
            return cluster_spec
        elif cluster_size is not None:
            return ClusterSpec.simple_linux(cluster_size, node_type)
        elif not self.cluster:
            return ClusterSpec.empty()
        elif self.session_context.fail_greedy_tests:
            return None
        else:
            return self.cluster.all()

    @property
    def globals(self):
        return self.session_context.globals

    @property
    def module_name(self) -> str:
        return "" if self.module is None else self.module

    @property
    def cls_name(self) -> str:
        return "" if self.cls is None else self.cls.__name__

    @property
    def function_name(self) -> str:
        return "" if self.function is None else self.function.__name__

    @property
    def description(self):
        """Description of the test, needed in particular for reporting.
        If the function has a docstring, return that, otherwise return the class docstring or "".
        """
        if self.function.__doc__:
            return self.function.__doc__
        elif self.cls.__doc__ is not None:
            return self.cls.__doc__
        else:
            return ""

    @property
    def injected_args_name(self) -> str:
        if self.injected_args is None:
            return ""
        else:
            params = ".".join(["%s=%s" % (k, self.injected_args[k]) for k in self.injected_args])
            return _escape_pathname(params)

    @property
    def test_id(self) -> str:
        return self.test_name

    @property
    def test_name(self) -> str:
        """
        The fully-qualified name of the test. This is similar to test_id, but does not include the session ID. It
        includes the module, class, and method name.
        """
        name_components = [
            self.module_name,
            self.cls_name,
            self.function_name,
            self.injected_args_name,
        ]

        return ".".join(filter(lambda x: x is not None and len(x) > 0, name_components))

    @property
    def logger(self):
        if self._logger is None:
            self._logger = test_logger(
                TestContext.logger_name(self, self.test_index),
                TestContext.results_dir(self, self.test_index),
                self.session_context.debug,
            )
        return self._logger

    def close(self):
        """Release resources, etc."""
        if hasattr(self, "services"):
            for service in self.services:
                service.close()
            # Remove reference to services. This is important to prevent potential memory leaks if users write services
            # which themselves have references to large memory-intensive objects
            del self.services

        # Remove local scratch directory
        if self._local_scratch_dir and os.path.exists(self._local_scratch_dir):
            shutil.rmtree(self._local_scratch_dir)

        # Release file handles held by logger
        if self._logger:
            close_logger(self._logger)


================================================
FILE: ducktape/utils/__init__.py
================================================


================================================
FILE: ducktape/utils/http_utils.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from urllib.request import Request, build_opener


class HttpMixin(object):
    def http_request(self, url, method, data="", headers=None, timeout=None):
        if url[0:7].lower() != "http://":
            url = "http://%s" % url

        if hasattr(self, "logger") and self.logger is not None:
            self.logger.debug("Sending http request. Url: %s, Data: %s, Headers: %s" % (url, str(data), str(headers)))

        req = Request(url, data, headers)
        req.get_method = lambda: method
        # The timeout parameter in urllib2.urlopen has strange behavior, and
        # seems to raise errors when set to a number. Using an opener works however.
        opener = build_opener()
        if timeout is None:
            response = opener.open(req)
        else:
            response = opener.open(req, timeout=timeout)

        return response


================================================
FILE: ducktape/utils/local_filesystem_utils.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import errno
import os


def mkdir_p(path: str) -> None:
    """mkdir -p functionality.
    :type path: str
    """
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


================================================
FILE: ducktape/utils/persistence.py
================================================
# Copyright (c) 2009 Jason M Baker
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

"""
Module contains persistent data structures from pysistence project, patched to work with
python 3.x.
"""


def not_implemented_method(*args, **kwargs):
    raise NotImplementedError("Cannot set values in a PDict")


class PDict(dict):
    """
    Persistent dict.
    """

    __setitem__ = not_implemented_method
    __delitem__ = not_implemented_method
    update = not_implemented_method
    clear = not_implemented_method
    pop = not_implemented_method
    popitem = not_implemented_method

    def _as_transient(self):
        return dict(self)

    def copy(self):
        return PDict(self)

    def without(self, *keys):
        new_dict = self._as_transient()
        for key in keys:
            del new_dict[key]
        return PDict(new_dict)

    def using(self, **kwargs):
        new_dict = self._as_transient()
        new_dict.update(kwargs)
        return PDict(new_dict)

    def __reduce__(self):
        # Pickling support in python 2.x and 3.x
        return make_dict, (self._as_transient(),)


make_dict = PDict


================================================
FILE: ducktape/utils/terminal_size.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import platform
import shlex
import struct
import subprocess

"""
Adapted from https://gist.github.com/jtriley/1108174
"""


def get_terminal_size():
    """getTerminalSize()
    - get width and height of console
    - works on linux,os x,windows,cygwin(windows)
    originally retrieved from:
    http://stackoverflow.com/questions/566746/how-to-get-console-window-width-in-python
    """
    current_os = platform.system()
    tuple_xy = None
    if current_os == "Windows":
        tuple_xy = _get_terminal_size_windows()
        if tuple_xy is None:
            tuple_xy = _get_terminal_size_tput()
            # needed for window's python in cygwin's xterm!
    if current_os in ["Linux", "Darwin"] or current_os.startswith("CYGWIN"):
        tuple_xy = _get_terminal_size_linux()
    if tuple_xy is None:
        tuple_xy = (80, 25)  # default value
    return tuple_xy


def _get_terminal_size_windows():
    try:
        from ctypes import create_string_buffer, windll

        # stdin handle is -10
        # stdout handle is -11
        # stderr handle is -12
        h = windll.kernel32.GetStdHandle(-12)
        csbi = create_string_buffer(22)
        res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
        if res:
            (
                bufx,
                bufy,
                curx,
                cury,
                wattr,
                left,
                top,
                right,
                bottom,
                maxx,
                maxy,
            ) = struct.unpack("hhhhHhhhhhh", csbi.raw)
            sizex = right - left + 1
            sizey = bottom - top + 1
            return sizex, sizey
    except Exception:
        pass


def _get_terminal_size_tput():
    # get terminal width
    # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width-height-of-a-terminal-window
    try:
        cols = int(subprocess.check_call(shlex.split("tput cols")))
        rows = int(subprocess.check_call(shlex.split("tput lines")))
        return (cols, rows)
    except Exception:
        pass


def _get_terminal_size_linux():
    def ioctl_GWINSZ(fd):
        try:
            import fcntl
            import termios

            cr = struct.unpack("hh", fcntl.ioctl(fd, termios.TIOCGWINSZ, "1234"))
            return cr
        except Exception:
            pass

    cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
    if not cr:
        try:
            fd = os.open(os.ctermid(), os.O_RDONLY)
            cr = ioctl_GWINSZ(fd)
            os.close(fd)
        except Exception:
            pass
    if not cr:
        try:
            cr = (os.environ["LINES"], os.environ["COLUMNS"])
        except Exception:
            return None
    return int(cr[1]), int(cr[0])


================================================
FILE: ducktape/utils/util.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import time
from typing import Callable

from ducktape import __version__ as __ducktape_version__
from ducktape.errors import TimeoutError


def wait_until(condition, timeout_sec, backoff_sec=0.1, err_msg="", retry_on_exc=False):
    """Block until condition evaluates as true or timeout expires, whichever comes first.

    :param condition: callable that returns True if the condition is met, False otherwise
    :param timeout_sec: number of seconds to check the condition for before failing
    :param backoff_sec: number of seconds to back off between each failure to meet the condition before checking again
    :param err_msg: a string or callable returning a string that will be included as the exception message if the
                    condition is not met
    :param retry_on_exc: if True, will retry if condition raises an exception. If condition raised exception on last
                         iteration, that exception will be raised as a cause of TimeoutError.
                         If False and condition raises an exception, that exception will be forwarded to the caller
                         immediately.
                         Defaults to False (original ducktape behavior).
                         # TODO: [1.0.0] flip this to True
    :return: silently if condition becomes true within the timeout window, otherwise raise Exception with the given
    error message.
    """
    start = time.time()
    stop = start + timeout_sec
    last_exception = None
    while time.time() < stop:
        try:
            if condition():
                return
            else:
                # reset last_exception if last iteration didn't raise any exception, but simply returned False
                last_exception = None
        except BaseException as e:
            # save last raised exception for logging it later
            last_exception = e
            if not retry_on_exc:
                raise e
        finally:
            time.sleep(backoff_sec)

    # it is safe to call Exception from None - will be just treated as a normal exception
    raise TimeoutError(err_msg() if callable(err_msg) else err_msg) from last_exception


def package_is_installed(package_name):
    """Return true iff package can be successfully imported."""
    try:
        importlib.import_module(package_name)
        return True
    except Exception:
        return False


def ducktape_version():
    """Return string representation of current ducktape version."""
    return __ducktape_version__


def load_function(func_module_path) -> Callable:
    """Loads and returns a function from a module path seperated by '.'s"""
    module, function_name = func_module_path.rsplit(".", 1)
    try:
        func = getattr(importlib.import_module(module), function_name)
        if not callable(func):
            raise Exception("Function {} from module {} is not callable".format(function_name, module))
        return func
    except AttributeError:
        raise Exception(
            "Function could not be loaded from the module path {}, verify that it is '.' seperated".format(
                func_module_path
            )
        )


================================================
FILE: requirements-test.txt
================================================
pytest~=6.2.0
# 4.0 drops py27 support
mock==4.0.2
memory_profiler==0.57
statistics==1.0.3.5
requests-testadapter==0.3.0
pytest-cov~=3.0
pytest-xdist~=2.5
ruff==0.4.10


================================================
FILE: requirements.txt
================================================
jinja2~=3.1.6
boto3==1.33.13
# jinja2 pulls in MarkupSafe with a > constraint, but we need to constrain it for compatibility
MarkupSafe~=2.1.5
pyparsing==3.1.4
zipp==3.20.2
pywinrm==0.4.3
requests==2.32.4
paramiko~=3.4.0
pyzmq==26.4.0
pycryptodome==3.23.0
more-itertools==5.0.0
PyYAML==6.0.2
psutil==5.7.2


================================================
FILE: ruff.toml
================================================

extend-exclude = [
    "docs",
    ".virtualenvs"
]
line-length = 120

[lint]
select = [
    "E4",
    "E7",
    "E9",
    "F"
]
ignore = [
    "E111",
    "W292",
    "E226"
]


================================================
FILE: service.yml
================================================
name: ducktape
lang: python
lang_version: '3.13'
git:
  enable: true
sonarqube:
  enable: true
github:
  enable: true
  repo_name: confluentinc/ducktape
renovatebot:
  enable: true
  automerge:
    enable: false
semaphore:
  enable: true
  pipeline_enable: false
  branches:
  - master
  - 0.13.x
  - 0.12.x
  - 0.11.x
  - 0.10.x
  - 0.9.x
  - 0.8.x
  - 0.7.x


================================================
FILE: setup.cfg
================================================
# pytest configuration (can also be defined in in tox.ini or pytest.ini file)
#
# To ease possible confusion, prefix ducktape unit tests with 'check' instead of 'test', since
# many ducktape files, classes, and methods have 'test' somewhere in the name
[tool:pytest]
python_files=check_*.py
python_classes=Check
python_functions=check_*

# don't search inside any resources directory for unit tests
norecursedirs = resources


================================================
FILE: setup.py
================================================
from setuptools import find_packages, setup
from setuptools.command.test import test as TestCommand
import re
import sys

version = ""
with open("ducktape/__init__.py", "r") as fd:
    version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1)

if not version:
    raise RuntimeError("Cannot find version information")


class PyTest(TestCommand):
    user_options = [("pytest-args=", "a", "Arguments to pass to py.test")]

    def initialize_options(self):
        TestCommand.initialize_options(self)
        self.pytest_args = []

    def finalize_options(self):
        TestCommand.finalize_options(self)
        self.test_args = []
        self.test_suite = True

    def run_tests(self):
        # import here, cause outside the eggs aren't loaded
        import pytest

        errno = pytest.main(self.pytest_args)
        self.run_command("flake8")
        sys.exit(errno)


test_req = open("requirements-test.txt").read()


setup(
    name="ducktape",
    version=version,
    description="Distributed system test tools",
    author="Confluent",
    platforms=["any"],
    entry_points={
        "console_scripts": ["ducktape=ducktape.command_line.main:main"],
    },
    license="apache2.0",
    url="http://github.com/confluentinc/ducktape",
    packages=find_packages(),
    package_data={"ducktape": ["templates/report/*"]},
    python_requires=">= 3.6",
    install_requires=open("requirements.txt").read(),
    extras_require={"test": test_req},
    setup_requires=["ruff==0.4.10"],
    cmdclass={"test": PyTest},
)


================================================
FILE: systests/__init__.py
================================================


================================================
FILE: systests/cluster/__init__.py
================================================


================================================
FILE: systests/cluster/test_debug.py
================================================
"""
This module contains tests that are useful for developer debugging
and can contain sleep statements or test that intentionally fail or break things.
They're separate from test_remote_account.py for that reason.
"""

import time

from ducktape.mark import matrix, parametrize, ignore
from ducktape.mark.resource import cluster
from ducktape.tests.test import Test
from systests.cluster.test_remote_account import GenericService


class FailingTest(Test):
    """
    The purpose of this test is to validate reporters. Some of them are intended to fail.
    """

    def setup(self):
        self.service = GenericService(self.test_context, 1)

    @cluster(num_nodes=1)
    @matrix(
        string_param=["success-first", "fail-second", "fail-third"],
        int_param=[10, 20, -30],
    )
    def matrix_test(self, string_param, int_param):
        assert not string_param.startswith("fail") and int_param > 0

    @cluster(num_nodes=1)
    @parametrize(string_param="success-first", int_param=10)
    @parametrize(string_param="fail-second", int_param=-10)
    def parametrized_test(self, string_param, int_param):
        assert not string_param.startswith("fail") and int_param > 0

    @cluster(num_nodes=1)
    def failing_test(self):
        assert False

    @cluster(num_nodes=1)
    def successful_test(self):
        assert True


class DebugThisTest(Test):
    @cluster(num_nodes=1)
    def one_node_test_sleep_90s(self):
        self.service = GenericService(self.test_context, 1)
        self.logger.warning("one_node_test - Sleeping for 90s")
        time.sleep(90)
        assert True

    @cluster(num_nodes=1)
    def one_node_test_sleep_30s(self):
        self.service = GenericService(self.test_context, 1)
        self.logger.warning("another_one_node_test - Sleeping for 30s")
        time.sleep(30)
        assert True

    @cluster(num_nodes=1)
    def another_one_node_test_sleep_30s(self):
        self.service = GenericService(self.test_context, 1)
        self.logger.warning("yet_another_one_node_test - Sleeping for 30s")
        time.sleep(30)
        assert True

    @cluster(num_nodes=2)
    def two_node_test(self):
        self.service = GenericService(self.test_context, 2)
        assert True

    @cluster(num_nodes=2)
    def another_two_node_test(self):
        self.service = GenericService(self.test_context, 2)
        assert True

    @ignore
    @cluster(num_nodes=2)
    def a_two_node_ignored_test(self):
        assert False

    @cluster(num_nodes=2)
    def yet_another_two_node_test(self):
        self.service = GenericService(self.test_context, 2)
        assert True

    @cluster(num_nodes=3)
    def three_node_test(self):
        self.service = GenericService(self.test_context, 3)
        assert True

    @cluster(num_nodes=3)
    def three_node_test_sleeping_30s(self):
        self.service = GenericService(self.test_context, 3)
        self.logger.warning("Sleeping for 30s")
        time.sleep(30)
        assert True

    @cluster(num_nodes=3)
    def another_three_node_test(self):
        self.service = GenericService(self.test_context, 3)
        assert True

    @cluster(num_nodes=2)
    def bad_alloc_test(self):
        # @cluster annotation specifies 2 nodes, but we ask for 3, this will fail
        self.service = GenericService(self.test_context, 3)
        time.sleep(10)
        assert True


================================================
FILE: systests/cluster/test_no_cluster.py
================================================
from ducktape.mark.resource import cluster
from ducktape.tests.test import Test


class NoClusterTest(Test):
    """This test helps validate the behavior for no-cluster tests (ie 0 nodes)"""

    @cluster(num_nodes=0)
    def test_zero_nodes(self):
        self.logger.warn("Testing")
        assert True


================================================
FILE: systests/cluster/test_remote_account.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.consts import WINDOWS, LINUX
from ducktape.cluster.node_spec import NodeSpec
from ducktape.services.service import Service
from ducktape.tests.test import Test
from ducktape.errors import TimeoutError
from ducktape.mark.resource import cluster

import os
import pytest
import random
import shutil
from six import iteritems
import tempfile
from threading import Thread
import time
import logging

from ducktape.utils.util import wait_until


def generate_tempdir_name():
    """Use this ad-hoc function instead of the tempfile module since we're creating and removing
    this directory with ssh commands.
    """
    return "/tmp/" + "t" + str(int(time.time()))


class RemoteAccountTestService(Service):
    """Simple service that allocates one node for performing tests of RemoteAccount functionality"""

    def __init__(self, context):
        super(RemoteAccountTestService, self).__init__(context, num_nodes=1)
        self.temp_dir = generate_tempdir_name()
        self.logs = {
            "my_log": {"path": self.log_file, "collect_default": True},
            "non_existent_log": {
                "path": os.path.join(self.temp_dir, "absent.log"),
                "collect_default": True,
            },
        }

    @property
    def log_file(self):
        return os.path.join(self.temp_dir, "test.log")

    def start_node(self, node):
        node.account.ssh("mkdir -p " + self.temp_dir)
        node.account.ssh("touch " + self.log_file)

    def stop_node(self, node):
        pass

    def clean_node(self, node):
        node.account.ssh("rm -rf " + self.temp_dir)

    def write_to_log(self, msg):
        self.nodes[0].account.ssh("echo -e -n " + repr(msg) + " >> " + self.log_file)


class GenericService(Service):
    """Service which doesn't do anything - just a group of nodes, each of which has a scratch directory."""

    def __init__(self, context, num_nodes):
        super(GenericService, self).__init__(context, num_nodes)
        self.worker_scratch_dir = "scratch"
        for node in self.nodes:
            node.account.mkdirs(self.worker_scratch_dir)

    def stop_node(self, node):
        # noop
        pass

    def clean_node(self, node):
        node.account.remove(self.worker_scratch_dir, allow_fail=True)


class UnderUtilizedTest(Test):
    def setup(self):
        self.service = GenericService(self.test_context, 1)

    @cluster(num_nodes=3)
    def under_utilized_test(self):
        # setup() creates a service instance, which calls alloc() for one node
        assert self.test_context.cluster.max_used() == 1
        assert len(self.test_context.cluster.used()) == 1

        self.another_service = GenericService(self.test_context, 1)
        assert len(self.test_context.cluster.used()) == 2
        assert self.test_context.cluster.max_used() == 2

        self.service.stop()
        self.service.free()
        assert len(self.test_context.cluster.used()) == 1
        assert self.test_context.cluster.max_used() == 2


class FileSystemTest(Test):
    """
    Note that in an attempt to isolate the file system methods, validation should be done with ssh/shell commands.
    """

    def setup(self):
        self.service = GenericService(self.test_context, 1)
        self.node = self.service.nodes[0]
        self.scratch_dir = self.service.worker_scratch_dir

    @cluster(num_nodes=1)
    def create_file_test(self):
        expected_contents = "hello world"
        fname = "myfile.txt"
        fpath = "%s/%s" % (self.scratch_dir, fname)

        self.node.account.create_file(fpath, expected_contents)

        # validate existence and contents
        self.node.account.ssh("test -f %s" % fpath)
        contents = "\n".join([line for line in self.node.account.ssh_capture("cat %s" % fpath)])
        assert contents == expected_contents

        # TODO also check absolute path

    @cluster(num_nodes=1)
    def mkdir_test(self):
        dirname = "%s/mydir" % self.scratch_dir
        self.node.account.mkdir(dirname)

        # TODO - important!! check mode
        self.node.account.ssh("test -d %s" % dirname, allow_fail=False)

        # mkdir should not succeed if the base directories do not already exist
        dirname = "%s/a/b/c/d" % self.scratch_dir
        with pytest.raises(IOError):
            self.node.account.mkdir(dirname)

        # TODO also check absolute path

    @cluster(num_nodes=1)
    def mkdirs_nested_test(self):
        dirname = "%s/a/b/c/d" % self.scratch_dir

        # TODO important!! check mode
        self.node.account.mkdirs(dirname)
        self.node.account.ssh("test -d %s" % dirname, allow_fail=False)

        # TODO also check absolute path

    @cluster(num_nodes=1)
    def open_test(self):
        """Try opening, writing, reading a file."""
        fname = "%s/myfile.txt" % self.scratch_dir
        expected_contents = b"hello world\nhooray!"
        with self.node.account.open(fname, "w") as f:
            f.write(expected_contents)

        with self.node.account.open(fname, "r") as f:
            contents = f.read()
        assert contents == expected_contents

        # Now try opening in append mode
        append = b"hithere"
        expected_contents = expected_contents + append
        with self.node.account.open(fname, "a") as f:
            f.write(append)

        with self.node.account.open(fname, "r") as f:
            contents = f.read()

        assert contents == expected_contents

    @cluster(num_nodes=1)
    def exists_file_test(self):
        """
        Create various kinds of files and symlinks, verifying that exists works as expected.

        Note that because
        """

        # create file, test existence with relative and absolute path
        self.node.account.ssh("touch %s/hi" % self.scratch_dir)
        assert self.node.account.exists("%s/hi" % self.scratch_dir)
        # TODO abspath

        # create symlink, test existence with relative and absolute path
        self.node.account.ssh("ln -s %s/hi %s/hi-link" % (self.scratch_dir, self.scratch_dir))
        assert self.node.account.exists("%s/hi-link" % self.scratch_dir)
        # TODO abspath

    def exists_dir_test(self):
        # check bad path doesn't exist
        assert not self.node.account.exists("a/b/c/d")

        # create dir, test existence with relative and absolute path
        dpath = "%s/mydir" % self.scratch_dir
        self.node.account.ssh("mkdir %s" % dpath)
        assert self.node.account.exists(dpath)
        # TODO abspath

        # create symlink, test existence with relative and absolute path
        self.node.account.ssh("ln -s %s %s/mydir-link" % (dpath, self.scratch_dir))
        assert self.node.account.exists("%s/mydir-link" % self.scratch_dir)
        # # TODO abspath

    def remove_test(self):
        """Test functionality of remove method"""
        # remove a non-empty directory
        dpath = "%s/mydir" % self.scratch_dir
        self.node.account.ssh("mkdir %s" % dpath)
        self.node.account.ssh("touch %s/hi.txt" % dpath)
        self.node.account.ssh("test -d %s" % dpath)
        self.node.account.remove(dpath)
        self.node.account.ssh("test ! -d %s" % dpath)

        # remove a file
        fpath = "%s/hello.txt" % self.scratch_dir
        self.node.account.ssh("echo 'hello world' > %s" % fpath)
        self.node.account.remove(fpath)

        # remove non-existent path
        with pytest.raises(RuntimeError):
            self.node.account.remove("a/b/c/d")

        # remove non-existent path with allow_fail = True should be ok
        self.node.account.remove("a/b/c/d", allow_fail=True)


# Representation of a somewhat arbitrary directory structure for testing copy functionality
# A key which has a string as its value represents a file
# A key which has a dict as its value represents a subdirectory
DIR_STRUCTURE = {
    "d00": {
        "another_file": b"1\n2\n3\n4\ncats and dogs",
        "d10": {"fasdf": b"lasdf;asfd\nahoppoqnbasnb"},
        "d11": {"f65": b"afasdfsafdsadf"},
    },
    "a_file": b"hello world!",
}


def make_dir_structure(base_dir, dir_structure, node=None):
    """Make a file tree starting at base_dir with structure specified by dir_structure.

    if node is None, make the structure locally, else make it on the given node
    """
    for k, v in iteritems(dir_structure):
        if isinstance(v, dict):
            # it's a subdirectory
            subdir_name = k
            subdir_path = os.path.join(base_dir, subdir_name)
            subdir_structure = v

            if node:
                node.account.mkdir(subdir_path)
            else:
                os.mkdir(subdir_path)

            make_dir_structure(subdir_path, subdir_structure, node)
        else:
            # it's a file
            file_name = k
            file_path = os.path.join(base_dir, file_name)
            file_contents = v

            if node:
                with node.account.open(file_path, "wb") as f:
                    f.write(file_contents)
            else:
                with open(file_path, "wb") as f:
                    f.write(file_contents)


def verify_dir_structure(base_dir, dir_structure, node=None):
    """Verify locally or on the given node whether the file subtree at base_dir matches dir_structure."""
    for k, v in iteritems(dir_structure):
        if isinstance(v, dict):
            # it's a subdirectory
            subdir_name = k
            subdir_path = os.path.join(base_dir, subdir_name)
            subdir_structure = v

            if node:
                assert node.account.isdir(subdir_path)
            else:
                assert os.path.isdir(subdir_path)

            verify_dir_structure(subdir_path, subdir_structure, node)
        else:
            # it's a file
            file_name = k
            file_path = os.path.join(base_dir, file_name)
            expected_file_contents = v

            if node:
                with node.account.open(file_path, "r") as f:
                    contents = f.read()
            else:
                with open(file_path, "rb") as f:
                    contents = f.read()
            assert expected_file_contents == contents, contents


class CopyToAndFroTest(Test):
    """These tests check copy_to, and copy_from functionality."""

    def setup(self):
        self.service = GenericService(self.test_context, 1)
        self.node = self.service.nodes[0]
        self.remote_scratch_dir = self.service.worker_scratch_dir

        self.local_temp_dir = tempfile.mkdtemp()

        self.logger.info("local_temp_dir: %s" % self.local_temp_dir)
        self.logger.info("node: %s" % str(self.node.account))

    @cluster(num_nodes=1)
    def test_copy_to_dir_with_rename(self):
        # make dir structure locally
        make_dir_structure(self.local_temp_dir, DIR_STRUCTURE)
        dest = os.path.join(self.remote_scratch_dir, "renamed")
        self.node.account.copy_to(self.local_temp_dir, dest)

        # now validate the directory structure on the remote machine
        verify_dir_structure(dest, DIR_STRUCTURE, node=self.node)

    @cluster(num_nodes=1)
    def test_copy_to_dir_as_subtree(self):
        # copy directory "into" a directory; this should preserve the original directoryname
        make_dir_structure(self.local_temp_dir, DIR_STRUCTURE)
        self.node.account.copy_to(self.local_temp_dir, self.remote_scratch_dir)
        local_temp_dir_name = self.local_temp_dir
        if local_temp_dir_name.endswith(os.path.sep):
            local_temp_dir_name = local_temp_dir_name[: -len(os.path.sep)]

        verify_dir_structure(os.path.join(self.remote_scratch_dir, local_temp_dir_name), DIR_STRUCTURE)

    @cluster(num_nodes=1)
    def test_copy_from_dir_with_rename(self):
        # make dir structure remotely
        make_dir_structure(self.remote_scratch_dir, DIR_STRUCTURE, node=self.node)
        dest = os.path.join(self.local_temp_dir, "renamed")
        self.node.account.copy_from(self.remote_scratch_dir, dest)

        # now validate the directory structure locally
        verify_dir_structure(dest, DIR_STRUCTURE)

    @cluster(num_nodes=1)
    def test_copy_from_dir_as_subtree(self):
        # copy directory "into" a directory; this should preserve the original directoryname
        make_dir_structure(self.remote_scratch_dir, DIR_STRUCTURE, node=self.node)
        self.node.account.copy_from(self.remote_scratch_dir, self.local_temp_dir)

        verify_dir_structure(os.path.join(self.local_temp_dir, "scratch"), DIR_STRUCTURE)

    def teardown(self):
        # allow_fail in case scratch dir was not successfully created
        if os.path.exists(self.local_temp_dir):
            shutil.rmtree(self.local_temp_dir)


class CopyDirectTest(Test):
    def setup(self):
        self.service = GenericService(self.test_context, 2)
        self.src_node, self.dest_node = self.service.nodes
        self.remote_scratch_dir = self.service.worker_scratch_dir

        self.logger.info("src_node: %s" % str(self.src_node.account))
        self.logger.info("dest_node: %s" % str(self.dest_node.account))

    @cluster(num_nodes=2)
    def test_copy_file(self):
        """Verify that a file can be correctly copied directly between nodes.

        This should work with or without the recursive flag.
        """
        file_path = os.path.join(self.remote_scratch_dir, "myfile.txt")
        expected_contents = b"123"
        self.src_node.account.create_file(file_path, expected_contents)

        self.src_node.account.copy_between(file_path, file_path, self.dest_node)

        assert self.dest_node.account.isfile(file_path)
        with self.dest_node.account.open(file_path, "r") as f:
            contents = f.read()
            assert expected_contents == contents

    @cluster(num_nodes=2)
    def test_copy_directory(self):
        """Verify that a directory can be correctly copied directly between nodes."""

        make_dir_structure(self.remote_scratch_dir, DIR_STRUCTURE, node=self.src_node)
        self.src_node.account.copy_between(self.remote_scratch_dir, self.remote_scratch_dir, self.dest_node)
        verify_dir_structure(
            os.path.join(self.remote_scratch_dir, "scratch"),
            DIR_STRUCTURE,
            node=self.dest_node,
        )


class TestClusterSpec(Test):
    @cluster(cluster_spec=ClusterSpec.simple_linux(2))
    def test_create_two_node_service(self):
        self.service = GenericService(self.test_context, 2)
        for node in self.service.nodes:
            node.account.ssh("echo hi")

    @cluster(
        cluster_spec=ClusterSpec.from_nodes(
            [
                NodeSpec(operating_system=WINDOWS),
                NodeSpec(operating_system=LINUX),
                NodeSpec(),  # this one is also linux
            ]
        )
    )
    def three_nodes_test(self):
        self.service = GenericService(self.test_context, 3)
        for node in self.service.nodes:
            node.account.ssh("echo hi")


class RemoteAccountTest(Test):
    def __init__(self, test_context):
        super(RemoteAccountTest, self).__init__(test_context)
        self.account_service = RemoteAccountTestService(test_context)

    def setup(self):
        self.account_service.start()

    @cluster(num_nodes=1)
    def test_flaky(self):
        choices = [
            Exception("FLAKE 1"),
            Exception("FLAKE 1"),
            Exception("FLAKE 2"),
            Exception("FLAKE 2"),
            Exception("FLAKE 3"),
            None,
        ]
        exp = random.choice(choices)
        if exp:
            raise exp

    @cluster(num_nodes=1)
    def test_ssh_capture_combine_stderr(self):
        """Test that ssh_capture correctly captures stderr and stdout from remote process."""
        node = self.account_service.nodes[0]

        # swap stdout and stderr in the echo process
        cmd = "for i in $(seq 1 5); do echo $i 3>&1 1>&2 2>&3; done"

        ssh_output = node.account.ssh_capture(cmd, combine_stderr=True)
        bad_ssh_output = node.account.ssh_capture(cmd, combine_stderr=False)  # Same command, but don't capture stderr

        lines = [int(line.strip()) for line in ssh_output]
        assert lines == [i for i in range(1, 6)]
        bad_lines = [int(line.strip()) for line in bad_ssh_output]
        assert bad_lines == []

    @cluster(num_nodes=1)
    def test_ssh_output_combine_stderr(self):
        """Test that ssh_output correctly captures stderr and stdout from remote process."""
        node = self.account_service.nodes[0]

        # swap stdout and stderr in the echo process
        cmd = "for i in $(seq 1 5); do echo $i 3>&1 1>&2 2>&3; done"

        ssh_output = node.account.ssh_output(cmd, combine_stderr=True)
        bad_ssh_output = node.account.ssh_output(cmd, combine_stderr=False)  # Same command, but don't capture stderr

        assert ssh_output == b"\n".join([str(i).encode("utf-8") for i in range(1, 6)]) + b"\n", ssh_output
        assert bad_ssh_output == b"", bad_ssh_output

    @cluster(num_nodes=1)
    def test_ssh_capture(self):
        """Test that ssh_capture correctly captures output from ssh subprocess."""
        node = self.account_service.nodes[0]
        cmd = "for i in $(seq 1 5); do echo $i; done"
        ssh_output = node.account.ssh_capture(cmd, combine_stderr=False)

        lines = [int(line.strip()) for line in ssh_output]
        assert lines == [i for i in range(1, 6)]

    @cluster(num_nodes=1)
    def test_ssh_output(self):
        """Test that ssh_output correctly captures output from ssh subprocess."""
        node = self.account_service.nodes[0]
        cmd = "for i in $(seq 1 5); do echo $i; done"
        ssh_output = node.account.ssh_output(cmd, combine_stderr=False)

        assert ssh_output == b"\n".join([str(i).encode("utf-8") for i in range(1, 6)]) + b"\n", ssh_output

    @cluster(num_nodes=1)
    def test_monitor_log(self):
        """Tests log monitoring by writing to a log in the background thread"""

        node = self.account_service.nodes[0]

        # Make sure we start the log with some data, including the value we're going to grep for
        self.account_service.write_to_log("foo\nbar\nbaz")

        # Background thread that simulates a process writing to the log
        self.wrote_log_line = False

        def background_logging_thread():
            # This needs to be large enough that we can verify we've actually
            # waited some time for the data to be written, but not too long that
            # the test takes a long time
            time.sleep(3)
            self.wrote_log_line = True
            self.account_service.write_to_log("foo\nbar\nbaz")

        with node.account.monitor_log(self.account_service.log_file) as monitor:
            logging_thread = Thread(target=background_logging_thread)
            logging_thread.start()
            monitor.wait_until("foo", timeout_sec=10, err_msg="Never saw expected log")
            assert self.wrote_log_line

        logging_thread.join(5.0)
        if logging_thread.is_alive():
            raise Exception("Timed out waiting for background thread.")

    @cluster(num_nodes=1)
    def test_monitor_log_exception(self):
        """Tests log monitoring correctly throws an exception when the regex was not found"""

        node = self.account_service.nodes[0]

        # Make sure we start the log with some data, including the value we're going to grep for
        self.account_service.write_to_log("foo\nbar\nbaz")

        timeout = 3
        try:
            with node.account.monitor_log(self.account_service.log_file) as monitor:
                start = time.time()
                monitor.wait_until("foo", timeout_sec=timeout, err_msg="Never saw expected log")
                assert False, "Log monitoring should have timed out and thrown an exception"
        except TimeoutError:
            # expected
            end = time.time()
            assert end - start > timeout, "Should have waited full timeout period while monitoring the log"

    @cluster(num_nodes=1)
    def test_kill_process(self):
        """Tests that kill_process correctly works"""
        grep_str = '"nc -l -p 5000"'

        def get_pids():
            pid_cmd = f"ps ax | grep -i {grep_str} | grep -v grep | awk '{{print $1}}'"

            return list(node.account.ssh_capture(pid_cmd, callback=int))

        node = self.account_service.nodes[0]

        # Run TCP service using netcat
        node.account.ssh_capture("nohup nc -l -p 5000 > /dev/null 2>&1 &")

        wait_until(
            lambda: len(get_pids()) > 0,
            timeout_sec=10,
            err_msg="Failed to start process within %d sec" % 10,
        )

        # Kill service.
        node.account.kill_process(grep_str)

        wait_until(
            lambda: len(get_pids()) == 0,
            timeout_sec=10,
            err_msg="Failed to kill process within %d sec" % 10,
        )


class TestIterWrapper(Test):
    def setup(self):
        self.line_num = 6
        self.eps = 0.01

        self.service = GenericService(self.test_context, num_nodes=1)
        self.node = self.service.nodes[0]

        self.temp_file = "ducktape-test-" + str(random.randint(0, 100000))
        contents = ""
        for i in range(self.line_num):
            contents += "%d\n" % i

        self.node.account.create_file(self.temp_file, contents)

    def test_iter_wrapper(self):
        """Test has_next functionality on the returned iterable item."""
        output = self.node.account.ssh_capture("cat " + self.temp_file)
        for i in range(self.line_num):
            assert output.has_next()  # with timeout in case of hang
            assert output.next().strip() == str(i)
        start = time.time()
        assert output.has_next() is False
        stop = time.time()
        assert stop - start < self.eps, "has_next() should return immediately"

    def test_iter_wrapper_timeout(self):
        """Test has_next with timeout"""
        output = self.node.account.ssh_capture("tail -F " + self.temp_file)
        # allow command to be executed before we check output with timeout_sec = 0
        time.sleep(0.5)
        for i in range(self.line_num):
            assert output.has_next(timeout_sec=0)
            assert output.next().strip() == str(i)

        timeout = 0.25
        start = time.time()
        # This check will last for the duration of the timeout because the the remote tail -F process
        # remains running, and the output stream is not closed.
        assert output.has_next(timeout_sec=timeout) is False
        stop = time.time()
        assert (stop - start >= timeout) and (stop - start) < timeout + self.eps, (
            "has_next() should return right after %s second" % str(timeout)
        )

    def teardown(self):
        # tail -F call above will leave stray processes, so clean up
        cmd = "for p in $(ps ax | grep -v grep | grep \"%s\" | awk '{print $1}'); do kill $p; done" % self.temp_file
        self.node.account.ssh(cmd, allow_fail=True)

        self.node.account.ssh("rm -f " + self.temp_file, allow_fail=True)


class RemoteAccountCompressedTest(Test):
    def __init__(self, test_context):
        super(RemoteAccountCompressedTest, self).__init__(test_context)
        self.account_service = RemoteAccountTestService(test_context)
        self.test_context.session_context.compress = True
        self.tar_msg = False
        self.tar_error = False

    def setup(self):
        self.account_service.start()

    @cluster(num_nodes=1)
    def test_log_compression_with_non_existent_files(self):
        """Test that log compression with tar works even when a specific log file has not been generated
        (e.g. heap dump)
        """
        self.test_context.logger.addFilter(CompressionErrorFilter(self))
        self.copy_service_logs(None)

        if not self.tar_msg:
            raise Exception("Never saw attempt to compress log")
        if self.tar_error:
            raise Exception("Failure when compressing logs")


class CompressionErrorFilter(logging.Filter):
    def __init__(self, test):
        super(CompressionErrorFilter, self).__init__()
        self.test = test

    def filter(self, record):
        if "tar czf" in record.msg:
            self.test.tar_msg = True
            if "Error" in record.msg:
                self.test.tar_error = True
        return True


================================================
FILE: systests/cluster/test_runner_operations.py
================================================
# Copyright 2022 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.services.service import Service
from ducktape.tests.test import Test
from ducktape.mark.resource import cluster
import time


class SimpleEchoService(Service):
    """Simple service that allocates one node for performing tests of RemoteAccount functionality"""

    logs = {
        "my_log": {"path": "/tmp/log", "collect_default": True},
    }

    def __init__(self, context):
        super(SimpleEchoService, self).__init__(context, num_nodes=1)
        self.count = 0

    def echo(self):
        self.nodes[0].account.ssh("echo {} >> /tmp/log".format(self.count))
        self.count += 1


class SimpleRunnerTest(Test):
    def setup(self):
        self.service = SimpleEchoService(self.test_context)

    @cluster(num_nodes=1)
    def timeout_test(self):
        """
        a simple longer running test to test special run flags agaisnt.
        """
        self.service.start()

        while self.service.count < 500:
            self.service.echo()
            time.sleep(0.1)

    @cluster(num_nodes=1)
    def quick1_test(self):
        """
        a simple quick test to test basic execution.
        """
        self.service.start()

        while self.service.count < 20:
            self.service.echo()
            time.sleep(0.2)

    @cluster(num_nodes=1)
    def quick2_test(self):
        """
        a simple quick test to test basic execution.
        """
        self.service.start()

        while self.service.count < 20:
            self.service.echo()
            time.sleep(0.2)


================================================
FILE: tests/__init__.py
================================================
from ducktape.tests.test import Test
from ducktape.tests.test_context import TestContext

__all__ = ["Test", "TestContext"]


================================================
FILE: tests/cluster/__init__.py
================================================


================================================
FILE: tests/cluster/check_cluster.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections

from ducktape.cluster.cluster_node import ClusterNode
from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.node_spec import NodeSpec
from ducktape.cluster.consts import LINUX, WINDOWS
from tests.ducktape_mock import FakeCluster

FakeRemoteAccount = collections.namedtuple("FakeRemoteAccount", ["operating_system"])


class CheckCluster(object):
    def setup_method(self, _):
        self.cluster = FakeCluster(0)
        self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=LINUX)))
        self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=LINUX)))
        self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=WINDOWS)))
        self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=WINDOWS)))
        self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=WINDOWS)))

    def spec(self, linux_nodes, windows_nodes):
        nodes = []
        for i in range(linux_nodes):
            nodes.append(NodeSpec(LINUX))
        for i in range(windows_nodes):
            nodes.append(NodeSpec(WINDOWS))
        return ClusterSpec(nodes)

    def check_enough_capacity(self):
        assert self.cluster.available().nodes.can_remove_spec(self.spec(2, 2))
        assert self.cluster.available().nodes.can_remove_spec(self.spec(2, 3))

    def check_not_enough_capacity(self):
        assert not self.cluster.available().nodes.can_remove_spec(self.spec(5, 2))
        assert not self.cluster.available().nodes.can_remove_spec(self.spec(5, 5))
        assert not self.cluster.available().nodes.can_remove_spec(self.spec(3, 3))


================================================
FILE: tests/cluster/check_cluster_spec.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.node_spec import NodeSpec


class CheckClusterSpec(object):
    def check_cluster_spec_sizes(self):
        simple_linux_2 = ClusterSpec.simple_linux(2)
        assert 2 == len(simple_linux_2)
        assert 0 == len(ClusterSpec.empty())

    def check_to_string(self):
        empty = ClusterSpec.empty()
        assert "[]" == str(empty)
        simple_linux_5 = ClusterSpec.simple_linux(5)
        assert '[{"num_nodes": 5, "os": "linux"}]' == str(simple_linux_5)

    def check_simple_linux_with_node_type(self):
        """Test simple_linux with node_type parameter."""
        spec = ClusterSpec.simple_linux(3, node_type="large")
        assert len(spec) == 3
        for node_spec in spec:
            assert node_spec.operating_system == "linux"
            assert node_spec.node_type == "large"

    def check_simple_linux_without_node_type(self):
        """Test simple_linux without node_type (backward compatibility)."""
        spec = ClusterSpec.simple_linux(2)
        assert len(spec) == 2
        for node_spec in spec:
            assert node_spec.operating_system == "linux"
            assert node_spec.node_type is None

    def check_grouped_by_os_and_type_empty(self):
        """Test grouped_by_os_and_type on empty ClusterSpec via NodeContainer."""
        spec = ClusterSpec.empty()
        grouped = spec.nodes.grouped_by_os_and_type()
        assert grouped == {}

    def check_grouped_by_os_and_type_single_type(self):
        """Test grouped_by_os_and_type with single node type via NodeContainer."""
        spec = ClusterSpec.simple_linux(3, node_type="small")
        grouped = spec.nodes.grouped_by_os_and_type()
        assert grouped == {("linux", "small"): 3}

    def check_grouped_by_os_and_type_mixed(self):
        """Test grouped_by_os_and_type with mixed node types via NodeContainer."""
        spec = ClusterSpec(
            nodes=[
                NodeSpec("linux", node_type="small"),
                NodeSpec("linux", node_type="small"),
                NodeSpec("linux", node_type="large"),
                NodeSpec("linux", node_type=None),
                NodeSpec("windows", node_type="medium"),
            ]
        )
        grouped = spec.nodes.grouped_by_os_and_type()
        assert grouped == {
            ("linux", "small"): 2,
            ("linux", "large"): 1,
            ("linux", None): 1,
            ("windows", "medium"): 1,
        }


================================================
FILE: tests/cluster/check_finite_subcluster.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.consts import LINUX
from ducktape.cluster.finite_subcluster import FiniteSubcluster
from ducktape.cluster.node_container import (
    InsufficientResourcesError,
    NodeNotPresentError,
)
from ducktape.services.service import Service
import pickle
import pytest


class MockFiniteSubclusterNode:
    @property
    def operating_system(self):
        return LINUX


class CheckFiniteSubcluster(object):
    single_node_cluster_json = {"nodes": [{"hostname": "localhost"}]}

    def check_cluster_size(self):
        cluster = FiniteSubcluster([])
        assert len(cluster) == 0

        n = 10
        cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)])
        assert len(cluster) == n

    def check_pickleable(self):
        cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(10)])
        pickle.dumps(cluster)

    def check_allocate_free(self):
        n = 10
        cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)])
        assert len(cluster) == n
        assert cluster.num_available_nodes() == n

        nodes = cluster.alloc(Service.setup_cluster_spec(num_nodes=1))
        assert len(nodes) == 1
        assert len(cluster) == n
        assert cluster.num_available_nodes() == n - 1

        nodes2 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2))
        assert len(nodes2) == 2
        assert len(cluster) == n
        assert cluster.num_available_nodes() == n - 3

        cluster.free(nodes)
        assert cluster.num_available_nodes() == n - 2

        cluster.free(nodes2)
        assert cluster.num_available_nodes() == n

    def check_alloc_too_many(self):
        n = 10
        cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)])
        with pytest.raises(InsufficientResourcesError):
            cluster.alloc(Service.setup_cluster_spec(num_nodes=(n + 1)))

    def check_free_too_many(self):
        n = 10
        cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)])
        nodes = cluster.alloc(Service.setup_cluster_spec(num_nodes=n))
        with pytest.raises(NodeNotPresentError):
            nodes.append(MockFiniteSubclusterNode())
            cluster.free(nodes)


================================================
FILE: tests/cluster/check_json.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.cluster.json import JsonCluster
from ducktape.cluster.node_container import InsufficientResourcesError
from ducktape.services.service import Service
import pickle
import pytest

from tests.runner.fake_remote_account import create_fake_remote_account


def create_json_cluster(*args, **kwargs):
    return JsonCluster(*args, make_remote_account_func=create_fake_remote_account, **kwargs)


class CheckJsonCluster(object):
    single_node_cluster_json = {"nodes": [{"ssh_config": {"host": "localhost"}}]}

    def check_invalid_json(self):
        # Missing list of nodes
        with pytest.raises(ValueError):
            create_json_cluster({})

        # Missing hostname, which is required
        with pytest.raises(ValueError):
            create_json_cluster({"nodes": [{}]})

    @staticmethod
    def cluster_hostnames(nodes):
        return set([node.account.hostname for node in nodes])

    def check_cluster_size(self):
        cluster = create_json_cluster({"nodes": []})
        assert len(cluster) == 0

        n = 10
        cluster = create_json_cluster({"nodes": [{"ssh_config": {"hostname": "localhost%d" % x}} for x in range(n)]})

        assert len(cluster) == n

    def check_pickleable(self):
        cluster = create_json_cluster(
            {
                "nodes": [
                    {"ssh_config": {"host": "localhost1"}},
                    {"ssh_config": {"host": "localhost2"}},
                    {"ssh_config": {"host": "localhost3"}},
                ]
            }
        )

        pickle.dumps(cluster)

    def check_allocate_free(self):
        cluster = create_json_cluster(
            {
                "nodes": [
                    {"ssh_config": {"host": "localhost1"}},
                    {"ssh_config": {"host": "localhost2"}},
                    {"ssh_config": {"host": "localhost3"}},
                ]
            }
        )

        assert len(cluster) == 3
        assert cluster.num_available_nodes() == 3

        nodes = cluster.alloc(Service.setup_cluster_spec(num_nodes=1))
        nodes_hostnames = self.cluster_hostnames(nodes)
        assert len(cluster) == 3
        assert cluster.num_available_nodes() == 2

        nodes2 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2))
        nodes2_hostnames = self.cluster_hostnames(nodes2)
        assert len(cluster) == 3
        assert cluster.num_available_nodes() == 0

        assert nodes_hostnames.isdisjoint(nodes2_hostnames)

        cluster.free(nodes)
        assert cluster.num_available_nodes() == 1

        cluster.free(nodes2)
        assert cluster.num_available_nodes() == 3

    def check_parsing(self):
        """Checks that RemoteAccounts are generated correctly from input JSON"""

        node = create_json_cluster({"nodes": [{"ssh_config": {"host": "hostname"}}]}).alloc(
            Service.setup_cluster_spec(num_nodes=1)
        )[0]

        assert node.account.hostname == "hostname"
        assert node.account.user is None

        ssh_config = {
            "host": "hostname",
            "user": "user",
            "hostname": "localhost",
            "port": 22,
        }
        node = create_json_cluster(
            {"nodes": [{"hostname": "hostname", "user": "user", "ssh_config": ssh_config}]}
        ).alloc(Service.setup_cluster_spec(num_nodes=1))[0]

        assert node.account.hostname == "hostname"
        assert node.account.user == "user"

        # check ssh configs
        assert node.account.ssh_config.host == "hostname"
        assert node.account.ssh_config.user == "user"
        assert node.account.ssh_config.hostname == "localhost"
        assert node.account.ssh_config.port == 22

    def check_exhausts_supply(self):
        cluster = create_json_cluster(self.single_node_cluster_json)
        with pytest.raises(InsufficientResourcesError):
            cluster.alloc(Service.setup_cluster_spec(num_nodes=2))

    def check_node_names(self):
        cluster = create_json_cluster(
            {
                "nodes": [
                    {"ssh_config": {"host": "localhost1"}},
                    {"ssh_config": {"host": "localhost2"}},
                    {"ssh_config": {"host": "localhost3"}},
                ]
            }
        )
        hosts = set(["localhost1", "localhost2", "localhost3"])
        nodes = cluster.alloc(cluster.available())
        assert hosts == set(node.name for node in nodes)


================================================
FILE: tests/cluster/check_localhost.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.localhost import LocalhostCluster
from ducktape.services.service import Service

import pickle


class CheckLocalhostCluster(object):
    def setup_method(self, _):
        self.cluster = LocalhostCluster()

    def check_size(self):
        len(self.cluster) >= 2**31 - 1

    def check_pickleable(self):
        cluster = LocalhostCluster()
        pickle.dumps(cluster)

    def check_request_free(self):
        available = self.cluster.num_available_nodes()
        initial_size = len(self.cluster)

        # Should be able to allocate arbitrarily many nodes
        nodes = self.cluster.alloc(Service.setup_cluster_spec(num_nodes=100))
        assert len(nodes) == 100
        for i, node in enumerate(nodes):
            assert node.account.hostname == "localhost%d" % i
            assert node.account.ssh_hostname == "localhost"
            assert node.account.ssh_config.hostname == "localhost"
            assert node.account.ssh_config.port == 22
            assert node.account.user is None

        assert self.cluster.num_available_nodes() == (available - 100)
        assert len(self.cluster) == initial_size  # This shouldn't change

        self.cluster.free(nodes)

        assert self.cluster.num_available_nodes() == available


================================================
FILE: tests/cluster/check_node_container.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.cluster_node import ClusterNode
from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.consts import WINDOWS, LINUX
from ducktape.cluster.node_spec import NodeSpec
from ducktape.cluster.node_container import (
    NodeContainer,
    NodeNotPresentError,
    InsufficientResourcesError,
    InsufficientHealthyNodesError,
)
import pytest

from ducktape.cluster.remoteaccount import RemoteAccountSSHConfig
from tests.ducktape_mock import MockAccount
from tests.runner.fake_remote_account import FakeRemoteAccount, FakeWindowsRemoteAccount


def fake_account(host, is_available=True, node_type=None):
    return FakeRemoteAccount(
        ssh_config=RemoteAccountSSHConfig(host=host), is_available=is_available, node_type=node_type
    )


def fake_win_account(host, is_available=True, node_type=None):
    return FakeWindowsRemoteAccount(
        ssh_config=RemoteAccountSSHConfig(host=host), is_available=is_available, node_type=node_type
    )


def count_nodes_by_os(container, target_os):
    """Helper to count nodes by OS from node_groups."""
    count = 0
    for (os, _), nodes in container.node_groups.items():
        if os == target_os:
            count += len(nodes)
    return count


class CheckNodeContainer(object):
    def check_sizes(self):
        empty = NodeContainer()
        assert 0 == empty.size()
        assert 0 == len(empty)
        nodes = [ClusterNode(MockAccount())]
        container = NodeContainer(nodes)
        assert 1 == container.size()
        assert 1 == len(container)

    def check_add_and_remove(self):
        nodes = [
            ClusterNode(MockAccount()),
            ClusterNode(MockAccount()),
            ClusterNode(MockAccount()),
            ClusterNode(MockAccount()),
            ClusterNode(MockAccount()),
        ]
        container = NodeContainer([])
        assert 0 == len(container)
        container.add_node(nodes[0])
        container.add_node(nodes[1])
        container.add_node(nodes[2])
        container2 = container.clone()
        i = 0
        for node in container:
            assert nodes[i] == node
            i += 1
        assert 3 == len(container)
        container.remove_node(nodes[0])
        with pytest.raises(NodeNotPresentError):
            container.remove_node(nodes[0])
        assert 2 == len(container)
        assert 3 == len(container2)

    def check_remove_single_node_spec(self):
        """Check remove_spec() method - verify a simple happy path of removing a single node"""
        accounts = [
            fake_account("host1"),
            fake_account("host2"),
            fake_win_account("w1"),
            fake_win_account("w2"),
        ]
        container = NodeContainer(accounts)
        one_linux_node_spec = ClusterSpec(nodes=[NodeSpec(LINUX)])
        one_windows_node_spec = ClusterSpec(nodes=[NodeSpec(WINDOWS)])

        def _remove_single_node(one_node_spec, os):
            assert container.can_remove_spec(one_node_spec)
            good_nodes, bad_nodes = container.remove_spec(one_node_spec)
            assert good_nodes and len(good_nodes) == 1
            assert good_nodes[0].os == os
            assert not bad_nodes

        _remove_single_node(one_windows_node_spec, WINDOWS)
        assert count_nodes_by_os(container, LINUX) == 2
        assert count_nodes_by_os(container, WINDOWS) == 1

        _remove_single_node(one_windows_node_spec, WINDOWS)
        assert count_nodes_by_os(container, LINUX) == 2
        assert count_nodes_by_os(container, WINDOWS) == 0
        assert not container.can_remove_spec(one_windows_node_spec)
        with pytest.raises(InsufficientResourcesError):
            container.remove_spec(one_windows_node_spec)

        _remove_single_node(one_linux_node_spec, LINUX)
        assert count_nodes_by_os(container, LINUX) == 1
        assert count_nodes_by_os(container, WINDOWS) == 0

        _remove_single_node(one_linux_node_spec, LINUX)
        assert count_nodes_by_os(container, LINUX) == 0
        assert count_nodes_by_os(container, WINDOWS) == 0
        assert not container.can_remove_spec(one_linux_node_spec)
        with pytest.raises(InsufficientResourcesError):
            container.remove_spec(one_linux_node_spec)

    @pytest.mark.parametrize(
        "cluster_spec",
        [
            pytest.param(
                ClusterSpec(nodes=[NodeSpec(LINUX), NodeSpec(WINDOWS), NodeSpec(WINDOWS)]),
                id="not enough windows nodes",
            ),
            pytest.param(
                ClusterSpec(nodes=[NodeSpec(LINUX), NodeSpec(LINUX), NodeSpec(WINDOWS)]),
                id="not enough linux nodes",
            ),
            pytest.param(
                ClusterSpec(
                    nodes=[
                        NodeSpec(LINUX),
                        NodeSpec(LINUX),
                        NodeSpec(WINDOWS),
                        NodeSpec(WINDOWS),
                    ]
                ),
                id="not enough nodes",
            ),
        ],
    )
    def check_not_enough_nodes_to_remove(self, cluster_spec):
        """
        Check what happens if there aren't enough resources in this container to match a given spec.
        Various parametrizations check the behavior for when there are enough nodes for one OS but not another,
        or for both.
        """
        accounts = [fake_account("host1"), fake_win_account("w1")]
        container = NodeContainer(accounts)
        original_container = container.clone()

        assert not container.can_remove_spec(cluster_spec)
        assert len(container.attempt_remove_spec(cluster_spec)) > 0

        with pytest.raises(InsufficientResourcesError):
            container.remove_spec(cluster_spec)

        # check that container was not modified
        assert container.node_groups == original_container.node_groups

    @pytest.mark.parametrize(
        "accounts",
        [
            pytest.param(
                [
                    fake_account("host1"),
                    fake_account("host2"),
                    fake_win_account("w1"),
                    fake_win_account("w2", is_available=False),
                ],
                id="windows not available",
            ),
            pytest.param(
                [
                    fake_account("host1"),
                    fake_account("host2", is_available=False),
                    fake_win_account("w1"),
                    fake_win_account("w2"),
                ],
                id="linux not available",
            ),
            pytest.param(
                [
                    fake_account("host1"),
                    fake_account("host2", is_available=False),
                    fake_win_account("w1"),
                    fake_win_account("w2", is_available=False),
                ],
                id="neither is available",
            ),
        ],
    )
    def check_not_enough_healthy_nodes(self, accounts):
        """
        When there's not enough healthy nodes in any of the OS-s, we obviously don't want to remove anything.
        Even when there's enough healthy nodes for one of the OS-s, but not enough in another one,
        we don't want to remove any nodes at all.
        Various sets of params check if there aren't enough healthy nodes for one OS but not the other, or both.
        """
        container = NodeContainer(accounts)
        original_container = container.clone()
        expected_bad_nodes = [acc for acc in accounts if not acc.is_available]
        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX),
                NodeSpec(LINUX),
                NodeSpec(WINDOWS),
                NodeSpec(WINDOWS),
            ]
        )
        assert container.can_remove_spec(spec)
        with pytest.raises(InsufficientHealthyNodesError) as exc_info:
            container.remove_spec(spec)
        assert exc_info.value.bad_nodes == expected_bad_nodes
        # check that no nodes were actually allocated, but unhealthy ones were removed from the cluster
        original_container.remove_nodes(expected_bad_nodes)
        assert container.node_groups == original_container.node_groups

    @pytest.mark.parametrize(
        "accounts",
        [
            pytest.param(
                [
                    fake_account("host1"),
                    fake_account("host2"),
                    fake_account("host3"),
                    fake_win_account("w1", is_available=False),
                    fake_win_account("w2"),
                    fake_win_account("w2"),
                ],
                id="windows not available",
            ),
            pytest.param(
                [
                    fake_account("host1", is_available=False),
                    fake_account("host2"),
                    fake_account("host3"),
                    fake_win_account("w1"),
                    fake_win_account("w2"),
                    fake_win_account("w2"),
                ],
                id="linux not available",
            ),
            pytest.param(
                [
                    fake_account("host1", is_available=False),
                    fake_account("host2"),
                    fake_account("host3"),
                    fake_win_account("w1", is_available=False),
                    fake_win_account("w2"),
                    fake_win_account("w2"),
                ],
                id="neither is available",
            ),
            pytest.param(
                [
                    fake_account("host1"),
                    fake_account("host2"),
                    fake_account("host3"),
                    fake_win_account("w1"),
                    fake_win_account("w2"),
                    fake_win_account("w2"),
                ],
                id="all are available",
            ),
        ],
    )
    def check_enough_healthy_but_some_bad_nodes_too(self, accounts):
        """
        Check that we can successfully allocate all necessary nodes - even if some nodes don't pass health checks,
        we still have enough nodes to match the provided cluster spec.

        This test assumes that if we do encounter unhealthy node,
        we encounter it before we can finish allocating healthy ones, otherwise it would just mean testing the happy
        path (which we do in one of the params).
        """
        container = NodeContainer(accounts)
        original_container = container.clone()
        expected_bad_nodes = [acc for acc in accounts if not acc.is_available]
        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX),
                NodeSpec(LINUX),
                NodeSpec(WINDOWS),
                NodeSpec(WINDOWS),
            ]
        )

        assert container.can_remove_spec(spec)
        good_nodes, bad_nodes = container.remove_spec(spec)

        # alloc should succeed
        # check that we did catch a bad node if any
        assert bad_nodes == expected_bad_nodes
        # check that container has exactly the right number of nodes left -
        # we removed len(spec) healthy nodes, plus len(expected_bad_nodes) of unhealthy nodes.
        assert len(container) == len(original_container) - len(spec) - len(expected_bad_nodes)

        # check that we got 2 windows nodes and two linux nodes in response,
        # don't care which ones in particular
        assert len(good_nodes) == 4
        actual_linux = [node for node in good_nodes if node.os == LINUX]
        assert len(actual_linux) == 2
        actual_win = [node for node in good_nodes if node.os == WINDOWS]
        assert len(actual_win) == 2

    def check_empty_cluster_spec(self):
        accounts = [fake_account("host1"), fake_account("host2"), fake_account("host3")]
        container = NodeContainer(accounts)
        spec = ClusterSpec.empty()
        assert not container.attempt_remove_spec(spec)
        assert container.can_remove_spec(spec)
        good, bad = container.remove_spec(spec)
        assert not good
        assert not bad

    def check_none_cluster_spec(self):
        accounts = [fake_account("host1"), fake_account("host2"), fake_account("host3")]
        container = NodeContainer(accounts)
        spec = None
        assert container.attempt_remove_spec(spec)
        assert not container.can_remove_spec(spec)
        with pytest.raises(InsufficientResourcesError):
            container.remove_spec(spec)

    # ==================== node_type tests ====================

    def check_node_groups_by_type(self):
        """Check that nodes are grouped by (os, node_type) in node_groups."""
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="small"),
            fake_account("host3", node_type="large"),
            fake_account("host4"),  # no node_type (None)
            fake_win_account("w1", node_type="small"),
        ]
        container = NodeContainer(accounts)

        # Check node_groups has correct keys
        assert (LINUX, "small") in container.node_groups
        assert (LINUX, "large") in container.node_groups
        assert (LINUX, None) in container.node_groups
        assert (WINDOWS, "small") in container.node_groups

        # Check counts
        assert len(container.node_groups[(LINUX, "small")]) == 2
        assert len(container.node_groups[(LINUX, "large")]) == 1
        assert len(container.node_groups[(LINUX, None)]) == 1
        assert len(container.node_groups[(WINDOWS, "small")]) == 1

    def check_remove_spec_with_node_type(self):
        """Check remove_spec with node_type specified in the cluster spec."""
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="small"),
            fake_account("host3", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # Request 1 small node
        small_spec = ClusterSpec(nodes=[NodeSpec(LINUX, node_type="small")])
        assert container.can_remove_spec(small_spec)
        good_nodes, bad_nodes = container.remove_spec(small_spec)
        assert len(good_nodes) == 1
        assert good_nodes[0].node_type == "small"
        assert not bad_nodes

        # Should have 1 small and 1 large left
        assert len(container.node_groups.get((LINUX, "small"), [])) == 1
        assert len(container.node_groups.get((LINUX, "large"), [])) == 1

    def check_remove_spec_with_node_type_not_available(self):
        """Check remove_spec fails when requested node_type is not available."""
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="small"),
        ]
        container = NodeContainer(accounts)

        # Request a large node (not available)
        large_spec = ClusterSpec(nodes=[NodeSpec(LINUX, node_type="large")])
        assert not container.can_remove_spec(large_spec)
        with pytest.raises(InsufficientResourcesError):
            container.remove_spec(large_spec)

    def check_remove_spec_node_type_none_matches_any(self):
        """Check that node_type=None in spec matches any available node."""
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # Request a node with no specific type - should match any
        any_spec = ClusterSpec(nodes=[NodeSpec(LINUX, node_type=None)])
        assert container.can_remove_spec(any_spec)
        good_nodes, _ = container.remove_spec(any_spec)
        assert len(good_nodes) == 1
        # Should have allocated one of the available nodes
        assert good_nodes[0].node_type in ("small", "large")

    def check_remove_spec_mixed_node_types(self):
        """Check remove_spec with mixed node_type requirements."""
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="small"),
            fake_account("host3", node_type="large"),
            fake_account("host4", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # Request 1 small and 1 large
        mixed_spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type="large"),
            ]
        )
        assert container.can_remove_spec(mixed_spec)
        good_nodes, _ = container.remove_spec(mixed_spec)
        assert len(good_nodes) == 2

        small_nodes = [n for n in good_nodes if n.node_type == "small"]
        large_nodes = [n for n in good_nodes if n.node_type == "large"]
        assert len(small_nodes) == 1
        assert len(large_nodes) == 1

    # ==================== Double-counting fix tests ====================

    def check_mixed_typed_and_untyped_double_counting_rejected(self):
        """
        Test that mixed typed and untyped requirements correctly fail when
        total capacity is insufficient (the double-counting bug fix).

        Scenario:
            - Available: 3 Linux nodes (2 small, 1 large)
            - Request: 2 linux/small + 2 linux/any
            - Expected: FAIL (need 4 nodes, only have 3)

        Before the fix, this would incorrectly pass because:
            - linux/small check: 2 available >= 2 needed ✓
            - linux/any check: 3 available >= 2 needed ✓
        But the any-type was double-counting the small nodes!
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="small"),
            fake_account("host3", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # Request 2 small + 2 any = 4 total, but only 3 available
        impossible_spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type=None),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert not container.can_remove_spec(impossible_spec)
        with pytest.raises(InsufficientResourcesError):
            container.remove_spec(impossible_spec)

    def check_mixed_typed_and_untyped_valid_passes(self):
        """
        Test that mixed typed and untyped requirements correctly pass when
        there is sufficient capacity.

        Scenario:
            - Available: 4 Linux nodes (2 small, 2 large)
            - Request: 2 linux/small + 1 linux/any
            - Expected: PASS (need 3 nodes, have 4)
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="small"),
            fake_account("host3", node_type="large"),
            fake_account("host4", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # Request 2 small + 1 any = 3 total, have 4 available
        valid_spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert container.can_remove_spec(valid_spec)
        good_nodes, bad_nodes = container.remove_spec(valid_spec)
        assert len(good_nodes) == 3
        assert not bad_nodes

    def check_specific_type_shortage_detected(self):
        """
        Test that shortage of a specific type is detected even when total
        OS capacity is sufficient.

        Scenario:
            - Available: 3 Linux nodes (1 small, 2 large)
            - Request: 2 linux/small + 1 linux/any
            - Expected: FAIL (need 2 small, only have 1)
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="large"),
            fake_account("host3", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # Request 2 small + 1 any - total capacity OK but not enough small
        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert not container.can_remove_spec(spec)
        error_msg = container.attempt_remove_spec(spec)
        assert "linux/small" in error_msg

    def check_holistic_os_capacity_check(self):
        """
        Test that total OS capacity is checked before detailed type checks.

        Scenario:
            - Available: 2 Linux nodes (1 small, 1 large)
            - Request: 1 linux/small + 1 linux/large + 1 linux/any
            - Expected: FAIL (need 3 total, only have 2)
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="large"),
        ]
        container = NodeContainer(accounts)

        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type="large"),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert not container.can_remove_spec(spec)
        error_msg = container.attempt_remove_spec(spec)
        # Should report total Linux shortage
        assert "linux" in error_msg.lower()

    def check_multi_os_mixed_requirements(self):
        """
        Test holistic validation works correctly with multiple OS types.

        Scenario:
            - Available: 3 Linux (2 small, 1 large), 2 Windows (1 small, 1 large)
            - Request: 2 linux/small + 1 linux/any + 1 windows/any
            - Expected: FAIL for Linux (need 3, have 3, but 2 small reserved leaves only 1 for any)
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="small"),
            fake_account("host3", node_type="large"),
            fake_win_account("w1", node_type="small"),
            fake_win_account("w2", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # For Linux: 2 small + 2 any = 4 total, but only 3 available
        # For Windows: 1 any = fine
        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type=None),
                NodeSpec(LINUX, node_type=None),
                NodeSpec(WINDOWS, node_type=None),
            ]
        )
        assert not container.can_remove_spec(spec)

    def check_only_any_type_requirements(self):
        """
        Test that requests with only any-type requirements work correctly.
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="large"),
            fake_account("host3"),  # no type
        ]
        container = NodeContainer(accounts)

        # Request 3 any-type - should work
        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type=None),
                NodeSpec(LINUX, node_type=None),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert container.can_remove_spec(spec)
        good_nodes, _ = container.remove_spec(spec)
        assert len(good_nodes) == 3

        # Request 4 any-type - should fail
        container2 = NodeContainer(accounts)
        spec2 = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type=None),
                NodeSpec(LINUX, node_type=None),
                NodeSpec(LINUX, node_type=None),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert not container2.can_remove_spec(spec2)

    def check_allocation_order_specific_before_any(self):
        """
        Test that specific node_type requirements are allocated before any-type.

        This prevents any-type requests from "stealing" nodes needed by specific types.

        Scenario:
            - Available: 2 Linux nodes (1 small, 1 large)
            - Request: 1 linux/small + 1 linux/any
            - Expected: SUCCESS - small gets the small node, any gets the large node

        Without proper ordering, if linux/any is processed first, it might take
        the small node, leaving linux/small with no matching nodes.
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="large"),
        ]
        container = NodeContainer(accounts)

        # Request 1 small + 1 any - should succeed with proper ordering
        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert container.can_remove_spec(spec)
        good_nodes, bad_nodes = container.remove_spec(spec)
        assert len(good_nodes) == 2
        assert not bad_nodes

        # Verify we got one of each type
        types = [n.node_type for n in good_nodes]
        assert "small" in types
        assert "large" in types

    def check_allocation_order_with_multiple_specific_types(self):
        """
        Test allocation order with multiple specific types and any-type.

        Scenario:
            - Available: 3 Linux nodes (1 small, 1 medium, 1 large)
            - Request: 1 linux/small + 1 linux/large + 1 linux/any
            - Expected: SUCCESS - specific types get their nodes, any gets medium
        """
        accounts = [
            fake_account("host1", node_type="small"),
            fake_account("host2", node_type="medium"),
            fake_account("host3", node_type="large"),
        ]
        container = NodeContainer(accounts)

        spec = ClusterSpec(
            nodes=[
                NodeSpec(LINUX, node_type="small"),
                NodeSpec(LINUX, node_type="large"),
                NodeSpec(LINUX, node_type=None),
            ]
        )
        assert container.can_remove_spec(spec)
        good_nodes, _ = container.remove_spec(spec)
        assert len(good_nodes) == 3

        # Verify we got all three types
        types = [n.node_type for n in good_nodes]
        assert "small" in types
        assert "medium" in types
        assert "large" in types


================================================
FILE: tests/cluster/check_remoteaccount.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.errors import TimeoutError
from tests.ducktape_mock import MockAccount
from tests.test_utils import find_available_port
from ducktape.cluster.remoteaccount import RemoteAccount
from ducktape.cluster.remoteaccount import RemoteAccountSSHConfig
import pytest

import logging
from threading import Thread
from http.server import SimpleHTTPRequestHandler
import socketserver
import threading
import time


class DummyException(Exception):
    pass


def raise_error_checker(error, remote_account):
    raise DummyException("dummy raise: {}\nfrom: {}".format(error, remote_account))


def raise_no_error_checker(error, remote_account):
    pass


class SimpleServer(object):
    """Helper class which starts a simple server listening on localhost at the specified port"""

    def __init__(self):
        self.port = find_available_port()
        self.handler = SimpleHTTPRequestHandler
        self.httpd = socketserver.TCPServer(("", self.port), self.handler)
        self.close_signal = threading.Event()
        self.server_started = False

    def start(self, delay_sec=0.0):
        """Run the server after specified delay"""

        def run():
            self.close_signal.wait(delay_sec)

            if not self.close_signal.is_set():
                self.server_started = True
                self.httpd.serve_forever()

        self.background_thread = Thread(target=run)
        self.background_thread.start()

    def stop(self):
        self.close_signal.set()

        if self.server_started:
            self.httpd.shutdown()
        self.background_thread.join(timeout=0.5)
        if self.background_thread.is_alive():
            raise Exception("SimpleServer failed to stop quickly")


class CheckRemoteAccount(object):
    def setup(self):
        self.server = SimpleServer()
        self.account = MockAccount()

    def check_wait_for_http(self):
        """Check waiting without timeout"""
        self.server.start(delay_sec=0.0)
        self.account.wait_for_http_service(port=self.server.port, headers={}, timeout=10, path="/")

    def check_wait_for_http_timeout(self):
        """Check waiting with timeout"""

        timeout = 1
        start = time.time()
        self.server.start(delay_sec=5)

        try:
            self.account.wait_for_http_service(port=self.server.port, headers={}, timeout=timeout, path="/")
            raise Exception("Should have timed out waiting for server to start")
        except TimeoutError:
            # expected behavior. Now check that we're reasonably close to the expected timeout
            # This is a fairly loose check since there are various internal timeouts that can affect the overall
            # timing
            actual_timeout = time.time() - start
            assert abs(actual_timeout - timeout) / timeout < 1

    @pytest.mark.parametrize(
        "checkers",
        [
            [raise_error_checker],
            [raise_no_error_checker, raise_error_checker],
            [raise_error_checker, raise_no_error_checker],
        ],
    )
    def check_ssh_checker(self, checkers):
        self.server.start()
        ssh_config = RemoteAccountSSHConfig.from_string(
            """
        Host dummy_host.com
            Hostname dummy_host.name.com
            Port 22
            User dummy
            ConnectTimeout 1
        """
        )
        self.account = RemoteAccount(ssh_config, ssh_exception_checks=checkers)
        with pytest.raises(DummyException):
            self.account.ssh("echo test")

    def teardown(self):
        self.server.stop()


class CheckRemoteAccountEquality(object):
    def check_remote_account_equality(self):
        """Different instances of remote account initialized with the same parameters should be equal."""

        ssh_config = RemoteAccountSSHConfig(host="thehost", hostname="localhost", port=22)

        kwargs = {
            "ssh_config": ssh_config,
            "externally_routable_ip": "345",
            "logger": logging.getLogger(__name__),
        }
        r1 = RemoteAccount(**kwargs)
        r2 = RemoteAccount(**kwargs)

        assert r1 == r2


================================================
FILE: tests/cluster/check_vagrant.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.vagrant import VagrantCluster
from ducktape.services.service import Service
import json
import pickle
import os
import random
import pytest
from ducktape.cluster.remoteaccount import RemoteAccountError

from tests.runner.fake_remote_account import create_fake_remote_account

TWO_HOSTS = """Host worker1
  HostName 127.0.0.1
  User vagrant
  Port 2222
  UserKnownHostsFile /dev/null
  StrictHostKeyChecking no
  PasswordAuthentication no
  IdentityFile /Users/foo/ducktape.git/.vagrant/machines/worker1/virtualbox/private_key
  IdentitiesOnly yes
  LogLevel FATAL

Host worker2
  HostName 127.0.0.2
  User vagrant
  Port 2200
  UserKnownHostsFile /dev/null
  StrictHostKeyChecking no
  PasswordAuthentication no
  IdentityFile /Users/foo/ducktape.git/.vagrant/machines/worker2/virtualbox/private_key
  IdentitiesOnly yes
  LogLevel FATAL

"""


def make_vagrant_cluster(*args, **kwargs):
    return VagrantCluster(make_remote_account_func=create_fake_remote_account, *args, **kwargs)


class CheckVagrantCluster(object):
    def setup_method(self, _):
        # We roll our own tempfile name instead of using python tempfile module because
        # in some cases, we want self.cluster_file to be the name of a file which does not yet exist
        self.cluster_file = "cluster_file_temporary-%d.json" % random.randint(1, 2**63 - 1)
        if os.path.exists(self.cluster_file):
            os.remove(self.cluster_file)

    def teardown_method(self, _):
        if os.path.exists(self.cluster_file):
            os.remove(self.cluster_file)

    def _set_monkeypatch_attr(self, monkeypatch):
        monkeypatch.setattr(
            "ducktape.cluster.vagrant.VagrantCluster._vagrant_ssh_config",
            lambda vc: (TWO_HOSTS, None),
        )
        monkeypatch.setattr(
            "ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount.fetch_externally_routable_ip",
            lambda vc: "127.0.0.1",
        )

    def check_pickleable(self, monkeypatch):
        self._set_monkeypatch_attr(monkeypatch)
        cluster = VagrantCluster()
        pickle.dumps(cluster)

    def check_one_host_parsing(self, monkeypatch):
        """check the behavior of VagrantCluster when cluster_file is not specified. VagrantCluster should read
        cluster information from _vagrant_ssh_config().
        """
        self._set_monkeypatch_attr(monkeypatch)

        cluster = make_vagrant_cluster()
        assert len(cluster) == 2
        assert cluster.num_available_nodes() == 2
        node1, node2 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2))

        assert node1.account.hostname == "worker1"
        assert node1.account.user == "vagrant"
        assert node1.account.ssh_hostname == "127.0.0.1"

        assert node2.account.hostname == "worker2"
        assert node2.account.user == "vagrant"
        assert node2.account.ssh_hostname == "127.0.0.2"

    def check_cluster_file_write(self, monkeypatch):
        """check the behavior of VagrantCluster when cluster_file is specified but the file doesn't exist.
        VagrantCluster should read cluster information from _vagrant_ssh_config() and write the information to
        cluster_file.
        """
        self._set_monkeypatch_attr(monkeypatch)
        assert not os.path.exists(self.cluster_file)

        cluster = make_vagrant_cluster(cluster_file=self.cluster_file)
        cluster_json_expected = {}
        nodes = [
            {
                "externally_routable_ip": node_account.externally_routable_ip,
                "ssh_config": {
                    "host": node_account.ssh_config.host,
                    "hostname": node_account.ssh_config.hostname,
                    "user": node_account.ssh_config.user,
                    "identityfile": node_account.ssh_config.identityfile,
                    "password": node_account.ssh_config.password,
                    "port": node_account.ssh_config.port,
                    "connecttimeout": None,
                },
            }
            for node_account in cluster._available_accounts
        ]

        cluster_json_expected["nodes"] = nodes

        cluster_json_actual = json.load(open(os.path.abspath(self.cluster_file)))
        assert cluster_json_actual == cluster_json_expected

    def check_cluster_file_read(self, monkeypatch):
        """check the behavior of VagrantCluster when cluster_file is specified and the file exists.
        VagrantCluster should read cluster information from cluster_file.
        """
        self._set_monkeypatch_attr(monkeypatch)

        # To verify that VagrantCluster reads cluster information from the cluster_file, the
        # content in the file is intentionally made different from that returned by _vagrant_ssh_config().
        nodes_expected = []
        node1_expected = {
            "externally_routable_ip": "127.0.0.3",
            "ssh_config": {
                "host": "worker3",
                "hostname": "127.0.0.3",
                "user": "vagrant",
                "port": 2222,
                "password": "password",
                "identityfile": "/path/to/identfile3",
                "connecttimeout": None,
            },
        }
        nodes_expected.append(node1_expected)

        node2_expected = {
            "externally_routable_ip": "127.0.0.2",
            "ssh_config": {
                "host": "worker2",
                "hostname": "127.0.0.2",
                "user": "vagrant",
                "port": 2223,
                "password": None,
                "identityfile": "/path/to/indentfile2",
                "connecttimeout": 10,
            },
        }
        nodes_expected.append(node2_expected)

        cluster_json_expected = {}
        cluster_json_expected["nodes"] = nodes_expected
        json.dump(
            cluster_json_expected,
            open(self.cluster_file, "w+"),
            indent=2,
            separators=(",", ": "),
            sort_keys=True,
        )

        # Load the cluster from the json file we just created
        cluster = make_vagrant_cluster(cluster_file=self.cluster_file)

        assert len(cluster) == 2
        assert cluster.num_available_nodes() == 2
        node2, node3 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2))

        assert node3.account.hostname == "worker2"
        assert node3.account.user == "vagrant"
        assert node3.account.ssh_hostname == "127.0.0.2"
        assert node3.account.ssh_config.to_json() == node2_expected["ssh_config"]

        assert node2.account.hostname == "worker3"
        assert node2.account.user == "vagrant"
        assert node2.account.ssh_hostname == "127.0.0.3"
        assert node2.account.ssh_config.to_json() == node1_expected["ssh_config"]

    def check_no_valid_network_devices(self, monkeypatch):
        """
        test to make sure that a remote account error is raised when no network devices are found
        """
        monkeypatch.setattr(
            "ducktape.cluster.vagrant.VagrantCluster._vagrant_ssh_config",
            lambda vc: (TWO_HOSTS, None),
        )
        monkeypatch.setattr(
            "ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount.get_network_devices",
            lambda account: [],
        )

        with pytest.raises(RemoteAccountError):
            VagrantCluster()


================================================
FILE: tests/command_line/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/command_line/check_main.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.command_line.main import get_user_defined_globals
from ducktape.command_line.main import setup_results_directory
from ducktape.command_line.main import update_latest_symlink

import json
import os
import os.path
import pickle
import pytest
import tempfile


class CheckSetupResultsDirectory(object):
    def setup_method(self, _):
        self.results_root = tempfile.mkdtemp()
        self.results_dir = os.path.join(self.results_root, "results_directory")
        self.latest_symlink = os.path.join(self.results_root, "latest")

    def validate_directories(self):
        """Validate existence of results directory and correct symlink"""
        assert os.path.exists(self.results_dir)
        assert os.path.exists(self.latest_symlink)
        assert os.path.islink(self.latest_symlink)

        # check symlink points to correct location
        assert os.path.realpath(self.latest_symlink) == os.path.realpath(self.results_dir)

    def check_creation(self):
        """Check results and symlink from scratch"""
        assert not os.path.exists(self.results_dir)
        setup_results_directory(self.results_dir)
        update_latest_symlink(self.results_root, self.results_dir)
        self.validate_directories()

    def check_symlink(self):
        """Check "latest" symlink behavior"""

        # if symlink already exists
        old_results = os.path.join(self.results_root, "OLD")
        os.mkdir(old_results)
        os.symlink(old_results, self.latest_symlink)
        assert os.path.islink(self.latest_symlink) and os.path.exists(self.latest_symlink)

        setup_results_directory(self.results_dir)
        update_latest_symlink(self.results_root, self.results_dir)
        self.validate_directories()

        # Try again if symlink exists and points to nothing
        os.rmdir(self.results_dir)
        assert os.path.islink(self.latest_symlink) and not os.path.exists(self.latest_symlink)
        setup_results_directory(self.results_dir)
        update_latest_symlink(self.results_root, self.results_dir)
        self.validate_directories()


globals_json = """
{
    "x": 200
}
"""

invalid_globals_json = """
{
    can't parse this!: ?right?
}
"""

valid_json_not_dict = """
[
    {
        "x": 200,
        "y": 300
    }
]
"""


class CheckUserDefinedGlobals(object):
    """Tests for the helper method which parses in user defined globals option"""

    def check_immutable(self):
        """Expect the user defined dict object to be immutable."""
        global_dict = get_user_defined_globals(globals_json)

        with pytest.raises(NotImplementedError):
            global_dict["x"] = -1

        with pytest.raises(NotImplementedError):
            global_dict["y"] = 3

    def check_pickleable(self):
        """Expect the user defined dict object to be pickleable"""
        globals_dict = get_user_defined_globals(globals_json)

        assert globals_dict  # Need to test non-empty dict, to ensure py3 compatibility
        assert pickle.loads(pickle.dumps(globals_dict)) == globals_dict

    def check_parseable_json_string(self):
        """Check if globals_json is parseable as JSON, we get back a dictionary view of parsed JSON."""
        globals_dict = get_user_defined_globals(globals_json)
        assert globals_dict == json.loads(globals_json)

    def check_unparseable(self):
        """If globals string is not a path to a file, and not parseable as JSON we want to raise a ValueError"""
        with pytest.raises(ValueError):
            get_user_defined_globals(invalid_globals_json)

    def check_parse_from_file(self):
        """Validate that, given a filename of a file containing valid JSON, we correctly parse the file contents."""
        _, fname = tempfile.mkstemp()
        try:
            with open(fname, "w") as fh:
                fh.write(globals_json)

            global_dict = get_user_defined_globals(fname)
            assert global_dict == json.loads(globals_json)
            assert global_dict["x"] == 200
        finally:
            os.remove(fname)

    def check_bad_parse_from_file(self):
        """Validate behavior when given file containing invalid JSON"""
        _, fname = tempfile.mkstemp()
        try:
            with open(fname, "w") as fh:
                # Write invalid JSON
                fh.write(invalid_globals_json)

            with pytest.raises(ValueError):
                get_user_defined_globals(fname)

        finally:
            os.remove(fname)

    def check_non_dict(self):
        """Valid JSON which does not parse as a dict should raise a ValueError"""

        # Should be able to parse this as JSON
        json.loads(valid_json_not_dict)

        with pytest.raises(ValueError):
            get_user_defined_globals(valid_json_not_dict)

    def check_non_dict_from_file(self):
        """Validate behavior when given file containing valid JSON which does not parse as a dict"""
        _, fname = tempfile.mkstemp()
        try:
            with open(fname, "w") as fh:
                # Write valid JSON which does not parse as a dict
                fh.write(valid_json_not_dict)

            with pytest.raises(ValueError, match=fname):
                get_user_defined_globals(fname)

        finally:
            os.remove(fname)


================================================
FILE: tests/command_line/check_parse_args.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.command_line.parse_args import parse_args

from io import StringIO

import os
import re
import shutil
import sys
import tempfile


class Capturing(object):
    """This context manager can be used to capture stdout from a function call.
    E.g.
        with Capture() as captured:
            call_function()
        assert captured.output == expected_output
    """

    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self

    def __exit__(self, *args):
        self.output = self._stringio.getvalue()
        sys.stdout = self._stdout


class CheckParseArgs(object):
    def check_empty_args(self):
        """Check that parsing an empty args list results in printing a usage message, followed by sys.exit(0)"""
        try:
            with Capturing() as captured:
                parse_args([])
        except SystemExit as e:
            assert e.code == 0
            assert captured.output.find("usage") >= 0

    def check_version(self):
        """If --version is present, ducktape should print version and exit"""
        try:
            with Capturing() as captured:
                parse_args(["--version"])
        except SystemExit as e:
            assert e.code == 0
            assert re.search(r"[\d]+\.[\d]+\.[\d]+", captured.output) is not None

    def check_empty_test_path(self):
        """Check that default test_path is an array consisting of cwd."""
        args = ["--collect-only"]
        parsed = parse_args(args)
        parsed_paths = [os.path.abspath(p) for p in parsed["test_path"]]

        assert parsed_paths == [os.path.abspath(".")]

    def check_multiple_test_paths(self):
        """Check that parser properly handles multiple "test paths". It should capture a list of test paths."""
        paths = ["path1"]
        args = ["--debug"] + paths + ["--collect-only"]
        parsed = parse_args(args)
        assert parsed["test_path"] == paths

        paths = ["path%d" % i for i in range(10)]
        args = ["--cluster-file", "my-cluster-file"] + paths + ["--debug", "--exit-first"]
        parsed = parse_args(args)
        assert parsed["test_path"] == paths

    def check_multiple_exclude(self):
        excluded = ["excluded1", "excluded2"]
        args = ["--collect-only", "--exclude"] + excluded + ["--debug"]
        parsed = parse_args(args)
        assert parsed["exclude"] == excluded

    def check_config_overrides(self, monkeypatch):
        """Check that parsed arguments pick up values from config files, and that overrides match precedence."""

        tmpdir = tempfile.mkdtemp(dir="/tmp")
        # Create tmp file for global config
        project_cfg_filename = os.path.join(tmpdir, "ducktape-project.cfg")
        user_cfg_filename = os.path.join(tmpdir, "ducktape-user.cfg")

        project_cfg = [
            "--cluster-file CLUSTERFILE-project",
            "--results-root RESULTSROOT-project",
            "--parameters PARAMETERS-project",
        ]

        # user_cfg options should override project_cfg
        user_cfg = ["--results-root RESULTSROOT-user", "--parameters PARAMETERS-user"]

        try:
            monkeypatch.setattr(
                "ducktape.command_line.defaults.ConsoleDefaults.PROJECT_CONFIG_FILE",
                project_cfg_filename,
            )
            monkeypatch.setattr(
                "ducktape.command_line.defaults.ConsoleDefaults.USER_CONFIG_FILE",
                user_cfg_filename,
            )

            with open(project_cfg_filename, "w") as project_f:
                project_f.write("\n".join(project_cfg))

            with open(user_cfg_filename, "w") as user_f:
                user_f.write("\n".join(user_cfg))

            # command-line options should override user_cfg and project_cfg
            args_dict = parse_args(["--parameters", "PARAMETERS-commandline"])

            assert args_dict["cluster_file"] == "CLUSTERFILE-project"
            assert args_dict["results_root"] == "RESULTSROOT-user"
            assert args_dict["parameters"] == "PARAMETERS-commandline"

        finally:
            shutil.rmtree(tmpdir)

    def check_config_file_option(self):
        """Check that config file option works"""
        tmpdir = tempfile.mkdtemp(dir="/tmp")
        user_cfg_filename = os.path.join(tmpdir, "ducktape-user.cfg")

        user_cfg = ["--results-root RESULTSROOT-user", "--parameters PARAMETERS-user"]

        try:
            with open(user_cfg_filename, "w") as user_f:
                user_f.write("\n".join(user_cfg))
            args_dict = parse_args(["--config-file", user_cfg_filename])
            assert args_dict["results_root"] == "RESULTSROOT-user"
            assert args_dict["parameters"] == "PARAMETERS-user"
        finally:
            shutil.rmtree(tmpdir)

    def check_config_overrides_for_n_args(self, monkeypatch):
        """Check that parsed arguments pick up values from config files, and that overrides match precedence."""

        tmpdir = tempfile.mkdtemp(dir="/tmp")
        # Create tmp file for global config
        project_cfg_filename = os.path.join(tmpdir, "ducktape-project.cfg")
        user_cfg_filename = os.path.join(tmpdir, "ducktape-user.cfg")

        project_cfg = [
            "--exclude",
            "test1",
            "test2",
            "test3",
        ]

        # user_cfg options should override project_cfg
        user_cfg = [
            "test1",
            "test2",
            "test3",
        ]

        try:
            monkeypatch.setattr(
                "ducktape.command_line.defaults.ConsoleDefaults.PROJECT_CONFIG_FILE",
                project_cfg_filename,
            )
            monkeypatch.setattr(
                "ducktape.command_line.defaults.ConsoleDefaults.USER_CONFIG_FILE",
                user_cfg_filename,
            )

            with open(project_cfg_filename, "w") as project_f:
                project_f.write("\n".join(project_cfg))

            with open(user_cfg_filename, "w") as user_f:
                user_f.write("\n".join(user_cfg))

            # command-line options should override user_cfg and project_cfg
            args_dict = parse_args(["test1", "test2", "test3", "test4", "--max-parallel", "9000"])

            assert args_dict["test_path"] == ["test1", "test2", "test3", "test4"]
            assert args_dict["exclude"] == ["test1", "test2", "test3"]
            assert args_dict["max_parallel"] == 9000

        finally:
            shutil.rmtree(tmpdir)


================================================
FILE: tests/ducktape_mock.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Tuple
from ducktape.cluster.cluster import Cluster
from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.cluster.consts import LINUX
from ducktape.cluster.node_container import NodeContainer
from ducktape.tests.session import SessionContext
from ducktape.tests.test import Test
from ducktape.tests.test_context import TestContext
from ducktape.cluster.linux_remoteaccount import LinuxRemoteAccount
from ducktape.cluster.remoteaccount import RemoteAccountSSHConfig
from unittest.mock import MagicMock


import os
import tempfile


def mock_cluster():
    return MagicMock(
        all=lambda: [MagicMock(spec=ClusterSpec)] * 3,
        max_used=lambda: 3,
        max_used_nodes=3,
    )


class FakeClusterNode(object):
    @property
    def operating_system(self):
        return LINUX


class FakeCluster(Cluster):
    """A cluster class with counters, but no actual node objects"""

    def __init__(self, num_nodes):
        super(FakeCluster, self).__init__()
        self._available_nodes = NodeContainer()
        for i in range(0, num_nodes):
            self._available_nodes.add_node(FakeClusterNode())
        self._in_use_nodes = NodeContainer()

    def do_alloc(self, cluster_spec):
        good_nodes, bad_nodes = self._available_nodes.remove_spec(cluster_spec)
        self._in_use_nodes.add_nodes(good_nodes)
        return good_nodes

    def free_single(self, node):
        self._in_use_nodes.remove_node(node)
        self._available_nodes.add_node(node)

    def available(self):
        return ClusterSpec.from_nodes(self._available_nodes)

    def used(self):
        return ClusterSpec.from_nodes(self._in_use_nodes)


def session_context(**kwargs):
    """Return a SessionContext object"""

    if "results_dir" not in kwargs.keys():
        tmp = tempfile.mkdtemp()
        session_dir = os.path.join(tmp, "test_dir")
        os.mkdir(session_dir)
        kwargs["results_dir"] = session_dir

    return SessionContext(session_id="test_session", **kwargs)


class TestMockTest(Test):
    def mock_test(self):
        pass


def test_context(session_context=session_context(), cluster=mock_cluster()):
    """Return a TestContext object"""
    return TestContext(
        session_context=session_context,
        file="tests/ducktape_mock.py",
        module=__name__,
        cls=TestMockTest,
        function=TestMockTest.mock_test,
        cluster=cluster,
    )


class MockNode(object):
    """Mock cluster node"""

    def __init__(self):
        self.account = MockAccount()


class MockAccount(LinuxRemoteAccount):
    """Mock node.account object. It's Linux because tests are run in Linux."""

    def __init__(self, **kwargs):
        ssh_config = RemoteAccountSSHConfig(host="localhost", user=None, hostname="localhost", port=22)

        super(MockAccount, self).__init__(ssh_config, externally_routable_ip="localhost", logger=None, **kwargs)


class MockSender(MagicMock):
    send_results: List[Tuple]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.send_results = []

    def send(self, *args, **kwargs):
        self.send_results.append((args, kwargs))


================================================
FILE: tests/loader/__init__.py
================================================


================================================
FILE: tests/loader/check_loader.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.loader import TestLoader, LoaderException, _requests_session

import tests.ducktape_mock

import os
import os.path
import pytest
import re
import requests
import tempfile
import yaml

from mock import Mock
from requests_testadapter import Resp


class LocalFileAdapter(requests.adapters.HTTPAdapter):
    def build_response_from_file(self, request):
        file_path = request.url[7:]
        with open(file_path, "rb") as file:
            buff = bytearray(os.path.getsize(file_path))
            file.readinto(buff)
            resp = Resp(buff)
            r = self.build_response(request, resp)

            return r

    def send(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None):
        return self.build_response_from_file(request)


def resources_dir():
    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources")


def discover_dir():
    """Return the absolute path to the directory to use with discovery tests."""
    return os.path.join(resources_dir(), "loader_test_directory")


def sub_dir_a():
    return os.path.join(discover_dir(), "sub_dir_a")


def num_tests_in_file(fpath):
    """Count expected number of tests in the file.
    Search for NUM_TESTS = N

    return N if pattern is present else 0
    """
    with open(fpath, "r") as fd:
        match = re.search(r"^NUM_TESTS\s*=\s*(\d+)", fd.read(), re.MULTILINE)

        if not match:
            return 0
        return int(match.group(1))


def num_tests_in_dir(dpath):
    """Walk through directory subtree and count up expected number of tests that TestLoader should find."""
    assert os.path.exists(dpath)
    assert os.path.isdir(dpath)

    num_tests = 0
    for pwd, dirs, files in os.walk(dpath):
        for f in files:
            if not f.endswith(".py"):
                continue
            file_path = os.path.abspath(os.path.join(pwd, f))
            num_tests += num_tests_in_file(file_path)
    return num_tests


def invalid_test_suites():
    dpath = os.path.join(discover_dir(), "invalid_test_suites")
    params = []
    for pwd, dirs, files in os.walk(dpath):
        for f in files:
            if not f.endswith(".yml"):
                continue
            file_path = os.path.abspath(os.path.join(pwd, f))
            params.append(pytest.param(file_path, id=os.path.basename(file_path)))
    return params


class CheckTestLoader(object):
    def setup_method(self, method):
        self.SESSION_CONTEXT = tests.ducktape_mock.session_context()
        # To simplify unit tests, add file:// support to the test loader's functionality for loading previous
        # report.json files
        _requests_session.mount("file://", LocalFileAdapter())

    @pytest.mark.parametrize("suite_file_path", invalid_test_suites())
    def check_test_loader_raises_on_invalid_test_suite(self, suite_file_path):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        with pytest.raises(LoaderException):
            loader.load([suite_file_path])

    @pytest.mark.parametrize(
        ["expected_count", "input_symbols", "excluded_symbols"],
        [
            pytest.param(
                6,
                [
                    # import a single test suite with an import statement
                    os.path.join(discover_dir(), "test_suite_with_single_import.yml")
                ],
                None,
                id="load dependency path",
            ),
            pytest.param(
                2,
                [
                    # test that a self import doesn't cause an infinite loop
                    os.path.join(discover_dir(), "test_suite_with_self_import.yml")
                ],
                None,
                id="self load in import",
            ),
            pytest.param(
                5,
                [
                    # test that a cyclic import doesn't cause an infinite loop
                    os.path.join(discover_dir(), "test_suite_cyclic_b.yml")
                ],
                None,
                id="self load in import",
            ),
            pytest.param(
                5,
                [
                    # test that a cyclic import starting with second import
                    os.path.join(discover_dir(), "test_suite_cyclic_a.yml")
                ],
                None,
                id="self load in import",
            ),
            pytest.param(
                5,
                [
                    # test single import statement with parent reference
                    os.path.join(discover_dir(), "test_suites", "sub_dir_test_import.yml")
                ],
                None,
                id="self load in import",
            ),
            pytest.param(
                8,
                [
                    # see test suite files for number of tests in it.
                    # decorated test suite includes 2 tests;
                    # single includes 4 tests; multiple includes 3 tests;
                    # however both single and multiple test suites include one test method of test_by.py,
                    # so total -= 1
                    os.path.join(discover_dir(), "test_suite_single.yml"),
                    os.path.join(discover_dir(), "test_suite_multiple.yml"),
                    os.path.join(discover_dir(), "test_suite_decorated.yml"),
                ],
                None,
                id="load multiple test suite files",
            ),
            pytest.param(
                5,
                [
                    # see test suite file for number of tests in it
                    os.path.join(discover_dir(), "test_suites", "test_suite_glob.yml")
                ],
                None,
                id="load test suite with globs",
            ),
            pytest.param(
                2,
                [
                    # test suite that includes sub_dir_a/test_c.py (1 test total):
                    os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c.yml"),
                    # explicitly include test_a.yml (1 test total)
                    os.path.join(discover_dir(), "test_a.py"),
                ],
                None,
                id="load both file and suite",
            ),
            pytest.param(
                1,
                [
                    # test suite that includes sub_dir_a/test_c.py (1 test total):
                    os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c.yml"),
                    # explicitly include test_a.yml (1 test total)
                    os.path.join(discover_dir(), "test_a.py"),
                ],
                [
                    # explicitly exclude the sub_dir_a/test_c.py (included with test suite):
                    os.path.join(sub_dir_a(), "test_c.py"),
                ],
                id="global exclude overrides test suite include",
            ),
            pytest.param(
                4,
                [
                    # sub_dir_a contains 4 total tests
                    # test suite that includes sub_dir_a/*.py but excludes sub_dir_a/test_d.py:
                    os.path.join(discover_dir(), "test_suites", "sub_dir_a_with_exclude.yml"),
                    # explicitly include sub_dir_a/test_d.py to override exclusion from test suite:
                    os.path.join(sub_dir_a(), "test_d.py"),
                ],
                None,
                id="global include overrides test suite exclude",
            ),
            pytest.param(
                1,
                [
                    # load two test suites and two files that all point to the same actual test
                    # and verify that in the end only 1 test has been loaded
                    os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c.yml"),
                    os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c_via_class.yml"),
                    os.path.join(sub_dir_a(), "test_c.py"),
                    os.path.join(sub_dir_a(), "test_c.py::TestC"),
                ],
                None,
                id="same test in test suites and test files",
            ),
        ],
    )
    def check_test_loader_with_test_suites_and_files(self, expected_count, input_symbols, excluded_symbols):
        """
        When both files and test suites are loaded, files (both included and excluded) are
        loaded after and separately from the test suites, so even if a test suite excludes file A,
        it will be included if it's passed directly. And if file A is excluded directly, even if any of
        the test suites includes it, it will still be excluded.
        """
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        tests = loader.load(input_symbols, excluded_test_symbols=excluded_symbols)
        assert len(tests) == expected_count

    def check_test_loader_with_directory(self):
        """Check discovery on a directory."""
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        tests = loader.load([discover_dir()])
        assert len(tests) == num_tests_in_dir(discover_dir())

    @pytest.mark.parametrize(
        ["dir_", "file_name"],
        [
            pytest.param(discover_dir(), "test_a.py"),
            pytest.param(resources_dir(), "a.py"),
        ],
    )
    def check_test_loader_with_file(self, dir_, file_name):
        """Check discovery on a file."""
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        module_path = os.path.join(dir_, file_name)

        tests = loader.load([module_path])
        assert len(tests) == num_tests_in_file(module_path)

    def check_test_loader_with_glob(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        file_glob = os.path.join(discover_dir(), "*_a.py")  # should resolve to test_a.py only
        tests = loader.load([file_glob])
        assert len(tests) == 1

    def check_test_loader_multiple_files(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        file_a = os.path.join(discover_dir(), "test_a.py")
        file_b = os.path.join(discover_dir(), "test_b.py")

        tests = loader.load([file_a, file_b])
        assert len(tests) == num_tests_in_file(file_a) + num_tests_in_file(file_b)

    def check_test_loader_include_dir_exclude_file(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        excluded_file_a = os.path.join(discover_dir(), "test_a.py")
        excluded_file_b = os.path.join(discover_dir(), "test_b.py")
        num_excluded = num_tests_in_file(excluded_file_a) + num_tests_in_file(excluded_file_b)
        tests = loader.load([discover_dir()], [excluded_file_a, excluded_file_b])
        assert len(tests) == num_tests_in_dir(discover_dir()) - num_excluded

    def check_test_loader_exclude_subdir(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        included_dir = discover_dir()
        excluded_dir = sub_dir_a()
        tests = loader.load([included_dir], [excluded_dir])
        assert len(tests) == num_tests_in_dir(included_dir) - num_tests_in_dir(excluded_dir)

    def check_test_loader_exclude_subdir_glob(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        included_dir = discover_dir()
        excluded_dir = sub_dir_a()
        excluded_dir_glob = os.path.join(sub_dir_a(), "*.py")
        tests = loader.load([included_dir], [excluded_dir_glob])
        assert len(tests) == num_tests_in_dir(included_dir) - num_tests_in_dir(excluded_dir)

    def check_test_loader_raises_when_nothing_is_included(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        file_a = os.path.join(discover_dir(), "test_a.py")
        file_b = os.path.join(discover_dir(), "test_b.py")
        with pytest.raises(LoaderException):
            loader.load([file_a, file_b], [discover_dir()])

    def check_test_loader_raises_on_include_subdir_exclude_parent_dir(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        with pytest.raises(LoaderException):
            loader.load([(sub_dir_a())], [(discover_dir())])

    def check_test_loader_with_nonexistent_file(self):
        """Check discovery on a non-existent path should throw LoaderException"""
        with pytest.raises(LoaderException):
            loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
            loader.load([os.path.join(discover_dir(), "file_that_does_not_exist.py")])

    def check_test_loader_include_dir_without_tests(self):
        with pytest.raises(LoaderException):
            loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
            loader.load([os.path.join(discover_dir(), "sub_dir_no_tests")])

    def check_test_loader_include_file_without_tests(self):
        with pytest.raises(LoaderException):
            loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
            loader.load([os.path.join(discover_dir(), "sub_dir_no_tests", "just_some_file.py")])

    def check_test_loader_allow_exclude_dir_without_tests(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        tests = loader.load([discover_dir()], [os.path.join(discover_dir(), "sub_dir_no_tests")])
        assert len(tests) == num_tests_in_dir(discover_dir())

    def check_test_loader_allow_exclude_file_without_tests(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        tests = loader.load(
            [discover_dir()],
            [os.path.join(discover_dir(), "sub_dir_no_tests", "just_some_file.py")],
        )
        assert len(tests) == num_tests_in_dir(discover_dir())

    def check_test_loader_allow_exclude_nonexistent_file(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        tests = loader.load(
            [discover_dir()],
            [os.path.join(discover_dir(), "file_that_does_not_exist.py")],
        )
        assert len(tests) == num_tests_in_dir(discover_dir())

    def check_test_loader_with_class(self):
        """Check test discovery with discover class syntax."""
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        tests = loader.load([os.path.join(discover_dir(), "test_b.py::TestBB")])
        assert len(tests) == 2

        # Sanity check, test that it discovers two test class & 3 tests if it searches the whole module
        tests = loader.load([os.path.join(discover_dir(), "test_b.py")])
        assert len(tests) == 3

    def check_test_loader_include_dir_exclude_class(self):
        """Check test discovery with discover class syntax."""
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        tests = loader.load([discover_dir()], [os.path.join(discover_dir(), "test_b.py::TestBB")])
        # TestBB contains 2 test methods
        assert len(tests) == num_tests_in_dir(discover_dir()) - 2

    def check_test_loader_include_class_exclude_method(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        included = [os.path.join(discover_dir(), "test_b.py::TestBB")]
        excluded = [os.path.join(discover_dir(), "test_b.py::TestBB.test_bb_one")]
        tests = loader.load(included, excluded)
        # TestBB contains 2 test methods, but 1 is excluded
        assert len(tests) == 1

    def check_test_loader_include_dir_exclude_method(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        excluded = [os.path.join(discover_dir(), "test_b.py::TestBB.test_bb_one")]
        tests = loader.load([discover_dir()], excluded)
        # excluded 1 method only
        assert len(tests) == num_tests_in_dir(discover_dir()) - 1

    def check_test_loader_with_matrix_params(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        included = [
            os.path.join(
                discover_dir(),
                'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "test "}',
            )
        ]
        tests = loader.load(included)
        # TestMatrix contains a single parametrized method. Since we only provide a single set of params, we should
        # end up with a single context
        assert len(tests) == 1
        assert tests[0].injected_args == {"x": 1, "y": "test "}

    def check_test_loader_with_params_special_chars(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        included = [
            os.path.join(
                discover_dir(),
                r"test_decorated.py::TestParametrizdeSpecial.test_special_characters_params"
                r'@{"version": "6.1.0", "chars": "!@#$%^&*()_+::.,/? \"{}\\"}',
            )
        ]
        tests = loader.load(included)
        # TestMatrix contains a single parametrized method. Since we only provide a single set of params, we should
        # end up with a single context
        assert len(tests) == 1
        assert tests[0].injected_args == {
            "version": "6.1.0",
            "chars": '!@#$%^&*()_+::.,/? "{}\\',
        }

    def check_test_loader_with_multiple_matrix_params(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        params = '[{"x": 1,"y": "test "}, {"x": 2,"y": "I\'m"}]'
        included = [
            os.path.join(
                discover_dir(),
                "test_decorated.py::TestMatrix.test_thing@{}".format(params),
            )
        ]
        tests = loader.load(included)
        # TestMatrix contains a single parametrized method.
        # We provide two sets of params, so we should end up with two contexts
        assert len(tests) == 2
        injected_args = map(lambda t: t.injected_args, tests)
        assert {"x": 1, "y": "test "} in injected_args
        assert {"x": 2, "y": "I'm"} in injected_args

    def check_test_loader_with_parametrize(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        included = [
            os.path.join(
                discover_dir(),
                'test_decorated.py::TestParametrized.test_thing@{"x":1,"y":2}',
            )
        ]
        tests = loader.load(included)
        assert len(tests) == 1
        assert tests[0].injected_args == {"x": 1, "y": 2}

    def check_test_loader_with_parametrize_with_objects(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        parameters = '{"d": {"a": "A"}, "lst": ["whatever"]}'
        included = [
            os.path.join(
                discover_dir(),
                "test_decorated.py::TestObjectParameters.test_thing@{}".format(parameters),
            )
        ]
        tests = loader.load(included)
        assert len(tests) == 1
        assert tests[0].injected_args == {"d": {"a": "A"}, "lst": ["whatever"]}

    def check_test_loader_with_injected_args(self):
        """When the --parameters command-line option is used, the loader behaves a little bit differently:

        each test method annotated with @parametrize or @matrix should only expand to a single discovered test,
        and the injected args should be those passed in from command-line.
        """
        # parameter values don't have to match any of the pre-defined parameters in the annotation
        # moreover, even parameter keys don't have to match method arguments, though if that's the case
        # the runner will complain, but the loader wouldn't care (this has been ducktape's behavior for a while now)
        injected_args = {"x": 100, "y": -100}
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), injected_args=injected_args)

        file = os.path.join(discover_dir(), "test_decorated.py")
        tests = loader.load([file])
        assert len(tests) == 6

        for t in tests:
            assert t.injected_args == injected_args

    def check_test_loader_raises_with_both_injected_args_and_parameters(self):
        """One should not use both --parameters command-line flag and parameterized test symbols at the same time.
        Loader will explicitly raise in such cases to avoid confusing behavior.
        """
        injected_args = {"x": 1, "y": "test "}
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), injected_args=injected_args)
        included = [
            os.path.join(
                discover_dir(),
                'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "test "}',
            )
        ]
        with pytest.raises(LoaderException, match="Cannot use both"):
            loader.load(included)

    def check_test_loader_raises_on_params_not_found(self):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        # parameter syntax is valid, but there is no such parameter defined in the test annotation in the code
        included = [
            os.path.join(
                discover_dir(),
                'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "missing"}',
            )
        ]
        with pytest.raises(LoaderException, match="No tests to run"):
            loader.load(included)

    @pytest.mark.parametrize(
        "symbol",
        [
            # no class
            "test_decorated.py::.test_thing"
            # no method
            'test_decorated.py::TestMatrix@{"x": 1, "y": "test "}'
            # invalid json in params
            'test_decorated.py::TestMatrix.test_thing@{x: 1,"y": "test "}'
        ],
    )
    def check_test_loader_raises_on_malformed_test_discovery_symbol(self, symbol):
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        included = [os.path.join(discover_dir(), symbol)]
        with pytest.raises(LoaderException, match="Invalid discovery symbol"):
            loader.load(included)

    def check_test_loader_exclude_with_injected_args(self):
        injected_args = {"x": 1, "y": -1}
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), injected_args=injected_args)

        included = [os.path.join(discover_dir(), "test_decorated.py")]
        excluded = [os.path.join(discover_dir(), "test_decorated.py::TestStackedMatrix")]
        tests = loader.load(included, excluded)
        # test_decorated.py contains 5 test methods total
        # we exclude 1 class with 1 method so should be 4
        # exclusion shouldn't care about injected args
        assert len(tests) == 5

        for t in tests:
            assert t.injected_args == injected_args

    def check_test_loader_exclude_with_params(self):
        """
        Checks behavior of exclude flag with parametrized annotations.
        Should exclude only a single permutation of the method
        """
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        # included 8 tests
        included = [os.path.join(discover_dir(), "test_decorated.py::TestMatrix")]
        # exclude 1 test
        excluded = [
            os.path.join(
                discover_dir(),
                'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "test "}',
            )
        ]
        tests = loader.load(included, excluded)
        assert len(tests) == 7

    def check_test_loader_exclude_with_params_multiple(self):
        """
        Checks behavior of exclude flag with parametrized annotations.
        Should exclude two permutations of the method
        """
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        # include 8 tests
        included = [os.path.join(discover_dir(), "test_decorated.py::TestMatrix")]
        # exclude 2 tests
        params = '[{"x": 1,"y": "test "}, {"x": 2,"y": "I\'m"}]'
        excluded = [
            os.path.join(
                discover_dir(),
                "test_decorated.py::TestMatrix.test_thing@{}".format(params),
            )
        ]
        tests = loader.load(included, excluded)
        assert len(tests) == 6

    def check_test_loader_with_subsets(self):
        """Check that computation of subsets work properly. This validates both that the division of tests is correct
        (i.e. as even a distribution as we can get but uneven in the expected way when necessary) and that the division
        happens after the expansion of tests marked for possible expansion (e.g. matrix, parametrize)."""

        file = os.path.join(discover_dir(), "test_decorated.py")

        # The test file contains 18 tests. With 4 subsets, first two subset should have an "extra"
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=0, subsets=4)
        tests = loader.load([file])
        assert len(tests) == 5

        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=1, subsets=4)
        tests = loader.load([file])
        assert len(tests) == 5

        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=2, subsets=4)
        tests = loader.load([file])
        assert len(tests) == 4

        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=3, subsets=4)
        tests = loader.load([file])
        assert len(tests) == 4

    def check_test_loader_with_invalid_subsets(self):
        """Check that the TestLoader throws an exception if the requests subset is larger than the number of subsets"""
        with pytest.raises(ValueError):
            TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=4, subsets=4)
        with pytest.raises(ValueError):
            TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=5, subsets=4)

    def check_test_loader_with_time_based_subsets(self):
        """Check that computation of subsets using a report with timing information correctly generates subsets that
        are optimized based on timing rather than number of tests.
        """

        file = os.path.join(discover_dir(), "test_b.py")
        report_url = "file://" + os.path.join(resources_dir(), "report.json")

        # The expected behavior of the current implementation is to add tests to each subset from largest to smallest,
        # using the least full subset each time. The test data with times of (10, 5, 1) should result in the first
        # subset containing 1 test and the second containing 2 (the opposite of the simple count-based strategy)

        loader = TestLoader(
            self.SESSION_CONTEXT,
            logger=Mock(),
            subset=0,
            subsets=2,
            historical_report=report_url,
        )
        tests = loader.load([file])
        assert len(tests) == 1

        loader = TestLoader(
            self.SESSION_CONTEXT,
            logger=Mock(),
            subset=1,
            subsets=2,
            historical_report=report_url,
        )
        tests = loader.load([file])
        assert len(tests) == 2

    def check_loader_with_non_yml_file(self):
        """
        test loading a test file as an import
        """
        file = os.path.join(discover_dir(), "test_suite_import_py.yml")
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        with pytest.raises(LoaderException, match=r"Failed to load test suite from file: \S+test_a\.py"):
            loader.load([file])

    def check_loader_with_non_suite_yml_file(self):
        """
        test importing a suite that is malformed
        """
        file1 = os.path.join(discover_dir(), "test_suite_malformed.yml")
        file2 = os.path.join(discover_dir(), "test_suite_import_malformed.yml")
        loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
        with pytest.raises(LoaderException, match="No tests found in  simple_malformed_suite"):
            loader.load([file1])
        with pytest.raises(LoaderException, match="No tests found in  simple_malformed_suite"):
            loader.load([file2])

    def check_test_loader_with_absolute_path(self):
        """
        Test loading suites using absolute paths to other imported suites as well as absolute paths
        to tests in the suite
        """
        with tempfile.TemporaryDirectory() as td:
            temp_suite1 = os.path.join(td, "temp_suite1.yml")
            temp_suite2 = os.path.join(td, "temp_suite2.yml")
            with open(temp_suite1, "w") as f:
                test_yaml1 = yaml.dump(
                    {
                        "import": str(os.path.join(td, "temp_suite2.yml")),
                        "suite": [os.path.abspath(os.path.join(discover_dir(), "test_a.py"))],
                    }
                )
                f.write(test_yaml1)

            with open(temp_suite2, "w") as f:
                test_yaml2 = yaml.dump({"suite": [os.path.abspath(os.path.join(discover_dir(), "test_b.py"))]})
                f.write(test_yaml2)

            loader = TestLoader(self.SESSION_CONTEXT, logger=Mock())
            tests = loader.load([temp_suite1])
            assert len(tests) == 4


def join_parsed_symbol_components(parsed):
    """
    Join together a parsed symbol

    e.g.
        {
            'path': 'path/to/dir/test_file.py',
            'cls': 'ClassName',
            'method': 'method'
        },
        ->
        'path/to/dir/test_file.py::ClassName.method'
    """
    symbol = os.path.join(parsed["path"])

    if parsed["cls"] or parsed["method"]:
        symbol += "::"
        symbol += parsed["cls"]
        if parsed["method"]:
            symbol += "."
            symbol += parsed["method"]

    return symbol


def normalize_ending_slash(dirname):
    if dirname.endswith(os.path.sep):
        dirname = dirname[: -len(os.path.sep)]
    return dirname


class CheckParseSymbol(object):
    def check_parse_discovery_symbol(self):
        """Check that "test discovery symbol" parsing logic works correctly"""
        parsed_symbols = [
            {"path": "path/to/dir", "cls": "", "method": ""},
            {"path": "path/to/dir/test_file.py", "cls": "", "method": ""},
            {"path": "path/to/dir/test_file.py", "cls": "ClassName", "method": ""},
            {
                "path": "path/to/dir/test_file.py",
                "cls": "ClassName",
                "method": "method",
            },
            {"path": "path/to/dir", "cls": "ClassName", "method": ""},
            {"path": "test_file.py", "cls": "", "method": ""},
            {"path": "test_file.py", "cls": "ClassName", "method": ""},
            {"path": "test_file.py", "cls": "ClassName", "method": "method"},
        ]

        loader = TestLoader(tests.ducktape_mock.session_context(), logger=Mock())
        for parsed in parsed_symbols:
            symbol = join_parsed_symbol_components(parsed)

            expected_parsed = (
                normalize_ending_slash(parsed["path"]),
                parsed["cls"],
                parsed["method"],
            )

            actually_parsed = loader._parse_discovery_symbol(symbol)
            actually_parsed = (
                normalize_ending_slash(actually_parsed[0]),
                actually_parsed[1],
                actually_parsed[2],
            )

            assert actually_parsed == expected_parsed, "%s did not parse as expected" % symbol


================================================
FILE: tests/loader/resources/__init__.py
================================================


================================================
FILE: tests/loader/resources/loader_test_directory/README
================================================
A dummy test directory structure for checking test discovery behavior

================================================
FILE: tests/loader/resources/loader_test_directory/__init__.py
================================================


================================================
FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/empty_file.yml
================================================


================================================
FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/malformed_test_suite.yml
================================================
malformed_test_suite:
  included:
    should_have_been_a_list:
      - but
      - is
      - a
      - dict


================================================
FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/not_yaml.yml
================================================
this is not a yaml file
No, really


================================================
FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/test_suite_refers_to_non_existent_file.yml
================================================
non_existent_file_suit:
  included:
    - file_does_not_exist.py


================================================
FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/test_suite_with_malformed_params.yml
================================================
non_existent_file_suit:
  included:
    - 'test_decorated.py::TestMatrix.test_thing@[{x: 1,y: "test "}]'  # this json won't parse because no quotes


================================================
FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/test_suites_with_no_tests.yml
================================================
empty_test_suite_a:
empty_test_suite_b:
  excluded:
    - ../sub_dir_a


================================================
FILE: tests/loader/resources/loader_test_directory/name_does_not_match_pattern.py
================================================
from ducktape.tests.test import Test


class TestNotLoaded(Test):
    """Loader should not discover this - module name does not match default pattern."""

    def test_a(self):
        pass


================================================
FILE: tests/loader/resources/loader_test_directory/sub_dir_a/__init__.py
================================================


================================================
FILE: tests/loader/resources/loader_test_directory/sub_dir_a/test_c.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test

NUM_TESTS = 1


class TestC(Test):
    """Loader should discover this."""

    def test(self):
        pass


class TestInvisible(object):
    """Loader should not discover this."""

    def test_invisible(self):
        pass


================================================
FILE: tests/loader/resources/loader_test_directory/sub_dir_a/test_d.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test

NUM_TESTS = 3


class TestD(Test):
    """Loader should discover this."""

    def test_d(self):
        pass

    def test_dd(self):
        pass

    def ddd_test(self):
        pass


class TestInvisible(object):
    """Loader should not discover this."""

    def test_invisible(self):
        pass


================================================
FILE: tests/loader/resources/loader_test_directory/sub_dir_no_tests/__init__.py
================================================


================================================
FILE: tests/loader/resources/loader_test_directory/sub_dir_no_tests/just_some_file.py
================================================
class JustSomeClass(object):
    pass


================================================
FILE: tests/loader/resources/loader_test_directory/test_a.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test

NUM_TESTS = 1


class TestA(Test):
    """Loader should discover this."""

    def test_a(self):
        pass


class TestInvisible(object):
    """Loader should not discover this."""

    def test_invisible(self):
        pass


================================================
FILE: tests/loader/resources/loader_test_directory/test_b.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test

NUM_TESTS = 3


class TestB(Test):
    """Loader should discover this."""

    def test_b(self):
        pass


class TestBB(Test):
    """Loader should discover this with 2 tests."""

    test_not_callable = 3

    def test_bb_one(self):
        pass

    def bb_two_test(self):
        pass

    def other_method(self):
        pass


class TestInvisible(object):
    """Loader should not discover this."""

    def test_invisible(self):
        pass


================================================
FILE: tests/loader/resources/loader_test_directory/test_decorated.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test
from ducktape.mark import matrix
from ducktape.mark import parametrize

NUM_TESTS = 18


class TestMatrix(Test):
    """8 tests here"""

    @matrix(x=[1, 2], y=["I'm", " a ", "test ", "matrix!"])
    def test_thing(self, x, y):
        pass


class TestStackedMatrix(Test):
    """4 tests"""

    @matrix(x=[1, 2], y=[-1, 0])
    def test_thing(self, x, y):
        pass


class TestParametrized(Test):
    @parametrize(x=10)
    def test_single_decorator(self, x=1, y="hi"):
        """1 test"""
        pass

    @parametrize(x=1, y=2)
    @parametrize(x="abc", y=[])
    def test_thing(self, x, y):
        """2 tests"""
        pass


class TestParametrizdeSpecial(Test):
    @parametrize(version="6.1.0", chars='!@#$%^&*()_+::.,/? "{}\\')
    def test_special_characters_params(self, version, chars):
        """1 tests"""
        pass


class TestObjectParameters(Test):
    @parametrize(d={"a": "A"}, lst=["whatever"])
    @parametrize(d={"z": "Z"}, lst=["something"])
    def test_thing(self, d, lst):
        """2 tests"""
        pass


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_cyclic_a.yml
================================================
import:
  - test_suite_cyclic_b.yml # 1 test

test_suite_cyclic_a:
  - ./sub_dir_a/test_c.py # 1 test
  - ./test_b.py # 3
# suite total: 4

# total: 5


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_cyclic_b.yml
================================================

import:
  - test_suite_cyclic_a.yml # 4 test

test_suite_cyclic_b:
  - test_a.py # 1 test
# suite total: 1 test

# total: 5 tests


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_decorated.yml
================================================
test_suite_decorated:
  - 'test_decorated.py::TestMatrix.test_thing@[{"x": 1,"y": "test "}, {"x": 2, "y": "test "}]'  # 2 tests
  - 'test_decorated.py::TestStackedMatrix.test_thing@{"x": 100, "y": 100}'  # ignored, there are no such params in code

# total: 2 tests


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_import_malformed.yml
================================================
import:
  - test_suite_malformed.yml

simple_suite:
  - test_a.py # 1


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_import_py.yml
================================================
import:
  - test_a.py
# improper import

simple_suite:
  - test_a.py
# 1 test


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_malformed.yml
================================================
simple_malformed_suite:
  cheese:
    - 'fatty milk'
    - 'salt'
  bread:
    - 'flower'
    - 'water'
    - 'yeast'


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_multiple.yml
================================================
test_suite_b:
  - sub_dir_a/test_c.py  # 1 test
  - sub_dir_a/test_d.py::TestD.test_d  # 1 test
# suite total: 2 tests

test_suite_c:
  included:
    - sub_dir_a  # +4 tests across all files
    - test_b.py  # +3 tests
  excluded:
    - sub_dir_a/test_d.py  # -3 tests
    - test_b.py::TestBB  # -2 tests
# suite total: 2 tests

# total: 4
# - test_c.py (with 1 test method) included in both test suites so total -= 1
# total: 3


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_single.yml
================================================
test_suite_a:
  included:
    - test_a.py  # 1 test
    - test_b.py::TestBB.test_bb_one  # 1 test
    - sub_dir_a
    # sub_dir_a/test_c.py = 0 (excluded)
    # sub_dir_a/test_d.py = 3 - 1 (excluded) = 2
  excluded:
    - sub_dir_a/test_c.py  # 1 test
    - sub_dir_a/test_d.py::TestD.test_dd  # 1 test

# total: 4 tests


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_with_self_import.yml
================================================
test_suite_d:
  included:
    - sub_dir_a  # +4 tests across all files
    - test_b.py  # +3 tests
  excluded:
    - sub_dir_a/test_d.py  # -3 tests
    - test_b.py::TestBB  # -2 tests
# suite total: 2 tests
import:
  - ./test_suite_with_self_import.yml #  +0 test


================================================
FILE: tests/loader/resources/loader_test_directory/test_suite_with_single_import.yml
================================================
test_suite_d:
  included:
    - sub_dir_a  # +4 tests across all files
    - test_b.py  # +3 tests
  excluded:
    - sub_dir_a/test_d.py  # -3 tests
    - test_b.py::TestBB  # -2 tests
# suite total: 6 tests

import:
  - ./test_suite_single.yml #  +4 test


================================================
FILE: tests/loader/resources/loader_test_directory/test_suites/refers_to_parent_dir.yml
================================================
test_suite_a:
  included:
    - ../test_a.py  # 1 test
# total: 1 tests


================================================
FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_a_test_c.yml
================================================
test_suite:
  - ../sub_dir_a/test_c.py


================================================
FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_a_test_c_via_class.yml
================================================
test_suite:
  - ../sub_dir_a/test_c.py::TestC


================================================
FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_a_with_exclude.yml
================================================
test_suite:
  included:
    - ../sub_dir_a/*.py
  excluded:
    - ../sub_dir_a/test_d.py


================================================
FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_test_import.yml
================================================
import:
  - ../test_suite_cyclic_a.yml # 5 tests
  - .././test_suites/../sub_dir_no_tests/.././test_suite_cyclic_b.yml # 0 tests

# 5 test total


================================================
FILE: tests/loader/resources/loader_test_directory/test_suites/test_suite_glob.yml
================================================
test_suite_glob:
  included:
    - ../sub_dir_a/*
    - ../test_?.py
  excluded:
    - ../sub_dir_a/*_d.py

# globs expand to:
# included:
#   sub_dir_a/test_c.py (1 test)
#   sub_dir_a/test_d.py (3 tests) - excluded
#   test_a.py (1 test)
#   test_b.py (3 tests)
# excluded:
#   sub_dir_a/test_d.py (3 tests)
# TOTAL:
#   5 tests


================================================
FILE: tests/loader/resources/report.json
================================================
{
  "results": [
    {
      "test_id": "tests.loader.resources.loader_test_directory.test_b.TestB.test_b",
      "run_time_seconds": 10
    },
    {
      "test_id": "tests.loader.resources.loader_test_directory.test_b.TestBB.test_bb_one",
      "run_time_seconds": 5
    },
    {
      "test_id": "tests.loader.resources.loader_test_directory.test_b.TestBB.bb_two_test",
      "run_time_seconds": 1
    }
  ]
}

================================================
FILE: tests/logger/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/logger/check_logger.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import psutil
import shutil
import tempfile

from ducktape.tests.loggermaker import LoggerMaker, close_logger


class DummyFileLoggerMaker(LoggerMaker):
    def __init__(self, log_dir, n_handles):
        """Create a logger with n_handles file handles, with files in log_dir"""
        self.log_dir = log_dir
        self.n_handles = n_handles

    @property
    def logger_name(self):
        return "a.b.c"

    def configure_logger(self):
        for i in range(self.n_handles):
            fh = logging.FileHandler(os.path.join(self.log_dir, "log-" + str(i)))
            self._logger.addHandler(fh)


def open_files():
    # current process
    p = psutil.Process()
    return p.open_files()


class CheckLogger(object):
    def setup_method(self, _):
        self.temp_dir = tempfile.mkdtemp()

    def check_close_logger(self):
        """Check that calling close_logger properly cleans up resources."""
        initial_open_files = open_files()

        n_handles = 100
        log_maker = DummyFileLoggerMaker(self.temp_dir, n_handles)
        # accessing logger attribute lazily triggers configuration of logger
        the_logger = log_maker.logger

        assert len(open_files()) == len(initial_open_files) + n_handles
        close_logger(the_logger)
        assert len(open_files()) == len(initial_open_files)

    def teardown_method(self, _):
        if os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir)


================================================
FILE: tests/mark/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/mark/check_cluster_use_metadata.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster import LocalhostCluster
from ducktape.mark import parametrize, matrix, ignore, defaults
from ducktape.mark.mark_expander import MarkedFunctionExpander
from ducktape.mark.resource import cluster
from ducktape.cluster.cluster_spec import ClusterSpec

import pytest

from tests import ducktape_mock


class CheckClusterUseAnnotation(object):
    def check_basic_usage_arbitrary_metadata(self):
        cluster_use_metadata = {"a": 2, "b": "hi", "num_nodes": 300}

        @cluster(**cluster_use_metadata)
        def function():
            return "hi"

        assert hasattr(function, "marks")

        test_context_list = MarkedFunctionExpander(function=function).expand()
        assert len(test_context_list) == 1
        assert test_context_list[0].cluster_use_metadata == cluster_use_metadata

    def check_basic_usage_cluster_spec(self):
        num_nodes = 200

        @cluster(cluster_spec=ClusterSpec.simple_linux(num_nodes))
        def function():
            return "hi"

        assert hasattr(function, "marks")

        test_context_list = MarkedFunctionExpander(function=function).expand()
        assert len(test_context_list) == 1
        assert len(test_context_list[0].expected_cluster_spec.nodes) == num_nodes
        # All nodes should be linux
        for node in test_context_list[0].expected_cluster_spec.nodes.elements():
            assert node.operating_system == "linux"

    def check_basic_usage_num_nodes(self):
        num_nodes = 200

        @cluster(num_nodes=num_nodes)
        def function():
            return "hi"

        assert hasattr(function, "marks")

        test_context_list = MarkedFunctionExpander(function=function).expand()
        assert len(test_context_list) == 1
        assert test_context_list[0].expected_num_nodes == num_nodes

    @pytest.mark.parametrize("fail_greedy_tests", [True, False])
    @pytest.mark.parametrize("has_annotation", [True, False])
    def check_empty_cluster_annotation(self, fail_greedy_tests, has_annotation):
        @cluster()
        def function_with_annotation():
            return "hi"

        def function_no_annotation():
            return "hello"

        assert hasattr(function_with_annotation, "marks")
        assert not hasattr(function_no_annotation, "marks")

        # no annotation and empty annotation behave identically as far as this functionality is concerned
        function = function_with_annotation if has_annotation else function_no_annotation

        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = ducktape_mock.session_context(fail_greedy_tests=fail_greedy_tests)
        tc_list = MarkedFunctionExpander(
            function=function, cluster=mock_cluster, session_context=session_context
        ).expand()

        assert len(tc_list) == 1
        if fail_greedy_tests:
            assert tc_list[0].expected_num_nodes == 0
            assert tc_list[0].expected_cluster_spec is None
        else:
            assert tc_list[0].expected_num_nodes == 1000

    @pytest.mark.parametrize("fail_greedy_tests", [True, False])
    def check_zero_nodes_annotation(self, fail_greedy_tests):
        @cluster(num_nodes=0)
        def function():
            return "hi"

        assert hasattr(function, "marks")
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = ducktape_mock.session_context(fail_greedy_tests=fail_greedy_tests)
        tc_list = MarkedFunctionExpander(
            function=function, cluster=mock_cluster, session_context=session_context
        ).expand()
        assert len(tc_list) == 1
        assert tc_list[0].expected_num_nodes == 0
        assert tc_list[0].expected_cluster_spec is not None
        assert len(tc_list[0].expected_cluster_spec) == 0

    def check_with_parametrize(self):
        num_nodes = 200

        @cluster(num_nodes=num_nodes)
        @parametrize(x=1)
        def f(x, y=2, z=3):
            return x, y, z

        test_context_list = MarkedFunctionExpander(function=f).expand()
        assert len(test_context_list) == 1
        assert test_context_list[0].expected_num_nodes == num_nodes

    def check_beneath_parametrize(self):
        """Annotations such as cluster, which add metadata, but do not create new tests, add the metadata to
        all test cases physically below the annotation.

        In the case of a parametrized test, when @cluster has no parametrize annotations below it,
        there are not any test cases to which it will apply, so none of the resulting tests should have
        associated cluster size metadata.
        """
        num_nodes = 200

        @parametrize(x=1)
        @cluster(num_nodes=num_nodes)
        def f(x, y=2, z=3):
            return x, y, z

        with pytest.raises(AssertionError):
            MarkedFunctionExpander(function=f).expand()

    def check_with_nodes_default_parametrize_matrix(self):
        default_num_nodes = 5
        parametrize_num_nodes = 3
        matrix_num_nodes = 7

        @cluster(num_nodes=default_num_nodes)
        @defaults(y=[5])
        @parametrize(x=100, y=200)
        @cluster(num_nodes=parametrize_num_nodes)
        @parametrize(x=10, y=20)
        @parametrize(x=30, y=40)
        @cluster(num_nodes=matrix_num_nodes)
        @matrix(x=[1, 2])
        def f(x, y):
            return x, y

        test_context_list = MarkedFunctionExpander(function=f).expand()
        assert len(test_context_list) == 5
        # {'x': 1, 'y': 5} -> using matrix with matrix_num_nodes
        assert test_context_list[0].expected_num_nodes == matrix_num_nodes
        # {'x': 2, 'y': 5} -> using matrix with matrix_num_nodes
        assert test_context_list[1].expected_num_nodes == matrix_num_nodes
        # {'x': 30, 'y': 40} -> using parametrize with cluster parametrize_num_nodes
        assert test_context_list[2].expected_num_nodes == parametrize_num_nodes
        # {'x': 10, 'y': 20} -> using parametrize with cluster parametrize_num_nodes
        assert test_context_list[3].expected_num_nodes == parametrize_num_nodes
        # {'x': 100, 'y': 200} -> using parametrize with default_num_nodes
        assert test_context_list[4].expected_num_nodes == default_num_nodes

    def check_no_override(self):
        """cluster use metadata should apply to all test cases physically below it, except for those which already
        have cluster use metadata.
        """

        num_nodes_1 = 200
        num_nodes_2 = 42

        # num_nodes_2 should *not* override num_nodes_1
        @cluster(num_nodes=num_nodes_2)
        @cluster(num_nodes=num_nodes_1)
        def f(x, y=2, z=3):
            return x, y, z

        test_context_list = MarkedFunctionExpander(function=f).expand()
        assert len(test_context_list) == 1
        assert test_context_list[0].expected_num_nodes == num_nodes_1

    def check_parametrized_with_multiple_cluster_annotations(self):
        num_nodes_1 = 10
        num_nodes_2 = 20

        # num_nodes_1 should *not* override num_nodes_2
        @cluster(num_nodes=num_nodes_1)
        @parametrize(x=1)
        @parametrize(x=1.5)
        @cluster(num_nodes=num_nodes_2)
        @parametrize(x=2)
        def f(x, y=2, z=3):
            return x, y, z

        test_context_list = MarkedFunctionExpander(function=f).expand()
        assert len(test_context_list) == 3
        assert test_context_list[0].expected_num_nodes == num_nodes_1
        assert test_context_list[1].expected_num_nodes == num_nodes_1
        assert test_context_list[2].expected_num_nodes == num_nodes_2

    def check_matrix_with_multiple_cluster_annotations(self):
        num_nodes_1 = 10
        num_nodes_2 = 20

        # num_nodes_1 should *not* override num_nodes_2
        @cluster(num_nodes=num_nodes_1)
        @matrix(x=[1])
        @matrix(x=[1.5, 1.6], y=[-15, -16])
        @cluster(num_nodes=num_nodes_2)
        @matrix(x=[2])
        def f(x, y=2, z=3):
            return x, y, z

        test_context_list = MarkedFunctionExpander(function=f).expand()
        assert len(test_context_list) == 6
        assert test_context_list[0].expected_num_nodes == num_nodes_1
        assert test_context_list[1].expected_num_nodes == num_nodes_1
        assert test_context_list[2].expected_num_nodes == num_nodes_1
        assert test_context_list[3].expected_num_nodes == num_nodes_1
        assert test_context_list[4].expected_num_nodes == num_nodes_1
        assert test_context_list[5].expected_num_nodes == num_nodes_2

    def check_with_ignore(self):
        num_nodes = 200

        @cluster(num_nodes=num_nodes)
        @ignore
        def function():
            return "hi"

        assert hasattr(function, "marks")

        test_context_list = MarkedFunctionExpander(function=function).expand()
        assert len(test_context_list) == 1
        assert test_context_list[0].expected_num_nodes == num_nodes

        # order shouldn't matter here
        @ignore
        @cluster(num_nodes=num_nodes)
        def function():
            return "hi"

        assert hasattr(function, "marks")

        test_context_list = MarkedFunctionExpander(function=function).expand()
        assert len(test_context_list) == 1
        assert test_context_list[0].expected_num_nodes == num_nodes


================================================
FILE: tests/mark/check_env.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.mark.mark_expander import MarkedFunctionExpander
from ducktape.mark import env, is_env

import os


class CheckEnv(object):
    def check_does_not_raise_exception_when_key_not_exists(self):
        class C(object):
            @env(BLAH="8")
            def function(self):
                return 1

    def check_has_env_annotation(self):
        class C(object):
            @env(JAVA_HOME="blah")
            def function(self):
                return 1

        assert is_env(C.function)

    def check_is_ignored_if_env_not_correct(self):
        class C(object):
            @env(JAVA_HOME="blah")
            def function(self):
                return 1

        context_list = MarkedFunctionExpander(function=C.function, cls=C).expand()
        assert context_list[0].ignore

    def check_is_not_ignore_if_correct_env(self):
        os.environ["test_key"] = "test"

        class C(object):
            @env(test_key="test")
            def function(self):
                return 1

        context_list = MarkedFunctionExpander(function=C.function, cls=C).expand()
        assert not context_list[0].ignore


================================================
FILE: tests/mark/check_ignore.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.mark.mark_expander import MarkedFunctionExpander
from ducktape.mark import ignore, ignored, parametrize, matrix

import pytest


class CheckIgnore(object):
    def check_simple(self):
        @ignore
        def function(x=1, y=2, z=3):
            return x, y, z

        assert ignored(function)
        context_list = MarkedFunctionExpander(function=function).expand()
        assert len(context_list) == 1
        assert context_list[0].ignore

    def check_simple_method(self):
        class C(object):
            @ignore
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert ignored(C.function)
        context_list = MarkedFunctionExpander(function=C.function, cls=C).expand()
        assert len(context_list) == 1
        assert context_list[0].ignore

    def check_ignore_all(self):
        @ignore
        @parametrize(x=100, y=200, z=300)
        @parametrize(x=100, z=300)
        @parametrize(y=200)
        @parametrize()
        def function(x=1, y=2, z=3):
            return x, y, z

        assert ignored(function)
        context_list = MarkedFunctionExpander(function=function).expand()
        assert len(context_list) == 4
        for ctx in context_list:
            assert ctx.ignore

    def check_ignore_all_method(self):
        """Check @ignore() with no arguments used with various parametrizations on a method."""

        class C(object):
            @ignore
            @parametrize(x=100, y=200, z=300)
            @parametrize(x=100, z=300)
            @parametrize(y=200)
            @matrix(x=[1, 2, 3])
            @parametrize()
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert ignored(C.function)
        context_list = MarkedFunctionExpander(function=C.function, cls=C).expand()
        assert len(context_list) == 7
        for ctx in context_list:
            assert ctx.ignore

    def check_ignore_specific(self):
        @ignore(x=100, y=200, z=300)
        @parametrize(x=100, y=200, z=300)
        @parametrize(x=100, z=300)
        @parametrize(y=200)
        @parametrize()
        def function(x=1, y=2, z=3):
            return x, y, z

        assert ignored(function)
        context_list = MarkedFunctionExpander(None, function=function).expand()
        assert len(context_list) == 4
        for ctx in context_list:
            if ctx.injected_args == {"x": 100, "y": 200, "z": 300}:
                assert ctx.ignore
            else:
                assert not ctx.ignore

    def check_ignore_specific_method(self):
        class C(object):
            @ignore(x=100, y=200, z=300)
            @parametrize(x=100, y=200, z=300)
            @parametrize(x=100, z=300)
            @parametrize(y=200)
            @parametrize()
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert ignored(C.function)
        context_list = MarkedFunctionExpander(function=C.function, cls=C).expand()
        assert len(context_list) == 4
        for ctx in context_list:
            if ctx.injected_args == {"x": 100, "y": 200, "z": 300}:
                assert ctx.ignore
            else:
                assert not ctx.ignore

    def check_invalid_specific_ignore(self):
        """If there are no test cases to which ignore applies, it should raise an error
        Keeping in mind annotations "point down": they only apply to test cases physically below.
        """

        class C(object):
            @parametrize(x=100, y=200, z=300)
            @parametrize(x=100, z=300)
            @parametrize(y=200)
            @parametrize()
            @ignore(x=100, y=200, z=300)
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert ignored(C.function)
        with pytest.raises(AssertionError):
            MarkedFunctionExpander(function=C.function, cls=C).expand()

    def check_invalid_ignore_all(self):
        """If there are no test cases to which ignore applies, it should raise an error"""

        class C(object):
            @parametrize(x=100, y=200, z=300)
            @parametrize(x=100, z=300)
            @parametrize(y=200)
            @parametrize()
            @ignore
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert ignored(C.function)
        with pytest.raises(AssertionError):
            MarkedFunctionExpander(function=C.function, cls=C).expand()


================================================
FILE: tests/mark/check_parametrize.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.mark import parametrize, parametrized, matrix, defaults
from ducktape.mark.mark_expander import MarkedFunctionExpander


class CheckParametrize(object):
    def check_simple(self):
        @parametrize(x=100, z=300)
        def function(x=1, y=2, z=3):
            return x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 1
        assert injected_args_list[0] == {"x": 100, "z": 300}

        context_list = MarkedFunctionExpander(function=function).expand()

        all_f = [cxt.function for cxt in context_list]
        assert all_f[0]() == (100, 2, 300)

    def check_simple_method(self):
        class C(object):
            @parametrize(x=100, z=300)
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert parametrized(C.function)
        injected_args_list = [m.injected_args for m in C.function.marks]
        assert len(injected_args_list) == 1
        assert injected_args_list[0] == {"x": 100, "z": 300}

        context_list = MarkedFunctionExpander(None, function=C.function).expand()
        all_f = [cxt.function for cxt in context_list]
        assert all_f[0](C()) == (100, 2, 300)

    def check_stacked(self):
        @parametrize(x=100, y=200, z=300)
        @parametrize(x=100, z=300)
        @parametrize(y=200)
        @parametrize()
        def function(x=1, y=2, z=3):
            return x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 4

        context_list = MarkedFunctionExpander(None, function=function).expand()
        all_f = [cxt.function for cxt in context_list]
        assert all_f[0]() == (100, 200, 300)
        assert all_f[1]() == (100, 2, 300)
        assert all_f[2]() == (1, 200, 3)
        assert all_f[3]() == (1, 2, 3)

    def check_stacked_method(self):
        class C(object):
            @parametrize(x=100, y=200, z=300)
            @parametrize(x=100, z=300)
            @parametrize(y=200)
            @parametrize()
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert parametrized(C.function)
        injected_args_list = [m.injected_args for m in C.function.marks]
        assert len(injected_args_list) == 4

        context_list = MarkedFunctionExpander(None, function=C.function).expand()
        all_f = [cxt.function for cxt in context_list]
        c = C()
        assert all_f[0](c) == (100, 200, 300)
        assert all_f[1](c) == (100, 2, 300)
        assert all_f[2](c) == (1, 200, 3)
        assert all_f[3](c) == (1, 2, 3)


class CheckMatrix(object):
    def check_simple(self):
        @matrix(x=[1, 2], y=[-1, -2])
        def function(x=1, y=2, z=3):
            return x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 1

        context_list = MarkedFunctionExpander(None, function=function).expand()
        assert len(context_list) == 4

        for ctx in context_list:
            f = ctx.function
            injected_args = ctx.injected_args
            assert f() == (injected_args["x"], injected_args["y"], 3)

    def check_simple_method(self):
        class C(object):
            @matrix(x=[1, 2], y=[-1, -2])
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert parametrized(C.function)
        injected_args_list = [m.injected_args for m in C.function.marks]
        assert len(injected_args_list) == 1

        context_list = MarkedFunctionExpander(None, function=C.function).expand()
        assert len(context_list) == 4

        c = C()
        for ctx in context_list:
            f = ctx.function
            injected_args = ctx.injected_args
            assert f(c) == (injected_args["x"], injected_args["y"], 3)

    def check_stacked(self):
        @matrix(x=[1, 2], y=[0])
        @matrix(x=[-1], z=[-10])
        def function(x=1, y=2, z=3):
            return x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 2

        context_list = MarkedFunctionExpander(None, function=function).expand()
        assert len(context_list) == 3

        expected_output = {(1, 0, 3), (2, 0, 3), (-1, 2, -10)}
        output = set()
        for c in context_list:
            output.add(c.function())

        assert output == expected_output

    def check_stacked_method(self):
        class C(object):
            @matrix(x=[1, 2], y=[0])
            @matrix(x=[-1], z=[-10])
            def function(self, x=1, y=2, z=3):
                return x, y, z

        assert parametrized(C.function)
        injected_args_list = [m.injected_args for m in C.function.marks]
        assert len(injected_args_list) == 2

        context_list = MarkedFunctionExpander(None, function=C.function).expand()
        assert len(context_list) == 3

        expected_output = {(1, 0, 3), (2, 0, 3), (-1, 2, -10)}
        output = set()
        for ctx in context_list:
            output.add(ctx.function(C()))

        assert output == expected_output


class CheckDefaults(object):
    def check_defaults(self):
        @defaults(z=[1, 2])
        @matrix(x=[1], y=[1, 2])
        @parametrize(x=3, y=4)
        def function(x=1, y=2, z=-1):
            return x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 3

        context_list = MarkedFunctionExpander(None, function=function).expand()
        assert len(context_list) == 6

        expected_output = {
            (1, 1, 1),
            (1, 1, 2),
            (1, 2, 1),
            (1, 2, 2),
            (3, 4, 1),
            (3, 4, 2),
        }
        output = set()
        for ctx in context_list:
            output.add(ctx.function())

        assert output == expected_output

    def check_defaults_method(self):
        class C(object):
            @defaults(z=[1, 2])
            @matrix(x=[1], y=[1, 2])
            @parametrize(x=3, y=4)
            def function(self, x=1, y=2, z=-1):
                return x, y, z

        assert parametrized(C.function)
        injected_args_list = [m.injected_args for m in C.function.marks]
        assert len(injected_args_list) == 3

        context_list = MarkedFunctionExpander(None, function=C.function).expand()
        assert len(context_list) == 6

        expected_output = {
            (1, 1, 1),
            (1, 1, 2),
            (1, 2, 1),
            (1, 2, 2),
            (3, 4, 1),
            (3, 4, 2),
        }
        output = set()

        c = C()
        for ctx in context_list:
            f = ctx.function
            injected_args = ctx.injected_args
            assert f(c) == (injected_args["x"], injected_args["y"], injected_args["z"])
            output.add(ctx.function(C()))

        assert output == expected_output

    def check_overlap_param(self):
        @defaults(y=[3, 4], z=[1, 2])
        @parametrize(w=1, x=2, y=3)
        def function(w=10, x=20, y=30, z=40):
            return w, x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 2

        context_list = MarkedFunctionExpander(None, function=function).expand()
        assert len(context_list) == 2

        expected_output = {(1, 2, 3, 1), (1, 2, 3, 2)}
        output = set()
        for ctx in context_list:
            output.add(ctx.function())

        assert output == expected_output

    def check_overlap_matrix(self):
        @defaults(y=[3, 4], z=[1, 2])
        @matrix(x=[1, 2], y=[5, 6])
        def function(x=20, y=30, z=40):
            return x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 2

        context_list = MarkedFunctionExpander(None, function=function).expand()
        assert len(context_list) == 8

        expected_output = {
            (1, 5, 1),
            (1, 5, 2),
            (2, 5, 1),
            (2, 5, 2),
            (1, 6, 1),
            (1, 6, 2),
            (2, 6, 1),
            (2, 6, 2),
        }
        output = set()
        for ctx in context_list:
            output.add(ctx.function())

        assert output == expected_output

    def check_only_defaults(self):
        @defaults(x=[3], z=[1, 2])
        def function(x=1, y=2, z=-1):
            return x, y, z

        assert parametrized(function)
        injected_args_list = [m.injected_args for m in function.marks]
        assert len(injected_args_list) == 1

        context_list = MarkedFunctionExpander(None, function=function).expand()
        assert len(context_list) == 2

        expected_output = {(3, 2, 1), (3, 2, 2)}
        output = set()
        for ctx in context_list:
            output.add(ctx.function())

        assert output == expected_output


================================================
FILE: tests/mark/resources/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/reporter/check_symbol_reporter.py
================================================
from pathlib import Path
from unittest.mock import Mock

from ducktape.tests.reporter import FailedTestSymbolReporter


def check_to_symbol_no_args(tmp_path):
    result = Mock(
        file_name="/test_folder/test_file",
        cls_name="TestClass",
        function_name="test_func",
        injected_args=None,
    )
    reporter = FailedTestSymbolReporter(Mock())
    reporter.working_dir = Path("/")
    assert reporter.to_symbol(result) == "test_folder/test_file::TestClass.test_func"


def check_to_symbol_relative_path(tmp_path):
    result = Mock(
        file_name="/test_folder/test_file",
        cls_name="TestClass",
        function_name="test_func",
        injected_args=None,
    )
    reporter = FailedTestSymbolReporter(Mock())
    reporter.working_dir = Path("/test_folder")
    assert reporter.to_symbol(result) == "test_file::TestClass.test_func"


def check_to_symbol_with_args():
    result = Mock(
        file_name="/test_folder/test_file",
        cls_name="TestClass",
        function_name="test_func",
        injected_args={"arg": "val"},
    )

    reporter = FailedTestSymbolReporter(Mock())
    reporter.working_dir = Path("/")
    assert reporter.to_symbol(result) == 'test_folder/test_file::TestClass.test_func@{"arg":"val"}'


================================================
FILE: tests/runner/__init__.py
================================================


================================================
FILE: tests/runner/check_runner.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest.mock import patch

import pytest

from ducktape.cluster.node_container import NodeContainer, InsufficientResourcesError
from ducktape.tests.runner_client import RunnerClient
from ducktape.tests.status import PASS, FAIL
from ducktape.tests.test_context import TestContext
from ducktape.tests.runner import TestRunner
from ducktape.mark.mark_expander import MarkedFunctionExpander
from ducktape.cluster.localhost import LocalhostCluster
from tests.ducktape_mock import FakeCluster, MockSender

import tests.ducktape_mock
from tests.runner.resources.test_fails_to_init import FailsToInitTest
from tests.runner.resources.test_fails_to_init_in_setup import FailsToInitInSetupTest
from .resources.test_bad_actor import BadActorTest
from .resources.test_thingy import ClusterTestThingy, TestThingy
from .resources.test_failing_tests import FailingTest
from ducktape.tests.reporter import JUnitReporter

from unittest.mock import Mock, MagicMock
import os
import xml.etree.ElementTree as ET

from .resources.test_various_num_nodes import VariousNumNodesTest

TEST_THINGY_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_thingy.py"))
FAILING_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_failing_tests.py"))
FAILS_TO_INIT_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_fails_to_init.py"))
FAILS_TO_INIT_IN_SETUP_TEST_FILE = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "resources/test_fails_to_init_in_setup.py")
)
VARIOUS_NUM_NODES_TEST_FILE = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "resources/test_various_num_nodes.py")
)
BAD_ACTOR_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_bad_actor.py"))


class CheckRunner(object):
    def check_insufficient_cluster_resources(self):
        """The test runner should behave sensibly when the cluster is too small to run a given test."""
        mock_cluster = FakeCluster(1)
        session_context = tests.ducktape_mock.session_context()

        test_context = TestContext(
            session_context=session_context,
            module=None,
            cls=TestThingy,
            function=TestThingy.test_pi,
            file=TEST_THINGY_FILE,
            cluster=mock_cluster,
            cluster_use_metadata={"num_nodes": 1000},
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), [test_context], 1)

        # Even though the cluster is too small, the test runner should this handle gracefully without raising an error
        results = runner.run_all_tests()
        assert len(results) == 1
        assert results.num_failed == 1
        assert results.num_passed == 0
        assert results.num_ignored == 0

    def _do_expand(self, test_file, test_class, test_methods, cluster=None, session_context=None):
        ctx_list = []
        for f in test_methods:
            ctx_list.extend(
                MarkedFunctionExpander(
                    session_context=session_context,
                    cls=test_class,
                    function=f,
                    file=test_file,
                    cluster=cluster,
                ).expand()
            )
        return ctx_list

    def check_simple_run(self):
        """Check expected behavior when running a single test."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        test_methods = [
            TestThingy.test_pi,
            TestThingy.test_ignore1,
            TestThingy.test_ignore2,
            TestThingy.test_failure,
        ]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        results = runner.run_all_tests()
        assert len(results) == 4
        assert results.num_flaky == 0
        assert results.num_failed == 1
        assert results.num_passed == 1
        assert results.num_ignored == 2

        result_with_data = [r for r in results if r.data is not None][0]
        assert result_with_data.data == {"data": 3.14159}

    def check_deflake_run(self):
        """Check expected behavior when running a single test."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        test_methods = [TestThingy.test_flaky, TestThingy.test_failure]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 2)

        results = runner.run_all_tests()
        assert len(results) == 2
        assert results.num_flaky == 1
        assert results.num_failed == 1
        assert results.num_passed == 0
        assert results.num_ignored == 0

    def check_runner_report_junit(self):
        """Check we can serialize results into a xunit xml format. Also ensures that the XML report
        adheres to the Junit spec using xpath queries"""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()
        test_methods = [
            TestThingy.test_pi,
            TestThingy.test_ignore1,
            TestThingy.test_ignore2,
            TestThingy.test_failure,
        ]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        results = runner.run_all_tests()
        JUnitReporter(results).report()
        xml_report = os.path.join(session_context.results_dir, "report.xml")
        assert os.path.exists(xml_report)
        tree = ET.parse(xml_report)
        assert len(tree.findall("./testsuite/testcase/failure")) == 1
        assert len(tree.findall("./testsuite/testcase/skipped")) == 2
        assert len(tree.findall("./testsuite/testcase")) == 4

        passed = tree.findall("./testsuite/testcase/[@status='pass']")
        assert len(passed) == 1
        assert passed[0].get("classname") == "TestThingy"
        assert passed[0].get("name") == "test_pi"

        failures = tree.findall("./testsuite/testcase/[@status='fail']")
        assert len(failures) == 1
        assert failures[0].get("classname") == "TestThingy"
        assert failures[0].get("name") == "test_failure"

        ignores = tree.findall("./testsuite/testcase/[@status='ignore']")
        assert len(ignores) == 2
        assert ignores[0].get("classname") == "TestThingy"
        assert ignores[1].get("classname") == "TestThingy"

        assert ignores[0].get("name") == "test_ignore1"
        assert ignores[1].get("name") == "test_ignore2.x=5"

    def check_exit_first(self):
        """Confirm that exit_first in session context has desired effect of preventing any tests from running
        after the first test failure.
        """
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context(**{"exit_first": True})

        test_methods = [FailingTest.test_fail]
        ctx_list = self._do_expand(
            test_file=FAILING_TEST_FILE,
            test_class=FailingTest,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
        results = runner.run_all_tests()
        assert len(ctx_list) > 1
        assert len(results) == 1

    def check_exits_if_failed_to_initialize(self):
        """Validate that runner exits correctly when tests failed to initialize."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        ctx_list = self._do_expand(
            test_file=FAILS_TO_INIT_TEST_FILE,
            test_class=FailsToInitTest,
            test_methods=[FailsToInitTest.test_nothing],
            cluster=mock_cluster,
            session_context=session_context,
        )
        ctx_list.extend(
            self._do_expand(
                test_file=FAILS_TO_INIT_IN_SETUP_TEST_FILE,
                test_class=FailsToInitInSetupTest,
                test_methods=[FailsToInitInSetupTest.test_nothing],
                cluster=mock_cluster,
                session_context=session_context,
            )
        )

        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
        results = runner.run_all_tests()
        # These tests fail to initialize, each class has two test methods, so should have 4 results, all failed
        assert len(results) == 4
        assert results.num_flaky == 0
        assert results.num_failed == 4
        assert results.num_passed == 0
        assert results.num_ignored == 0

    # mock an error reporting test failure - this should not prevent subsequent tests from execution and mark
    # failed test as failed correctly
    @patch.object(RunnerClient, "_exc_msg", side_effect=Exception)
    def check_sends_result_when_error_reporting_exception(self, exc_msg_mock):
        """Validates that an error when reporting an exception in the test doesn't prevent subsequent tests
        from executing"""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()
        test_methods = [TestThingy.test_failure, TestThingy.test_pi]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        results = runner.run_all_tests()
        assert len(results) == 2
        assert results.num_flaky == 0
        assert results.num_failed == 1
        assert results.num_passed == 1
        assert results.num_ignored == 0

    def check_run_failure_with_bad_cluster_allocation(self):
        """Check test should be marked failed if it under-utilizes the cluster resources."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context(
            fail_bad_cluster_utilization="fail_bad_cluster_utilization"
        )

        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=ClusterTestThingy,
            test_methods=[ClusterTestThingy.test_bad_num_nodes],
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        results = runner.run_all_tests()

        assert len(results) == 1
        assert results.num_flaky == 0
        assert results.num_failed == 1
        assert results.num_passed == 0
        assert results.num_ignored == 0

    def check_test_failure_with_too_many_nodes_requested(self):
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context(debug=True)

        ctx_list = self._do_expand(
            test_file=BAD_ACTOR_TEST_FILE,
            test_class=BadActorTest,
            test_methods=[BadActorTest.test_too_many_nodes],
            cluster=mock_cluster,
            session_context=session_context,
        )
        ctx_list.extend(
            self._do_expand(
                test_file=VARIOUS_NUM_NODES_TEST_FILE,
                test_class=VariousNumNodesTest,
                test_methods=[VariousNumNodesTest.test_one_node_a],
                cluster=mock_cluster,
                session_context=session_context,
            )
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
        results = runner.run_all_tests()
        assert results.num_flaky == 0
        assert results.num_failed == 1
        assert results.num_passed == 1
        assert results.num_ignored == 0
        passed = [r for r in results if r.test_status == PASS]
        failed = [r for r in results if r.test_status == FAIL]
        assert passed[0].test_id == "tests.runner.resources.test_various_num_nodes.VariousNumNodesTest.test_one_node_a"
        assert failed[0].test_id == "tests.runner.resources.test_bad_actor.BadActorTest.test_too_many_nodes"

    def check_runner_timeout(self):
        """Check process cleanup and error handling when runner times out.

        When timeout occurs, the runner should:
        1. Clean up all client processes
        2. Mark active tests as failed with 'Test timed out' message
        3. Mark remaining unrun tests as failed with 'Test not run' message
        4. Return results gracefully instead of raising
        """
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context(max_parallel=1000, test_runner_timeout=1)

        test_methods = [TestThingy.test_delayed, TestThingy.test_failure]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        # Should return results gracefully instead of raising
        results = runner.run_all_tests()

        # All client processes should be cleaned up
        assert not runner._client_procs

        # Both tests should be marked as failed
        assert len(results) == 2
        assert results.num_failed == 2
        assert results.num_passed == 0

        # Check that failure summaries contain timeout information
        for result in results:
            assert result.test_status == FAIL
            # Summary should mention either "timed out" or "not run" with the exception message
            assert "runner client unresponsive" in result.summary.lower()

    @pytest.mark.parametrize("fail_greedy_tests", [True, False])
    def check_fail_greedy_tests(self, fail_greedy_tests):
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context(fail_greedy_tests=fail_greedy_tests)

        test_methods = [
            VariousNumNodesTest.test_empty_cluster_annotation,
            VariousNumNodesTest.test_no_cluster_annotation,
            VariousNumNodesTest.test_zero_nodes,
        ]
        ctx_list = self._do_expand(
            test_file=VARIOUS_NUM_NODES_TEST_FILE,
            test_class=VariousNumNodesTest,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
        results = runner.run_all_tests()
        assert results.num_flaky == 0
        assert results.num_failed == (2 if fail_greedy_tests else 0)
        # zero-node test should always pass, whether we fail on greedy or not
        assert results.num_passed == (1 if fail_greedy_tests else 3)
        assert results.num_ignored == 0

    def check_cluster_shrink(self):
        """
        Check what happens if cluster loses a node while the runner is already running.
        SchedulerTestThingy has two 5-node tests, and one of each for 4, 3, and 2 nodes.

        Thus both 5-node tests should pass, first one failing during pre-allocation phase,
        second one shouldn't even attempt to be allocated.
        And all the other tests should pass still.
        """

        mock_cluster = ShrinkingLocalhostCluster(num_nodes=5)
        session_context = tests.ducktape_mock.session_context(max_parallel=10)

        test_methods = [
            VariousNumNodesTest.test_five_nodes_a,
            VariousNumNodesTest.test_five_nodes_b,
            VariousNumNodesTest.test_four_nodes,
            VariousNumNodesTest.test_three_nodes_a,
            VariousNumNodesTest.test_two_nodes_a,
        ]

        ctx_list = self._do_expand(
            test_file=VARIOUS_NUM_NODES_TEST_FILE,
            test_class=VariousNumNodesTest,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )

        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
        results = runner.run_all_tests()

        assert len(results) == 5
        assert results.num_flaky == 0
        assert results.num_failed == 2  # both of the 5-node tests should fail
        assert results.num_passed == 3  # 4-node, 3-node and 2-node should all pass
        assert results.num_ignored == 0

    def check_cluster_shrink_reschedule(self):
        """
        Test that the test that failed to schedule initially due to a node going offline is not lost and is still
        scheduled when more nodes become available.

        We start with a 6-node cluster.
        First we run a long-ish 3-node test, leaving 3 nodes available.
        Then when trying to run a second 3-node test, we shrink the cluster, emulating one of the nodes
        going down - this leaves only 2 nodes available, so we cannot run this test.

        However, after the first 3-node test finishes running, it will return its 3 nodes back to the cluster,
        so the second 3-node test becomes schedulable again - this is what we test for.

        Also two two-node tests should pass too - they should be scheduled before the second 3-node test,
        while two nodes are waiting for the first 3-node test to finish.

        It's generally not a good practice to rely on sleep, but I think it's acceptable in this case,
        since we do need to rely on parallelism.
        """

        mock_cluster = ShrinkingLocalhostCluster(num_nodes=6, shrink_on=2)
        session_context = tests.ducktape_mock.session_context(max_parallel=10)

        test_methods = [
            VariousNumNodesTest.test_three_nodes_asleep,
            VariousNumNodesTest.test_three_nodes_b,
            VariousNumNodesTest.test_two_nodes_a,
            VariousNumNodesTest.test_two_nodes_b,
        ]

        ctx_list = self._do_expand(
            test_file=VARIOUS_NUM_NODES_TEST_FILE,
            test_class=VariousNumNodesTest,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )

        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
        results = runner.run_all_tests()

        assert len(results) == 4
        assert results.num_flaky == 0
        assert results.num_failed == 0
        assert results.num_passed == 4
        assert results.num_ignored == 0

        # normal order on a 6-node cluster would be:
        #  - test_three_nodes_asleep, test_three_nodes_b, test_two_nodes_a, test_two_nodes_b
        # however the cluster would shrink to 5 nodes after scheduling the first 3-node test,
        # leaving no space for the second 3-node test to be scheduled, bumping it down the line,
        # while two 2-node tests will be scheduled alongside the
        expected_scheduling_order = [
            "VariousNumNodesTest.test_three_nodes_asleep",
            "VariousNumNodesTest.test_two_nodes_a",
            "VariousNumNodesTest.test_two_nodes_b",
            "VariousNumNodesTest.test_three_nodes_b",
        ]
        # We check the actual order the tests were scheduled in, since completion order might be different,
        # with so many fast tests running in parallel.
        actual_scheduling_order = [x.test_id for x in runner.test_schedule_log]
        assert actual_scheduling_order == expected_scheduling_order

    def check_cluster_shrink_to_zero(self):
        """
        Validates that if the cluster is shrunk to zero nodes size, no tests can run,
        but we still exit gracefully.
        """

        mock_cluster = ShrinkingLocalhostCluster(num_nodes=1, shrink_on=1)
        session_context = tests.ducktape_mock.session_context(max_parallel=10)

        test_methods = [
            VariousNumNodesTest.test_one_node_a,
            VariousNumNodesTest.test_one_node_b,
        ]

        ctx_list = self._do_expand(
            test_file=VARIOUS_NUM_NODES_TEST_FILE,
            test_class=VariousNumNodesTest,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )

        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)
        results = runner.run_all_tests()

        assert len(results) == 2
        assert results.num_flaky == 0
        assert results.num_failed == 2
        assert results.num_passed == 0
        assert results.num_ignored == 0

    def check_runner_client_report(self):
        """Validates that an error when reporting an exception in the test doesn't prevent subsequent tests
        from executing"""
        mock_cluster = tests.ducktape_mock.mock_cluster()
        session_context = tests.ducktape_mock.session_context()
        test_context = tests.ducktape_mock.test_context(session_context=session_context)
        rc = RunnerClient(
            "localhost",
            22,
            test_context.test_id,
            0,
            "dummy",
            "/tmp/dummy",
            True,
            False,
            5,
        )
        rc.sender = MockSender()
        rc.cluster = mock_cluster
        rc.session_context = session_context
        rc.test_metadata = MagicMock()
        with patch("ducktape.tests.runner_client.RunnerClient._collect_test_context") as test_collect_patch:
            test_collect_patch.return_value = test_context
            rc.run()
        # get the last message, get the args send to that message, and get the first arg
        finished_result = rc.sender.send_results[-1][0][0]
        assert finished_result.get("event_type") == "FINISHED"
        assert finished_result["result"].summary == "Test Passed"

    def check_report_remaining_as_failed(self):
        """Test that _report_remaining_as_failed marks all unrun tests as failed."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        test_methods = [TestThingy.test_pi, TestThingy.test_failure]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        # Call the method directly
        reason = "test reason for failure"
        runner._report_remaining_as_failed(reason)

        # All tests should be marked as failed
        assert len(runner.results) == 2
        assert runner.results.num_failed == 2
        for result in runner.results:
            assert result.test_status == FAIL
            assert f"Test not run: {reason}" in result.summary

        # Scheduler should be empty after draining
        assert len(runner.scheduler) == 0

    def check_report_active_as_failed(self):
        """Test that _report_active_as_failed marks running tests as failed."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        test_methods = [TestThingy.test_pi]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        # Simulate an active test using the actual test_id from ctx_list
        from ducktape.tests.runner import TestKey
        import time

        test_ctx = ctx_list[0]
        test_key = TestKey(test_ctx.test_id, 1)
        runner.active_tests[test_key] = True
        runner.client_report[test_key] = {"runner_start_time": time.time() - 10}

        # Call the method
        reason = "test timeout reason"
        runner._report_active_as_failed(reason)

        # Test should be marked as failed
        assert len(runner.results) == 1
        assert runner.results.num_failed == 1
        results_list = [r for r in runner.results]
        assert len(results_list) == 1
        result = results_list[0]
        assert result.test_status == FAIL
        assert f"Test timed out: {reason}" in result.summary
        assert result.start_time < result.stop_time

        # Active tests should be cleared
        assert len(runner.active_tests) == 0

    def check_report_active_as_failed_frees_cluster(self):
        """Test that _report_active_as_failed frees cluster resources."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        test_methods = [TestThingy.test_pi]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        # Simulate an active test with allocated cluster
        from ducktape.tests.runner import TestKey
        from ducktape.cluster.finite_subcluster import FiniteSubcluster
        import time

        test_ctx = ctx_list[0]
        test_key = TestKey(test_ctx.test_id, 1)

        # Allocate cluster nodes for this test
        allocated_nodes = mock_cluster.alloc(test_ctx.expected_cluster_spec)
        runner._test_cluster[test_key] = FiniteSubcluster(allocated_nodes)
        runner.active_tests[test_key] = True
        runner.client_report[test_key] = {"runner_start_time": time.time() - 10}

        # Check that nodes are allocated
        initial_available = mock_cluster.num_available_nodes()
        assert test_key in runner._test_cluster

        # Call the method
        reason = "test timeout reason"
        runner._report_active_as_failed(reason)

        # Cluster resources should be freed
        assert test_key not in runner._test_cluster
        assert mock_cluster.num_available_nodes() == initial_available + len(allocated_nodes)

        # Active tests should be cleared
        assert len(runner.active_tests) == 0

    def check_runner_client_shutdown_flag(self):
        """Test that runner client respects shutdown flag when sending messages."""
        mock_cluster = tests.ducktape_mock.mock_cluster()
        session_context = tests.ducktape_mock.session_context()
        test_context = tests.ducktape_mock.test_context(session_context=session_context)

        rc = RunnerClient(
            "localhost",
            22,
            test_context.test_id,
            0,
            "dummy",
            "/tmp/dummy",
            True,
            False,
            5,
        )
        rc.sender = MockSender()
        rc.cluster = mock_cluster
        rc.session_context = session_context

        # Set shutdown flag
        rc.runner_shutting_down = True

        # Try to send a message - should return None without sending
        from ducktape.tests.event import ClientEventFactory

        message_factory = ClientEventFactory(test_context.test_id, 0, "test-client")
        result = rc.send(message_factory.ready())

        assert result is None
        # No messages should have been sent
        assert len(rc.sender.send_results) == 0

    def check_duplicate_finished_message_handling(self):
        """Test that duplicate FINISHED messages are handled correctly with three distinct cases."""
        from ducktape.tests.runner import TestKey
        from ducktape.tests.result import TestResult
        from ducktape.cluster.finite_subcluster import FiniteSubcluster
        import time

        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        test_methods = [TestThingy.test_pi]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )
        runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1)

        # Mock the receiver.send to avoid ZMQ socket issues in tests
        runner.receiver.send = Mock()

        # Simulate a test that has been started
        test_ctx = ctx_list[0]
        test_key = TestKey(test_ctx.test_id, 1)
        runner.active_tests[test_key] = True

        # Allocate cluster for this test
        allocated_nodes = mock_cluster.alloc(test_ctx.expected_cluster_spec)
        runner._test_cluster[test_key] = FiniteSubcluster(allocated_nodes)

        # Mock a client process
        runner._client_procs[test_key] = Mock()
        runner._client_procs[test_key].is_alive.return_value = False
        runner._client_procs[test_key].join.return_value = None
        runner._client_procs[test_key].exitcode = 0
        runner._client_procs[test_key].name = "MockProcess"

        # Create a FINISHED event
        from ducktape.tests.event import ClientEventFactory

        event_factory = ClientEventFactory(test_ctx.test_id, 1, "test-client")
        result = TestResult(test_ctx, 1, session_context, PASS, "Test passed", None, time.time(), time.time())
        event = event_factory.finished(result=result)

        # Case 1: Normal first FINISHED - should process normally
        runner._handle_finished(event)
        assert test_key not in runner.active_tests
        assert test_key in runner.finished_tests
        assert len(runner.results) == 1
        assert test_key not in runner._test_cluster
        assert runner.receiver.send.call_count == 1

        # Case 2: Duplicate FINISHED (ZMQ retry) - should be ignored gracefully
        runner._handle_finished(event)
        assert len(runner.results) == 1  # Should not add duplicate
        assert runner.receiver.send.call_count == 2  # ACK still sent

        # Case 3: FINISHED for unknown test - should raise RuntimeError
        unknown_event_factory = ClientEventFactory("unknown.test.id", 999, "test-client")
        unknown_result = TestResult(test_ctx, 999, session_context, PASS, "Test passed", None, time.time(), time.time())
        unknown_event = unknown_event_factory.finished(result=unknown_result)

        with pytest.raises(RuntimeError, match="not in active_tests"):
            runner._handle_finished(unknown_event)

    def check_timeout_exception_join_timeout_param(self):
        """Test that timeout_exception_join_timeout parameter is used correctly."""
        mock_cluster = LocalhostCluster(num_nodes=1000)
        session_context = tests.ducktape_mock.session_context()

        test_methods = [TestThingy.test_pi]
        ctx_list = self._do_expand(
            test_file=TEST_THINGY_FILE,
            test_class=TestThingy,
            test_methods=test_methods,
            cluster=mock_cluster,
            session_context=session_context,
        )

        # Create runner with custom timeout values
        runner = TestRunner(
            mock_cluster,
            session_context,
            Mock(),
            ctx_list,
            1,
            finish_join_timeout=10,
            timeout_exception_join_timeout=60,
        )

        assert runner.finish_join_timeout == 10
        assert runner.timeout_exception_join_timeout == 60


class ShrinkingLocalhostCluster(LocalhostCluster):
    def __init__(self, *args, shrink_on=1, **kwargs):
        super().__init__(*args, **kwargs)
        self.bad_nodes = NodeContainer()
        # which call to shrink on
        self.shrink_on = shrink_on
        self.num_alloc_calls = 0

    def do_alloc(self, cluster_spec):
        allocated = super().do_alloc(cluster_spec)
        self.num_alloc_calls += 1
        if self.shrink_on == self.num_alloc_calls:
            bad_node = allocated.pop()
            self._in_use_nodes.remove_node(bad_node)
            self.bad_nodes.add_node(bad_node)

            # simplified logic, we know all nodes are of the same OS/type
            # check if we don't have enough nodes any more
            # (which really should be true every time, since the largest test would be scheduled)
            if len(allocated) < len(cluster_spec):
                # return all good nodes back to be available
                for node in allocated:
                    self._in_use_nodes.remove_node(node)
                    self._available_nodes.add_node(node)

                raise InsufficientResourcesError("yeah")

        return allocated


================================================
FILE: tests/runner/check_runner_memory.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.runner import TestRunner
from ducktape.mark.mark_expander import MarkedFunctionExpander
from ducktape.cluster.localhost import LocalhostCluster

from .resources.test_memory_leak import MemoryLeakTest

import math
from memory_profiler import memory_usage
import os
from queue import Queue
import statistics
from statistics import mean

import tests.ducktape_mock

from mock import Mock


N_TEST_CASES = 5


MEMORY_LEAK_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_memory_leak.py"))


class InstrumentedTestRunner(TestRunner):
    """Identical to TestRunner, except dump memory used by the current process
    before running each test.
    """

    def __init__(self, *args, **kwargs):
        self.queue = kwargs.get("queue")
        del kwargs["queue"]
        super(InstrumentedTestRunner, self).__init__(*args, **kwargs)

    def _run_single_test(self, test_context):
        # write current memory usage to file before running the test
        pid = os.getpid()
        current_memory = memory_usage(pid)[0]
        self.queue.put(current_memory)

        super(InstrumentedTestRunner, self)._run_single_test(test_context)


class CheckMemoryUsage(object):
    def setup_method(self, _):
        self.cluster = LocalhostCluster(num_nodes=100)
        self.session_context = tests.ducktape_mock.session_context()

    def check_for_inter_test_memory_leak(self):
        """Until v0.3.10, ducktape had a serious source of potential memory leaks.

        Because test_context objects held a reference to all services for the duration of a test run, the memory
        used by any individual service would not be garbage-collected until well after *all* tests had run.

        This memory leak was discovered in Kafka system tests, where many long-running system tests were enough
        to cumulatively use up the memory on the test machine, causing a cascade of test failures due to
        inability to allocate any more memory.

        This test provides a regression check against this type of memory leak; it fails without the fix, and passes
        with it.
        """
        # Get a list of test_context objects for the test runner
        ctx_list = []
        test_methods = [MemoryLeakTest.test_leak]
        for f in test_methods:
            ctx_list.extend(
                MarkedFunctionExpander(
                    session_context=self.session_context,
                    cls=MemoryLeakTest,
                    function=f,
                    file=MEMORY_LEAK_TEST_FILE,
                    cluster=self.cluster,
                ).expand()
            )
        assert len(ctx_list) == N_TEST_CASES  # Sanity check

        q = Queue()
        runner = InstrumentedTestRunner(self.cluster, self.session_context, Mock(), ctx_list, 1, queue=q)
        runner.run_all_tests()

        measurements = []
        while not q.empty():
            measurements.append(q.get())
        self.validate_memory_measurements(measurements)

    def validate_memory_measurements(self, measurements):
        """A rough heuristic to check that stair-case style memory leak is not present.

        The idea is that when well-behaved, in this specific test, the maximum memory usage should be near the "middle".
         Here we check that the maximum usage is within 5% of the median memory usage.

        What is meant by stair-case? When the leak was present in its most blatant form,
         each repetition of MemoryLeak.test_leak run by the test runner adds approximately a fixed amount of memory
         without freeing much, resulting in a memory usage profile that looks like a staircase going up to the right.
        """
        median_usage = statistics.median(measurements)
        max_usage = max(measurements)

        usage_stats = "\nmax: %s,\nmedian: %s,\nall: %s\n" % (
            max_usage,
            median_usage,
            measurements,
        )

        # we want to make sure that max usage doesn't exceed median usage by very much
        relative_diff = (max_usage - median_usage) / median_usage
        slope = self._linear_regression_slope(measurements)

        if slope > 0:
            # check max memory usage iff the memory measurements seem to be increasing overall
            assert relative_diff <= 0.05, (
                "max usage exceeded median usage by too much; there may be a memory leak: %s" % usage_stats
            )

    def _linear_regression_slope(self, arr):
        """Return the sign of the slope of the least squares fit line."""
        assert len(arr) > 0

        x_vals = [i for i in range(len(arr))]
        mean_x = mean(x_vals)
        mean_y = mean(arr)

        #            mean([x_i * y_i]) - mean_x * mean_y
        # slope =    -----------------------------------
        #                       variance([x_i])
        #
        # where variance is (1/N) * sum([(x_i - mean_x)^2])
        #
        # the denominator in regression formula is always positive, so it's enough to compute the numerator

        slope_numerator = mean([i * arr[i] for i in x_vals])
        slope_numerator = slope_numerator - (mean_x * mean_y)

        # return the sign
        return math.copysign(slope_numerator, 1)


================================================
FILE: tests/runner/check_sender_receiver.py
================================================
# Copyright 2021 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.localhost import LocalhostCluster
from tests.ducktape_mock import test_context, session_context

import logging
import pytest
from ducktape.tests.runner_client import Sender
from ducktape.tests.runner import Receiver
from ducktape.tests.event import ClientEventFactory, EventResponseFactory
from ducktape.errors import TimeoutError

import multiprocessing as mp
import os


class CheckSenderReceiver(object):
    def ready_response(self, client_id, port):
        sender_event_factory = ClientEventFactory("test_1", 0, client_id)
        sender = Sender(
            server_host="localhost",
            server_port=port,
            message_supplier=sender_event_factory,
            logger=logging,
        )
        sender.send(sender_event_factory.ready())

    def check_simple_messaging(self):
        s_context = session_context()
        cluster = LocalhostCluster(num_nodes=1000)
        t_context = test_context(s_context, cluster)

        client_id = "test-runner-{}-{}".format(os.getpid(), id(self))
        receiver_response_factory = EventResponseFactory()

        receiver = Receiver(5556, 5656)
        receiver.start()
        port = receiver.port

        try:
            p = mp.Process(target=self.ready_response, args=(client_id, port))
            p.start()

            event = receiver.recv(timeout=10000)
            assert event["event_type"] == ClientEventFactory.READY
            logging.info("replying to client")
            receiver.send(receiver_response_factory.ready(event, s_context, t_context, cluster))
        finally:
            p.join(timeout=2)  # Add timeout to prevent hanging if subprocess has issues
            receiver.close()

    def check_timeout(self):
        client_id = "test-runner-{}-{}".format(os.getpid(), id(self))

        receiver = Receiver(5556, 5656)
        receiver.start()
        port = receiver.port

        try:
            p = mp.Process(target=self.ready_response, args=(client_id, port))
            p.start()
            with pytest.raises(TimeoutError):
                receiver.recv(timeout=0)
        finally:
            # Terminate the process instead of waiting for it to timeout
            # (it will keep retrying for up to 15 seconds with default timeouts)
            p.terminate()
            p.join(timeout=1)
            receiver.close()

    def check_exponential_backoff(self):
        """Test that Sender applies exponential backoff on retries."""
        import time

        # Save original values
        original_timeout = Sender.REQUEST_TIMEOUT_MS
        original_retries = Sender.NUM_RETRIES

        # Override timeout values for faster testing BEFORE creating sender
        Sender.REQUEST_TIMEOUT_MS = 100  # 100ms instead of 3000ms
        Sender.NUM_RETRIES = 3  # 3 retries instead of 5

        try:
            client_id = "test-backoff-client"
            event_factory = ClientEventFactory("test_1", 0, client_id)

            # Create sender that will timeout (no receiver listening on this port)
            sender = Sender(
                server_host="localhost",
                server_port=9999,  # Nothing listening here
                message_supplier=event_factory,
                logger=logging,
            )

            start_time = time.time()
            try:
                sender.send(event_factory.ready())
            except RuntimeError as e:
                # Expected to fail with "Unable to receive response from driver"
                assert "Unable to receive response from driver" in str(e)

            elapsed = time.time() - start_time

            # With 2x backoff: 100ms + 200ms + 400ms = 700ms total
            # Without backoff: 100ms * 3 = 300ms total
            # We expect at least 500ms to prove backoff is working
            # Use 400ms to account for timing variance
            assert elapsed > 0.4, f"Expected > 0.4s with backoff, got {elapsed:.3f}s"
            assert elapsed < 2.0, f"Expected < 2.0s, got {elapsed:.3f}s (test taking too long)"

            sender.close()
        finally:
            # Restore original values
            Sender.REQUEST_TIMEOUT_MS = original_timeout
            Sender.NUM_RETRIES = original_retries


================================================
FILE: tests/runner/fake_remote_account.py
================================================
from ducktape.cluster.consts import LINUX, WINDOWS
from ducktape.cluster.remoteaccount import RemoteAccount


class FakeRemoteAccount(RemoteAccount):
    def __init__(self, *args, is_available=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.os = LINUX
        self.is_available = is_available

    def available(self):
        return self.is_available

    def fetch_externally_routable_ip(self, *args, **kwargs):
        return "fake ip"


class FakeWindowsRemoteAccount(FakeRemoteAccount):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.os = WINDOWS


def create_fake_remote_account(*args, **kwargs):
    return FakeRemoteAccount(*args, **kwargs)


================================================
FILE: tests/runner/resources/__init__.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/runner/resources/test_bad_actor.py
================================================
from ducktape.mark.resource import cluster
from ducktape.services.service import Service
from ducktape.tests.test import Test


class FakeService(Service):
    pass


class BadActorTest(Test):
    @cluster(num_nodes=2)
    def test_too_many_nodes(self):
        """
        This test should fail to allocate and
        should be dealt with gracefully by the framework, marking it as failed
        and moving on.
        """
        FakeService(self.test_context, num_nodes=3)


================================================
FILE: tests/runner/resources/test_failing_tests.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from ducktape.mark.resource import cluster
from ducktape.tests.test import Test
from ducktape.mark import matrix

"""All tests in this module fail"""


class FailingTest(Test):
    def __init__(self, test_context):
        super(FailingTest, self).__init__(test_context)

    @cluster(num_nodes=1000)
    @matrix(x=[_ for _ in range(2)])
    def test_fail(self, x):
        print("Test %s fails!" % x)
        raise RuntimeError("This test throws an error!")


================================================
FILE: tests/runner/resources/test_fails_to_init.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test
from ducktape.mark import matrix

"""All tests in this module fail"""


class FailsToInitTest(Test):
    """All tests in this class fail to initialize due to an exception in constructor"""

    def __init__(self, test_context):
        super(FailsToInitTest, self).__init__(test_context)
        x = None
        x.split(":")

    @matrix(x=[_ for _ in range(2)])
    def test_nothing(self):
        self.logger.info("NOTHING")


================================================
FILE: tests/runner/resources/test_fails_to_init_in_setup.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test
from ducktape.mark import matrix

"""All tests in this module fail"""


class FailsToInitInSetupTest(Test):
    """All tests in this class fail to initialize due to an exception in setUp() method"""

    def __init__(self, test_context):
        super(FailsToInitInSetupTest, self).__init__(test_context)

    def setUp(self):
        x = None
        x.split(":")

    @matrix(x=[_ for _ in range(2)])
    def test_nothing(self):
        self.logger.info("NOTHING")


================================================
FILE: tests/runner/resources/test_memory_leak.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ducktape.mark.resource import cluster
from ducktape.tests.test import Test
from ducktape.services.service import Service
from ducktape.mark import matrix


MEMORY_EATER_LIST_SIZE = 10000000
N_TEST_CASES = 5


class MemoryEater(Service):
    """Simple service that has a reference to a list with many elements"""

    def __init__(self, context):
        super(MemoryEater, self).__init__(context, 1)
        self.items = []

    def start_node(self, node):
        self.items = [x for x in range(MEMORY_EATER_LIST_SIZE)]

    def stop_node(self, node):
        pass

    def clean_node(self, node):
        pass

    @property
    def num_nodes(self):
        return 1


class MemoryLeakTest(Test):
    """A group of identical "memory-hungry" ducktape tests.
    Each test holds a reference to a service which itself holds a reference to a large (memory intensive) object.
    """

    def __init__(self, test_context):
        super(MemoryLeakTest, self).__init__(test_context)
        self.memory_eater = MemoryEater(test_context)

    @cluster(num_nodes=100)
    @matrix(x=[i for i in range(N_TEST_CASES)])
    def test_leak(self, x):
        self.memory_eater.start()


================================================
FILE: tests/runner/resources/test_thingy.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time
from ducktape.tests.test import Test
from ducktape.mark import ignore, parametrize
from ducktape.mark.resource import cluster


_flake = False


class TestThingy(Test):
    """Fake ducktape test class"""

    @cluster(num_nodes=1000)
    def test_pi(self):
        return {"data": 3.14159}

    @cluster(num_nodes=1000)
    def test_delayed(self):
        time.sleep(1)

    @cluster(num_nodes=1000)
    @ignore
    def test_ignore1(self):
        pass

    @cluster(num_nodes=1000)
    @ignore(x=5)
    @parametrize(x=5)
    def test_ignore2(self, x=2):
        pass

    @cluster(num_nodes=1000)
    def test_failure(self):
        raise Exception("This failed")

    @cluster(num_nodes=1000)
    def test_flaky(self):
        global _flake
        flake, _flake = _flake, not _flake
        assert flake


class ClusterTestThingy(Test):
    """Fake ducktape test class"""

    @cluster(num_nodes=10)
    def test_bad_num_nodes(self):
        pass


================================================
FILE: tests/runner/resources/test_various_num_nodes.py
================================================
import time

from ducktape.mark.resource import cluster
from ducktape.tests.test import Test


class VariousNumNodesTest(Test):
    """
    Allocates various number of nodes.
    """

    @cluster(num_nodes=5)
    def test_five_nodes_a(self):
        assert True

    @cluster(num_nodes=5)
    def test_five_nodes_b(self):
        assert True

    @cluster(num_nodes=4)
    def test_four_nodes(self):
        assert True

    @cluster(num_nodes=3)
    def test_three_nodes_asleep(self):
        time.sleep(3)
        assert True

    @cluster(num_nodes=3)
    def test_three_nodes_a(self):
        assert True

    @cluster(num_nodes=3)
    def test_three_nodes_b(self):
        assert True

    @cluster(num_nodes=2)
    def test_two_nodes_a(self):
        assert True

    @cluster(num_nodes=2)
    def test_two_nodes_b(self):
        assert True

    @cluster(num_nodes=1)
    def test_one_node_a(self):
        assert True

    @cluster(num_nodes=1)
    def test_one_node_b(self):
        assert True

    def test_no_cluster_annotation(self):
        assert True

    @cluster()
    def test_empty_cluster_annotation(self):
        assert True

    # this one is valid regardless of
    # whether the greedy tests are allowed or not
    # because it's not greedy, quite the opposite
    @cluster(num_nodes=0)
    def test_zero_nodes(self):
        assert True


================================================
FILE: tests/scheduler/__init__.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/scheduler/check_scheduler.py
================================================
# Copyright 2016 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import collections

from ducktape.cluster.cluster_spec import ClusterSpec
from tests.ducktape_mock import FakeCluster
from ducktape.tests.scheduler import TestScheduler
from ducktape.services.service import Service

FakeContext = collections.namedtuple("FakeContext", ["test_id", "expected_num_nodes", "expected_cluster_spec"])


class CheckScheduler(object):
    def setup_method(self, _):
        self.cluster = FakeCluster(100)
        self.tc0 = FakeContext(0, expected_num_nodes=10, expected_cluster_spec=ClusterSpec.simple_linux(10))
        self.tc1 = FakeContext(1, expected_num_nodes=50, expected_cluster_spec=ClusterSpec.simple_linux(50))
        self.tc2 = FakeContext(
            2,
            expected_num_nodes=100,
            expected_cluster_spec=ClusterSpec.simple_linux(100),
        )
        self.tc_list = [
            self.tc0,
            self.tc1,
            self.tc2,
        ]

    def check_empty(self):
        """Check expected behavior of empty scheduler."""
        scheduler = TestScheduler([], self.cluster)

        assert len(scheduler) == 0
        assert scheduler.peek() is None

    def check_non_empty_cluster_too_small(self):
        """Ensure that scheduler does not return tests if the cluster does not have enough available nodes."""

        scheduler = TestScheduler(self.tc_list, self.cluster)
        assert len(scheduler) == len(self.tc_list)
        assert scheduler.peek() is not None

        # alloc all cluster nodes so none are available
        self.cluster.alloc(Service.setup_cluster_spec(num_nodes=len(self.cluster)))
        assert self.cluster.num_available_nodes() == 0

        # peeking should not yield an object
        assert scheduler.peek() is None

    def check_simple_usage(self):
        """Check usage with fully available cluster."""

        scheduler = TestScheduler(self.tc_list, self.cluster)

        c = 2
        while len(scheduler) > 0:
            t = scheduler.peek()
            assert t.test_id == c
            assert len(scheduler) == c + 1

            scheduler.remove(t)
            assert len(scheduler) == c

            c -= 1

    def check_with_changing_cluster_availability(self):
        """Modify cluster usage in between calls to next()"""

        scheduler = TestScheduler(self.tc_list, self.cluster)

        # start with 100-node cluster (configured in setup_method())
        # allocate 60 nodes; only test_id 0 (which needs 10 nodes) should be available
        nodes = self.cluster.alloc(Service.setup_cluster_spec(num_nodes=60))
        assert self.cluster.num_available_nodes() == 40
        t = scheduler.peek()
        assert t == self.tc0
        scheduler.remove(t)
        assert scheduler.peek() is None

        # return 10 nodes, so 50 are available in the cluster
        # next test from the scheduler should be test id 1 (which needs 50 nodes)
        return_nodes = nodes[:10]
        keep_nodes = nodes[10:]
        self.cluster.free(return_nodes)
        assert self.cluster.num_available_nodes() == 50
        t = scheduler.peek()
        assert t == self.tc1
        scheduler.remove(t)
        assert scheduler.peek() is None

        # return remaining nodes, so cluster is fully available
        # next test from scheduler should be test id 2 (which needs 100 nodes)
        return_nodes = keep_nodes
        self.cluster.free(return_nodes)
        assert self.cluster.num_available_nodes() == len(self.cluster)
        t = scheduler.peek()
        assert t == self.tc2
        scheduler.remove(t)
        # scheduler should become empty now
        assert len(scheduler) == 0
        assert scheduler.peek() is None

    def check_filter_unschedulable_tests(self):
        self.cluster = FakeCluster(49)  # only test id 0 can be scheduled on this cluster
        scheduler = TestScheduler(self.tc_list, self.cluster)

        unschedulable = scheduler.filter_unschedulable_tests()
        assert set(unschedulable) == {self.tc1, self.tc2}
        # subsequent calls should return empty list and not modify scheduler
        assert not scheduler.filter_unschedulable_tests()
        assert scheduler.peek() == self.tc0

    def check_drain_remaining_tests(self):
        """Test that drain_remaining_tests returns all tests and empties the scheduler."""
        scheduler = TestScheduler(self.tc_list, self.cluster)

        # Initially scheduler should have all tests
        assert len(scheduler) == len(self.tc_list)

        # Drain all remaining tests
        remaining = scheduler.drain_remaining_tests()

        # Should return all tests
        assert len(remaining) == len(self.tc_list)
        assert set(remaining) == set(self.tc_list)

        # Scheduler should now be empty
        assert len(scheduler) == 0
        assert scheduler.peek() is None

        # Calling again should return empty list
        remaining_again = scheduler.drain_remaining_tests()
        assert len(remaining_again) == 0

    def check_drain_remaining_tests_partial(self):
        """Test drain_remaining_tests after some tests have been removed."""
        scheduler = TestScheduler(self.tc_list, self.cluster)

        # Remove one test
        t = scheduler.peek()
        scheduler.remove(t)
        initial_len = len(scheduler)

        # Drain remaining tests
        remaining = scheduler.drain_remaining_tests()

        # Should return only the remaining tests
        assert len(remaining) == initial_len
        assert t not in remaining

        # Scheduler should be empty
        assert len(scheduler) == 0


================================================
FILE: tests/services/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/services/check_background_thread_service.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.services.background_thread import BackgroundThreadService
from ducktape.errors import TimeoutError
from tests.ducktape_mock import test_context, MockNode
import pytest
import time


class DummyService(BackgroundThreadService):
    """Single node service that sleeps for self.run_time_sec seconds in a background thread."""

    def __init__(self, context, run_time_sec, exc=None):
        super(DummyService, self).__init__(context, 1)
        self.running = False
        self.run_time_sec = run_time_sec
        self._exc = exc

    def who_am_i(self, node=None):
        return "DummyService"

    def idx(self, node):
        return 1

    def allocate_nodes(self):
        self.nodes = [MockNode()]

    def _worker(self, idx, node):
        if self._exc:
            raise self._exc

        self.running = True

        end = time.time() + self.run_time_sec
        while self.running:
            time.sleep(0.1)
            if time.time() > end:
                self.running = False
                break

    def stop_node(self, node):
        self.running = False


class CheckBackgroundThreadService(object):
    def setup_method(self, method):
        self.context = test_context()

    def check_service_constructor(self):
        """Check that BackgroundThreadService constructor corresponds to the base class's one."""
        exp_spec = ClusterSpec.simple_linux(10)
        service = BackgroundThreadService(self.context, cluster_spec=exp_spec)
        assert service.cluster_spec == exp_spec

        service = BackgroundThreadService(self.context, num_nodes=20)
        assert service.cluster_spec.size() == 20

        with pytest.raises(RuntimeError):
            BackgroundThreadService(self.context, num_nodes=20, cluster_spec=exp_spec)

    def check_service_timeout(self):
        """Test that wait(timeout_sec) raise a TimeoutError in approximately the expected time."""
        self.service = DummyService(self.context, float("inf"))
        self.service.start()
        start = time.time()
        timeout_sec = 0.1
        try:
            self.service.wait(timeout_sec=timeout_sec)
            raise Exception("Expected service to timeout.")
        except TimeoutError:
            end = time.time()
            # Relative difference should be pretty small
            # within 10% should be reasonable
            actual_timeout = end - start
            relative_difference = abs(timeout_sec - actual_timeout) / timeout_sec
            assert relative_difference < 0.1, (
                "Correctly threw timeout error, but timeout doesn't match closely with expected timeout. "
                + "(expected timeout, actual timeout): (%s, %s)" % (str(timeout_sec), str(actual_timeout))
            )

    def check_no_timeout(self):
        """Run an instance of DummyService with a short run_time_sec. It should stop without
        timing out."""

        self.service = DummyService(self.context, run_time_sec=0.1)
        self.service.start()
        self.service.wait(timeout_sec=0.5)

    def check_wait_node(self):
        self.service = DummyService(self.context, run_time_sec=float("inf"))
        self.service.start()
        node = self.service.nodes[0]
        assert not self.service.wait_node(node, timeout_sec=0.1)
        self.service.stop_node(node)
        assert self.service.wait_node(node)

    def check_wait_node_no_start(self):
        self.service = DummyService(self.context, run_time_sec=float("inf"))
        node = self.service.nodes[0]
        assert self.service.wait_node(node)

    def check_background_exception(self):
        self.service = DummyService(self.context, float("inf"), Exception("failure"))
        self.service.start()
        with pytest.raises(Exception):
            self.service.wait(timeout_sec=1)
        with pytest.raises(Exception):
            self.service.stop(timeout_sec=1)
        assert hasattr(self.service, "errors")


================================================
FILE: tests/services/check_jvm_logging.py
================================================
# Copyright 2024 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.services.service import Service
from ducktape.jvm_logging import JVMLogger
from tests.ducktape_mock import test_context, session_context
from ducktape.cluster.localhost import LocalhostCluster
from unittest.mock import Mock
import os


class JavaService(Service):
    """Mock Java service for testing JVM logging."""

    def __init__(self, context, num_nodes):
        super(JavaService, self).__init__(context, num_nodes)
        self.start_called = False
        self.clean_called = False
        self.ssh_commands = []

    def idx(self, node):
        return 1

    def start_node(self, node, **kwargs):
        super(JavaService, self).start_node(node, **kwargs)
        self.start_called = True
        # Simulate a Java command being run
        node.account.ssh("java -version")

    def clean_node(self, node, **kwargs):
        super(JavaService, self).clean_node(node, **kwargs)
        self.clean_called = True


def create_mock_node():
    """Create a mock node with a mock account that tracks SSH calls."""
    node = Mock()
    node.account = Mock()
    node.account.ssh = Mock(return_value=None)
    node.account.hostname = "mock-host"
    node.account.externally_routable_ip = "127.0.0.1"
    return node


class CheckJVMLogging(object):
    def setup_method(self, _):
        self.cluster = LocalhostCluster()
        self.session_context = session_context()
        self.context = test_context(self.session_context, cluster=self.cluster)
        self.jvm_logger = JVMLogger()

    def check_enable_for_service(self):
        """Check that JVM logging can be enabled for a service."""
        service = JavaService(self.context, 1)

        # Enable JVM logging
        self.jvm_logger.enable_for_service(service)

        # Verify logs dict was updated
        assert hasattr(service, "logs")
        assert "jvm_gc_log" in service.logs
        assert "jvm_stdout_stderr" in service.logs
        assert "jvm_heap_dump" in service.logs

        # Verify helper methods were added
        assert hasattr(service, "JVM_LOG_DIR")
        assert hasattr(service, "jvm_options")
        assert hasattr(service, "setup_jvm_logging")
        assert hasattr(service, "clean_jvm_logs")

    def check_jvm_options_format(self):
        """Check that JVM options string is properly formatted."""
        jvm_opts = self.jvm_logger._get_jvm_options()

        # Should start with -Xlog:disable to prevent console pollution
        assert jvm_opts.startswith("-Xlog:disable")

        # Should contain key logging options
        assert "gc*:file=" in jvm_opts
        assert "HeapDumpOnOutOfMemoryError" in jvm_opts
        assert "safepoint=info" in jvm_opts
        assert "class+load=info" in jvm_opts
        assert "jit+compilation=info" in jvm_opts
        assert "NativeMemoryTracking=summary" in jvm_opts

    def check_ssh_wrapping(self):
        """Check that all SSH methods are wrapped to inject JDK_JAVA_OPTIONS."""
        # Create a mock node first
        mock_node = create_mock_node()
        # Add ssh_capture and ssh_output methods to mock
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        # Enable JVM logging for a service
        service = JavaService(self.context, 1)
        self.jvm_logger.enable_for_service(service)

        # Manually call the wrapped start_node with our mock node
        # This simulates what happens during service.start()
        service.start_node(mock_node)

        # Verify all SSH methods were wrapped
        assert hasattr(mock_node.account, "original_ssh")
        assert hasattr(mock_node.account, "original_ssh_capture")
        assert hasattr(mock_node.account, "original_ssh_output")

        # The current methods should be callables (the wrapper functions)
        assert callable(mock_node.account.ssh)
        assert callable(mock_node.account.ssh_capture)
        assert callable(mock_node.account.ssh_output)
        assert service.start_called

    def check_ssh_methods_inject_options(self):
        """Check that wrapped SSH methods actually inject JDK_JAVA_OPTIONS."""
        service = JavaService(self.context, 1)
        node = service.nodes[0]

        # Track what commands are actually executed
        executed_commands = []

        def track_ssh(cmd, allow_fail=False):
            executed_commands.append(("ssh", cmd))
            return 0

        def track_ssh_capture(cmd, allow_fail=False, callback=None, combine_stderr=True, timeout_sec=None):
            executed_commands.append(("ssh_capture", cmd))
            return iter([])

        def track_ssh_output(cmd, allow_fail=False, combine_stderr=True, timeout_sec=None):
            executed_commands.append(("ssh_output", cmd))
            return ""

        # Set the tracking functions as the original methods
        node.account.ssh = track_ssh
        node.account.ssh_capture = track_ssh_capture
        node.account.ssh_output = track_ssh_output

        # Enable JVM logging - this wraps start_node
        self.jvm_logger.enable_for_service(service)

        # Start node - this wraps the SSH methods
        service.start_node(node)

        # Clear the setup commands (from mkdir, etc)
        executed_commands.clear()

        # Now execute commands through the wrapped methods
        node.account.ssh("java -version")
        node.account.ssh_capture("java -jar app.jar")
        node.account.ssh_output("java -cp test.jar Main")

        # Verify JDK_JAVA_OPTIONS was injected in all commands
        assert len(executed_commands) == 3, f"Expected 3 commands, got {len(executed_commands)}"
        for method, cmd in executed_commands:
            assert "JDK_JAVA_OPTIONS=" in cmd, f"{method} didn't inject JDK_JAVA_OPTIONS: {cmd}"
            assert "-Xlog:disable" in cmd, f"{method} didn't include JVM options: {cmd}"
            # Verify it uses env command (more portable)
            assert cmd.startswith("env JDK_JAVA_OPTIONS="), f"{method} doesn't use env command: {cmd}"
            # Verify it appends to existing (${JDK_JAVA_OPTIONS:-})
            assert "${JDK_JAVA_OPTIONS:-}" in cmd, f"{method} doesn't preserve existing options: {cmd}"
            # Verify no extra quotes around the options (shlex.quote would add them)
            assert "'-Xlog:disable" not in cmd, f"{method} has unwanted quotes around options: {cmd}"

    def check_preserves_existing_jvm_options(self):
        """Check that existing JDK_JAVA_OPTIONS are preserved and appended to."""
        service = JavaService(self.context, 1)
        node = service.nodes[0]

        # Track what commands are executed
        executed_commands = []

        def track_ssh(cmd, allow_fail=False):
            executed_commands.append(cmd)
            return 0

        node.account.ssh = track_ssh
        node.account.ssh_capture = Mock(return_value=iter([]))
        node.account.ssh_output = Mock(return_value="")

        # Enable JVM logging
        self.jvm_logger.enable_for_service(service)

        # Start node
        service.start_node(node)
        executed_commands.clear()

        # Execute a command - the wrapper should preserve any existing JDK_JAVA_OPTIONS
        node.account.ssh("java -version")

        # Verify the command structure
        assert len(executed_commands) == 1
        cmd = executed_commands[0]

        # Should use env command
        assert cmd.startswith("env JDK_JAVA_OPTIONS=")

        # Should preserve existing options with ${JDK_JAVA_OPTIONS:-}
        assert "${JDK_JAVA_OPTIONS:-}" in cmd

        # Should append our JVM logging options
        assert "-Xlog:disable" in cmd

    def check_ssh_wrap_idempotent(self):
        """Check that SSH wrapping is idempotent (handles restarts)."""
        mock_node = create_mock_node()
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        service = JavaService(self.context, 1)

        # Enable JVM logging
        self.jvm_logger.enable_for_service(service)

        # Get the wrapped start_node method
        wrapped_start_node = service.start_node

        # Call it twice with the same node
        wrapped_start_node(mock_node)

        # Verify wrapping happened
        assert hasattr(mock_node.account, "original_ssh")
        assert hasattr(mock_node.account, "original_ssh_capture")
        assert hasattr(mock_node.account, "original_ssh_output")

        # Call again (simulating restart)
        wrapped_start_node(mock_node)

        # SSH should still have the original_* attributes (idempotent)
        assert hasattr(mock_node.account, "original_ssh")
        assert hasattr(mock_node.account, "original_ssh_capture")
        assert hasattr(mock_node.account, "original_ssh_output")

    def check_clean_node_behavior(self):
        """Check that clean_node properly cleans up and restores SSH methods."""
        mock_node = create_mock_node()
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        service = JavaService(self.context, 1)

        # Enable JVM logging
        self.jvm_logger.enable_for_service(service)

        # Get the wrapped methods
        wrapped_start_node = service.start_node
        wrapped_clean_node = service.clean_node

        # Start node to wrap SSH methods
        wrapped_start_node(mock_node)
        assert hasattr(mock_node.account, "original_ssh")
        assert hasattr(mock_node.account, "original_ssh_capture")
        assert hasattr(mock_node.account, "original_ssh_output")

        # Clean node to restore SSH methods
        wrapped_clean_node(mock_node)

        # Verify SSH methods were restored
        assert not hasattr(mock_node.account, "original_ssh")
        assert not hasattr(mock_node.account, "original_ssh_capture")
        assert not hasattr(mock_node.account, "original_ssh_output")

    def check_log_paths(self):
        """Check that log paths are correctly configured."""
        service = JavaService(self.context, 1)
        self.jvm_logger.enable_for_service(service)

        log_dir = self.jvm_logger.log_dir

        # Verify log paths
        assert service.logs["jvm_gc_log"]["path"] == os.path.join(log_dir, "gc.log")
        assert service.logs["jvm_stdout_stderr"]["path"] == os.path.join(log_dir, "jvm.log")
        assert service.logs["jvm_heap_dump"]["path"] == os.path.join(log_dir, "heap_dump.hprof")

        # Verify collection flags
        assert service.logs["jvm_gc_log"]["collect_default"] is True
        assert service.logs["jvm_stdout_stderr"]["collect_default"] is True
        assert service.logs["jvm_heap_dump"]["collect_default"] is False

    def check_kwargs_preserved(self):
        """Check that start_node and clean_node preserve kwargs after wrapping."""
        service = JavaService(self.context, 1)

        # Replace the node with a mock node
        mock_node = create_mock_node()
        service.nodes = [mock_node]

        self.jvm_logger.enable_for_service(service)

        # This should not raise an error even with extra kwargs
        service.start(timeout_sec=30, clean=False)
        assert service.start_called

    def check_setup_failure_doesnt_break_wrapping(self):
        """Check that if _setup_on_node fails, the error propagates correctly."""
        from ducktape.cluster.remoteaccount import RemoteCommandError

        mock_node = create_mock_node()
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        # Make mkdir fail
        def failing_ssh(cmd, allow_fail=False):
            if "mkdir" in cmd:
                if not allow_fail:
                    raise RemoteCommandError(mock_node.account, cmd, 1, "Permission denied")
                return 1
            return 0

        mock_node.account.ssh = failing_ssh

        service = JavaService(self.context, 1)
        self.jvm_logger.enable_for_service(service)

        # start_node should fail during setup
        try:
            service.start_node(mock_node)
            assert False, "Expected RemoteCommandError"
        except RemoteCommandError as e:
            assert "Permission denied" in str(e)

    def check_wrapped_ssh_failure_propagates(self):
        """Check that SSH failures are properly propagated through the wrapper."""
        from ducktape.cluster.remoteaccount import RemoteCommandError

        mock_node = create_mock_node()

        # Track calls
        ssh_calls = []

        def tracking_ssh(cmd, allow_fail=False):
            ssh_calls.append((cmd, allow_fail))
            # Fail on "test-java" commands (not the real java from JavaService.start_node)
            if "test-java" in cmd and not allow_fail:
                raise RemoteCommandError(mock_node.account, cmd, 127, "java: command not found")
            return 0

        mock_node.account.ssh = tracking_ssh
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        service = JavaService(self.context, 1)
        self.jvm_logger.enable_for_service(service)
        service.start_node(mock_node)

        ssh_calls.clear()

        # Command that should fail
        try:
            mock_node.account.ssh("test-java -version")
            assert False, "Expected RemoteCommandError"
        except RemoteCommandError as e:
            assert "java: command not found" in str(e)

        # Command with allow_fail=True should not raise
        result = mock_node.account.ssh("test-java -version", allow_fail=True)
        assert result == 0  # Our mock returns 0 when allow_fail=True

    def check_wrapped_ssh_with_allow_fail(self):
        """Check that allow_fail parameter works correctly through the wrapper."""
        mock_node = create_mock_node()

        # Track calls and their allow_fail parameter
        ssh_calls = []

        def tracking_ssh(cmd, allow_fail=False):
            ssh_calls.append((cmd, allow_fail))
            return 1 if "fail" in cmd else 0

        mock_node.account.ssh = tracking_ssh
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        service = JavaService(self.context, 1)
        self.jvm_logger.enable_for_service(service)
        service.start_node(mock_node)

        ssh_calls.clear()

        # Test with allow_fail=False (default)
        mock_node.account.ssh("echo success")
        assert ssh_calls[-1][1] is False

        # Test with allow_fail=True
        mock_node.account.ssh("echo fail", allow_fail=True)
        assert ssh_calls[-1][1] is True

    def check_cleanup_failure_still_restores_ssh(self):
        """Check that even if cleanup fails, SSH methods are still restored."""
        from ducktape.cluster.remoteaccount import RemoteCommandError

        mock_node = create_mock_node()

        cleanup_called = []

        def tracking_ssh(cmd, allow_fail=False):
            if "rm -rf" in cmd:
                cleanup_called.append(True)
                if not allow_fail:
                    raise RemoteCommandError(mock_node.account, cmd, 1, "rm failed")
            return 0

        original_ssh = tracking_ssh
        mock_node.account.ssh = original_ssh
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        service = JavaService(self.context, 1)
        self.jvm_logger.enable_for_service(service)

        # Start node
        service.start_node(mock_node)
        assert hasattr(mock_node.account, "original_ssh")

        # Clean node - cleanup uses allow_fail=True, so it shouldn't raise
        service.clean_node(mock_node)

        # Verify cleanup was attempted
        assert len(cleanup_called) > 0

        # Verify SSH was restored despite cleanup "failure"
        assert not hasattr(mock_node.account, "original_ssh")

    def check_double_cleanup_is_safe(self):
        """Check that calling clean_node twice doesn't cause errors."""
        mock_node = create_mock_node()
        mock_node.account.ssh_capture = Mock(return_value=iter([]))
        mock_node.account.ssh_output = Mock(return_value="")

        service = JavaService(self.context, 1)
        self.jvm_logger.enable_for_service(service)

        # Start node
        service.start_node(mock_node)
        assert hasattr(mock_node.account, "original_ssh")

        # Clean node first time
        service.clean_node(mock_node)
        assert not hasattr(mock_node.account, "original_ssh")

        # Clean node second time - should not raise
        service.clean_node(mock_node)
        assert not hasattr(mock_node.account, "original_ssh")


================================================
FILE: tests/services/check_service.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.services.service import Service
from tests.ducktape_mock import test_context, session_context
from ducktape.cluster.localhost import LocalhostCluster


class DummyService(Service):
    """Simple fake service class."""

    def __init__(self, context, num_nodes):
        super(DummyService, self).__init__(context, num_nodes)
        self.started_count = 0
        self.cleaned_count = 0
        self.stopped_count = 0

        self.started_kwargs = {}
        self.cleaned_kwargs = {}
        self.stopped_kwargs = {}

    def idx(self, node):
        return 1

    def start_node(self, node, **kwargs):
        super(DummyService, self).start_node(node, **kwargs)
        self.started_count += 1
        self.started_kwargs = kwargs

    def clean_node(self, node, **kwargs):
        super(DummyService, self).clean_node(node, **kwargs)
        self.cleaned_count += 1
        self.cleaned_kwargs = kwargs

    def stop_node(self, node, **kwargs):
        super(DummyService, self).stop_node(node, **kwargs)
        self.stopped_count += 1
        self.stopped_kwargs = kwargs


class DifferentDummyService(Service):
    """Another fake service class."""

    def __init__(self, context, num_nodes):
        super(DifferentDummyService, self).__init__(context, num_nodes)

    def idx(self, node):
        return 1


class CheckAllocateFree(object):
    def setup_method(self, _):
        self.cluster = LocalhostCluster()
        self.session_context = session_context()
        self.context = test_context(self.session_context, cluster=self.cluster)

    def check_allocate_free(self):
        """Check that allocating and freeing nodes works.

        This regression test catches the error with Service.free() introduced in v0.3.3 and fixed in v0.3.4
        """

        # Node allocation takes place during service instantiation
        initial_cluster_size = len(self.cluster)
        self.service = DummyService(self.context, 10)
        assert self.cluster.num_available_nodes() == initial_cluster_size - 10

        self.service.free()
        assert self.cluster.num_available_nodes() == initial_cluster_size

    def check_order(self):
        """Check expected behavior with service._order method"""
        self.dummy0 = DummyService(self.context, 4)
        self.diffDummy0 = DifferentDummyService(self.context, 100)
        self.dummy1 = DummyService(self.context, 1)
        self.diffDummy1 = DifferentDummyService(self.context, 2)
        self.diffDummy2 = DifferentDummyService(self.context, 5)

        assert self.dummy0._order == 0
        assert self.dummy1._order == 1
        assert self.diffDummy0._order == 0
        assert self.diffDummy1._order == 1
        assert self.diffDummy2._order == 2


class CheckStartStop(object):
    def setup_method(self, _):
        self.cluster = LocalhostCluster()
        self.session_context = session_context()
        self.context = test_context(self.session_context, cluster=self.cluster)

    def check_start_stop_clean(self):
        """
        Checks that start, stop, and clean invoke the expected per-node calls, and that start also runs stop and
        clean
        """
        service = DummyService(self.context, 2)

        service.start()
        assert service.started_count == 2
        assert service.stopped_count == 2
        assert service.cleaned_count == 2

        service.stop()
        assert service.stopped_count == 4

        service.start(clean=False)
        assert service.started_count == 4
        assert service.stopped_count == 6
        assert service.cleaned_count == 2

        service.stop()
        assert service.stopped_count == 8

        service.clean()
        assert service.cleaned_count == 4

    def check_kwargs_support(self):
        """Check that start, stop, and clean, and their per-node versions, can accept keyword arguments"""
        service = DummyService(self.context, 2)

        kwargs = {"foo": "bar"}
        service.start(**kwargs)
        assert service.started_kwargs == kwargs
        service.stop(**kwargs)
        assert service.stopped_kwargs == kwargs
        service.clean(**kwargs)
        assert service.cleaned_kwargs == kwargs


================================================
FILE: tests/templates/__init__.py
================================================


================================================
FILE: tests/templates/service/__init__.py
================================================


================================================
FILE: tests/templates/service/check_render.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.services.service import Service
from tests.ducktape_mock import test_context


class CheckTemplateRenderingService(object):
    """
    Tests rendering of templates, using input from a Service
    """

    def new_instance(self):
        return TemplateRenderingService()

    def check_simple(self):
        self.new_instance().render_simple()

    def check_single_variable(self):
        self.new_instance().render_single_variable()

    def check_overload(self):
        self.new_instance().render_overload()

    def check_class_template(self):
        self.new_instance().render_class_template()

    def check_file_template(self):
        self.new_instance().render_file_template()


class TemplateRenderingService(Service):
    NO_VARIABLE = "fixed content"
    SIMPLE_VARIABLE = "Hello {{a_field}}!"
    OVERLOAD_VARIABLES = "{{normal}} {{overload}}"

    CLASS_CONSTANT_TEMPLATE = "{{ CLASS_CONSTANT }}"
    CLASS_CONSTANT = "constant"

    def __init__(self):
        super(TemplateRenderingService, self).__init__(test_context(), 1)

    def render_simple(self):
        """Test that a trivial template works"""
        assert self.render_template(self.NO_VARIABLE) == self.NO_VARIABLE

    def render_single_variable(self):
        """Test that fields on the object are available to templates"""
        self.a_field = "world"
        assert self.render_template(self.SIMPLE_VARIABLE) == "Hello world!"

    def render_overload(self):
        self.normal = "normal"
        assert self.render_template(self.OVERLOAD_VARIABLES, overload="overloaded") == "normal overloaded"

    def render_class_template(self):
        assert self.render_template(self.CLASS_CONSTANT_TEMPLATE) == self.CLASS_CONSTANT
        self.CLASS_CONSTANT = "instance override"
        assert self.render_template(self.CLASS_CONSTANT_TEMPLATE) == "instance override"

    def render_file_template(self):
        self.a_field = "world"
        assert "Sample world" == self.render("sample")


================================================
FILE: tests/templates/service/templates/sample
================================================
Sample {{a_field}}

================================================
FILE: tests/templates/test/__init__.py
================================================


================================================
FILE: tests/templates/test/check_render.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests import Test
from ducktape.tests import TestContext
from ducktape.template import TemplateRenderer

from tests.ducktape_mock import session_context

import os
import tempfile


class CheckTemplateRenderingTest(object):
    """
    Minimal test to verify template rendering functionality
    """

    def setup(self):
        dir = tempfile.gettempdir()
        session_ctx = session_context(results_dir=dir)
        test_ctx = TestContext(session_context=session_ctx)
        return TemplateRenderingTest(test_ctx)

    def check_string_template(self):
        test = self.setup()
        test.other = "goodbye"
        result = test.render_template("Hello {{name}} and {{other}}", name="World")
        assert "Hello World and goodbye" == result

    def check_file_template(self):
        test = self.setup()
        test.name = "world"
        assert "Sample world" == test.render("sample")


class CheckPackageSearchPath(object):
    """
    Simple check on extracting package and search path based on module name.
    """

    def check_package_search_path(self):
        package, path = TemplateRenderer._package_search_path("a.b.c")
        # search path should be b/templates since templates is by convention a sibling of c
        assert package == "a" and path == os.path.join("b", "templates")

        package, path = TemplateRenderer._package_search_path("hi")
        assert package == "hi" and path == "templates"

        package, path = TemplateRenderer._package_search_path("")
        assert package == "" and path == "templates"

    def check_get_ctx(self):
        class A(TemplateRenderer):
            x = "xxx"

        class B(A):
            y = "yyy"

        b = B()
        b.instance = "b instance"

        ctx_a = A()._get_ctx()
        assert ctx_a["x"] == "xxx"
        assert "yyy" not in ctx_a
        assert "instance" not in ctx_a

        ctx_b = b._get_ctx()
        assert ctx_b["x"] == "xxx"
        assert ctx_b["y"] == "yyy"
        assert ctx_b["instance"] == "b instance"


class TemplateRenderingTest(Test):
    pass


================================================
FILE: tests/templates/test/templates/sample
================================================
Sample {{name}}

================================================
FILE: tests/test_utils.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import socket


def find_available_port(min_port=8000, max_port=9000):
    """Return first available port in the range [min_port, max_port], inclusive.

    Note that this actually isn't a 100% reliable way of getting a port, but it's probably good enough -- once
    you close a socket you cannot be sure of its availability. This was the source of a bunch of issues in
    the Apache Kafka unit tests.
    """
    for p in range(min_port, max_port + 1):
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.bind(("localhost", p))
            s.close()
            return p
        except socket.error:
            pass

    raise Exception("No available port found in range [%d, %d]" % (min_port, max_port))


================================================
FILE: tests/tests/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/tests/check_session.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.session import generate_results_dir, SessionContext
from ducktape.cluster.localhost import LocalhostCluster

import os.path
import pickle
import shutil
import tempfile


class CheckGenerateResultsDir(object):
    def setup_method(self, _):
        self.tempdir = tempfile.mkdtemp()

    def check_generate_results_root(self):
        """Check the generated results directory has the specified path as its root"""
        results_root = os.path.abspath(tempfile.mkdtemp())
        results_dir = generate_results_dir(results_root, "my_session_id")
        assert results_dir.find(results_root) == 0

    def check_pickleable(self):
        """Check that session_context object is pickleable
        This is necessary so that SessionContext objects can be shared between processes when using python
        multiprocessing module.
        """

        kwargs = {
            "session_id": "hello-123",
            "results_dir": self.tempdir,
            "cluster": LocalhostCluster(),
            "globals": {},
        }
        session_context = SessionContext(**kwargs)
        pickle.dumps(session_context)

    def teardown_method(self, _):
        if os.path.exists(self.tempdir):
            shutil.rmtree(self.tempdir)


================================================
FILE: tests/tests/check_test.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import random
import shutil
import sys
import tempfile

from ducktape.cluster.cluster_spec import ClusterSpec
from ducktape.tests.test import Test, _compress_cmd, in_dir, in_temp_dir
from ducktape.tests.test_context import _escape_pathname, TestContext
from tests import ducktape_mock


class DummyTest(Test):
    """class description"""

    def test_class_description(self):
        pass

    def test_function_description(self):
        """function description"""
        pass


class DummyTestNoDescription(Test):
    def test_this(self):
        pass


class CheckLifecycle(object):
    def check_test_context_double_close(self):
        context = TestContext(
            session_context=ducktape_mock.session_context(),
            cls=DummyTest,
            function=DummyTest.test_function_description,
        )
        context.close()
        context.close()
        assert not hasattr(context, "services")

    def check_cluster_property(self):
        exp_cluster = ClusterSpec.simple_linux(5)
        tc = TestContext(
            session_context=ducktape_mock.session_context(),
            cluster=exp_cluster,
            cls=DummyTest,
            function=DummyTest.test_function_description,
        )
        test_obj = tc.cls(tc)
        assert test_obj.cluster == exp_cluster


class CheckEscapePathname(object):
    def check_illegal_path(self):
        path = "\\/.a=2,   b=x/y/z"
        assert _escape_pathname(path) == "a=2.b=x.y.z"

    def check_negative(self):
        # it's better if negative numbers are preserved
        path = "x= -2, y=-50"
        assert _escape_pathname(path) == "x=-2.y=-50"

    def check_many_dots(self):
        path = "..a.....b.c...d."
        assert _escape_pathname(path) == "a.b.c.d"


class CheckDescription(object):
    """Check that pulling a description from a test works as expected."""

    def check_from_function(self):
        """If the function has a docstring, the description should come from the function"""
        context = TestContext(
            session_context=ducktape_mock.session_context(),
            cls=DummyTest,
            function=DummyTest.test_function_description,
        )
        assert context.description == "function description"

    def check_from_class(self):
        """If the test method has no docstring, description should come from the class docstring"""
        context = TestContext(
            session_context=ducktape_mock.session_context(),
            cls=DummyTest,
            function=DummyTest.test_class_description,
        )
        assert context.description == "class description"

    def check_no_description(self):
        """If nobody has a docstring, there shouldn't be an error, and description should be empty string"""
        context = TestContext(
            session_context=ducktape_mock.session_context(),
            cls=DummyTestNoDescription,
            function=DummyTestNoDescription.test_this,
        )
        assert context.description == ""


class CheckCompressCmd(object):
    """Check expected behavior of compress command used before collecting service logs"""

    def setup_method(self, _):
        self.tempdir = tempfile.mkdtemp()

    def _make_random_file(self, dir, num_chars=10000):
        """Populate filename with random characters."""
        filename = os.path.join(dir, "f-%d" % random.randint(1, 2**63 - 1))
        content = "".join([random.choice("0123456789abcdefghijklmnopqrstuvwxyz\n") for _ in range(num_chars)])
        with open(filename, "w") as f:
            f.writelines(content)
        return filename

    def _make_files(self, dir, num_files=10):
        """Populate dir with several files with random characters."""
        for i in range(num_files):
            self._make_random_file(dir)

    def _validate_compressed(self, uncompressed_path):
        if uncompressed_path.endswith(os.path.sep):
            uncompressed_path = uncompressed_path[: -len(os.path.sep)]

        compressed_path = uncompressed_path + ".tgz"

        # verify original file is replaced by filename.tgz
        assert os.path.exists(compressed_path)
        assert not os.path.exists(uncompressed_path)

        # verify that uncompressing gets us back the original
        with in_dir(self.tempdir):
            os.system("tar xzf %s" % (compressed_path))
            assert os.path.exists(uncompressed_path)

    def check_compress_service_logs_swallow_error(self):
        """Try compressing a non-existent service log, and check that it logs a message without throwing an error."""
        from tests.ducktape_mock import session_context

        tc = TestContext(
            session_context=session_context(),
            module=sys.modules[DummyTestNoDescription.__module__],
            cls=DummyTestNoDescription,
            function=DummyTestNoDescription.test_this,
        )

        tc._logger = logging.getLogger(__name__)
        temp_log_file = tempfile.NamedTemporaryFile(delete=False).name

        try:
            tmp_log_handler = logging.FileHandler(temp_log_file)
            tc._logger.addHandler(tmp_log_handler)

            test_obj = tc.cls(tc)

            # Expect an error to be triggered but swallowed
            test_obj.compress_service_logs(node=None, service=None, node_logs=["hi"])

            tmp_log_handler.close()
            with open(temp_log_file, "r") as f:
                s = f.read()
                assert s.find("Error compressing log hi") >= 0
        finally:
            if os.path.exists(temp_log_file):
                os.remove(temp_log_file)

    def check_abs_path_file(self):
        """Check compress command on an absolute path to a file"""
        with in_temp_dir() as dir1:
            filename = self._make_random_file(self.tempdir)
            abspath_filename = os.path.abspath(filename)

            # since we're using absolute path to file, we should be able to run the compress command from anywhere
            with in_temp_dir() as dir2:
                assert dir1 != dir2

                os.system(_compress_cmd(abspath_filename))
                self._validate_compressed(abspath_filename)

    def check_relative_path_file(self):
        """Check compress command on a relative path to a file"""
        with in_temp_dir():
            filename = self._make_random_file(self.tempdir)
            with in_dir(self.tempdir):
                filename = os.path.basename(filename)
                assert len(filename.split(os.path.sep)) == 1

                # compress it!
                os.system(_compress_cmd(filename))
                self._validate_compressed(filename)

    def check_abs_path_dir(self):
        """Validate compress command with absolute path to a directory"""
        dirname = tempfile.mkdtemp(dir=self.tempdir)
        dirname = os.path.abspath(dirname)
        self._make_files(dirname)

        # compress it!
        if not dirname.endswith(os.path.sep):
            # extra check - ensure all of this works with trailing '/'
            dirname += os.path.sep

        os.system(_compress_cmd(dirname))
        self._validate_compressed(dirname)

    def check_relative_path_dir(self):
        """Validate tarball compression of a directory"""
        dirname = tempfile.mkdtemp(dir=self.tempdir)
        self._make_files(dirname)

        dirname = os.path.basename(dirname)
        assert len(dirname.split(os.path.sep)) == 1

        # compress it!
        with in_dir(self.tempdir):
            os.system(_compress_cmd(dirname))
            self._validate_compressed(dirname)

    def teardown_method(self, _):
        if os.path.exists(self.tempdir):
            shutil.rmtree(self.tempdir)


================================================
FILE: tests/tests/check_test_context.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ducktape.tests.test import Test
from ducktape.services.service import Service
from ducktape.mark import parametrize
from ducktape.mark.mark_expander import MarkedFunctionExpander

from tests.ducktape_mock import session_context

from mock import MagicMock


class CheckTestContext(object):
    def check_copy_constructor(self):
        """Regression test against a bug introduced in 0.3.7
        The TestContext copy constructor was copying the ServiceRegistry object by reference.
        As a result, services registering themselves with one test context would be registered with the copied
        context as well, resulting in the length of the service registry to grow additively from test to test.

        This problem cropped up in particular with parametrized tests.
        """
        expander = MarkedFunctionExpander(
            session_context=session_context(),
            cls=DummyTest,
            function=DummyTest.test_me,
            cluster=MagicMock(),
        )
        ctx_list = expander.expand()

        for ctx in ctx_list:
            # Constructing an instance of the test class causes a service to be registered with the test context
            ctx.cls(ctx)

        # Ensure that each context.services object is a unique reference
        assert len(set(id(ctx.services) for ctx in ctx_list)) == len(ctx_list)


class DummyTest(Test):
    def __init__(self, test_context):
        super(DummyTest, self).__init__(test_context)
        self.service = DummyService(test_context)

    @parametrize(x=1)
    @parametrize(x=2)
    def test_me(self):
        pass


class DummyService(Service):
    def __init__(self, context):
        super(DummyService, self).__init__(context, 1)


================================================
FILE: tests/utils/__init__.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: tests/utils/check_util.py
================================================
# Copyright 2015 Confluent Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest

from ducktape.errors import TimeoutError
from ducktape.utils.util import wait_until
import time


class CheckUtils(object):
    def check_wait_until(self):
        """Check normal wait until behavior"""
        start = time.time()

        wait_until(lambda: time.time() > start + 0.5, timeout_sec=2, backoff_sec=0.1)

    def check_wait_until_timeout(self):
        """Check that timeout throws exception"""
        start = time.time()

        with pytest.raises(TimeoutError, match="Hello world"):
            wait_until(
                lambda: time.time() > start + 5,
                timeout_sec=0.5,
                backoff_sec=0.1,
                err_msg="Hello world",
            )

    def check_wait_until_timeout_callable_msg(self):
        """Check that timeout throws exception and the error message is generated via a callable"""
        start = time.time()

        with pytest.raises(TimeoutError, match="Hello world"):
            wait_until(
                lambda: time.time() > start + 5,
                timeout_sec=0.5,
                backoff_sec=0.1,
                err_msg=lambda: "Hello world",
            )

    def check_wait_until_with_exception(self):
        def condition_that_raises():
            raise Exception("OG")

        with pytest.raises(TimeoutError) as exc_info:
            wait_until(
                condition_that_raises,
                timeout_sec=0.5,
                backoff_sec=0.1,
                err_msg="Hello world",
                retry_on_exc=True,
            )
        exc_chain = exc_info.getrepr(chain=True).chain
        # 2 exceptions in the chain - OG and Hello world
        assert len(exc_chain) == 2

        # each element of a chain is a tuple of traceback, "crash" (which is the short message)
        # and optionally descr, which is None for the bottom of the chain
        # and "The exception above is ..." for all the others
        # We're interested in crash, since that one is a one-liner with the actual exception message.
        og_message = str(exc_chain[0][1])
        hello_message = str(exc_chain[1][1])
        assert "OG" in og_message
        assert "Hello world" in hello_message

    def check_wait_until_with_exception_on_first_step_only_but_still_fails(self):
        start = time.time()

        def condition_that_raises_before_3():
            if time.time() < start + 0.3:
                raise Exception("OG")
            else:
                return False

        with pytest.raises(TimeoutError) as exc_info:
            wait_until(
                condition_that_raises_before_3,
                timeout_sec=0.5,
                backoff_sec=0.1,
                err_msg="Hello world",
                retry_on_exc=True,
            )
        exc_chain = exc_info.getrepr(chain=True).chain
        assert len(exc_chain) == 1
        hello_message = str(exc_chain[0][1])
        assert "Hello world" in hello_message

    def check_wait_until_exception_which_succeeds_eventually(self):
        start = time.time()

        def condition_that_raises_before_3_but_then_succeeds():
            if time.time() < start + 0.3:
                raise Exception("OG")
            else:
                return True

        wait_until(
            condition_that_raises_before_3_but_then_succeeds,
            timeout_sec=0.5,
            backoff_sec=0.1,
            err_msg="Hello world",
            retry_on_exc=True,
        )

    def check_wait_until_breaks_early_on_exception(self):
        def condition_that_raises():
            raise Exception("OG")

        with pytest.raises(Exception, match="OG") as exc_info:
            wait_until(
                condition_that_raises,
                timeout_sec=0.5,
                backoff_sec=0.1,
                err_msg="Hello world",
            )
        assert "Hello world" not in str(exc_info)


================================================
FILE: tox.ini
================================================
[tox]
envlist = py38, py39, py310, py311, py312, py313, cover, style, docs

[testenv]
# Consolidate all deps here instead of separately in test/style/cover so we
# have a single env to work with, which makes debugging easier (like which env?).
# Not as clean but easier to work with during development, which is better.
deps =
    -r requirements.txt
    -r requirements-test.txt
install_command =
    pip install -U {packages}
recreate = False
skipsdist = True
usedevelop = True
setenv =
    PIP_PROCESS_DEPENDENCY_LINKS=1
    PIP_DEFAULT_TIMEOUT=60
    ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future
envdir = {package_root}/.virtualenvs/ducktape_{envname}
commands =
    pytest {env:PYTESTARGS:} {posargs}

[testenv:py38]
envdir = {package_root}/.virtualenvs/ducktape-py38

[testenv:py39]
envdir = {package_root}/.virtualenvs/ducktape-py39

[testenv:py310]
envdir = {package_root}/.virtualenvs/ducktape-py310

[testenv:py311]
envdir = {package_root}/.virtualenvs/ducktape-py311

[testenv:py312]
envdir = {package_root}/.virtualenvs/ducktape-py312

[testenv:py313]
envdir = {package_root}/.virtualenvs/ducktape-py313

[testenv:style]
basepython = python3.13
envdir = {package_root}/.virtualenvs/ducktape
commands =
    ruff format --check
    ruff check

[testenv:ruff]
basepython = python3.13
envdir = {package_root}/.virtualenvs/ducktape
commands =
    ruff {posargs}

[testenv:cover]
basepython = python3.13
envdir = {package_root}/.virtualenvs/ducktape
commands =
    pytest {env:PYTESTARGS:} --cov ducktape --cov-report=xml --cov-report=html --cov-report=term --cov-report=annotate:textcov \
                             --cov-fail-under=70

[testenv:docs]
basepython = python3.13
deps =
    -r {toxinidir}/docs/requirements.txt
changedir = {toxinidir}/docs
commands = sphinx-build -M {env:SPHINX_BUILDER:html} . _build  {posargs}