Repository: confluentinc/ducktape Branch: master Commit: 7b1e2132b337 Files: 193 Total size: 650.7 KB Directory structure: gitextract_muj9xuou/ ├── .coveragerc ├── .dockerignore ├── .github/ │ └── ISSUE_TEMPLATE/ │ └── bug_report.md ├── .gitignore ├── .readthedocs.yaml ├── .semaphore/ │ └── semaphore.yml ├── CODEOWNERS ├── Dockerfile ├── Jenkinsfile.disabled ├── README.md ├── Vagrantfile ├── docs/ │ ├── Makefile │ ├── README.md │ ├── _static/ │ │ └── theme_overrides.css │ ├── api/ │ │ ├── clusters.rst │ │ ├── remoteaccount.rst │ │ ├── services.rst │ │ ├── templates.rst │ │ └── test.rst │ ├── api.rst │ ├── changelog.rst │ ├── conf.py │ ├── debug_tests.rst │ ├── index.rst │ ├── install.rst │ ├── make.bat │ ├── misc.rst │ ├── new_services.rst │ ├── new_tests.rst │ ├── requirements.txt │ ├── run_tests.rst │ └── test_clusters.rst ├── ducktape/ │ ├── __init__.py │ ├── __main__.py │ ├── cluster/ │ │ ├── __init__.py │ │ ├── cluster.py │ │ ├── cluster_node.py │ │ ├── cluster_spec.py │ │ ├── consts.py │ │ ├── finite_subcluster.py │ │ ├── json.py │ │ ├── linux_remoteaccount.py │ │ ├── localhost.py │ │ ├── node_container.py │ │ ├── node_spec.py │ │ ├── remoteaccount.py │ │ ├── vagrant.py │ │ └── windows_remoteaccount.py │ ├── command_line/ │ │ ├── __init__.py │ │ ├── defaults.py │ │ ├── main.py │ │ └── parse_args.py │ ├── errors.py │ ├── json_serializable.py │ ├── jvm_logging.py │ ├── mark/ │ │ ├── __init__.py │ │ ├── _mark.py │ │ ├── consts.py │ │ ├── mark_expander.py │ │ └── resource.py │ ├── services/ │ │ ├── __init__.py │ │ ├── background_thread.py │ │ ├── service.py │ │ └── service_registry.py │ ├── template.py │ ├── templates/ │ │ └── report/ │ │ ├── report.css │ │ └── report.html │ ├── tests/ │ │ ├── __init__.py │ │ ├── event.py │ │ ├── loader.py │ │ ├── loggermaker.py │ │ ├── reporter.py │ │ ├── result.py │ │ ├── runner.py │ │ ├── runner_client.py │ │ ├── scheduler.py │ │ ├── serde.py │ │ ├── session.py │ │ ├── status.py │ │ ├── test.py │ │ └── test_context.py │ └── utils/ │ ├── __init__.py │ ├── http_utils.py │ ├── local_filesystem_utils.py │ ├── persistence.py │ ├── terminal_size.py │ └── util.py ├── requirements-test.txt ├── requirements.txt ├── ruff.toml ├── service.yml ├── setup.cfg ├── setup.py ├── systests/ │ ├── __init__.py │ └── cluster/ │ ├── __init__.py │ ├── test_debug.py │ ├── test_no_cluster.py │ ├── test_remote_account.py │ └── test_runner_operations.py ├── tests/ │ ├── __init__.py │ ├── cluster/ │ │ ├── __init__.py │ │ ├── check_cluster.py │ │ ├── check_cluster_spec.py │ │ ├── check_finite_subcluster.py │ │ ├── check_json.py │ │ ├── check_localhost.py │ │ ├── check_node_container.py │ │ ├── check_remoteaccount.py │ │ └── check_vagrant.py │ ├── command_line/ │ │ ├── __init__.py │ │ ├── check_main.py │ │ └── check_parse_args.py │ ├── ducktape_mock.py │ ├── loader/ │ │ ├── __init__.py │ │ ├── check_loader.py │ │ └── resources/ │ │ ├── __init__.py │ │ ├── loader_test_directory/ │ │ │ ├── README │ │ │ ├── __init__.py │ │ │ ├── invalid_test_suites/ │ │ │ │ ├── empty_file.yml │ │ │ │ ├── malformed_test_suite.yml │ │ │ │ ├── not_yaml.yml │ │ │ │ ├── test_suite_refers_to_non_existent_file.yml │ │ │ │ ├── test_suite_with_malformed_params.yml │ │ │ │ └── test_suites_with_no_tests.yml │ │ │ ├── name_does_not_match_pattern.py │ │ │ ├── sub_dir_a/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_c.py │ │ │ │ └── test_d.py │ │ │ ├── sub_dir_no_tests/ │ │ │ │ ├── __init__.py │ │ │ │ └── just_some_file.py │ │ │ ├── test_a.py │ │ │ ├── test_b.py │ │ │ ├── test_decorated.py │ │ │ ├── test_suite_cyclic_a.yml │ │ │ ├── test_suite_cyclic_b.yml │ │ │ ├── test_suite_decorated.yml │ │ │ ├── test_suite_import_malformed.yml │ │ │ ├── test_suite_import_py.yml │ │ │ ├── test_suite_malformed.yml │ │ │ ├── test_suite_multiple.yml │ │ │ ├── test_suite_single.yml │ │ │ ├── test_suite_with_self_import.yml │ │ │ ├── test_suite_with_single_import.yml │ │ │ └── test_suites/ │ │ │ ├── refers_to_parent_dir.yml │ │ │ ├── sub_dir_a_test_c.yml │ │ │ ├── sub_dir_a_test_c_via_class.yml │ │ │ ├── sub_dir_a_with_exclude.yml │ │ │ ├── sub_dir_test_import.yml │ │ │ └── test_suite_glob.yml │ │ └── report.json │ ├── logger/ │ │ ├── __init__.py │ │ └── check_logger.py │ ├── mark/ │ │ ├── __init__.py │ │ ├── check_cluster_use_metadata.py │ │ ├── check_env.py │ │ ├── check_ignore.py │ │ ├── check_parametrize.py │ │ └── resources/ │ │ └── __init__.py │ ├── reporter/ │ │ └── check_symbol_reporter.py │ ├── runner/ │ │ ├── __init__.py │ │ ├── check_runner.py │ │ ├── check_runner_memory.py │ │ ├── check_sender_receiver.py │ │ ├── fake_remote_account.py │ │ └── resources/ │ │ ├── __init__.py │ │ ├── test_bad_actor.py │ │ ├── test_failing_tests.py │ │ ├── test_fails_to_init.py │ │ ├── test_fails_to_init_in_setup.py │ │ ├── test_memory_leak.py │ │ ├── test_thingy.py │ │ └── test_various_num_nodes.py │ ├── scheduler/ │ │ ├── __init__.py │ │ └── check_scheduler.py │ ├── services/ │ │ ├── __init__.py │ │ ├── check_background_thread_service.py │ │ ├── check_jvm_logging.py │ │ └── check_service.py │ ├── templates/ │ │ ├── __init__.py │ │ ├── service/ │ │ │ ├── __init__.py │ │ │ ├── check_render.py │ │ │ └── templates/ │ │ │ └── sample │ │ └── test/ │ │ ├── __init__.py │ │ ├── check_render.py │ │ └── templates/ │ │ └── sample │ ├── test_utils.py │ ├── tests/ │ │ ├── __init__.py │ │ ├── check_session.py │ │ ├── check_test.py │ │ └── check_test_context.py │ └── utils/ │ ├── __init__.py │ └── check_util.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .coveragerc ================================================ [run] omit = .git/* .tox/* docs/* setup.py test/* tests/* ================================================ FILE: .dockerignore ================================================ * ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- @confluentinc/devprod-apac-n-frameworks-eng is tagged for visibility **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior **Expected behavior** A clear and concise description of what you expected to happen. **Additional context** Add any other context about the problem here. ================================================ FILE: .gitignore ================================================ # Duckape systests/.ducktape/ systests/results/ results .ducktape .vagrant # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ textcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ .virtualenvs/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .idea /.vagrant/ .vscode ================================================ FILE: .readthedocs.yaml ================================================ # .readthedocs.yaml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 build: os: ubuntu-20.04 tools: python: "3.13" sphinx: configuration: docs/conf.py python: install: - requirements: docs/requirements.txt - method: setuptools path: . ================================================ FILE: .semaphore/semaphore.yml ================================================ version: v1.0 name: pr-test-job agent: machine: type: s1-prod-ubuntu24-04-amd64-1 execution_time_limit: hours: 1 global_job_config: prologue: commands: - checkout blocks: - name: Test dependencies: [] task: jobs: - name: Test Python 3.8 commands: - sem-version python 3.8 - pip install tox - export PYTESTARGS='--junitxml=test/results-py38.xml' - tox -e py38 - name: Test Python 3.9 commands: - sem-version python 3.9 - pip install tox - export PYTESTARGS='--junitxml=test/results-py39.xml' - tox -e py39 - name: Test Python 3.10 commands: - sem-version python 3.10 - pip install tox - export PYTESTARGS='--junitxml=test/results-py310.xml' - tox -e py310 - name: Test Python 3.11 commands: - sem-version python 3.11 - pip install tox - export PYTESTARGS='--junitxml=test/results-py311.xml' - tox -e py311 - name: Test Python 3.12 commands: - sem-version python 3.12 - pip install tox - export PYTESTARGS='--junitxml=test/results-py312.xml' - tox -e py312 - name: Test Python 3.13 commands: - sem-version python 3.13 - pip install tox - export PYTESTARGS='--junitxml=test/results-py313.xml' - tox ================================================ FILE: CODEOWNERS ================================================ * @confluentinc/cp-test-frameworks-and-readiness ================================================ FILE: Dockerfile ================================================ # An image of ducktape that can be used to setup a Docker cluster where ducktape is run inside the container. FROM ubuntu:14.04 RUN apt-get update && \ apt-get install -y libffi-dev libssl-dev openssh-server python-dev python-pip python-virtualenv && \ virtualenv /opt/ducktape && \ . /opt/ducktape/bin/activate && \ pip install -U pip setuptools wheel && \ pip install bcrypt cryptography==2.2.2 pynacl && \ mkdir /var/run/sshd && \ mkdir /root/.ssh && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ARG DUCKTAPE_VERSION=0.7.3 RUN . /opt/ducktape/bin/activate && \ pip install ducktape==$DUCKTAPE_VERSION && \ ln -s /opt/ducktape/bin/ducktape /usr/local/bin/ducktape && \ deactivate && \ /usr/local/bin/ducktape --version EXPOSE 22 CMD ["/usr/sbin/sshd", "-D"] ================================================ FILE: Jenkinsfile.disabled ================================================ python { publish = false // Release is done manually to PyPI and not supported yet. } ================================================ FILE: README.md ================================================ [![Documentation Status](https://readthedocs.org/projects/ducktape/badge/?version=latest)](https://ducktape.readthedocs.io/en/latest/?badge=latest) Distributed System Integration & Performance Testing Library ============================================================ Overview -------- Ducktape contains tools for running system integration and performance tests. It provides the following features: * Isolation by default so system tests are as reliable as possible. * Utilities for pulling up and tearing down services easily in clusters in different environments (e.g. local, custom cluster, Vagrant, K8s, Mesos, Docker, cloud providers, etc.) * Easy to write unit tests for distributed systems * Trigger special events (e.g. bouncing a service) * Collect results (e.g. logs, console output) * Report results (e.g. expected conditions met, performance results, etc.) Documentation ------------- For detailed documentation on how to install, run, create new tests please refer to: http://ducktape.readthedocs.io/ Contribute ---------- - Source Code: https://github.com/confluentinc/ducktape - Issue Tracker: https://github.com/confluentinc/ducktape/issues License ------- The project is licensed under the Apache 2 license. ================================================ FILE: Vagrantfile ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # -*- mode: ruby -*- # vi: set ft=ruby : require 'socket' # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! VAGRANTFILE_API_VERSION = "2" # General config enable_dns = false num_workers = 3 ram_megabytes = 300 base_box = "ubuntu/focal64" local_config_file = File.join(File.dirname(__FILE__), "Vagrantfile.local") if File.exists?(local_config_file) then eval(File.read(local_config_file), binding, "Vagrantfile.local") end Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| config.hostmanager.enabled = true config.hostmanager.manage_host = enable_dns config.hostmanager.include_offline = false ## Provider-specific global configs config.vm.provider :virtualbox do |vb,override| override.vm.box = base_box override.hostmanager.ignore_private_ip = false # Brokers started with the standard script currently set Xms and Xmx to 1G, # plus we need some extra head room. vb.customize ["modifyvm", :id, "--memory", ram_megabytes.to_s] end ## Cluster definition (1..num_workers).each { |i| name = "ducktape" + i.to_s config.vm.define name do |worker| worker.vm.hostname = name worker.vm.network :private_network, ip: "192.168.56." + (150 + i).to_s end } end ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = ducktape SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/README.md ================================================ Ducktape documentation quick start guide ======================================== Build the documentation ----------------------- To render the pages run:: ```shell tox -e docs ``` The rendered pages will be in ``docs/_build/html`` Specify documentation format ---------------------------- Documentation is built using [sphinx-build](https://www.sphinx-doc.org/en/master/man/sphinx-build.html) command. You can select which builder to use using SPHINX_BUILDER command: ```shell SPHINX_BUILDER=man tox -e docs ``` All available values: https://www.sphinx-doc.org/en/master/man/sphinx-build.html#cmdoption-sphinx-build-M Pass options to sphinx-build ---------------------------- Any argument after `--` will be passed to the [sphinx-build](https://www.sphinx-doc.org/en/master/man/sphinx-build.html) command directly: ```shell tox -e docs -- -E ``` ================================================ FILE: docs/_static/theme_overrides.css ================================================ /* override table width restrictions */ @media screen and (min-width: 767px) { .wy-table-responsive table td { /* !important prevents the common CSS stylesheets from overriding this as on RTD they are loaded after this stylesheet */ white-space: normal !important; } .wy-table-responsive { overflow: visible !important; } } ================================================ FILE: docs/api/clusters.rst ================================================ Clusters ======== .. autoclass:: ducktape.cluster.cluster.Cluster :members: .. autoclass:: ducktape.cluster.vagrant.VagrantCluster :members: .. autoclass:: ducktape.cluster.localhost.LocalhostCluster :members: .. autoclass:: ducktape.cluster.json.JsonCluster :members: ================================================ FILE: docs/api/remoteaccount.rst ================================================ Remote Account ============== .. autoclass:: ducktape.cluster.remoteaccount.RemoteAccount :members: .. autoclass:: ducktape.cluster.remoteaccount.LogMonitor :members: .. autoclass:: ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount :members: .. autoclass:: ducktape.cluster.windows_remoteaccount.WindowsRemoteAccount :members: ================================================ FILE: docs/api/services.rst ================================================ Services ======== .. autoclass:: ducktape.services.service.Service :members: .. autoclass:: ducktape.services.background_thread.BackgroundThreadService :members: ================================================ FILE: docs/api/templates.rst ================================================ Template ======== .. autoclass:: ducktape.template.TemplateRenderer :members: ================================================ FILE: docs/api/test.rst ================================================ Test ==== .. autoclass:: ducktape.tests.test.Test :members: .. autoclass:: ducktape.tests.test.TestContext :members: ================================================ FILE: docs/api.rst ================================================ .. _topics-api: ======= API Doc ======= .. toctree:: api/test api/services api/remoteaccount api/clusters api/templates ================================================ FILE: docs/changelog.rst ================================================ .. _topics-changelog: ==== Changelog ==== 0.14.0 ====== Tuesday, March 10th, 2026 ------------------------- - Ensure log collection in case of runner client unresponsive issue - Enable jvm logging for java based services - Graceful shutdown and reporting fix for runner client unresponsive issue - Add heterogeneous cluster support - Add types to ducktape - Call JUnitReporter after each test completes - Update dependency requests to v2.32.4 [security] 0.13.0 ====== Monday, June 09th, 2025 ----------------------- - Report expected test count in the summary - Add python 3.13 support and run pr test job with all supported python versions - Upgrade PyYAML and fix style error - Add support for historical report in loader - Update dependency requests to v2.32.2 - Update dependency pycryptodome to v3.19.1 - Removing generated internal project.yml, public project.yml 0.12.0 ====== Friday, October 04th, 2024 -------------------------- - Store summary of previous runs when deflaking - Runner Client Minor Refactor and Test - Adding nodes used in test summary for HTML report - Parse args from config files independently - add support to python 3.10, 3.11 and 3.12 0.11.4 ====== Friday, August 18th, 2023 ------------------------- - Updated `requests` version to 2.31.0 0.11.3 ====== Wednesday, November 30th, 2022 ------------------------------ - Bugfix: fixed an edge case when BackgroundThread wait() method errors out if start() method has never been called. 0.11.2 ====== Wednesday, November 30th, 2022 ------------------------------ - Bugfix: fixed an edge case when BackgroundThread wait() method errors out if start() method has never been called. 0.11.1 ====== - Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`. - Bumped `jinja2` to `3.0.x` 0.11.0 ====== - Option to fail tests without `@cluster` annotation. Deprecate ``min_cluster_spec()`` method in the ``Test`` class - `#336 `_ 0.10.3 ====== Friday, August 18th, 2023 ------------------------- - Updated `requests` version to 2.31.0 0.10.2 ====== - Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`. 0.10.1 ====== - Disable health checks for nodes, effectively disabling `#325 `_. See github issue for details - `#339 `_ 0.10.0 ====== - **DO NOT USE**, this release has a nasty bug - `#339 `_ - Do not schedule tests on unresponsive nodes - `#325 `_ 0.9.4 ===== Friday, August 18th, 2023 ------------------------- - Updated `requests` version to 2.31.0 0.9.3 ===== - Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`. 0.9.2 ===== - Service release, no ducktape changes, simply fixed readthedocs configs. 0.9.1 ===== - use a generic network device based on the devices found on the remote machine rather than a hardcoded one - `#314 `_ and `#328 `_ - clean up process properly after an exception during test runner execution - `#323 `_ - log ssh errors - `#319 `_ - update vagrant tests to use ubuntu20 - `#328 `_ - added command to print the total number of nodes the tests run will require - `#320 `_ - drop support for python 3.6 and add support for python 3.9 - `#317 `_ 0.9.0 ===== - Upgrade paramiko version to 2.10.0 - `#312 `_ - Support SSH timeout - `#311 `_ 0.8.18 ====== - Updated `requests` version to `2.31.0` 0.8.17 ====== - Removed `tox` from requirements. It was not used, but was breaking our builds due to recent pushes to `virtualenv`. 0.8.x ===== - Support test suites - Easier way to rerun failed tests - generate test suite with all the failed tests and also print them in the log so that user can copy them and paste as ducktape command line arguments - Python 2 is no longer supported, minimum supported version is 3.6 - Added `--deflake N` flag - if provided, it will attempt to rerun each failed test up to N times, and if it eventually passes, it will be marked as Flaky - `#299 `_ - [backport, also in 0.9.1] - use a generic network device based on the devices found on the remote machine rather than a hardcoded one - `#314 `_ and `#328 `_ - [backport, also in 0.9.1] - clean up process properly after an exception during test runner execution - `#323 `_ - [backport, also in 0.9.1] - log ssh errors - `#319 `_ - [backport, also in 0.9.1] - update vagrant tests to use ubuntu20 - `#328 `_ - [backport, also in 0.9.1] - added command to print the total number of nodes the tests run will require - `#320 `_ ================================================ FILE: docs/conf.py ================================================ # -*- coding: utf-8 -*- # # ducktape documentation build configuration file, created by # sphinx-quickstart on Mon Mar 13 14:06:07 2017. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys import sphinx_rtd_theme from ducktape import __version__ sys.path.insert(0, os.path.abspath("..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ["sphinx.ext.viewcode", "sphinx.ext.autodoc", "sphinxarg.ext"] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = ".rst" # The master toctree document. master_doc = "index" # General information about the project. project = "Ducktape" copyright = "2017, Confluent Inc." author = "Confluent Inc." # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = __version__ # The full version, including alpha/beta/rc tags. release = version # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = "ducktapedoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, "ducktape.tex", "Ducktape Documentation", "Confluent Inc.", "manual"), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, "ducktape", "Ducktape Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( master_doc, "ducktape", "Ducktape Documentation", author, "ducktape", "One line description of project.", "Miscellaneous", ), ] # ---- Options for Autodoc ------------------------------------------------ autodoc_default_flags = ["show-inheritance"] def skip(app, what, name, obj, skip, options): if name == "__init__": return False return skip def setup(app): app.connect("autodoc-skip-member", skip) ================================================ FILE: docs/debug_tests.rst ================================================ .. _topics-debug_tests: =========== Debug Tests =========== The test results go in ``results/``. For results from a particular test, look for ``results//test_class_name//`` directory. The ``test_log.debug`` file will contain the log output from the python driver, and logs of services used in the test will be in ``service_name/node_name`` sub-directory. If there is not enough information in the logs, you can re-run the test with ``--no-teardown`` argument. .. code-block:: bash ducktape dir/tests/my_test.py::TestA.test_a --no-teardown This will run the test but will not kill any running processes or remove log files when the test finishes running. Then, you can examine the state of a running service or the machine when the service process is running by logging into that machine. Suppose you suspect a particular service being the cause of the test failure. You can find out which machine was allocated to that service by either looking at ``test_log.debug`` or at directory names under ``results//test_class_name//service_name/``. It could be useful to add an explicit debug log to ``start_node`` method with a node ID and node’s hostname information for easy debugging: .. code-block:: python def start_node(self, node): idx = self.idx(node) self.logger.info("Starting ZK node %d on %s", idx, node.account.hostname) The log statement will look something like this:: [INFO - 2017-03-28 22:07:25,222 - zookeeper - start_node - lineno:50]: Starting ZK node 1 on worker1 If you are using Vagrant for example, you can then log into that node via: .. code-block:: bash vagrant ssh worker1 Use Logging =========== Distributed system tests can be difficult to debug. You want to add a lot of logging for debugging and tracking progress of the test. A good approach would be to log an intention of an operation with some useful information before any operation that can fail. It could be a good idea to use a higher logging level than you would in production so more info is available. For example, make your log levels default to DEBUG instead of INFO. Also, put enough information to a message of ``assert`` to help figure out what went wrong as well as log messages. Consider an example of testing ElasticSearch service: .. code-block:: python res = es.search(index="test-index", body={"query": {"match_all": {}}}) self.logger.debug("result: %s" % res['hits']) assert res['hits']['total'] == 1, "Expected total 1 hit, but got %d" % res['hits']['total'] for hit in res['hits']['hits']: assert 'kimchy’ == hit['_source']['author’], "Expected author kimchy but got %s" % hit['_source']['author'] assert 'Elasticsearch: cool.' == hit['_source']['text’], "Expected text Elasticsearch: cool. but got %s" % hit['_source']['text’] First, the tests outputs the result of a search, so that if any of the following assertions fail, we can see the whole result in ``test_log.debug``. Assertion messages help to quickly see the difference in expected and retrieved results. Fail early ========== Try to avoid a situation where a test fails because of an uncaught failure earlier in the test. Suppose we write a ``start_node`` method that does not check if the service starts successfully. The service fails to start, but we get a test failure indication that there was a problem querying the service. It would be much faster to debug the issue if the test failure pointed to the issue with starting the service. So make sure to add checks for operations that may fail, and fail the test earlier than later. Flaky tests ============ Flaky tests are hard to debug due to their non-determinism, they waste time, and sometimes hide real bugs: developers tend to ignore those failures, and thus could miss real bugs. Flakiness can come from the test itself, the system it is testing, or the environmental issues. Waiting on Conditions ^^^^^^^^^^^^^^^^^^^^^ A common cause of a flaky test is asynchronous wait on conditions. A test makes an asynchronous call and does not properly wait for the result of the call to become available before using it:: node.account.kill_process("zookeeper", allow_fail=False) time.sleep(2) assert not self.alive(node), “Expected Zookeeper service to stop” In this example, the test terminates a zookeeper service via ``kill_process`` and then uses ``time.sleep`` to wait for it to stop. If terminating the process takes longer, the test will fail. The test may intermittently fail based on how fast a process terminates. Of course, there should be a timeout for termination to ensure that test does not run indefinitely. You could increase sleep time, but that also increases the test run length. A more explicit way to express this condition is to use :meth:`~ ducktape.utils.util.wait_until` with a timeout:: node.account.kill_process("zookeeper", allow_fail=False) wait_until(lambda: not self.alive(node), timeout_sec=5, err_msg="Timed out waiting for zookeeper to stop.") The test will progress as soon as condition is met, and timeout ensures that the test does not run indefinitely if termination never ends. Think carefully about the condition to check. A common source of issues is incorrect choice of condition of successful service start in ``start_node`` implementation. One way to check that a service starts successfully is to wait for some specific log output. However, make sure that this specific log message is always printed after the things run successfully. If there is still a chance that service may fail to start after the log is printed, this may cause race conditions and flaky tests. Sometimes it could be better to check if the service runs successfully by querying a service or checking some metrics if they are available. Test Order Dependency ^^^^^^^^^^^^^^^^^^^^^ Make sure that your services properly cleanup the state in ``clean_node`` implementation. Failure to properly clean up the state can cause the next run of the test to fail or fail intermittently if other tests happen to clean same directories for example. One of the benefits of isolation that ducktape assumes is that you can assume you have complete control of the machine. It is ok to delete the entire working space. It is also safe to kill all java processes you can find rather than being more targeted. So, clean up aggressively. Incorrect Assumptions ^^^^^^^^^^^^^^^^^^^^^ It is possible that assumptions about how the system works that we are testing are incorrect. One way to help debug this is to use more detailed comments why certain checks are made. Tools for Managing Logs ======================= Analyzing and matching up logs from a distributed service could be time consuming. There are many good tools for working with logs. Examples include http://lnav.org/, http://list.xmodulo.com/multitail.html, and http://glogg.bonnefon.org/. Validating Ssh Issues ======================= Ducktape supports running custom validators when an ssh error occurs, allowing you to run your own validation against a host. this is done simply by running ducktape with the `--ssh-checker-function`, followed by the module path to your function, so for instance:: ducktape my-test.py --ssh-checker-function my.module.validator.validate_ssh this function will take in the ssh error raised as its first argument, and the remote account object as its second. ================================================ FILE: docs/index.rst ================================================ .. _topics-index: ============================================================ Distributed System Integration & Performance Testing Library ============================================================ Ducktape contains tools for running system integration and performance tests. It provides the following features: * Write tests for distributed systems in a simple unit test-like style * Isolation by default so system tests are as reliable as possible. * Utilities for pulling up and tearing down services easily in clusters in different environments (e.g. local, custom cluster, Vagrant, K8s, Mesos, Docker, cloud providers, etc.) * Trigger special events (e.g. bouncing a service) * Collect results (e.g. logs, console output) * Report results (e.g. expected conditions met, performance results, etc.) .. toctree:: install test_clusters run_tests new_tests new_services debug_tests api misc changelog Contribute ========== - Source Code: https://github.com/confluentinc/ducktape - Issue Tracker: https://github.com/confluentinc/ducktape/issues License ======= The project is licensed under the Apache 2 license. ================================================ FILE: docs/install.rst ================================================ .. _topics-install: ======= Install ======= 1. Ducktape requires python 3.7 or later. 2. Install `cryptography`_ (used by `paramiko` which Ducktape depends on), this may have non-python external requirements .. _cryptography: https://cryptography.io/en/latest/installation * OSX (if needed):: brew install openssl * Ubuntu:: sudo apt-get install build-essential libssl-dev libffi-dev python-dev * Fedora and RHEL-derivatives:: sudo yum install gcc libffi-devel python-devel openssl-devel 3. As a general rule, it's recommended to use an isolation tool such as ``virtualenv`` 4. Install Ducktape:: pip install ducktape .. note:: On OSX you may need to:: C_INCLUDE_PATH=/usr/local/opt/openssl/include LIBRARY_PATH=/usr/local/opt/openssl/lib pip install ducktape If you are not using a virtualenv and get the error message `failed with error code 1`, you may need to install ducktape to your user directory instead with :: pip install --user ducktape ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build set SPHINXPROJ=ducktape if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% :end popd ================================================ FILE: docs/misc.rst ================================================ .. _topics-misc: ==== Misc ==== Developer Install ================= If you are are a ducktape developer, consider using the develop command instead of install. This allows you to make code changes without constantly reinstalling ducktape (see http://stackoverflow.com/questions/19048732/python-setup-py-develop-vs-install for more information):: cd ducktape python setup.py develop To uninstall:: cd ducktape python setup.py develop --uninstall Unit Tests ========== You can run the tests with code coverage and style check using `tox `_:: tox Alternatively, you can activate the virtualenv and run pytest and ruff directly:: source ~/.virtualenvs/ducktape/bin/activate pytest tests ruff check ruff format --check System Tests ============ System tests are included under the `systests/` directory. These tests are end to end tests that run across multiple VMs, testing ducktape in an environment similar to how it would be used in practice to test other projects. The system tests run against virtual machines managed by `Vagrant `_. With Vagrant installed, start the VMs (3 by default):: vagrant up From a developer install, running the system tests now looks the same as using ducktape on your own project:: ducktape systests/ You should see the tests running, and then results and logs will be in the default directory, `results/`. By using a developer install, you can make modifications to the ducktape code and iterate on system tests without having to re-install after each modification. When you're done running tests, you can destroy the VMs:: vagrant destroy Windows ======= Ducktape support Services that run on Windows, but only in EC2. When a ``Service`` requires a Windows machine, AWS credentials must be configured on the machine running ducktape. Ducktape uses the `boto3`_ Python module to connect to AWS. And ``boto3`` support many different `configuration options`_ .. _boto3: https://aws.amazon.com/sdk-for-python/ .. _configuration options: https://boto3.readthedocs.io/en/latest/guide/configuration.html#guide-configuration Here's an example bare minimum configuration using environment variables:: export AWS_ACCESS_KEY_ID="ABC123" export AWS_SECRET_ACCESS_KEY="secret" export AWS_DEFAULT_REGION="us-east-1" The region can be any AWS region, not just ``us-east-1``. ================================================ FILE: docs/new_services.rst ================================================ .. _topics-new_services: =================== Create New Services =================== Writing ducktape services ============================= ``Service`` refers generally to multiple processes, possibly long-running, which you want to run on the test cluster. These can be services you would actually deploy (e.g., Kafka brokers, ZK servers, REST proxy) or processes used during testing (e.g. producer/consumer performance processes). Services that are distributed systems can support a variable number of nodes which allow them to handle a variety of tests. Each service is implemented as a class and should at least implement the following: * :meth:`~ducktape.services.service.Service.start_node` - start the service (possibly waiting to ensure it started successfully) * :meth:`~ducktape.services.service.Service.stop_node` - kill processes on the given node * :meth:`~ducktape.services.service.Service.clean_node` - remove persistent state leftover from testing, e.g. log files These may block to ensure services start or stop properly, but must *not* block for the full lifetime of the service. If you need to run a blocking process (e.g. run a process via SSH and iterate over its output), this should be done in a background thread. For services that exit after completing a fixed operation (e.g. produce N messages to topic foo), you should also implement ``wait``, which will usually just wait for background worker threads to exit. The ``Service`` base class provides a helper method ``run`` which wraps ``start``, ``wait``, and ``stop`` for tests that need to start a service and wait for it to finish. You can also provide additional helper methods for common test functionality. Normal services might provide a ``bounce`` method. Most of the code you'll write for a service will just be series of SSH commands and tests of output. You should request the number of nodes you'll need using the ``num_nodes`` or ``cluster_spec`` parameter to the Service base class's constructor. Then, in your Service's methods you'll have access to ``self.nodes`` to access the nodes allocated to your service. Each node has an associated :class:`~ducktape.cluster.remoteaccount.RemoteAccount` instance which lets you easily perform remote operations such as running commands via SSH or creating files. By default, these operations try to hide output (but provide it to you if you need to extract some subset of it) and *checks status codes for errors* so any operations that fail cause an obvious failure of the entire test. .. _service-example-ref: New Service Example =================== Let’s walk through an example of writing a simple Zookeeper service. .. code-block:: python class ZookeeperService(Service): PERSISTENT_ROOT = "/mnt" LOG_FILE = os.path.join(PERSISTENT_ROOT, "zk.log") DATA_DIR = os.path.join(PERSISTENT_ROOT, "zookeeper") CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "zookeeper.properties") logs = { "zk_log": { "path": LOG_FILE, "collect_default": True}, "zk_data": { "path": DATA_DIR, "collect_default": False} } def __init__(self, context, num_nodes): super(ZookeeperService, self).__init__(context, num_nodes) ``logs`` is a member of ``Service`` that provides a mechanism for locating and collecting log files produced by the service on its nodes. ``logs`` is a dict with entries that look like ``log_name: {"path": log_path, "collect_default": boolean}``. In our example, log files will be collected on both successful and failed test runs, while files from the data directory will be collected only on failed test runs. Zookeeper service requests the number of nodes passed to its constructor by passing ``num_nodes`` parameters to the Service base class’s constructor. .. code-block:: python def start_node(self, node): idx = self.idx(node) self.logger.info("Starting ZK node %d on %s", idx, node.account.hostname) node.account.ssh("mkdir -p %s" % self.DATA_DIR) node.account.ssh("echo %d > %s/myid" % (idx, self.DATA_DIR)) prop_file = """\n dataDir=%s\n clientPort=2181""" % self.DATA_DIR for idx, node in enumerate(self.nodes): prop_file += "\n server.%d=%s:2888:3888" % (idx, node.account.hostname) self.logger.info("zookeeper.properties: %s" % prop_file) node.account.create_file(self.CONFIG_FILE, prop_file) start_cmd = "/opt/kafka/bin/zookeeper-server-start.sh %s 1>> %s 2>> %s &" % \ (self.CONFIG_FILE, self.LOG_FILE, self.LOG_FILE) with node.account.monitor_log(self.LOG_FILE) as monitor: node.account.ssh(start_cmd) monitor.wait_until( "binding to port", timeout_sec=100, backoff_sec=7, err_msg="Zookeeper service didn't finish startup" ) self.logger.debug("Zookeeper service is successfully started.") The ``start_node`` method first creates directories and the config file on the given node, and then invokes the start script to start a Zookeeper service. In this simple example, the config file is created from manually constructed ``prop_file`` string, because it has only a couple of easy to construct lines. More complex config files can be created with templates, as described in :ref:`using-templates-ref`. A service may take time to start and get to a usable state. Using sleeps to wait for a service to start often leads to a flaky test. The sleep time may be too short, or the service may fail to start altogether. It is useful to verify that the service starts properly before returning from the ``start_node``, and fail the test if the service fails to start. Otherwise, the test will likely fail later, and it would be harder to find the root cause of the failure. One way to check that the service starts successfully is to check whether a service’s process is alive and one additional check that the service is usable such as querying the service or checking some metrics if they are available. Our example checks whether a Zookeeper service is started successfully by searching for a particular output in a log file. The :class:`~ducktape.cluster.remoteaccount.RemoteAccount` instance associated with each node provides you with :class:`~ducktape.cluster.remoteaccount.LogMonitor` that let you check or wait for a pattern to appear in the log. Our example waits for 100 seconds for “binding to port” string to appear in the ``self.LOG_FILE`` log file, and raises an exception if it does not. .. code-block:: python def pids(self, node): try: cmd = "ps ax | grep -i zookeeper | grep java | grep -v grep | awk '{print $1}'" pid_arr = [pid for pid in node.account.ssh_capture(cmd, allow_fail=True, callback=int)] return pid_arr except (RemoteCommandError, ValueError) as e: return [] def alive(self, node): return len(self.pids(node)) > 0 def stop_node(self, node): idx = self.idx(node) self.logger.info("Stopping %s node %d on %s" % (type(self).__name__, idx, node.account.hostname)) node.account.kill_process("zookeeper", allow_fail=False) def clean_node(self, node): self.logger.info("Cleaning Zookeeper node %d on %s", self.idx(node), node.account.hostname) if self.alive(node): self.logger.warn("%s %s was still alive at cleanup time. Killing forcefully..." % (self.__class__.__name__, node.account)) node.account.kill_process("zookeeper", clean_shutdown=False, allow_fail=True) node.account.ssh("rm -rf /mnt/zookeeper /mnt/zookeeper.properties /mnt/zk.log", allow_fail=False) The ``stop_node`` method uses :meth:`~ducktape.cluster.remoteaccount.RemoteAccount.kill_process` to terminate the service process on the given node. If the remote command to terminate the process fails, :meth:`~ducktape.cluster.remoteaccount.RemoteAccount.kill_process` will raise an ``RemoteCommandError`` exception. The ``clean_node`` method forcefully kills the process if it is still alive, and then removes persistent state leftover from testing. Make sure to properly cleanup the state to avoid test order dependency and flaky tests. You can assume complete control of the machine, so it is safe to delete an entire temporary working space and kill all java processes, etc. .. _using-templates-ref: Using Templates =============== Both ``Service`` and ``Test`` subclass :class:`~ducktape.template.TemplateRenderer` that lets you render templates directly from strings or from files loaded from *templates/* directory relative to the class. A template contains variables and/or expressions, which are replaced with values when a template is rendered. :class:`~ducktape.template.TemplateRenderer` renders templates using `Jinja2 `_ template engine. A good use-case for templates is a properties file that needs to be passed to a service process. In :ref:`service-example-ref`, the properties file is created by building a string and using it as contents as follows:: prop_file = """\n dataDir=%s\n clientPort=2181""" % self.DATA_DIR for idx, node in enumerate(self.nodes): prop_file += "\n server.%d=%s:2888:3888" % (idx, node.account.hostname) node.account.create_file(self.CONFIG_FILE, prop_file) A template approach is to add a properties file in *templates/* directory relative to the ZookeeperService class: .. code-block:: rst dataDir={{ DATA_DIR }} clientPort=2181 {% for node in nodes %} server.{{ loop.index }}={{ node.account.hostname }}:2888:3888 {% endfor %} Suppose we named the file zookeeper.properties. The creation of the config file will look like this: .. code-block:: python prop_file = self.render('zookeeper.properties') node.account.create_file(self.CONFIG_FILE, prop_file) ================================================ FILE: docs/new_tests.rst ================================================ .. _topics-new_tests: ================ Create New Tests ================ Writing ducktape Tests ====================== Subclass :class:`~ducktape.tests.test.Test` and implement as many ``test`` methods as you want. The name of each test method must start or end with ``test``, e.g. ``test_functionality`` or ``example_test``. Typically, a test will start a few services, collect and/or validate some data, and then finish. If the test method finishes with no exceptions, the test is recorded as successful, otherwise it is recorded as a failure. Here is an example of a test that just starts a Zookeeper cluster with 2 nodes, and a Kafka cluster with 3 nodes:: class StartServicesTest(Test): """Make sure we can start Kafka and Zookeeper services.""" def __init__(self, test_context): super(StartServicesTest, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=2) self.kafka = KafkaService(test_context, num_nodes=3, self.zk) def test_services_start(self): self.zk.start() self.kafka.start() Test Parameters =============== Use test decorators to parametrize tests, examples are provided below .. autofunction:: ducktape.mark.parametrize .. autofunction:: ducktape.mark.matrix .. autofunction:: ducktape.mark.resource.cluster .. autofunction:: ducktape.mark.ignore Logging ======= The :class:`~ducktape.tests.test.Test` base class sets up logger you can use which is tagged by class name, so adding some logging for debugging or to track the progress of tests is easy:: self.logger.debug("End-to-end latency %d: %s", idx, line.strip()) These types of tests can be difficult to debug, so err toward more rather than less logging. .. note:: Logs are collected a multiple log levels, and only higher log levels are displayed to the console while the test runs. Make sure you log at the appropriate level. JVM Logging ----------- For Java-based services, ducktape can automatically collect JVM diagnostic logs without requiring any code changes to services or tests. Enable it with the ``--enable-jvm-logs`` flag:: ducktape --enable-jvm-logs When enabled, ducktape wraps the service's ``start_node`` and ``clean_node`` methods to: - Create a log directory (``/mnt/jvm_logs``) on each worker node before the service starts. - Prepend ``JDK_JAVA_OPTIONS`` with the JVM logging flags to every SSH command sent to the node, so the options are inherited by any Java process the service launches. - Remove the log directory after ``clean_node`` runs and restore the original SSH methods. The following JVM options are injected automatically: .. list-table:: :header-rows: 1 :widths: 40 60 * - Option - Purpose * - ``-Xlog:disable`` - Suppress default JVM console output to avoid polluting test logs * - ``-Xlog:gc*:file=/gc.log`` - GC activity with timestamps, uptime, level, and tags * - ``-XX:+HeapDumpOnOutOfMemoryError`` - Generate a heap dump when an OOM error occurs * - ``-XX:HeapDumpPath=/heap_dump.hprof`` - Location for the heap dump file * - ``-Xlog:safepoint=info:file=/jvm.log`` - Safepoint pause events * - ``-Xlog:class+load=info:file=/jvm.log`` - Class loading events * - ``-XX:ErrorFile=/hs_err_pid%p.log`` - Fatal error log (JVM crashes) * - ``-XX:NativeMemoryTracking=summary`` - Native memory usage tracking * - ``-Xlog:jit+compilation=info:file=/jvm.log`` - JIT compilation events The following log files are collected from each node: .. list-table:: :header-rows: 1 :widths: 30 20 50 * - File - Collected by default - Contents * - ``gc.log`` - Yes - Garbage collection activity * - ``jvm.log`` - Yes - Safepoint, class loading, and JIT compilation events * - ``heap_dump.hprof`` - No (failure only) - Heap dump generated on OutOfMemoryError .. note:: If a service or test injects its own ``-Xlog`` options as part of the command, those options will override the ones injected by JVM logging, since ducktape prepends ``JDK_JAVA_OPTIONS`` before the command. In practice, services should behave as expected. New test example ================ Lets expand on the StartServicesTest example. The test starts a Zookeeper cluster with 2 nodes, and a Kafka cluster with 3 nodes, and then bounces a kafka broker node which is either a special controller node or a non-controller node, depending on the `bounce_controller_broker` test parameter. .. code-block:: python class StartServicesTest(Test): def __init__(self, test_context): super(StartServicesTest, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=2) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk) def setUp(self): self.zk.start() self.kafka.start() @matrix(bounce_controller_broker=[True, False]) def test_broker_bounce(self, bounce_controller_broker=False): controller_node = self.kafka.controller() self.logger.debug("Found controller broker %s", controller_node.account) if bounce_controller_broker: bounce_node = controller_node else: bounce_node = self.kafka.nodes[(self.kafka.idx(controller_node) + 1) % self.kafka.num_nodes] self.logger.debug("Will hard kill broker %s", bounce_node.account) self.kafka.signal_node(bounce_node, sig=signal.SIGKILL) wait_until(lambda: not self.kafka.is_registered(bounce_node), timeout_sec=self.kafka.zk_session_timeout + 5, err_msg="Failed to see timely deregistration of hard-killed broker %s" % bounce_node.account) self.kafka.start_node(bounce_node) This will run two tests, one with ‘bounce_controller_broker’: False and another with 'bounce_controller_broker': True arguments. We moved start of Zookeeper and Kafka services to :meth:`~ducktape.tests.test.Test.setUp`, which is called before every test run. The test finds which of Kafka broker nodes is a special controller node via provided ``controller`` method in KafkaService. The ``controller`` method in KafkaService will raise an exception if the controller node is not found. Make sure to check the behavior of methods provided by a service or other helper classes and fail the test as soon as an issue is found. That way, it will be much easier to find the cause of the test failure. The test then finds the node to bounce based on `bounce_controller_broker` test parameter and then forcefully terminates the service process on that node via ``signal_node`` method of KafkaService. This method just sends a signal to forcefully kill the process, and does not do any further check. Thus, our test needs to check that the hard killed kafka broker is not part of the Kafka cluster anymore, before restarting the killed broker process. We do this by waiting on ``is_registered`` method provided by KafkaService to return False with a timeout, since de-registering the broker may take some time. Notice the use of ``wait_until`` method instead of a check after ``time.sleep``. This allows the test to continue as soon as de-registration happens. We don’t check if the restarted broker is registered, because this is already done in KafkaService ``start_node`` implementation, which will raise an exception if the service is not started successfully on a given node. ================================================ FILE: docs/requirements.txt ================================================ Sphinx~=8.2.3 sphinx-argparse~=0.5.2 sphinx-rtd-theme~=3.0.2 boto3==1.33.13 pycryptodome==3.23.0 pywinrm==0.4.3 jinja2~=3.1.6 MarkupSafe~=2.1.5 ================================================ FILE: docs/run_tests.rst ================================================ .. _topics-run_tests: ========= Run Tests ========= Running Tests ============= ducktape discovers and runs tests in the path(s) provided. You can specify a folder with tests (all tests in Python modules named with "test\_" prefix or "_test" suffix will be run), a specific test file (with any name) or even a specific class or test method, via absolute or relative paths. You can optionally specify a specific set of parameters for tests with ``@parametrize`` or ``@matrix`` annotations:: ducktape # e.g. ducktape dir/tests ducktape # e.g. ducktape dir/tests/my_test.py ducktape [::SomeTestClass] # e.g. ducktape dir/tests/my_test.py::TestA ducktape [::SomeTestClass[.test_method]] # e.g. ducktape dir/tests/my_test.py::TestA.test_a ducktape [::TestClass[.method[@params_json]]] # e.g. ducktape 'dir/tests/my_test.py::TestA.test_a@{"x": 100}' Excluding Tests =============== Pass ``--exclude`` flag to exclude certain test(s) from the run, using the same syntax:: ducktape ./my_tests_dir --exclude ./my_tests_dir/test_a.py ./my_tests_dir/test_b.py::TestB.test_b Test Suites =========== Test suite is a collection of tests to run, optionally also specifying which tests to exclude. Test suites are specified via YAML file .. code-block:: yaml # list all tests that are part of the suite under the test suite name: my_test_suite: - ./my_tests_dir/ # paths are relative to the test suite file location - ./another_tests_dir/test_file.py::TestClass.test_method # same syntax as passing tests directly to ducktape - './another_tests_dir/test_file.py::TestClass.parametrized_method@{"x": 100}' # params are supported too - ./third_tests_dir/prefix_*.py # basic globs are supported (* and ? characters) # each YAML file can contain one or more test suites: another_test_suite: # you can optionally specify excluded tests in the suite as well using the following syntax: included: - ./some_tests_dir/ excluded: - ./some_tests_dir/*_large_test.py Running Test Suites =================== Tests suites are run in the same fashion as separate tests. Run a single test suite:: ducktape ./path/to/test_suite.yml Run multiple test suites:: ducktape ./path/to/test_suite_1.yml ./test_suite_2.yml You can specify both tests and test suites at the same time:: ducktape ./my_test.py ./my_test_suite.yml ./another_test.py::TestClass.test_method If the same test method is effectively specified more than once, it will only be executed once. For example, if ``test_suite.yml`` lists ``test_a.py`` then running the following command will execute ``test_a.py`` only once:: ducktape test_suite.yml test_a.py If you specify a folder, all tests (ie python files) under that folder will be discovered, but test suites will be not. For example, if ``test_dir`` contains ``my_test.py`` and ``my_test_suite.yml``, then running:: ducktape ./test_dir will execute ``my_test.py`` but skip ``my_test_suite.yml``. To execute both ``my_test.py`` and ``my_test_suite.yml`` you need to specify test suite path explicitly:: ducktape ./test_dir/ ./test_dir/my_test_suite.yml Exclude and Test Suites ======================= Exclude section in the test suite applies only to that test suite. ``--exclude`` parameter passed to ducktape applies to all loaded tests and test suites. For example, if ``test_dir`` contains ``test_a.py``, ``test_b.py`` and ``test_c.py``, and ``test_suite.yml`` is: .. code-block:: yaml suite_one: included: - ./test_dir/*.py excluded: - ./test_dir/test_a.py suite_two: included: - ./test_dir/ excluded: - ./test_dir/test_b.py Then running:: ducktape test_suite.yml runs each of ``test_a.py``, ``test_b.py`` and ``test_c.py`` once But running:: ducktape test_suite.yml --exclude test_dir/test_a.py runs only ``test_b.py`` and ``test_c.py`` once, and skips ``test_a.py``. Options ======= To see a complete listing of options run:: ducktape --help .. argparse:: :module: ducktape.command_line.parse_args :func: create_ducktape_parser :prog: ducktape Configuration File ================== You can configure options in three locations: on the command line (highest priority), in a user configuration file in ``~/.ducktape/config``, and in a project-specific configuration ``/.ducktape/config`` (lowest priority). Configuration files use the same syntax as command line arguments and may split arguments across multiple lines:: --debug --exit-first --cluster=ducktape.cluster.json.JsonCluster Output ====== Test results go in ``results/.`` which looks like ``--``. For example: ``results/2015-03-28--002`` ducktape does its best to group test results and log files in a sensible way. The output directory is structured like so:: session_log.info session_log.debug report.txt # Summary report of all tests run in this session report.html # Open this to see summary report in a browser report.css test_log.info test_log.debug report.txt # Report on this single test [data.json] # Present if the test returns data some_logs some_logs ... To see an example of the output structure, go `here`_ and click on one of the details links. .. _here: http://testing.confluent.io/confluent-kafka-system-test-results/ ================================================ FILE: docs/test_clusters.rst ================================================ .. _topics-test_clusters: =================== Test Clusters =================== Ducktape runs on a test cluster with several nodes. Ducktape will take ownership of the nodes and handle starting, stopping, and running services on them. Many test environments are possible. The nodes may be local nodes, running inside Docker. Or they could be virtual machines running on a public cloud. Cluster Specifications ====================== A cluster specification-- also called a ClusterSpec-- describes a particular cluster configuration. Originally, cluster specifications could only express the number of nodes of each operating system. Now, with heterogeneous cluster support, specifications can also include node types (e.g., "small", "large") for more fine-grained resource allocation. See `Heterogeneous Clusters`_ for more details. Cluster specifications give us a vocabulary to express what a particular service or test needs to run. For example, a service might require a cluster with three Linux nodes and one Windows node. We could express that with a ClusterSpec containing three Linux NodeSpec objects and one Windows NodeSpec object. Heterogeneous Clusters ====================== Ducktape supports heterogeneous clusters where nodes can have different types (e.g., "small", "large", "arm64"). This allows tests to request specific node types while maintaining backward compatibility with existing tests. Using Node Types in Tests ------------------------- Use the ``@cluster`` decorator with ``node_type`` to request specific node types:: from ducktape.mark.resource import cluster @cluster(num_nodes=3, node_type="large") def test_with_large_nodes(self): # This test requires 3 large nodes pass @cluster(num_nodes=5) def test_any_nodes(self): # This test accepts any 5 nodes (backward compatible) pass Cluster Configuration --------------------- Node types are defined in your ``cluster.json`` file:: { "nodes": [ { "ssh_config": {"host": "worker1", ...}, "node_type": "small" }, { "ssh_config": {"host": "worker2", ...}, "node_type": "large" } ] } Backward Compatibility ---------------------- - Tests without ``node_type`` will match **any available node** - Existing tests and cluster configurations continue to work unchanged - Node type is optional in both test annotations and cluster configuration ================================================ FILE: ducktape/__init__.py ================================================ __version__ = "0.14.0" ================================================ FILE: ducktape/__main__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.command_line import main if __name__ == "__main__": main.main() ================================================ FILE: ducktape/cluster/__init__.py ================================================ from .json import JsonCluster # NOQA from .localhost import LocalhostCluster # NOQA from .vagrant import VagrantCluster # NOQA ================================================ FILE: ducktape/cluster/cluster.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections from typing import Iterable, List, Union from ducktape.cluster.cluster_node import ClusterNode from ducktape.cluster.cluster_spec import ClusterSpec class Cluster(object): """Interface for a cluster -- a collection of nodes with login credentials. This interface doesn't define any mapping of roles/services to nodes. It only interacts with some underlying system that can describe available resources and mediates reservations of those resources. """ def __init__(self): self.max_used_nodes = 0 def __len__(self) -> int: """Size of this cluster object. I.e. number of 'nodes' in the cluster.""" return self.available().size() + self.used().size() def alloc(self, cluster_spec) -> Union[ClusterNode, List[ClusterNode], "Cluster"]: """ Allocate some nodes. :param cluster_spec: A ClusterSpec describing the nodes to be allocated. :throws InsufficientResources: If the nodes cannot be allocated. :return: Allocated nodes spec """ allocated = self.do_alloc(cluster_spec) self.max_used_nodes = max(self.max_used_nodes, len(self.used())) return allocated def do_alloc(self, cluster_spec) -> Union[ClusterNode, List[ClusterNode], "Cluster"]: """ Subclasses should implement actual allocation here. :param cluster_spec: A ClusterSpec describing the nodes to be allocated. :throws InsufficientResources: If the nodes cannot be allocated. :return: Allocated nodes spec """ raise NotImplementedError def free(self, nodes: Union[Iterable[ClusterNode], ClusterNode]) -> None: """Free the given node or list of nodes""" if isinstance(nodes, collections.abc.Iterable): for s in nodes: self.free_single(s) else: self.free_single(nodes) def free_single(self, node: ClusterNode) -> None: raise NotImplementedError() def __eq__(self, other): return other is not None and self.__dict__ == other.__dict__ def __hash__(self): return hash(tuple(sorted(self.__dict__.items()))) def num_available_nodes(self) -> int: return self.available().size() def available(self) -> ClusterSpec: """ Return a ClusterSpec object describing the currently available nodes. """ raise NotImplementedError def used(self) -> ClusterSpec: """ Return a ClusterSpec object describing the currently in use nodes. """ raise NotImplementedError def max_used(self) -> int: return self.max_used_nodes def all(self): """ Return a ClusterSpec object describing all nodes. """ return self.available().clone().add(self.used()) ================================================ FILE: ducktape/cluster/cluster_node.py ================================================ from typing import Optional from ducktape.cluster.remoteaccount import RemoteAccount class ClusterNode(object): def __init__(self, account: RemoteAccount, **kwargs): self.account = account for k, v in kwargs.items(): setattr(self, k, v) @property def name(self) -> Optional[str]: return self.account.hostname @property def operating_system(self) -> Optional[str]: return self.account.operating_system ================================================ FILE: ducktape/cluster/cluster_spec.py ================================================ # Copyright 2017 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import import json import typing from ducktape.cluster.node_container import NodeContainer from .consts import LINUX from .node_spec import NodeSpec class ClusterSpec(object): """ The specification for a ducktape cluster. """ nodes: typing.Optional[NodeContainer] = None @staticmethod def empty(): return ClusterSpec([]) @staticmethod def simple_linux(num_nodes, node_type=None): """ Create a ClusterSpec for Linux nodes, optionally of a specific type. Examples: ClusterSpec.simple_linux(5) # 5 nodes, any type ClusterSpec.simple_linux(3, "large") # 3 large nodes :param num_nodes: Number of Linux nodes :param node_type: Optional node type label (e.g., "large", "small") """ node_specs = [NodeSpec(LINUX, node_type)] * num_nodes return ClusterSpec(node_specs) @staticmethod def from_nodes(nodes): """ Create a ClusterSpec describing a list of nodes. """ return ClusterSpec([NodeSpec(node.operating_system, getattr(node, "node_type", None)) for node in nodes]) def __init__(self, nodes=None): """ Initialize the ClusterSpec. :param nodes: A collection of NodeSpecs, or None to create an empty cluster spec. """ self.nodes = NodeContainer(nodes) def __len__(self): return self.size() def __iter__(self): return self.nodes.elements() def size(self): """Return the total size of this cluster spec, including all types of nodes.""" return self.nodes.size() def add(self, other): """ Add another ClusterSpec to this one. :param node_spec: The other cluster spec. This will not be modified. :return: This ClusterSpec. """ for node_spec in other.nodes: self.nodes.add_node(node_spec) return self def clone(self): """ Returns a deep copy of this object. """ return ClusterSpec(self.nodes.clone()) def __str__(self): node_spec_to_num = {} for node_spec in self.nodes.elements(): node_spec_str = str(node_spec) node_spec_to_num[node_spec_str] = node_spec_to_num.get(node_spec_str, 0) + 1 rval = [] for node_spec_str in sorted(node_spec_to_num.keys()): node_spec = json.loads(node_spec_str) node_spec["num_nodes"] = node_spec_to_num[node_spec_str] rval.append(node_spec) return json.dumps(rval, sort_keys=True) ================================================ FILE: ducktape/cluster/consts.py ================================================ # Copyright 2017 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. LINUX = "linux" WINDOWS = "windows" SUPPORTED_OS_TYPES = [LINUX, WINDOWS] ================================================ FILE: ducktape/cluster/finite_subcluster.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import typing from ducktape.cluster.cluster import Cluster, ClusterNode from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.node_container import NodeContainer class FiniteSubcluster(Cluster): """This cluster class gives us a mechanism for allocating finite blocks of nodes from another cluster.""" def __init__(self, nodes: typing.Iterable[ClusterNode]): super(FiniteSubcluster, self).__init__() self.nodes = nodes self._available_nodes = NodeContainer(nodes) self._in_use_nodes = NodeContainer() def do_alloc(self, cluster_spec) -> typing.List[ClusterNode]: # there cannot be any bad nodes here, # since FiniteSubcluster operates on ClusterNode objects, # which are not checked for health by NodeContainer.remove_spec # however there could be an error, specifically if a test decides to alloc more nodes than are available # in a previous ducktape version this exception was raised by remove_spec # in this one, for consistency, we let the cluster itself deal with allocation errors good_nodes, bad_nodes = self._available_nodes.remove_spec(cluster_spec) self._in_use_nodes.add_nodes(good_nodes) return good_nodes def free_single(self, node): self._in_use_nodes.remove_node(node) self._available_nodes.add_node(node) def available(self): return ClusterSpec.from_nodes(self._available_nodes) def used(self): return ClusterSpec.from_nodes(self._in_use_nodes) ================================================ FILE: ducktape/cluster/json.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import import json import os import traceback from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.consts import WINDOWS from ducktape.cluster.linux_remoteaccount import LinuxRemoteAccount from ducktape.cluster.node_container import InsufficientHealthyNodesError, NodeContainer from ducktape.cluster.windows_remoteaccount import WindowsRemoteAccount from ducktape.command_line.defaults import ConsoleDefaults from .cluster import Cluster from .cluster_node import ClusterNode from .remoteaccount import RemoteAccountSSHConfig def make_remote_account(ssh_config, *args, **kwargs): """Factory function for creating the correct RemoteAccount implementation.""" if ssh_config.host and WINDOWS in ssh_config.host: return WindowsRemoteAccount(ssh_config, *args, **kwargs) else: return LinuxRemoteAccount(ssh_config, *args, **kwargs) class JsonCluster(Cluster): """An implementation of Cluster that uses static settings specified in a cluster file or json-serializeable dict""" def __init__( self, cluster_json=None, *args, make_remote_account_func=make_remote_account, **kwargs, ): """Initialize JsonCluster JsonCluster can be initialized from: - a json-serializeable dict - a "cluster_file" containing json :param cluster_json: a json-serializeable dict containing node information. If ``cluster_json`` is None, load from file :param cluster_file (optional): Overrides the default location of the json cluster file Example json with a local Vagrant cluster:: { "nodes": [ { "externally_routable_ip": "192.168.50.151", "ssh_config": { "host": "worker1", "hostname": "127.0.0.1", "identityfile": "/path/to/private_key", "password": null, "port": 2222, "user": "vagrant" } }, { "externally_routable_ip": "192.168.50.151", "ssh_config": { "host": "worker2", "hostname": "127.0.0.1", "identityfile": "/path/to/private_key", "password": null, "port": 2223, "user": "vagrant" } } ] } """ super(JsonCluster, self).__init__() self._available_accounts: NodeContainer = NodeContainer() self._bad_accounts: NodeContainer = NodeContainer() self._in_use_nodes: NodeContainer = NodeContainer() if cluster_json is None: # This is a directly instantiation of JsonCluster rather than from a subclass (e.g. VagrantCluster) cluster_file = kwargs.get("cluster_file") if cluster_file is None: cluster_file = ConsoleDefaults.CLUSTER_FILE cluster_json = json.load(open(os.path.abspath(cluster_file))) try: for ninfo in cluster_json["nodes"]: ssh_config_dict = ninfo.get("ssh_config") assert ssh_config_dict is not None, ( "Cluster json has a node without a ssh_config field: %s\n Cluster json: %s" % (ninfo, cluster_json) ) ssh_config = RemoteAccountSSHConfig(**ninfo.get("ssh_config", {})) # Extract node_type from JSON (optional field) node_type = ninfo.get("node_type") remote_account = make_remote_account_func( ssh_config, ninfo.get("externally_routable_ip"), node_type=node_type, ssh_exception_checks=kwargs.get("ssh_exception_checks"), ) if remote_account.externally_routable_ip is None: remote_account.externally_routable_ip = self._externally_routable_ip(remote_account) self._available_accounts.add_node(remote_account) except BaseException as e: msg = "JSON cluster definition invalid: %s: %s" % ( e, traceback.format_exc(limit=16), ) raise ValueError(msg) self._id_supplier = 0 def do_alloc(self, cluster_spec): try: good_nodes, bad_nodes = self._available_accounts.remove_spec(cluster_spec) except InsufficientHealthyNodesError as e: self._bad_accounts.add_nodes(e.bad_nodes) raise e # even in case of no exceptions, we can still run into bad nodes, so let's track them if bad_nodes: self._bad_accounts.add_nodes(bad_nodes) # now let's gather all the good ones and convert them into ClusterNode objects allocated_nodes = [] for account in good_nodes: allocated_nodes.append(ClusterNode(account, slot_id=self._id_supplier)) self._id_supplier += 1 self._in_use_nodes.add_nodes(allocated_nodes) return allocated_nodes def free_single(self, node): self._in_use_nodes.remove_node(node) self._available_accounts.add_node(node.account) node.account.close() def _externally_routable_ip(self, account): return None def available(self): return ClusterSpec.from_nodes(self._available_accounts) def used(self): return ClusterSpec.from_nodes(self._in_use_nodes) ================================================ FILE: ducktape/cluster/linux_remoteaccount.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional from paramiko import SFTPClient, SSHClient from ducktape.cluster.consts import LINUX from ducktape.cluster.remoteaccount import RemoteAccount, RemoteAccountError class LinuxRemoteAccount(RemoteAccount): def __init__(self, *args, **kwargs): super(LinuxRemoteAccount, self).__init__(*args, **kwargs) self._ssh_client: Optional[SSHClient] = None self._sftp_client: Optional[SFTPClient] = None self.os = LINUX @property def local(self): """Returns True if this 'remote' account is probably local. This is an imperfect heuristic, but should work for simple local testing.""" return self.hostname == "localhost" and self.user is None and self.ssh_config is None def get_network_devices(self): """ Utility to get all network devices on a linux account """ return [device for device in self.sftp_client.listdir("/sys/class/net")] def get_external_accessible_network_devices(self): """ gets the subset of devices accessible through an external conenction """ return [ device for device in self.get_network_devices() if device != "lo" # do not include local device and (device.startswith("en") or device.startswith("eth")) # filter out other devices; "en" means ethernet # eth0 can also sometimes happen, see https://unix.stackexchange.com/q/134483 ] # deprecated, please use the self.externally_routable_ip that is set in your cluster, # not explicitly deprecating it as it's used by vagrant cluster def fetch_externally_routable_ip(self, is_aws=None): if is_aws is not None: self.logger.warning("fetch_externally_routable_ip: is_aws is a deprecated flag, and does nothing") devices = self.get_external_accessible_network_devices() self.logger.debug("found devices: {}".format(devices)) if not devices: raise RemoteAccountError(self, "Couldn't find any network devices") fmt_cmd = ( "/sbin/ifconfig {device} | " "grep 'inet ' | " "tail -n 1 | " r"egrep -o '[0-9\.]+' | " "head -n 1 2>&1" ) ips = ["".join(self.ssh_capture(fmt_cmd.format(device=device))).strip() for device in devices] self.logger.debug("found ips: {}".format(ips)) self.logger.debug("returning the first ip found") return next(iter(ips)) ================================================ FILE: ducktape/cluster/localhost.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.node_container import NodeContainer from .cluster import Cluster from .cluster_node import ClusterNode from .linux_remoteaccount import LinuxRemoteAccount from .remoteaccount import RemoteAccountSSHConfig class LocalhostCluster(Cluster): """ A "cluster" that runs entirely on localhost using default credentials. This doesn't require any user configuration and is equivalent to the old defaults in cluster_config.json. There are no constraints on the resources available. """ def __init__(self, *args, **kwargs): super(LocalhostCluster, self).__init__() num_nodes = kwargs.get("num_nodes", 1000) self._available_nodes = NodeContainer() for i in range(num_nodes): ssh_config = RemoteAccountSSHConfig("localhost%d" % i, hostname="localhost", port=22) self._available_nodes.add_node( ClusterNode( LinuxRemoteAccount( ssh_config, ssh_exception_checks=kwargs.get("ssh_exception_checks"), ) ) ) self._in_use_nodes = NodeContainer() def do_alloc(self, cluster_spec): # there shouldn't be any bad nodes in localhost cluster # since ClusterNode object does not implement `available()` method good_nodes, bad_nodes = self._available_nodes.remove_spec(cluster_spec) self._in_use_nodes.add_nodes(good_nodes) return good_nodes def free_single(self, node): self._in_use_nodes.remove_node(node) self._available_nodes.add_node(node) node.account.close() def available(self): return ClusterSpec.from_nodes(self._available_nodes) def used(self): return ClusterSpec.from_nodes(self._in_use_nodes) ================================================ FILE: ducktape/cluster/node_container.py ================================================ # Copyright 2017 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations from typing import ( TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union, ) from ducktape.cluster.cluster_node import ClusterNode from ducktape.cluster.remoteaccount import RemoteAccount if TYPE_CHECKING: from ducktape.cluster.cluster_spec import ClusterSpec NodeType = Union[ClusterNode, RemoteAccount] # Key for node grouping: (operating_system, node_type) NodeGroupKey = Tuple[Optional[str], Optional[str]] class NodeNotPresentError(Exception): pass class InsufficientResourcesError(Exception): pass class InsufficientHealthyNodesError(InsufficientResourcesError): def __init__(self, bad_nodes: List, *args): self.bad_nodes = bad_nodes super().__init__(*args) def _get_node_key(node: NodeType) -> NodeGroupKey: """Extract the (os, node_type) key from a node.""" os = getattr(node, "operating_system", None) node_type = getattr(node, "node_type", None) return (os, node_type) class NodeContainer(object): """ Container for cluster nodes, grouped by (operating_system, node_type). This enables efficient lookup and allocation of nodes matching specific requirements. Nodes with node_type=None are grouped under (os, None) and can match any request when no specific type is required. """ # Key: (os, node_type) tuple, Value: list of nodes node_groups: Dict[NodeGroupKey, List[NodeType]] def __init__(self, nodes: Optional[Iterable[NodeType]] = None) -> None: """ Create a NodeContainer with the given nodes. Node objects should implement at least an operating_system property, and optionally a node_type property. :param nodes: A collection of node objects to add, or None to add nothing. """ self.node_groups = {} if nodes is not None: for node in nodes: self.add_node(node) def size(self) -> int: """ Returns the total number of nodes in the container. """ return sum([len(val) for val in self.node_groups.values()]) def __len__(self): return self.size() def __iter__(self) -> Iterator[Any]: return self.elements() def elements(self, operating_system: Optional[str] = None, node_type: Optional[str] = None) -> Iterator[NodeType]: """ Yield the elements in this container. :param operating_system: If this is non-None, we will iterate only over elements which have this operating system. :param node_type: If this is non-None, we will iterate only over elements which have this node type. """ for (os, nt), node_list in self.node_groups.items(): # Filter by OS if specified if operating_system is not None and os != operating_system: continue # Filter by node_type if specified if node_type is not None and nt != node_type: continue for node in node_list: yield node def grouped_by_os_and_type(self) -> Dict[Tuple[Optional[str], Optional[str]], int]: """ Returns nodes grouped by (operating_system, node_type) with counts. This is a pure data method that groups nodes without any ordering. The caller is responsible for determining processing order. :return: Dictionary mapping (os, node_type) tuples to counts """ result: Dict[Tuple[Optional[str], Optional[str]], int] = {} for node in self.elements(): key = (getattr(node, "operating_system", None), getattr(node, "node_type", None)) result[key] = result.get(key, 0) + 1 return result def add_node(self, node: Union[ClusterNode, RemoteAccount]) -> None: """ Add a node to this collection, grouping by (os, node_type). :param node: The node to add. """ key = _get_node_key(node) self.node_groups.setdefault(key, []).append(node) def add_nodes(self, nodes): """ Add a collection of nodes to this collection. :param nodes: The nodes to add. """ for node in nodes: self.add_node(node) def remove_node(self, node): """ Removes a node from this collection. :param node: The node to remove. :returns: The node which has been removed. :throws NodeNotPresentError: If the node is not in the collection. """ key = _get_node_key(node) try: return self.node_groups.get(key, []).remove(node) except ValueError: raise NodeNotPresentError def remove_nodes(self, nodes): """ Remove a collection of nodes from this collection. :param nodes: The nodes to remove. """ for node in nodes: self.remove_node(node) def _find_matching_nodes( self, required_os: str, required_node_type: Optional[str], num_needed: int ) -> Tuple[List[NodeType], List[NodeType], int]: """ Find nodes that match the required OS and node_type. Matching rules: - OS must match exactly - If required_node_type is None, match nodes of ANY type for this OS - If required_node_type is specified, match only nodes with that exact type :param required_os: The required operating system :param required_node_type: The required node type (None means any) :param num_needed: Number of nodes needed :return: Tuple of (good_nodes, bad_nodes, shortfall) where shortfall is how many more we need """ good_nodes: List[NodeType] = [] bad_nodes: List[NodeType] = [] # Collect candidate keys - keys in node_groups that can satisfy this requirement candidate_keys: List[NodeGroupKey] = [] for os, nt in self.node_groups.keys(): if os != required_os: continue # If no specific type required, any node of this OS matches # If specific type required, only exact match if required_node_type is None or nt == required_node_type: candidate_keys.append((os, nt)) # Try to allocate from candidate pools for key in candidate_keys: if len(good_nodes) >= num_needed: break avail_nodes = self.node_groups.get(key, []) while avail_nodes and len(good_nodes) < num_needed: node = avail_nodes.pop(0) if isinstance(node, RemoteAccount): if node.available(): good_nodes.append(node) else: bad_nodes.append(node) else: good_nodes.append(node) shortfall = max(0, num_needed - len(good_nodes)) return good_nodes, bad_nodes, shortfall def remove_spec(self, cluster_spec: ClusterSpec) -> Tuple[List[NodeType], List[NodeType]]: """ Remove nodes matching a ClusterSpec from this NodeContainer. Allocation strategy: - Specific node_type requirements are allocated BEFORE any-type (None) requirements - For each (os, node_type) in the spec: - If node_type is specified, allocate from that exact pool - If node_type is None, allocate from any pool matching the OS :param cluster_spec: The cluster spec. This will not be modified. :returns: Tuple of (good_nodes, bad_nodes). :raises: InsufficientResourcesError when there aren't enough total nodes InsufficientHealthyNodesError when there aren't enough healthy nodes """ err = self.attempt_remove_spec(cluster_spec) if err: raise InsufficientResourcesError(err) good_nodes: List[NodeType] = [] bad_nodes: List[NodeType] = [] msg = "" # Get requirements grouped by (os, node_type) with counts grouped_counts = cluster_spec.nodes.grouped_by_os_and_type() # Sort so specific types (node_type != None) are allocated before any-type (node_type == None) # This prevents any-type requests from "stealing" nodes needed by specific types def allocation_order(item: Tuple[Tuple[str, Optional[str]], int]) -> Tuple[int, str, str]: (os, node_type), _ = item # Specific types first (0), any-type last (1) type_order = 1 if node_type is None else 0 return (type_order, os or "", node_type or "") sorted_requirements = sorted(grouped_counts.items(), key=allocation_order) for (os, node_type), num_needed in sorted_requirements: found_good, found_bad, shortfall = self._find_matching_nodes(os, node_type, num_needed) good_nodes.extend(found_good) bad_nodes.extend(found_bad) if shortfall > 0: type_desc = f"{os}" if node_type is None else f"{os}/{node_type}" msg += f"{type_desc} nodes requested: {num_needed}. Healthy nodes available: {len(found_good)}. " if msg: # Return good nodes back to the container for node in good_nodes: self.add_node(node) raise InsufficientHealthyNodesError(bad_nodes, msg) return good_nodes, bad_nodes def can_remove_spec(self, cluster_spec: ClusterSpec) -> bool: """ Determine if we can remove nodes matching a ClusterSpec from this NodeContainer. This container will not be modified. :param cluster_spec: The cluster spec. This will not be modified. :returns: True if we could remove the nodes; false otherwise """ msg = self.attempt_remove_spec(cluster_spec) return len(msg) == 0 def _count_nodes_by_os(self, target_os: str) -> int: """ Count total nodes available for a given OS (regardless of node_type). :param target_os: The operating system to count nodes for :return: Total number of nodes with the given OS """ count = 0 for (os, _), nodes in self.node_groups.items(): if os == target_os: count += len(nodes) return count def _count_nodes_by_os_and_type(self, target_os: str, target_type: str) -> int: """ Count nodes available for a specific (os, node_type) combination. :param target_os: The operating system :param target_type: The specific node type (not None) :return: Number of nodes matching both OS and type """ return len(self.node_groups.get((target_os, target_type), [])) def attempt_remove_spec(self, cluster_spec: ClusterSpec) -> str: """ Attempt to remove a cluster_spec from this node container. Uses holistic per-OS validation to correctly handle mixed typed and untyped requirements without double-counting shared capacity. Validation strategy: 1. Check total OS capacity >= total OS demand 2. Check each specific type has enough nodes 3. Check remaining capacity (after specific types) >= any-type demand :param cluster_spec: The cluster spec. This will not be modified. :returns: An empty string if we can remove the nodes; an error string otherwise. """ # if cluster_spec is None this means the test cannot be run at all # e.g. users didn't specify `@cluster` annotation on it but the session context has a flag to fail # on such tests or any other state where the test deems its cluster spec incorrect. if cluster_spec is None: return "Invalid or missing cluster spec" # cluster spec may be empty and that's ok, shortcut to returning no error messages elif len(cluster_spec) == 0 or cluster_spec.nodes is None: return "" msg = "" # Build requirements_by_os: {os -> {node_type -> count}} in a single pass requirements_by_os: Dict[str, Dict[Optional[str], int]] = {} for node_spec in cluster_spec.nodes.elements(): os = node_spec.operating_system node_type = node_spec.node_type requirements_by_os.setdefault(os, {}) requirements_by_os[os][node_type] = requirements_by_os[os].get(node_type, 0) + 1 # Validate each OS holistically for os, type_requirements in requirements_by_os.items(): total_available = self._count_nodes_by_os(os) total_required = sum(type_requirements.values()) # Check 1: Total capacity for this OS if total_available < total_required: msg += f"{os} nodes requested: {total_required}. {os} nodes available: {total_available}. " continue # Already failed, no need for detailed checks # Check 2: Each specific type has enough nodes for node_type, count_needed in type_requirements.items(): if node_type is None: continue # Handle any-type separately type_available = self._count_nodes_by_os_and_type(os, node_type) if type_available < count_needed: msg += f"{os}/{node_type} nodes requested: {count_needed}. {os}/{node_type} nodes available: {type_available}. " # Check 3: After reserving specific types, is there capacity for any-type? any_type_demand = type_requirements.get(None, 0) if any_type_demand > 0: specific_demand = sum(c for t, c in type_requirements.items() if t is not None) remaining_capacity = total_available - specific_demand if remaining_capacity < any_type_demand: msg += ( f"{os} (any type) nodes requested: {any_type_demand}. " f"{os} nodes remaining after typed allocations: {remaining_capacity}. " ) return msg def clone(self) -> "NodeContainer": """ Returns a deep copy of this object. """ container = NodeContainer() for key, nodes in self.node_groups.items(): for node in nodes: container.node_groups.setdefault(key, []).append(node) return container ================================================ FILE: ducktape/cluster/node_spec.py ================================================ # Copyright 2017 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import import json from typing import Optional from .consts import LINUX, SUPPORTED_OS_TYPES class NodeSpec(object): """ Specification for a single cluster node. The node_type field is a generic label that can represent size, architecture, or any classification scheme defined by the cluster configuration. When None, it matches any available node (backward compatible behavior). :param operating_system: The operating system of the node. :param node_type: Node type label (e.g., "large", "small"). None means "match any". """ def __init__(self, operating_system: str = LINUX, node_type: Optional[str] = None): self.operating_system = operating_system self.node_type = node_type if self.operating_system not in SUPPORTED_OS_TYPES: raise RuntimeError("Unsupported os type %s" % self.operating_system) def matches(self, available_node_spec: "NodeSpec") -> bool: """ Check if this requirement can be satisfied by an available node. Matching rules: - OS must match exactly - If requested node_type is None, match any type - If requested node_type is specified, must match exactly :param available_node_spec: The specification of an available node :return: True if this requirement matches the available node """ if self.operating_system != available_node_spec.operating_system: return False if self.node_type is None: return True # Requestor doesn't care about type return self.node_type == available_node_spec.node_type def __str__(self): d = {"os": self.operating_system} if self.node_type is not None: d["node_type"] = self.node_type return json.dumps(d, sort_keys=True) def __eq__(self, other): if not isinstance(other, NodeSpec): return False return self.operating_system == other.operating_system and self.node_type == other.node_type def __hash__(self): return hash((self.operating_system, self.node_type)) ================================================ FILE: ducktape/cluster/remoteaccount.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import shutil import signal import socket import stat import tempfile import warnings from contextlib import contextmanager from typing import Callable, List, Optional, Union from paramiko import MissingHostKeyPolicy, SFTPClient, SSHClient, SSHConfig from paramiko.ssh_exception import NoValidConnectionsError, SSHException from ducktape.errors import DucktapeError from ducktape.utils.http_utils import HttpMixin from ducktape.utils.util import wait_until def check_ssh(method: Callable) -> Callable: def wrapper(self, *args, **kwargs): try: return method(self, *args, **kwargs) except (SSHException, NoValidConnectionsError, socket.error) as e: if self._custom_ssh_exception_checks: self._log(logging.DEBUG, "caught ssh error", exc_info=True) self._log(logging.DEBUG, "starting ssh checks:") self._log( logging.DEBUG, "\n".join(repr(f) for f in self._custom_ssh_exception_checks), ) for func in self._custom_ssh_exception_checks: func(e, self) raise return wrapper class RemoteAccountSSHConfig(object): def __init__( self, host: Optional[str] = None, hostname: Optional[str] = None, user: Optional[str] = None, port: Optional[int] = None, password: Optional[str] = None, identityfile: Optional[str] = None, connecttimeout: Optional[Union[int, float]] = None, **kwargs, ) -> None: """Wrapper for ssh configs used by ducktape to connect to remote machines. The fields in this class are lowercase versions of a small selection of ssh config properties (see man page: "man ssh_config") """ self.host = host self.hostname: str = hostname or "localhost" self.user = user self.port = port or 22 self.port = int(self.port) self.password = password self.identityfile = identityfile # None is default, and it means default TCP timeout will be used. self.connecttimeout = int(connecttimeout) if connecttimeout is not None else None @staticmethod def from_string(config_str): """Construct RemoteAccountSSHConfig object from a string that looks like Host the-host Hostname the-hostname Port 22 User ubuntu IdentityFile /path/to/key """ config = SSHConfig() config.parse(config_str.split("\n")) hostnames = config.get_hostnames() if "*" in hostnames: hostnames.remove("*") assert len(hostnames) == 1, "Expected hostnames to have single entry: %s" % hostnames host = hostnames.pop() config_dict = config.lookup(host) if config_dict.get("identityfile") is not None: # paramiko.SSHConfig parses this in as a list, but we only want a single string config_dict["identityfile"] = config_dict["identityfile"][0] return RemoteAccountSSHConfig(host, **config_dict) def to_json(self): return self.__dict__ def __repr__(self): return str(self.to_json()) def __eq__(self, other): return other and other.__dict__ == self.__dict__ def __hash__(self): return hash(tuple(sorted(self.__dict__.items()))) class RemoteAccountError(DucktapeError): """This exception is raised when an attempted action on a remote node fails.""" def __init__(self, account, msg): self.account_str = str(account) self.msg = msg def __str__(self): return "%s: %s" % (self.account_str, self.msg) class RemoteCommandError(RemoteAccountError): """This exception is raised when a process run by ssh*() returns a non-zero exit status.""" def __init__(self, account, cmd, exit_status, msg): self.account_str = str(account) self.exit_status = exit_status self.cmd = cmd self.msg = msg def __str__(self): msg = "%s: Command '%s' returned non-zero exit status %d." % ( self.account_str, self.cmd, self.exit_status, ) if self.msg: msg += " Remote error message: %s" % self.msg return msg class RemoteAccount(HttpMixin): """RemoteAccount is the heart of interaction with cluster nodes, and every allocated cluster node has a reference to an instance of RemoteAccount. It wraps metadata such as ssh configs, and provides methods for file system manipulation and shell commands. Each operating system has its own RemoteAccount implementation. The node_type attribute stores the classification label from the cluster configuration, enabling type-aware node allocation. """ def __init__( self, ssh_config: RemoteAccountSSHConfig, externally_routable_ip: Optional[str] = None, node_type: Optional[str] = None, logger: Optional[logging.Logger] = None, ssh_exception_checks: List[Callable] = [], ) -> None: # Instance of RemoteAccountSSHConfig - use this instead of a dict, because we need the entire object to # be hashable self.ssh_config = ssh_config # We don't want to rely on the hostname (e.g. 'worker1') having been added to the driver host's /etc/hosts file. # But that means we need to distinguish between the hostname and the value of hostname we use for SSH commands. # We try to satisfy all use cases and keep things simple by # a) storing the hostname the user probably expects (the "Host" value in .ssh/config) # b) saving the real value we use for running the SSH command self.hostname = ssh_config.host self.ssh_hostname = ssh_config.hostname self.user = ssh_config.user self.externally_routable_ip = externally_routable_ip self.node_type = node_type # Node type label (e.g., "large", "small") self._logger = logger self.os: Optional[str] = None self._ssh_client: Optional[SSHClient] = None self._sftp_client: Optional[SFTPClient] = None self._custom_ssh_exception_checks = ssh_exception_checks @property def operating_system(self) -> Optional[str]: return self.os @property def logger(self): if self._logger: return self._logger else: return logging.getLogger(__name__) @logger.setter def logger(self, logger): self._logger = logger def _log(self, level, msg, *args, **kwargs): msg = "%s: %s" % (str(self), msg) self.logger.log(level, msg, *args, **kwargs) @check_ssh def _set_ssh_client(self): client = SSHClient() client.set_missing_host_key_policy(IgnoreMissingHostKeyPolicy()) self._log(logging.DEBUG, "ssh_config: %s" % str(self.ssh_config)) client.connect( hostname=self.ssh_config.hostname, port=self.ssh_config.port, username=self.ssh_config.user, password=self.ssh_config.password, key_filename=self.ssh_config.identityfile, look_for_keys=False, timeout=self.ssh_config.connecttimeout, ) if self._ssh_client: self._ssh_client.close() self._ssh_client = client self._set_sftp_client() @property def ssh_client(self): if self._ssh_client and self._ssh_client.get_transport() and self._ssh_client.get_transport().is_active(): try: transport = self._ssh_client.get_transport() transport.send_ignore() except Exception as e: self._log( logging.DEBUG, "exception getting ssh_client (creating new client): %s" % str(e), ) self._set_ssh_client() else: self._set_ssh_client() return self._ssh_client def _set_sftp_client(self): if self._sftp_client: self._sftp_client.close() self._sftp_client = self.ssh_client.open_sftp() @property def sftp_client(self): if not self._sftp_client: self._set_sftp_client() else: self.ssh_client # test connection return self._sftp_client def close(self) -> None: """Close/release any outstanding network connections to remote account.""" if self._ssh_client: self._ssh_client.close() self._ssh_client = None if self._sftp_client: self._sftp_client.close() self._sftp_client = None def __str__(self): r = "" if self.user: r += self.user + "@" r += self.hostname return r def __repr__(self): return str(self.__dict__) def __eq__(self, other): return other is not None and self.__dict__ == other.__dict__ def __hash__(self): return hash(tuple(sorted(self.__dict__.items()))) def wait_for_http_service(self, port, headers, timeout=20, path="/"): """Wait until this service node is available/awake.""" url = "http://%s:%s%s" % (self.externally_routable_ip, str(port), path) err_msg = ( "Timed out trying to contact service on %s. " % url + "Either the service failed to start, or there is a problem with the url." ) wait_until( lambda: self._can_ping_url(url, headers), timeout_sec=timeout, backoff_sec=0.25, err_msg=err_msg, ) def _can_ping_url(self, url, headers): """See if we can successfully issue a GET request to the given url.""" try: self.http_request(url, "GET", None, headers, timeout=0.75) return True except Exception: return False def available(self): # TODO: https://github.com/confluentinc/ducktape/issues/339 # try: # self.ssh_client # except Exception: # return False # else: # return True # finally: # self.close() return True @check_ssh def ssh(self, cmd, allow_fail=False): """Run the given command on the remote host, and block until the command has finished running. :param cmd: The remote ssh command :param allow_fail: If True, ignore nonzero exit status of the remote command, else raise an ``RemoteCommandError`` :return: The exit status of the command. :raise RemoteCommandError: If allow_fail is False and the command returns a non-zero exit status """ self._log(logging.DEBUG, "Running ssh command: %s" % cmd) client = self.ssh_client stdin, stdout, stderr = client.exec_command(cmd) # Unfortunately we need to read over the channel to ensure that recv_exit_status won't hang. See: # http://docs.paramiko.org/en/2.0/api/channel.html#paramiko.channel.Channel.recv_exit_status stdout.read() exit_status = stdout.channel.recv_exit_status() try: if exit_status != 0: if not allow_fail: raise RemoteCommandError(self, cmd, exit_status, stderr.read()) else: self._log( logging.DEBUG, "Running ssh command '%s' exited with status %d and message: %s" % (cmd, exit_status, stderr.read()), ) finally: stdin.close() stdout.close() stderr.close() return exit_status @check_ssh def ssh_capture( self, cmd, allow_fail=False, callback=None, combine_stderr=True, timeout_sec=None, ): """Run the given command asynchronously via ssh, and return an SSHOutputIter object. Does *not* block :param cmd: The remote ssh command :param allow_fail: If True, ignore nonzero exit status of the remote command, else raise an ``RemoteCommandError`` :param callback: If set, the iterator returns ``callback(line)`` for each line of output instead of the raw output :param combine_stderr: If True, return output from both stderr and stdout of the remote process. :param timeout_sec: Set timeout on blocking reads/writes. Default None. For more details see http://docs.paramiko.org/en/2.0/api/channel.html#paramiko.channel.Channel.settimeout :return SSHOutputIter: object which allows iteration through each line of output. :raise RemoteCommandError: If ``allow_fail`` is False and the command returns a non-zero exit status """ self._log(logging.DEBUG, "Running ssh command: %s" % cmd) client = self.ssh_client chan = client.get_transport().open_session(timeout=timeout_sec) chan.settimeout(timeout_sec) chan.exec_command(cmd) chan.set_combine_stderr(combine_stderr) stdin = chan.makefile("wb", -1) # set bufsize to -1 stdout = chan.makefile("r", -1) stderr = chan.makefile_stderr("r", -1) def output_generator(): for line in iter(stdout.readline, ""): if callback is None: yield line else: yield callback(line) try: exit_status = stdout.channel.recv_exit_status() if exit_status != 0: if not allow_fail: raise RemoteCommandError(self, cmd, exit_status, stderr.read()) else: self._log( logging.DEBUG, "Running ssh command '%s' exited with status %d and message: %s" % (cmd, exit_status, stderr.read()), ) finally: stdin.close() stdout.close() stderr.close() return SSHOutputIter(output_generator, stdout) @check_ssh def ssh_output(self, cmd, allow_fail=False, combine_stderr=True, timeout_sec=None): """Runs the command via SSH and captures the output, returning it as a string. :param cmd: The remote ssh command. :param allow_fail: If True, ignore nonzero exit status of the remote command, else raise an ``RemoteCommandError`` :param combine_stderr: If True, return output from both stderr and stdout of the remote process. :param timeout_sec: Set timeout on blocking reads/writes. Default None. For more details see http://docs.paramiko.org/en/2.0/api/channel.html#paramiko.channel.Channel.settimeout :return: The stdout output from the ssh command. :raise RemoteCommandError: If ``allow_fail`` is False and the command returns a non-zero exit status """ self._log(logging.DEBUG, "Running ssh command: %s" % cmd) client = self.ssh_client chan = client.get_transport().open_session(timeout=timeout_sec) chan.settimeout(timeout_sec) chan.exec_command(cmd) chan.set_combine_stderr(combine_stderr) stdin = chan.makefile("wb", -1) # set bufsize to -1 stdout = chan.makefile("r", -1) stderr = chan.makefile_stderr("r", -1) try: stdoutdata = stdout.read() exit_status = stdin.channel.recv_exit_status() if exit_status != 0: if not allow_fail: raise RemoteCommandError(self, cmd, exit_status, stderr.read()) else: self._log( logging.DEBUG, "Running ssh command '%s' exited with status %d and message: %s" % (cmd, exit_status, stderr.read()), ) finally: stdin.close() stdout.close() stderr.close() self._log(logging.DEBUG, "Returning ssh command output:\n%s" % stdoutdata) return stdoutdata def alive(self, pid): """Return True if and only if process with given pid is alive.""" try: self.ssh("kill -0 %s" % str(pid), allow_fail=False) return True except Exception: return False def signal(self, pid, sig, allow_fail=False): cmd = "kill -%d %s" % (int(sig), str(pid)) self.ssh(cmd, allow_fail=allow_fail) def kill_process(self, process_grep_str, clean_shutdown=True, allow_fail=False): cmd = """ps ax | grep -i """ + process_grep_str + """ | grep -v grep | awk '{print $1}'""" pids = [pid for pid in self.ssh_capture(cmd, allow_fail=True)] if clean_shutdown: sig = signal.SIGTERM else: sig = signal.SIGKILL for pid in pids: self.signal(pid, sig, allow_fail=allow_fail) def java_pids(self, match): """ Get all the Java process IDs matching 'match'. :param match: The AWK expression to match """ cmd = """jcmd | awk '/%s/ { print $1 }'""" % match return [int(pid) for pid in self.ssh_capture(cmd, allow_fail=True)] def kill_java_processes(self, match, clean_shutdown=True, allow_fail=False): """ Kill all the java processes matching 'match'. :param match: The AWK expression to match :param clean_shutdown: True if we should shut down cleanly with SIGTERM; false if we should shut down with SIGKILL. :param allow_fail: True if we should throw exceptions if the ssh commands fail. """ cmd = """jcmd | awk '/%s/ { print $1 }'""" % match pids = [pid for pid in self.ssh_capture(cmd, allow_fail=True)] if clean_shutdown: sig = signal.SIGTERM else: sig = signal.SIGKILL for pid in pids: self.signal(pid, sig, allow_fail=allow_fail) def copy_between(self, src, dest, dest_node): """Copy src to dest on dest_node :param src: Path to the file or directory we want to copy :param dest: The destination path :param dest_node: The node to which we want to copy the file/directory Note that if src is a directory, this will automatically copy recursively. """ # TODO: if dest is an existing file, what is the behavior? temp_dir = tempfile.mkdtemp() try: # TODO: deal with very unlikely case that src_name matches temp_dir name? # TODO: I think this actually works local_dest = self._re_anchor_basename(src, temp_dir) self.copy_from(src, local_dest) dest_node.account.copy_to(local_dest, dest) finally: if os.path.isdir(temp_dir): shutil.rmtree(temp_dir) def scp_from(self, src, dest, recursive=False): warnings.warn("scp_from is now deprecated. Please use copy_from") self.copy_from(src, dest) def _re_anchor_basename(self, path, directory): """Anchor the basename of path onto the given directory Helper for the various copy_* methods. :param path: Path to a file or directory. Could be on the driver machine or a worker machine. :param directory: Path to a directory. Could be on the driver machine or a worker machine. Example:: path/to/the_basename, another/path/ -> another/path/the_basename """ path_basename = path # trim off path separator from end of path # this is necessary because os.path.basename of a path ending in a separator is an empty string # For example: # os.path.basename("the/path/") == "" # os.path.basename("the/path") == "path" if path_basename.endswith(os.path.sep): path_basename = path_basename[: -len(os.path.sep)] path_basename = os.path.basename(path_basename) return os.path.join(directory, path_basename) @check_ssh def copy_from(self, src, dest): if os.path.isdir(dest): # dest is an existing directory, so assuming src looks like path/to/src_name, # in this case we'll copy as: # path/to/src_name -> dest/src_name dest = self._re_anchor_basename(src, dest) if self.isfile(src): self.sftp_client.get(src, dest) elif self.isdir(src): # we can now assume dest path looks like: path_that_exists/new_directory os.mkdir(dest) # for obj in `ls src`, if it's a file, copy with copy_file_from, elif its a directory, call again for obj in self.sftp_client.listdir(src): obj_path = os.path.join(src, obj) if self.isfile(obj_path) or self.isdir(obj_path): self.copy_from(obj_path, dest) else: # TODO what about uncopyable file types? pass def scp_to(self, src, dest, recursive=False): warnings.warn("scp_to is now deprecated. Please use copy_to") self.copy_to(src, dest) @check_ssh def copy_to(self, src, dest): if self.isdir(dest): # dest is an existing directory, so assuming src looks like path/to/src_name, # in this case we'll copy as: # path/to/src_name -> dest/src_name dest = self._re_anchor_basename(src, dest) if os.path.isfile(src): # local to remote self.sftp_client.put(src, dest) elif os.path.isdir(src): # we can now assume dest path looks like: path_that_exists/new_directory self.mkdir(dest) # for obj in `ls src`, if it's a file, copy with copy_file_from, elif its a directory, call again for obj in os.listdir(src): obj_path = os.path.join(src, obj) if os.path.isfile(obj_path) or os.path.isdir(obj_path): self.copy_to(obj_path, dest) else: # TODO what about uncopyable file types? pass @check_ssh def islink(self, path): try: # stat should follow symlinks path_stat = self.sftp_client.lstat(path) return stat.S_ISLNK(path_stat.st_mode) except Exception: return False @check_ssh def isdir(self, path): try: # stat should follow symlinks path_stat = self.sftp_client.stat(path) return stat.S_ISDIR(path_stat.st_mode) except Exception: return False @check_ssh def exists(self, path): """Test that the path exists, but don't follow symlinks.""" try: # stat follows symlinks and tries to stat the actual file self.sftp_client.lstat(path) return True except IOError: return False @check_ssh def isfile(self, path): """Imitates semantics of os.path.isfile :param path: Path to the thing to check :return: True if path is a file or a symlink to a file, else False. Note False can mean path does not exist. """ try: # stat should follow symlinks path_stat = self.sftp_client.stat(path) return stat.S_ISREG(path_stat.st_mode) except Exception: return False def open(self, path, mode="r"): return self.sftp_client.open(path, mode) @check_ssh def create_file(self, path, contents): """Create file at path, with the given contents. If the path already exists, it will be overwritten. """ # TODO: what should semantics be if path exists? what actually happens if it already exists? # TODO: what happens if the base part of the path does not exist? with self.sftp_client.open(path, "w") as f: f.write(contents) _DEFAULT_PERMISSIONS = int("755", 8) @check_ssh def mkdir(self, path, mode=_DEFAULT_PERMISSIONS): self.sftp_client.mkdir(path, mode) def mkdirs(self, path, mode=_DEFAULT_PERMISSIONS): self.ssh("mkdir -p %s && chmod %o %s" % (path, mode, path)) def remove(self, path, allow_fail=False): """Remove the given file or directory""" if allow_fail: cmd = "rm -rf %s" % path else: cmd = "rm -r %s" % path self.ssh(cmd, allow_fail=allow_fail) @contextmanager def monitor_log(self, log): """ Context manager that returns an object that helps you wait for events to occur in a log. This checks the size of the log at the beginning of the block and makes a helper object available with convenience methods for checking or waiting for a pattern to appear in the log. This will commonly be used to start a process, then wait for a log message indicating the process is in a ready state. See ``LogMonitor`` for more usage information. """ try: offset = int(self.ssh_output("wc -c %s" % log).split()[0]) except Exception: offset = 0 yield LogMonitor(self, log, offset) class SSHOutputIter(object): """Helper class that wraps around an iterable object to provide has_next() in addition to next()""" def __init__(self, iter_obj_func, channel_file=None): """ :param iter_obj_func: A generator that returns an iterator over stdout from the remote process :param channel_file: A paramiko ``ChannelFile`` object """ self.iter_obj_func = iter_obj_func self.iter_obj = iter_obj_func() self.channel_file = channel_file # sentinel is used as an indicator that there is currently nothing cached # If self.cached is self.sentinel, then next object from ier_obj is not yet cached. self.sentinel = object() self.cached = self.sentinel def __iter__(self): return self def next(self): if self.cached is self.sentinel: return next(self.iter_obj) next_obj = self.cached self.cached = self.sentinel return next_obj __next__ = next def has_next(self, timeout_sec=None): """Return True if next(iter_obj) would return another object within timeout_sec, else False. If timeout_sec is None, next(iter_obj) may block indefinitely. """ assert timeout_sec is None or self.channel_file is not None, "should have descriptor to enforce timeout" prev_timeout = None if self.cached is self.sentinel: if self.channel_file is not None: prev_timeout = self.channel_file.channel.gettimeout() # when timeout_sec is None, next(iter_obj) will block indefinitely self.channel_file.channel.settimeout(timeout_sec) try: self.cached = next(self.iter_obj, self.sentinel) except socket.timeout: self.iter_obj = self.iter_obj_func() self.cached = self.sentinel finally: if self.channel_file is not None: # restore preexisting timeout self.channel_file.channel.settimeout(prev_timeout) return self.cached is not self.sentinel class LogMonitor(object): """ Helper class returned by monitor_log. Should be used as:: with remote_account.monitor_log("/path/to/log") as monitor: remote_account.ssh("/command/to/start") monitor.wait_until("pattern.*to.*grep.*for", timeout_sec=5) to run the command and then wait for the pattern to appear in the log. """ def __init__(self, acct, log, offset): self.acct = acct self.log = log self.offset = offset def wait_until(self, pattern, **kwargs): """ Wait until the specified pattern is found in the log, after the initial offset recorded when the LogMonitor was created. Additional keyword args are passed directly to ``ducktape.utils.util.wait_until`` """ return wait_until( lambda: self.acct.ssh( "tail -c +%d %s | grep '%s'" % (self.offset + 1, self.log, pattern), allow_fail=True, ) == 0, **kwargs, ) class IgnoreMissingHostKeyPolicy(MissingHostKeyPolicy): """Policy for ignoring missing host keys. Many examples show use of AutoAddPolicy, but this clutters up the known_hosts file unnecessarily. """ def missing_host_key(self, client, hostname, key): return ================================================ FILE: ducktape/cluster/vagrant.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import import json import os import subprocess from ducktape.json_serializable import DucktapeJSONEncoder from .json import JsonCluster, make_remote_account from .remoteaccount import RemoteAccountSSHConfig class VagrantCluster(JsonCluster): """ An implementation of Cluster that uses a set of VMs created by Vagrant. Because we need hostnames that can be advertised, this assumes that the Vagrant VM's name is a routeable hostname on all the hosts. - If cluster_file is specified in the constructor's kwargs (i.e. passed via command line argument --cluster-file) - If cluster_file exists on the filesystem, read cluster info from the file - Otherwise, retrieve cluster info via "vagrant ssh-config" from vagrant and write cluster info to cluster_file - Otherwise, retrieve cluster info via "vagrant ssh-config" from vagrant """ def __init__(self, *args, make_remote_account_func=make_remote_account, **kwargs) -> None: is_read_from_file = False self.ssh_exception_checks = kwargs.get("ssh_exception_checks") cluster_file = kwargs.get("cluster_file") if cluster_file is not None: try: cluster_json = json.load(open(os.path.abspath(cluster_file))) is_read_from_file = True except IOError: # It is OK if file is not found. Call vagrant ssh-info to read the cluster info. pass if not is_read_from_file: cluster_json = {"nodes": self._get_nodes_from_vagrant(make_remote_account_func)} super(VagrantCluster, self).__init__( cluster_json, *args, make_remote_account_func=make_remote_account_func, **kwargs, ) # If cluster file is specified but the cluster info is not read from it, write the cluster info into the file if not is_read_from_file and cluster_file is not None: nodes = [ { "ssh_config": node_account.ssh_config, "externally_routable_ip": node_account.externally_routable_ip, } for node_account in self._available_accounts ] cluster_json["nodes"] = nodes with open(cluster_file, "w+") as fd: json.dump( cluster_json, fd, cls=DucktapeJSONEncoder, indent=2, separators=(",", ": "), sort_keys=True, ) # Release any ssh clients used in querying the nodes for metadata for node_account in self._available_accounts: node_account.close() def _get_nodes_from_vagrant(self, make_remote_account_func): ssh_config_info, error = self._vagrant_ssh_config() nodes = [] node_info_arr = ssh_config_info.split("\n\n") node_info_arr = [ninfo.strip() for ninfo in node_info_arr if ninfo.strip()] for ninfo in node_info_arr: ssh_config = RemoteAccountSSHConfig.from_string(ninfo) account = None try: account = make_remote_account_func(ssh_config, ssh_exception_checks=self.ssh_exception_checks) externally_routable_ip = account.fetch_externally_routable_ip() finally: if account: account.close() del account nodes.append( { "ssh_config": ssh_config.to_json(), "externally_routable_ip": externally_routable_ip, } ) return nodes def _vagrant_ssh_config(self): ssh_config_info, error = subprocess.Popen( "vagrant ssh-config", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, # Force to text mode in py2/3 compatible way universal_newlines=True, ).communicate() return ssh_config_info, error ================================================ FILE: ducktape/cluster/windows_remoteaccount.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import base64 import logging import os import boto3 import winrm from botocore.exceptions import ClientError from Crypto.Cipher import PKCS1_v1_5 from Crypto.PublicKey import RSA from ducktape.cluster.consts import WINDOWS from ducktape.cluster.remoteaccount import RemoteAccount, RemoteCommandError class WindowsRemoteAccount(RemoteAccount): """ Windows remote accounts are currently only supported in EC2. See ``_setup_winrm()`` for how the WinRM password is fetched, which is currently specific to AWS. The Windows AMI needs to also have an SSH server running to support SSH commands, SCP, and rsync. """ WINRM_USERNAME = "Administrator" def __init__(self, *args, **kwargs): super(WindowsRemoteAccount, self).__init__(*args, **kwargs) self.os = WINDOWS self._winrm_client = None @property def winrm_client(self): # TODO: currently this only works in AWS EC2 provisioned by Vagrant. Add support for other environments. # check if winrm has already been setup. If yes, return immediately. if self._winrm_client: return self._winrm_client # first get the instance ID of this machine from Vagrant's metadata. ec2_instance_id_path = os.path.join(os.getcwd(), ".vagrant", "machines", self.ssh_config.host, "aws", "id") instance_id_file = None try: instance_id_file = open(ec2_instance_id_path, "r") ec2_instance_id = instance_id_file.read().strip() if not ec2_instance_id or ec2_instance_id == "": raise Exception except Exception: raise Exception("Could not extract EC2 instance ID from local file: %s" % ec2_instance_id_path) finally: if instance_id_file: instance_id_file.close() self._log(logging.INFO, "Found EC2 instance id: %s" % ec2_instance_id) # then get the encrypted password. client = boto3.client("ec2") try: response = client.get_password_data(InstanceId=ec2_instance_id) except ClientError as ce: if "InvalidInstanceID.NotFound" in str(ce): raise Exception( "The instance id '%s' couldn't be found. Is the correct AWS region configured?" % ec2_instance_id ) else: raise ce self._log( logging.INFO, "Fetched encrypted winrm password and will decrypt with private key: %s" % self.ssh_config.identityfile, ) # then decrypt the password using the private key. key_file = None try: key_file = open(self.ssh_config.identityfile, "r") key = key_file.read() rsa_key = RSA.importKey(key) cipher = PKCS1_v1_5.new(rsa_key) winrm_password = cipher.decrypt(base64.b64decode(response["PasswordData"]), None) self._winrm_client = winrm.Session( self.ssh_config.hostname, auth=(WindowsRemoteAccount.WINRM_USERNAME, winrm_password), ) finally: if key_file: key_file.close() return self._winrm_client def fetch_externally_routable_ip(self, is_aws=None): # EC2 windows machines aren't given an externally routable IP. Use the hostname instead. return self.ssh_config.hostname def run_winrm_command(self, cmd, allow_fail=False): self._log(logging.DEBUG, "Running winrm command: %s" % cmd) result = self.winrm_client.run_cmd(cmd) if not allow_fail and result.status_code != 0: raise RemoteCommandError(self, cmd, result.status_code, result.std_err) return result.status_code ================================================ FILE: ducktape/command_line/__init__.py ================================================ ================================================ FILE: ducktape/command_line/defaults.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os class ConsoleDefaults(object): # Directory for project-specific ducktape configs and runtime data DUCKTAPE_DIR = ".ducktape" # Store various bookkeeping data here METADATA_DIR = os.path.join(DUCKTAPE_DIR, "metadata") # Default path, relative to current project directory, to the project's ducktape config file PROJECT_CONFIG_FILE = os.path.join(DUCKTAPE_DIR, "config") # Default path to the user-specific config file USER_CONFIG_FILE = os.path.join("~", DUCKTAPE_DIR, "config") # Default cluster implementation CLUSTER_TYPE = "ducktape.cluster.vagrant.VagrantCluster" # Default path, relative to current project directory, to the cluster file CLUSTER_FILE = os.path.join(DUCKTAPE_DIR, "cluster.json") # Track the last-used session_id here SESSION_ID_FILE = os.path.join(METADATA_DIR, "session_id") # Folders with test reports, logs, etc all are created in this directory RESULTS_ROOT_DIRECTORY = "./results" SESSION_LOG_FORMATTER = "[%(levelname)s:%(asctime)s]: %(message)s" TEST_LOG_FORMATTER = "[%(levelname)-5s - %(asctime)s - %(module)s - %(funcName)s - lineno:%(lineno)s]: %(message)s" # Log this to indicate a test is misbehaving to help end user find which test is at fault BAD_TEST_MESSAGE = "BAD_TEST" # Ducktape will try to pick a random open port in the range [TEST_DRIVER_PORT_MIN, TEST_DRIVER_PORT_MAX] # this range is *inclusive* TEST_DRIVER_MIN_PORT = 5556 TEST_DRIVER_MAX_PORT = 5656 ================================================ FILE: ducktape/command_line/main.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import importlib import json import os import random import sys import traceback from ducktape.command_line.defaults import ConsoleDefaults from ducktape.command_line.parse_args import parse_args from ducktape.tests.loader import LoaderException, TestLoader from ducktape.tests.loggermaker import close_logger from ducktape.tests.reporter import ( FailedTestSymbolReporter, HTMLSummaryReporter, JSONReporter, JUnitReporter, SimpleFileSummaryReporter, SimpleStdoutSummaryReporter, ) from ducktape.tests.runner import TestRunner from ducktape.tests.session import ( SessionContext, SessionLoggerMaker, generate_results_dir, generate_session_id, ) from ducktape.utils import persistence from ducktape.utils.local_filesystem_utils import mkdir_p from ducktape.utils.util import load_function def get_user_defined_globals(globals_str): """Parse user-defined globals into an immutable dict using globals_str :param globals_str Either a file, in which case, attempt to open the file and parse the contents as JSON, or a JSON string representing a JSON object. The parsed JSON must represent a collection of key-value pairs, i.e. a python dict. :return dict containing user-defined global variables """ if globals_str is None: return persistence.make_dict() from_file = False if os.path.isfile(globals_str): # The string appears to be a file, so try loading JSON from file # This may raise an IOError if the file can't be read or a ValueError if the contents of the file # cannot be parsed. user_globals = json.loads(open(globals_str, "r").read()) from_file = True else: try: # try parsing directly as json if it doesn't seem to be a file user_globals = json.loads(globals_str) except ValueError as ve: message = str(ve) message += "\nglobals parameter %s is neither valid JSON nor a valid path to a JSON file." % globals_str raise ValueError(message) # Now check that the parsed JSON is a dictionary if not isinstance(user_globals, dict): if from_file: message = "The JSON contained in file %s must parse to a dict. " % globals_str else: message = "JSON string referred to by globals parameter must parse to a dict. " message += "I.e. the contents of the JSON must be an object, not an array or primitive. " message += "Instead found %s, which parsed to %s" % ( str(user_globals), type(user_globals), ) raise ValueError(message) # create the immutable dict return persistence.make_dict(**user_globals) def setup_results_directory(new_results_dir): """Make directory in which results will be stored""" if os.path.exists(new_results_dir): raise Exception("A file or directory at %s already exists. Exiting without overwriting." % new_results_dir) mkdir_p(new_results_dir) def update_latest_symlink(results_root, new_results_dir): """Create or update symlink "latest" which points to the new test results directory""" latest_test_dir = os.path.join(results_root, "latest") if os.path.islink(latest_test_dir): os.unlink(latest_test_dir) os.symlink(new_results_dir, latest_test_dir) def main(): """Ducktape entry point. This contains top level logic for ducktape command-line program which does the following: Discover tests Initialize cluster for distributed services Run tests Report a summary of all results """ args_dict = parse_args(sys.argv[1:]) injected_args = None if args_dict["parameters"]: try: injected_args = json.loads(args_dict["parameters"]) except ValueError as e: print("parameters are not valid json: " + str(e)) sys.exit(1) args_dict["globals"] = get_user_defined_globals(args_dict.get("globals")) # Make .ducktape directory where metadata such as the last used session_id is stored if not os.path.isdir(ConsoleDefaults.METADATA_DIR): os.makedirs(ConsoleDefaults.METADATA_DIR) # Generate a shared 'global' identifier for this test run and create the directory # in which all test results will be stored session_id = generate_session_id(ConsoleDefaults.SESSION_ID_FILE) results_dir = generate_results_dir(args_dict["results_root"], session_id) setup_results_directory(results_dir) session_context = SessionContext(session_id=session_id, results_dir=results_dir, **args_dict) session_logger = SessionLoggerMaker(session_context).logger for k, v in args_dict.items(): session_logger.debug("Configuration: %s=%s", k, v) # Discover and load tests to be run loader = TestLoader( session_context, session_logger, repeat=args_dict["repeat"], injected_args=injected_args, subset=args_dict["subset"], subsets=args_dict["subsets"], historical_report=args_dict["historical_report"], ) try: tests = loader.load(args_dict["test_path"], excluded_test_symbols=args_dict["exclude"]) except LoaderException as e: print("Failed while trying to discover tests: {}".format(e)) sys.exit(1) expected_test_count = len(tests) session_logger.info(f"Discovered {expected_test_count} tests to run") if args_dict["collect_only"]: print("Collected %d tests:" % expected_test_count) for test in tests: print(" " + str(test)) sys.exit(0) if args_dict["collect_num_nodes"]: total_nodes = sum(test.expected_num_nodes for test in tests) print(total_nodes) sys.exit(0) if args_dict["sample"]: print("Running a sample of %d tests" % args_dict["sample"]) try: tests = random.sample(tests, args_dict["sample"]) except ValueError as e: if args_dict["sample"] > len(tests): print( "sample size %d greater than number of tests %d; running all tests" % (args_dict["sample"], len(tests)) ) else: print("invalid sample size (%s), running all tests" % e) # Initializing the cluster is slow, so do so only if # tests are sure to be run try: (cluster_mod_name, cluster_class_name) = args_dict["cluster"].rsplit(".", 1) cluster_mod = importlib.import_module(cluster_mod_name) cluster_class = getattr(cluster_mod, cluster_class_name) cluster_kwargs = {"cluster_file": args_dict["cluster_file"]} checker_function_names = args_dict["ssh_checker_function"] if checker_function_names: checkers = [load_function(func_path) for func_path in checker_function_names] if checkers: cluster_kwargs["ssh_exception_checks"] = checkers cluster = cluster_class(**cluster_kwargs) for ctx in tests: # Note that we're attaching a reference to cluster # only after test context objects have been instantiated ctx.cluster = cluster except Exception: print("Failed to load cluster: ", str(sys.exc_info()[0])) print(traceback.format_exc(limit=16)) sys.exit(1) # Run the tests deflake_num = args_dict["deflake"] if deflake_num < 1: session_logger.warning("specified number of deflake runs specified to be less than 1, running without deflake.") deflake_num = max(1, deflake_num) runner = TestRunner(cluster, session_context, session_logger, tests, deflake_num) test_results = runner.run_all_tests() # Report results reporters = [ SimpleStdoutSummaryReporter(test_results), SimpleFileSummaryReporter(test_results), HTMLSummaryReporter(test_results, expected_test_count), JSONReporter(test_results), JUnitReporter(test_results), FailedTestSymbolReporter(test_results), ] for r in reporters: r.report() update_latest_symlink(args_dict["results_root"], results_dir) if len(test_results) < expected_test_count: session_logger.warning( f"All tests were NOT run. Expected {expected_test_count} tests, only {len(test_results)} were run." ) close_logger(session_logger) sys.exit(1) close_logger(session_logger) if not test_results.get_aggregate_success(): # Non-zero exit if at least one test failed sys.exit(1) ================================================ FILE: ducktape/command_line/parse_args.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import argparse import itertools import os import sys from typing import Dict, List, Optional, Union from ducktape.command_line.defaults import ConsoleDefaults from ducktape.utils.util import ducktape_version def create_ducktape_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Discover and run your tests") parser.add_argument( "test_path", metavar="test_path", type=str, nargs="*", default=[os.getcwd()], help="One or more test identifiers or test suite paths to execute", ) parser.add_argument( "--exclude", type=str, nargs="*", default=None, help="one or more space-delimited strings indicating which tests to exclude", ) parser.add_argument( "--collect-only", action="store_true", help="display collected tests, but do not run.", ) parser.add_argument( "--collect-num-nodes", action="store_true", help="display total number of nodes requested by all tests, but do not run anything.", ) parser.add_argument("--debug", action="store_true", help="pipe more verbose test output to stdout.") parser.add_argument( "--config-file", action="store", default=ConsoleDefaults.USER_CONFIG_FILE, help="path to project-specific configuration file.", ) parser.add_argument( "--compress", action="store_true", help="compress remote logs before collection.", ) parser.add_argument( "--cluster", action="store", default=ConsoleDefaults.CLUSTER_TYPE, help="cluster class to use to allocate nodes for tests.", ) parser.add_argument( "--default-num-nodes", action="store", type=int, default=None, help="Global hint for cluster usage. A test without the @cluster annotation will " "default to this value for expected cluster usage.", ) parser.add_argument( "--cluster-file", action="store", default=None, help="path to a json file which provides information needed to initialize a json cluster. " "The file is used to read/write cached cluster info if " "cluster is ducktape.cluster.vagrant.VagrantCluster.", ) parser.add_argument( "--results-root", action="store", default=ConsoleDefaults.RESULTS_ROOT_DIRECTORY, help="path to custom root results directory. Running ducktape with this root " "specified will result in new test results being stored in a subdirectory of " "this root directory.", ) parser.add_argument("--exit-first", action="store_true", help="exit after first failure") parser.add_argument( "--no-teardown", action="store_true", help="don't kill running processes or remove log files when a test has finished running. " "This is primarily useful for test developers who want to interact with running " "services after a test has run.", ) parser.add_argument("--version", action="store_true", help="display version") parser.add_argument( "--parameters", action="store", help="inject these arguments into the specified test(s). Specify parameters as a JSON string.", ) parser.add_argument( "--globals", action="store", help="user-defined globals go here. " "This can be a file containing a JSON object, or a string representing a JSON object.", ) parser.add_argument( "--max-parallel", action="store", type=int, default=1, help="Upper bound on number of tests run simultaneously.", ) parser.add_argument( "--repeat", action="store", type=int, default=1, help="Use this flag to repeat all discovered tests the given number of times.", ) parser.add_argument( "--subsets", action="store", type=int, default=1, help="Number of subsets of tests to statically break the tests into to allow for parallel " "execution without coordination between test runner processes.", ) parser.add_argument( "--subset", action="store", type=int, default=0, help="Which subset of the tests to run, based on the breakdown using the parameter for --subsets", ) parser.add_argument( "--historical-report", action="store", type=str, help="URL of a JSON report file containing stats from a previous test run. If specified, " "this will be used when creating subsets of tests to divide evenly by total run time " "instead of by number of tests.", ) parser.add_argument( "--skip-nodes-allocation", action="store_true", help="Use this flag to skip allocating " "nodes for services. Can be used when running specific tests on a running platform", ) parser.add_argument( "--sample", action="store", type=int, help="The size of a random test sample to run", ) parser.add_argument( "--fail-bad-cluster-utilization", action="store_true", help="Fail a test if the test declared that it needs more nodes than it actually used. " "E.g. if the test had `@cluster(num_nodes=10)` annotation, " "but never used more than 5 nodes during its execution.", ) parser.add_argument( "--fail-greedy-tests", action="store_true", help="Fail a test if it has no @cluster annotation " "or if @cluster annotation is empty. " "You can still specify 0-sized cluster explicitly using either num_nodes=0 " "or cluster_spec=ClusterSpec.empty()", ) parser.add_argument( "--test-runner-timeout", action="store", type=int, default=1800000, help="Amount of time in milliseconds between test communicating between the test runner" " before a timeout error occurs. Default is 30 minutes", ) ( parser.add_argument( "--ssh-checker-function", action="store", type=str, nargs="+", help="Python module path(s) to a function that takes an exception and a remote account" " that will be called when an ssh error occurs, this can give some " "validation or better logging when an ssh error occurs. Specify any " "number of module paths after this flag to be called.", ), ) parser.add_argument( "--deflake", action="store", type=int, default=1, help="the number of times a failed test should be ran in total (including its initial run) " "to determine flakyness. When not present, deflake will not be used, " "and a test will be marked as either passed or failed. " "When enabled tests will be marked as flaky if it passes on any of the reruns", ) parser.add_argument( "--enable-jvm-logs", action="store_true", help="Enable automatic JVM log collection for Java-based services (Kafka, ZooKeeper, Connect, etc.)", ) return parser def get_user_config_file(args: List[str]) -> str: """Helper function to get specified (or default) user config file. :return Filename which is the path to the config file. """ parser = create_ducktape_parser() config_file = vars(parser.parse_args(args))["config_file"] assert config_file is not None return os.path.expanduser(config_file) def config_file_to_args_list(config_file): """Parse in contents of config file, and return a list of command-line options parseable by the ducktape parser. Skip whitespace lines and comments (lines prefixed by "#") """ if config_file is None: raise RuntimeError("config_file is None") # Read in configuration, but ignore empty lines and comments config_lines = [ line for line in open(config_file).readlines() if (len(line.strip()) > 0 and line.lstrip()[0] != "#") ] return list(itertools.chain(*[line.split() for line in config_lines])) def parse_non_default_args(parser: argparse.ArgumentParser, defaults: dict, args: list) -> dict: """ Parse and remove default args from a list of args, and return the dict of the parsed args. """ parsed_args = vars(parser.parse_args(args)) # remove defaults for key, value in defaults.items(): if parsed_args[key] == value: del parsed_args[key] return parsed_args def parse_args( args: List[str], ) -> Dict[str, Optional[Union[List[str], bool, str, int]]]: """Parse in command-line and config file options. Command line arguments have the highest priority, then user configs specified in ~/.ducktape/config, and finally project configs specified in /config. """ parser = create_ducktape_parser() if len(args) == 0: # Show help if there are no arguments parser.print_help() sys.exit(0) # Collect arguments from project config file, user config file, and command line # later arguments supersede earlier arguments parsed_args_list = [] # First collect all the default values defaults = vars(parser.parse_args([])) project_config_file = ConsoleDefaults.PROJECT_CONFIG_FILE # Load all non-default args from project config file. if os.path.exists(project_config_file): parsed_args_list.append(parse_non_default_args(parser, defaults, config_file_to_args_list(project_config_file))) # Load all non-default args from user config file. user_config_file = get_user_config_file(args) if os.path.exists(user_config_file): parsed_args_list.append(parse_non_default_args(parser, defaults, config_file_to_args_list(user_config_file))) # Load all non-default args from the command line. parsed_args_list.append(parse_non_default_args(parser, defaults, args)) # Don't need to copy, done with the defaults dict. # Start with the default args, and layer on changes. parsed_args_dict = defaults for parsed_args in parsed_args_list: parsed_args_dict.update(parsed_args) if parsed_args_dict["version"]: print(ducktape_version()) sys.exit(0) return parsed_args_dict ================================================ FILE: ducktape/errors.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. class DucktapeError(RuntimeError): pass class TimeoutError(DucktapeError): pass ================================================ FILE: ducktape/json_serializable.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from json import JSONEncoder class DucktapeJSONEncoder(JSONEncoder): def default(self, obj): if hasattr(obj, "to_json"): # to_json may return a dict or array or other naturally json serializable object # this allows serialization to work correctly on nested items return obj.to_json() else: # Let the base class default method raise the TypeError return JSONEncoder.default(self, obj) ================================================ FILE: ducktape/jvm_logging.py ================================================ # Copyright 2024 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ JVM logging support for Ducktape. This module provides automatic JVM log collection for Java-based services without requiring any code changes to services or tests. Please Note: We are prepending JVM options to the SSH cmd. If any option is injected again as part of cmd, it will override the option injected from this module. For example, if a test or service injects its own -Xlog options, it may override the GC logging options injected by this module. In practice, Services should work as expected. """ import os import types class JVMLogger: """Handles JVM logging configuration and enablement for services.""" def __init__(self, log_dir="/mnt/jvm_logs"): """ Initialize JVM logger. :param log_dir: Directory for JVM logs on worker nodes """ self.log_dir = log_dir def enable_for_service(self, service): """ Enable JVM logging for a service instance. Adds JVM log definitions and helper methods to the service. Wraps start_node so that when nodes are started, JVM logging is set up and JVM options are injected via SSH, and wraps clean_node to clean up JVM logs after node cleanup. :param service: Service instance to enable JVM logging for """ # Store reference to JVMLogger instance for use in closures jvm_logger = self # Add JVM log definitions jvm_logs = { "jvm_gc_log": {"path": os.path.join(jvm_logger.log_dir, "gc.log"), "collect_default": True}, "jvm_stdout_stderr": {"path": os.path.join(jvm_logger.log_dir, "jvm.log"), "collect_default": True}, "jvm_heap_dump": { "path": os.path.join(jvm_logger.log_dir, "heap_dump.hprof"), "collect_default": False, # Only on failure }, } # Initialize logs dict if needed if not hasattr(service, "logs") or service.logs is None: service.logs = {} # Merge with existing logs service.logs.update(jvm_logs) # Add helper methods service.JVM_LOG_DIR = jvm_logger.log_dir service.jvm_options = lambda node: jvm_logger._get_jvm_options() service.setup_jvm_logging = lambda node: jvm_logger._setup_on_node(node) service.clean_jvm_logs = lambda node: jvm_logger._cleanup_on_node(node) # Wrap start_node to automatically setup JVM logging and wrap SSH original_start_node = service.start_node def wrapped_start_node(self, node, *args, **kwargs): jvm_logger._setup_on_node(node) # Setup JVM log directory # Wrap all SSH methods to inject JDK_JAVA_OPTIONS, wrap once and keep active for entire service lifecycle if not hasattr(node.account, "original_ssh"): original_ssh = node.account.ssh original_ssh_capture = node.account.ssh_capture original_ssh_output = node.account.ssh_output node.account.original_ssh = original_ssh node.account.original_ssh_capture = original_ssh_capture node.account.original_ssh_output = original_ssh_output jvm_opts = jvm_logger._get_jvm_options() # Use env command and append to existing JDK_JAVA_OPTIONS env_prefix = f'env JDK_JAVA_OPTIONS="${{JDK_JAVA_OPTIONS:-}} {jvm_opts}" ' def wrapped_ssh(cmd, allow_fail=False): return original_ssh(env_prefix + cmd, allow_fail=allow_fail) def wrapped_ssh_capture(cmd, allow_fail=False, callback=None, combine_stderr=True, timeout_sec=None): return original_ssh_capture( env_prefix + cmd, allow_fail=allow_fail, callback=callback, combine_stderr=combine_stderr, timeout_sec=timeout_sec, ) def wrapped_ssh_output(cmd, allow_fail=False, combine_stderr=True, timeout_sec=None): return original_ssh_output( env_prefix + cmd, allow_fail=allow_fail, combine_stderr=combine_stderr, timeout_sec=timeout_sec ) node.account.ssh = wrapped_ssh node.account.ssh_capture = wrapped_ssh_capture node.account.ssh_output = wrapped_ssh_output return original_start_node(node, *args, **kwargs) # Bind the wrapper function to the service object service.start_node = types.MethodType(wrapped_start_node, service) # Wrap clean_node to cleanup JVM logs and restore SSH methods original_clean_node = service.clean_node def wrapped_clean_node(self, node, *args, **kwargs): result = original_clean_node(node, *args, **kwargs) jvm_logger._cleanup_on_node(node) # Restore original SSH methods if hasattr(node.account, "original_ssh"): node.account.ssh = node.account.original_ssh node.account.ssh_capture = node.account.original_ssh_capture node.account.ssh_output = node.account.original_ssh_output del node.account.original_ssh del node.account.original_ssh_capture del node.account.original_ssh_output return result # Bind the wrapper function to the service instance service.clean_node = types.MethodType(wrapped_clean_node, service) def _get_jvm_options(self): """Generate JVM options string for logging.""" gc_log = os.path.join(self.log_dir, "gc.log") heap_dump = os.path.join(self.log_dir, "heap_dump.hprof") error_log = os.path.join(self.log_dir, "hs_err_pid%p.log") jvm_log = os.path.join(self.log_dir, "jvm.log") jvm_logging_opts = [ "-Xlog:disable", # Suppress all default JVM console logging to prevent output pollution f"-Xlog:gc*:file={gc_log}:time,uptime,level,tags", # GC activity with timestamps "-XX:+HeapDumpOnOutOfMemoryError", # Generate heap dump on OOM f"-XX:HeapDumpPath={heap_dump}", # Heap dump file location f"-Xlog:safepoint=info:file={jvm_log}:time,uptime,level,tags", # Safepoint pause events f"-Xlog:class+load=info:file={jvm_log}:time,uptime,level,tags", # Class loading events f"-XX:ErrorFile={error_log}", # Fatal error log location (JVM crashes) "-XX:NativeMemoryTracking=summary", # Track native memory usage f"-Xlog:jit+compilation=info:file={jvm_log}:time,uptime,level,tags", # JIT compilation events ] return " ".join(jvm_logging_opts) def _setup_on_node(self, node): """Create JVM log directory on worker node.""" node.account.ssh(f"mkdir -p {self.log_dir}") node.account.ssh(f"chmod 755 {self.log_dir}") def _cleanup_on_node(self, node): """Clean JVM logs from worker node.""" node.account.ssh(f"rm -rf {self.log_dir}", allow_fail=True) ================================================ FILE: ducktape/mark/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ._mark import matrix # NOQA from ._mark import defaults, env, ignore, ignored, is_env, parametrize, parametrized __all__ = [ "matrix", "defaults", "env", "ignore", "ignored", "is_env", "parametrize", "parametrized", ] ================================================ FILE: ducktape/mark/_mark.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import functools import itertools import os from ducktape.errors import DucktapeError class Mark(object): """Common base class for "marks" which may be applied to test functions/methods.""" @staticmethod def mark(fun, mark): """Attach a tag indicating that fun has been marked with the given mark Marking fun updates it with two attributes: - marks: a list of mark objects applied to the function. These may be strings or objects subclassing Mark we use a list because in some cases, it is useful to preserve ordering. - mark_names: a set of names of marks applied to the function """ # Update fun.marks if hasattr(fun, "marks"): fun.marks.append(mark) else: fun.__dict__["marks"] = [mark] # Update fun.mark_names if hasattr(fun, "mark_names"): fun.mark_names.add(mark.name) else: fun.__dict__["mark_names"] = {mark.name} @staticmethod def marked(f, mark): if f is None: return False if not hasattr(f, "mark_names"): return False return mark.name in f.mark_names @staticmethod def clear_marks(f): if not hasattr(f, "marks"): return del f.__dict__["marks"] del f.__dict__["mark_names"] @property def name(self): return "MARK" def apply(self, seed_context, context_list): raise NotImplementedError("Subclasses should implement apply") def __eq__(self, other): if not isinstance(self, type(other)): return False return self.name == other.name class Ignore(Mark): """Ignore a specific parametrization of test.""" def __init__(self, **kwargs): # Ignore tests with injected_args matching self.injected_args self.injected_args = kwargs @property def name(self): return "IGNORE" def apply(self, seed_context, context_list): assert len(context_list) > 0, "ignore annotation is not being applied to any test cases" for ctx in context_list: ctx.ignore = ctx.ignore or self.injected_args is None or self.injected_args == ctx.injected_args return context_list def __eq__(self, other): return super(Ignore, self).__eq__(other) and self.injected_args == other.injected_args class IgnoreAll(Ignore): """This mark signals to ignore all parametrizations of a test.""" def __init__(self): super(IgnoreAll, self).__init__() self.injected_args = None class Matrix(Mark): """Parametrize with a matrix of arguments. Assume each values in self.injected_args is iterable """ def __init__(self, **kwargs): self.injected_args = kwargs for k in self.injected_args: try: iter(self.injected_args[k]) except TypeError as te: raise DucktapeError("Expected all values in @matrix decorator to be iterable: " + str(te)) @property def name(self): return "MATRIX" def apply(self, seed_context, context_list): for injected_args in cartesian_product_dict(self.injected_args): injected_fun = _inject(**injected_args)(seed_context.function) context_list.insert(0, seed_context.copy(function=injected_fun, injected_args=injected_args)) return context_list def __eq__(self, other): return super(Matrix, self).__eq__(other) and self.injected_args == other.injected_args class Defaults(Mark): """Parametrize with a default matrix of arguments on existing parametrizations. Assume each values in self.injected_args is iterable """ def __init__(self, **kwargs): self.injected_args = kwargs for k in self.injected_args: try: iter(self.injected_args[k]) except TypeError as te: raise DucktapeError("Expected all values in @defaults decorator to be iterable: " + str(te)) @property def name(self): return "DEFAULTS" def apply(self, seed_context, context_list): new_context_list = [] if context_list: for ctx in context_list: for injected_args in cartesian_product_dict( {arg: self.injected_args[arg] for arg in self.injected_args if arg not in ctx.injected_args} ): injected_args.update(ctx.injected_args) injected_fun = _inject(**injected_args)(seed_context.function) new_context = seed_context.copy( function=injected_fun, injected_args=injected_args, cluster_use_metadata=ctx.cluster_use_metadata, ) new_context_list.insert(0, new_context) else: for injected_args in cartesian_product_dict(self.injected_args): injected_fun = _inject(**injected_args)(seed_context.function) new_context_list.insert( 0, seed_context.copy(function=injected_fun, injected_args=injected_args), ) return new_context_list def __eq__(self, other): return super(Defaults, self).__eq__(other) and self.injected_args == other.injected_args class Parametrize(Mark): """Parametrize a test function""" def __init__(self, **kwargs): self.injected_args = kwargs @property def name(self): return "PARAMETRIZE" def apply(self, seed_context, context_list): injected_fun = _inject(**self.injected_args)(seed_context.function) context_list.insert( 0, seed_context.copy(function=injected_fun, injected_args=self.injected_args), ) return context_list def __eq__(self, other): return super(Parametrize, self).__eq__(other) and self.injected_args == other.injected_args class Env(Mark): def __init__(self, **kwargs): self.injected_args = kwargs self.should_ignore = any(os.environ.get(key) != value for key, value in kwargs.items()) @property def name(self): return "ENV" def apply(self, seed_context, context_list): for ctx in context_list: ctx.ignore = ctx.ignore or self.should_ignore return context_list def __eq__(self, other): return super(Env, self).__eq__(other) and self.injected_args == other.injected_args PARAMETRIZED = Parametrize() MATRIX = Matrix() DEFAULTS = Defaults() IGNORE = Ignore() ENV = Env() def _is_parametrize_mark(m): return m.name == PARAMETRIZED.name or m.name == MATRIX.name or m.name == DEFAULTS.name def parametrized(f): """Is this function or object decorated with @parametrize or @matrix?""" return Mark.marked(f, PARAMETRIZED) or Mark.marked(f, MATRIX) or Mark.marked(f, DEFAULTS) def ignored(f): """Is this function or object decorated with @ignore?""" return Mark.marked(f, IGNORE) def is_env(f): return Mark.marked(f, ENV) def cartesian_product_dict(d): """Return the "cartesian product" of this dictionary's values. d is assumed to be a dictionary, where each value in the dict is a list of values Example:: { "x": [1, 2], "y": ["a", "b"] } expand this into a list of dictionaries like so: [ { "x": 1, "y": "a" }, { "x": 1, "y": "b" }, { "x": 2, "y": "a" }, { "x": 2, "y", "b" } ] """ # Establish an ordering of the keys key_list = [k for k in d.keys()] expanded = [] values_list = [d[k] for k in key_list] # list of lists for v in itertools.product(*values_list): # Iterate through the cartesian product of the lists of values # One dictionary per element in this cartesian product new_dict = {} for i in range(len(key_list)): new_dict[key_list[i]] = v[i] expanded.append(new_dict) return expanded def matrix(**kwargs): """Function decorator used to parametrize with a matrix of values. Decorating a function or method with ``@matrix`` marks it with the Matrix mark. When expanded using the ``MarkedFunctionExpander``, it yields a list of TestContext objects, one for every possible combination of arguments. Example:: @matrix(x=[1, 2], y=[-1, -2]) def g(x, y): print "x = %s, y = %s" % (x, y) for ctx in MarkedFunctionExpander(..., function=g, ...).expand(): ctx.function() # output: # x = 1, y = -1 # x = 1, y = -2 # x = 2, y = -1 # x = 2, y = -2 """ def parametrizer(f): Mark.mark(f, Matrix(**kwargs)) return f return parametrizer def defaults(**kwargs): """Function decorator used to parametrize with a default matrix of values. Decorating a function or method with ``@defaults`` marks it with the Defaults mark. When expanded using the ``MarkedFunctionExpander``, it yields a list of TestContext objects, one for every possible combination of defaults combined with ``@matrix`` and ``@parametrize``. If there are overlap between defaults and parametrization, defaults will not be applied. Example:: @defaults(z=[1, 2]) @matrix(x=[1], y=[1, 2]) @parametrize(x=3, y=4) @parametrize(x=3, y=4, z=999) def g(x, y, z): print "x = %s, y = %s" % (x, y) for ctx in MarkedFunctionExpander(..., function=g, ...).expand(): ctx.function() # output: # x = 1, y = 1, z = 1 # x = 1, y = 1, z = 2 # x = 1, y = 2, z = 1 # x = 1, y = 2, z = 2 # x = 3, y = 4, z = 1 # x = 3, y = 4, z = 2 # x = 3, y = 4, z = 999 """ def parametrizer(f): Mark.mark(f, Defaults(**kwargs)) return f return parametrizer def parametrize(**kwargs): """Function decorator used to parametrize its arguments. Decorating a function or method with ``@parametrize`` marks it with the Parametrize mark. Example:: @parametrize(x=1, y=2 z=-1) @parametrize(x=3, y=4, z=5) def g(x, y, z): print "x = %s, y = %s, z = %s" % (x, y, z) for ctx in MarkedFunctionExpander(..., function=g, ...).expand(): ctx.function() # output: # x = 1, y = 2, z = -1 # x = 3, y = 4, z = 5 """ def parametrizer(f): Mark.mark(f, Parametrize(**kwargs)) return f return parametrizer def ignore(*args, **kwargs): """ Test method decorator which signals to the test runner to ignore a given test. Example:: When no parameters are provided to the @ignore decorator, ignore all parametrizations of the test function @ignore # Ignore all parametrizations @parametrize(x=1, y=0) @parametrize(x=2, y=3) def the_test(...): ... Example:: If parameters are supplied to the @ignore decorator, only ignore the parametrization with matching parameter(s) @ignore(x=2, y=3) @parametrize(x=1, y=0) # This test will run as usual @parametrize(x=2, y=3) # This test will be ignored def the_test(...): ... """ if len(args) == 1 and len(kwargs) == 0: # this corresponds to the usage of the decorator with no arguments # @ignore # def test_function: # ... Mark.mark(args[0], IgnoreAll()) return args[0] # this corresponds to usage of @ignore with arguments def ignorer(f): Mark.mark(f, Ignore(**kwargs)) return f return ignorer def env(**kwargs): def environment(f): Mark.mark(f, Env(**kwargs)) return f return environment def _inject(*args, **kwargs): """Inject variables into the arguments of a function or method. This is almost identical to decorating with functools.partial, except we also propagate the wrapped function's __name__. """ def injector(f): assert callable(f) @functools.wraps(f) def wrapper(*w_args, **w_kwargs): return functools.partial(f, *args, **kwargs)(*w_args, **w_kwargs) wrapper.args = args wrapper.kwargs = kwargs wrapper.function = f return wrapper return injector ================================================ FILE: ducktape/mark/consts.py ================================================ CLUSTER_SPEC_KEYWORD = "cluster_spec" CLUSTER_SIZE_KEYWORD = "num_nodes" CLUSTER_NODE_TYPE_KEYWORD = "node_type" ================================================ FILE: ducktape/mark/mark_expander.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test_context import TestContext from ._mark import Parametrize, _is_parametrize_mark, parametrized class MarkedFunctionExpander(object): """This class helps expand decorated/marked functions into a list of test context objects.""" def __init__( self, session_context=None, module=None, cls=None, function=None, file=None, cluster=None, ): self.seed_context = TestContext( session_context=session_context, module=module, cls=cls, function=function, file=file, cluster=cluster, ) if parametrized(function): self.context_list = [] else: self.context_list = [self.seed_context] def expand(self, test_parameters=None): """Inspect self.function for marks, and expand into a list of test context objects useable by the test runner.""" f = self.seed_context.function # If the user has specified that they want to run tests with specific parameters, apply the parameters first, # then subsequently strip any parametrization decorators. Otherwise, everything gets applied normally. if test_parameters is not None: self.context_list = Parametrize(**test_parameters).apply(self.seed_context, self.context_list) for m in getattr(f, "marks", []): if test_parameters is None or not _is_parametrize_mark(m): self.context_list = m.apply(self.seed_context, self.context_list) return self.context_list ================================================ FILE: ducktape/mark/resource.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy from typing import Callable, List from ducktape.mark._mark import Mark from ducktape.tests.test_context import TestContext class ClusterUseMetadata(Mark): """Provide a hint about how a given test will use the cluster.""" def __init__(self, **kwargs) -> None: # shallow copy self.metadata = copy.copy(kwargs) @property def name(self) -> str: return "RESOURCE_HINT_CLUSTER_USE" def apply(self, seed_context: TestContext, context_list: List[TestContext]) -> List[TestContext]: assert len(context_list) > 0, "cluster use annotation is not being applied to any test cases" for ctx in context_list: if not ctx.cluster_use_metadata: # only update if non-None and non-empty ctx.cluster_use_metadata = self.metadata return context_list def cluster(**kwargs) -> Callable: """Test method decorator used to provide hints about how the test will use the given cluster. If this decorator is not provided, the test will either claim all cluster resources or fail immediately, depending on the flags passed to ducktape. :Keywords used by ducktape: - ``num_nodes`` provide hint about how many nodes the test will consume - ``node_type`` provide hint about what type of nodes the test needs (e.g., "large", "small") - ``cluster_spec`` provide hint about how many nodes of each type the test will consume Example:: # basic usage with num_nodes @cluster(num_nodes=10) def the_test(...): ... # usage with num_nodes and node_type @cluster(num_nodes=5, node_type="large") def the_test(...): ... # basic usage with cluster_spec @cluster(cluster_spec=ClusterSpec.simple_linux(10)) def the_test(...): ... # parametrized test: # both test cases will be marked with cluster_size of 200 @cluster(num_nodes=200) @parametrize(x=1) @parametrize(x=2) def the_test(x): ... # test case {'x': 1} has cluster size 100, test case {'x': 2} has cluster size 200 @cluster(num_nodes=100) @parametrize(x=1) @cluster(num_nodes=200) @parametrize(x=2) def the_test(x): ... """ def cluster_use_metadata_adder(f): Mark.mark(f, ClusterUseMetadata(**kwargs)) return f return cluster_use_metadata_adder ================================================ FILE: ducktape/services/__init__.py ================================================ ================================================ FILE: ducktape/services/background_thread.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import threading import traceback from ducktape.services.service import Service class BackgroundThreadService(Service): def __init__(self, context, num_nodes=None, cluster_spec=None, *args, **kwargs): super(BackgroundThreadService, self).__init__(context, num_nodes, cluster_spec, *args, **kwargs) self.worker_threads = {} self.worker_errors = {} self.errors = "" self.lock = threading.RLock() def _protected_worker(self, idx, node): """Protected worker captures exceptions and makes them available to the main thread. This gives us the ability to propagate exceptions thrown in background threads, if desired. """ try: self._worker(idx, node) except BaseException: with self.lock: self.logger.info("BackgroundThreadService threw exception: ") tb = traceback.format_exc() self.logger.info(tb) self.worker_errors[threading.currentThread().name] = tb if self.errors: self.errors += "\n" self.errors += "%s: %s" % (threading.currentThread().name, tb) raise def start_node(self, node): idx = self.idx(node) if idx in self.worker_threads and self.worker_threads[idx].is_alive(): raise RuntimeError("Cannot restart node since previous thread is still alive") self.logger.info("Running %s node %d on %s", self.service_id, idx, node.account.hostname) worker = threading.Thread( name=self.service_id + "-worker-" + str(idx), target=self._protected_worker, args=(idx, node), ) worker.daemon = True worker.start() self.worker_threads[idx] = worker def wait(self, timeout_sec=600): """Wait no more than timeout_sec for all worker threads to finish. raise TimeoutException if all worker threads do not finish within timeout_sec """ super(BackgroundThreadService, self).wait(timeout_sec) self._propagate_exceptions() def stop(self): alive_workers = [worker for worker in self.worker_threads.values() if worker.is_alive()] if len(alive_workers) > 0: self.logger.debug("Called stop with at least one worker thread is still running: " + str(alive_workers)) self.logger.debug("%s" % str(self.worker_threads)) super(BackgroundThreadService, self).stop() self._propagate_exceptions() def wait_node(self, node, timeout_sec=600): idx = self.idx(node) worker_thread = self.worker_threads.get(idx) # worker thread can be absent if this node has never been started if worker_thread: worker_thread.join(timeout_sec) return not (worker_thread.is_alive()) else: self.logger.debug(f"Worker thread not found for {self.who_am_i(node)}") return True def _propagate_exceptions(self): """ Propagate exceptions thrown in background threads """ with self.lock: if len(self.worker_errors) > 0: raise Exception(self.errors) ================================================ FILE: ducktape/services/service.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import shutil import tempfile import time from typing import Any, Dict from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.command_line.defaults import ConsoleDefaults from ducktape.errors import TimeoutError from ducktape.template import TemplateRenderer class ServiceIdFactory: def generate_service_id(self, service): return "{service_name}-{service_number}-{service_id}".format( service_name=service.__class__.__name__, service_number=service._order, service_id=id(service), ) class MultiRunServiceIdFactory: def __init__(self, run_number=1): self.run_number = run_number def generate_service_id(self, service): return "{run_number}-{service_name}-{service_number}-{service_id}".format( run_number=self.run_number, service_name=service.__class__.__name__, service_number=service._order, service_id=id(service), ) service_id_factory = ServiceIdFactory() class Service(TemplateRenderer): """Service classes know how to deploy a service onto a set of nodes and then clean up after themselves. They request the necessary resources from the cluster, configure each node, and bring up/tear down the service. They also expose information about the service so that other services or test scripts can easily be configured to work with them. Finally, they may be able to collect and check logs/output from the service, which can be helpful in writing tests or benchmarks. Services should generally be written to support an arbitrary number of nodes, even if instances are independent of each other. They should be able to assume that there won't be resource conflicts: the cluster tests are being run on should be large enough to use one instance per service instance. """ # Provides a mechanism for locating and collecting log files produced by the service on its nodes. # logs is a dict with entries that look like log_name: {"path": log_path, "collect_default": boolean} # # For example, zookeeper service might have self.logs like this: # self.logs = { # "zk_log": {"path": "/mnt/zk.log", # "collect_default": True} # } logs: Dict[str, Dict[str, Any]] = {} def __init__(self, context, num_nodes=None, cluster_spec=None, *args, **kwargs): """ Initialize the Service. Note: only one of (num_nodes, cluster_spec) may be set. :param context: An object which has at minimum 'cluster' and 'logger' attributes. In tests, this is always a TestContext object. :param num_nodes: An integer representing the number of Linux nodes to allocate. :param cluster_spec: A ClusterSpec object representing the minimum cluster specification needed. """ super(Service, self).__init__(*args, **kwargs) # Keep track of significant events in the lifetime of this service self._init_time = time.time() self._start_time = -1 self._start_duration_seconds = -1 self._stop_time = -1 self._stop_duration_seconds = -1 self._clean_time = -1 self._initialized = False self.service_id_factory = service_id_factory self.cluster_spec = Service.setup_cluster_spec(num_nodes=num_nodes, cluster_spec=cluster_spec) self.context = context self.nodes = [] self.skip_nodes_allocation = kwargs.get("skip_nodes_allocation", False) if not self.skip_nodes_allocation: self.allocate_nodes() # Keep track of which nodes nodes were allocated to this service, even after nodes are freed # Note: only keep references to representations of the nodes, not the actual node objects themselves self._nodes_formerly_allocated = [str(node.account) for node in self.nodes] # Every time a service instance is created, it registers itself with its # context object. This makes it possible for external mechanisms to clean up # after the service if something goes wrong. # # Note: Allocate nodes *before* registering self with the service registry self.context.services.append(self) # Each service instance has its own local scratch directory on the test driver self._local_scratch_dir = None self._initialized = True @staticmethod def setup_cluster_spec(num_nodes=None, cluster_spec=None): if num_nodes is None: if cluster_spec is None: raise RuntimeError("You must set either num_nodes or cluster_spec.") else: return cluster_spec else: if cluster_spec is not None: raise RuntimeError("You must set only one of (num_nodes, cluster_spec)") return ClusterSpec.simple_linux(num_nodes) def __repr__(self): return "<%s: %s>" % ( self.who_am_i(), "num_nodes: %d, nodes: %s" % (len(self.nodes), [n.account.hostname for n in self.nodes]), ) @property def num_nodes(self): return len(self.nodes) @property def local_scratch_dir(self): """This local scratch directory is created/destroyed on the test driver before/after each test is run.""" if not self._local_scratch_dir: self._local_scratch_dir = tempfile.mkdtemp() return self._local_scratch_dir @property def service_id(self): """Human-readable identifier (almost certainly) unique within a test run.""" return self.service_id_factory.generate_service_id(self) @property def _order(self): """Index of this service instance with respect to other services of the same type registered with self.context. When used with a test_context, this lets the user know Example:: suppose the services registered with the same context looks like context.services == [Zookeeper, Kafka, Zookeeper, Kafka, MirrorMaker] then: context.services[0]._order == 0 # "0th" Zookeeper instance context.services[2]._order == 0 # "0th" Kafka instance context.services[1]._order == 1 # "1st" Zookeeper instance context.services[3]._order == 1 # "1st" Kafka instance context.services[4]._order == 0 # "0th" MirrorMaker instance """ if hasattr(self.context, "services"): same_services = [id(s) for s in self.context.services if isinstance(self, type(s))] if self not in self.context.services and not self._initialized: # It's possible that _order will be invoked in the constructor *before* self has been registered with # the service registry (aka self.context.services). return len(same_services) # Note: index raises ValueError if the item is not in the list index = same_services.index(id(self)) return index else: return 0 @property def logger(self): """The logger instance for this service.""" return self.context.logger @property def cluster(self): """The cluster object from which this service instance gets its nodes.""" return self.context.cluster @property def allocated(self): """Return True iff nodes have been allocated to this service instance.""" return len(self.nodes) > 0 def who_am_i(self, node=None): """Human-readable identifier useful for log messages.""" if node is None: return self.service_id else: return "%s node %d on %s" % ( self.service_id, self.idx(node), node.account.hostname, ) def allocate_nodes(self): """Request resources from the cluster.""" if self.allocated: raise Exception("Requesting nodes for a service that has already been allocated nodes.") self.logger.debug("Requesting nodes from the cluster: %s" % self.cluster_spec) try: self.nodes = self.cluster.alloc(self.cluster_spec) except RuntimeError as e: msg = str(e) if hasattr(self.context, "services"): msg += " Currently registered services: " + str(self.context.services) raise RuntimeError(msg) for idx, node in enumerate(self.nodes, 1): # Remote accounts utilities should log where this service logs if node.account._logger is not None: # This log message help test-writer identify which test and/or service didn't clean up after itself node.account.logger.critical(ConsoleDefaults.BAD_TEST_MESSAGE) raise RuntimeError( "logger was not None on service start. There may be a concurrency issue, " "or some service which isn't properly cleaning up after itself. " "Service: %s, node.account: %s" % (self.__class__.__name__, str(node.account)) ) node.account.logger = self.logger self.logger.debug("Successfully allocated %d nodes to %s" % (len(self.nodes), self.who_am_i())) def start(self, **kwargs): """Start the service on all nodes.""" self.logger.info("%s: starting service" % self.who_am_i()) if self._start_time < 0: # Set self._start_time only the first time self.start is invoked self._start_time = time.time() self.logger.debug(self.who_am_i() + ": killing processes and attempting to clean up before starting") for node in self.nodes: # Added precaution - kill running processes, clean persistent files (if 'clean'=False flag passed, # skip cleaning), try/except for each step, since each of these steps may fail if there # are no processes to kill or no files to remove try: self.stop_node(node) except Exception: pass try: if kwargs.get("clean", True): self.clean_node(node) else: self.logger.debug("%s: skip cleaning node" % self.who_am_i(node)) except Exception: pass for node in self.nodes: self.logger.debug("%s: starting node" % self.who_am_i(node)) self.start_node(node, **kwargs) if self._start_duration_seconds < 0: self._start_duration_seconds = time.time() - self._start_time def start_node(self, node, **kwargs): """Start service process(es) on the given node.""" pass def wait(self, timeout_sec=600): """Wait for the service to finish. This only makes sense for tasks with a fixed amount of work to do. For services that generate output, it is only guaranteed to be available after this call returns. """ unfinished_nodes = [] start = time.time() end = start + timeout_sec for node in self.nodes: now = time.time() node_name = self.who_am_i(node) if end > now: self.logger.debug("%s: waiting for node", node_name) if not self.wait_node(node, end - now): unfinished_nodes.append(node_name) else: unfinished_nodes.append(node_name) if unfinished_nodes: raise TimeoutError( "Timed out waiting %s seconds for service nodes to finish. " % str(timeout_sec) + "These nodes are still alive: " + str(unfinished_nodes) ) def wait_node(self, node, timeout_sec=None): """Wait for the service on the given node to finish. Return True if the node finished shutdown, False otherwise. """ pass def stop(self, **kwargs): """Stop service processes on each node in this service. Subclasses must override stop_node. """ self._stop_time = time.time() # The last time stop is invoked self.logger.info("%s: stopping service" % self.who_am_i()) for node in self.nodes: self.logger.info("%s: stopping node" % self.who_am_i(node)) self.stop_node(node, **kwargs) self._stop_duration_seconds = time.time() - self._stop_time def stop_node(self, node, **kwargs): """Halt service process(es) on this node.""" pass def clean(self, **kwargs): """Clean up persistent state on each node - e.g. logs, config files etc. Subclasses must override clean_node. """ self._clean_time = time.time() self.logger.info("%s: cleaning service" % self.who_am_i()) for node in self.nodes: self.logger.info("%s: cleaning node" % self.who_am_i(node)) self.clean_node(node, **kwargs) def clean_node(self, node, **kwargs): """Clean up persistent state on this node - e.g. service logs, configuration files etc.""" self.logger.warn( "%s: clean_node has not been overriden. " "This may be fine if the service leaves no persistent state." % self.who_am_i() ) def free(self): """Free each node. This 'deallocates' the nodes so the cluster can assign them to other services.""" while self.nodes: node = self.nodes.pop() self.logger.info("%s: freeing node" % self.who_am_i(node)) node.account.logger = None self.cluster.free(node) def run(self): """Helper that executes run(), wait(), and stop() in sequence.""" self.start() self.wait() self.stop() def get_node(self, idx): """ids presented externally are indexed from 1, so we provide a helper method to avoid confusion.""" return self.nodes[idx - 1] def idx(self, node): """Return id of the given node. Return -1 if node does not belong to this service. idx identifies the node within this service instance (not globally). """ for idx, n in enumerate(self.nodes, 1): if self.get_node(idx) == node: return idx return -1 def close(self): """Release resources.""" # Remove local scratch directory if self._local_scratch_dir and os.path.exists(self._local_scratch_dir): shutil.rmtree(self._local_scratch_dir) @staticmethod def run_parallel(*args): """Helper to run a set of services in parallel. This is useful if you want multiple services of different types to run concurrently, e.g. a producer + consumer pair. """ for svc in args: svc.start() for svc in args: svc.wait() for svc in args: svc.stop() def to_json(self): return { "cls_name": self.__class__.__name__, "module_name": self.__module__, "lifecycle": { "init_time": self._init_time, "start_time": self._start_time, "start_duration_seconds": self._start_duration_seconds, "stop_time": self._stop_time, "stop_duration_seconds": self._stop_duration_seconds, "clean_time": self._clean_time, }, "service_id": self.service_id, "nodes": self._nodes_formerly_allocated, } ================================================ FILE: ducktape/services/service_registry.py ================================================ # Copyright 2014 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import OrderedDict from ducktape.jvm_logging import JVMLogger class ServiceRegistry(object): def __init__(self, enable_jvm_logs=False) -> None: self._services: OrderedDict = OrderedDict() self._nodes: dict = {} # Initialize JVM logging if enabled if enable_jvm_logs: self._jvm_logger = JVMLogger() else: self._jvm_logger = None def __contains__(self, item): return id(item) in self._services def __iter__(self): return iter(self._services.values()) def __repr__(self): return str(self._services.values()) def append(self, service): # Auto-enable JVM logging for Java-based services if self._jvm_logger: self._jvm_logger.enable_for_service(service) self._services[id(service)] = service self._nodes[id(service)] = [str(n.account) for n in service.nodes] def to_json(self): return [service.to_json() for service in self._services.values()] def stop_all(self): """Stop all currently registered services in the reverse of the order in which they were added. Note that this does not clean up persistent state or free the nodes back to the cluster. """ keyboard_interrupt = None for service in reversed(self._services.values()): try: service.stop() except BaseException as e: if isinstance(e, KeyboardInterrupt): keyboard_interrupt = e service.logger.warn("Error stopping service %s: %s", service, e) if keyboard_interrupt is not None: raise keyboard_interrupt def clean_all(self): """Clean all services. This should only be called after services are stopped.""" keyboard_interrupt = None for service in self._services.values(): try: service.clean() except BaseException as e: if isinstance(e, KeyboardInterrupt): keyboard_interrupt = e service.logger.warn("Error cleaning service %s: %s" % (service, e)) if keyboard_interrupt is not None: raise keyboard_interrupt def free_all(self): """Release nodes back to the cluster.""" keyboard_interrupt = None for service in self._services.values(): try: service.free() except BaseException as e: if isinstance(e, KeyboardInterrupt): keyboard_interrupt = e service.logger.warn("Error freeing service %s: %s" % (service, e)) if keyboard_interrupt is not None: raise keyboard_interrupt self._services.clear() self._nodes.clear() def errors(self): """ Gets a printable string containing any errors produced by the services. """ return "\n\n".join( "{}: {}".format(service.who_am_i(), service.error) for service in self._services.values() if hasattr(service, "error") and service.error ) ================================================ FILE: ducktape/template.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect import os.path from jinja2 import ChoiceLoader, Environment, FileSystemLoader, PackageLoader, Template from ducktape.utils.util import package_is_installed class TemplateRenderer(object): def _get_ctx(self): ctx = {k: getattr(self.__class__, k) for k in dir(self.__class__)} ctx.update(self.__dict__) return ctx def render_template(self, template, **kwargs): """ Render a template using the context of the current object, optionally with overrides. :param template: the template to render, a Template or a str :param kwargs: optional override parameters :return: the rendered template """ if not hasattr(template, "render"): template = Template(template) ctx = self._get_ctx() return template.render(ctx, **kwargs) @staticmethod def _package_search_path(module_name): """ :param module_name: Name of a module :return: (package, package_search_path) where package is the package containing the module, and package_search_path is a path relative to the package in which to search for templates. """ module_parts = module_name.split(".") package = module_parts[0] # Construct path relative to package under which "templates" would be found directory = "" for d in module_parts[1:-1]: directory = os.path.join(directory, d) return package, os.path.join(directory, "templates") def render(self, path, **kwargs): """ Render a template loaded from a file. template files referenced in file f should be in a sibling directory of f called "templates". :param path: path, relative to the search paths, to the template file :param kwargs: optional override parameters :return: the rendered template """ if not hasattr(self, "template_loader"): class_dir = os.path.dirname(inspect.getfile(self.__class__)) module_name = self.__class__.__module__ package, package_search_path = self._package_search_path(module_name) loaders = [] msg = "" if os.path.isdir(class_dir): # FileSystemLoader overrides PackageLoader if the path containing this directory # is a valid directory. FileSystemLoader throws an error from which ChoiceLoader # doesn't recover if the directory is invalid loaders.append(FileSystemLoader(os.path.join(class_dir, "templates"))) else: msg += "Will not search in %s for template files since it is not a valid directory. " % class_dir if package_is_installed(package): loaders.append(PackageLoader(package, package_search_path)) else: msg += "Will not search in package %s for template files because it cannot be imported." if len(loaders) == 0: # Expect at least one of FileSystemLoader and PackageLoader to be present raise EnvironmentError(msg) self.template_loader = ChoiceLoader(loaders) self.template_env = Environment(loader=self.template_loader, trim_blocks=True, lstrip_blocks=True) template = self.template_env.get_template(path) return self.render_template(template, **kwargs) ================================================ FILE: ducktape/templates/report/report.css ================================================ body { font-family: verdana, arial, helvetica, sans-serif; font-size: 80%; } table { font-size: 100%; } h1, h2, h3, h4, h5, h6 { color: gray; } #summary_header_row { font-weight: bold; color: white; background-color: #777; } #summary_header_row th { border: 1px solid #777; padding: 2px; } #summary_report_table { width: 80%; border-collapse: collapse; border: 1px solid #777; } #summary_report_table td { border: 1px solid #777; padding: 2px; } .pre_stack_trace { white-space: pre-wrap; /* Since CSS 2.1 */ white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ white-space: -o-pre-wrap; /* Opera 7 */ word-wrap: break-word; /* Internet Explorer 5.5+ */ } .header_row { font-weight: bold; color: white; background-color: #777; } .header_row th { border: 1px solid #777; padding: 2px; } .report_table { width: 80%; border-collapse: collapse; border: 1px solid #777; } .report_table td { border: 1px solid #777; padding: 2px; } .pass { background-color: #6c6; } .flaky { background-color: #dd2; } .fail { background-color: #c60; } .ignore { background-color: #555; } .testcase { margin-left: 2em; } ================================================ FILE: ducktape/templates/report/report.html ================================================
================================================ FILE: ducktape/tests/__init__.py ================================================ from .test import Test from .test_context import TestContext __all__ = ["Test", "TestContext"] ================================================ FILE: ducktape/tests/event.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import os import time class ClientEventFactory(object): """Used by test runner clients to generate events.""" READY = "READY" # reply: {test_metadata, cluster, session_context} SETTING_UP = "SETTING_UP" RUNNING = "RUNNING" TEARING_DOWN = "TEARING_DOWN" FINISHED = "FINISHED" LOG = "LOG" # Types of messages available TYPES = {READY, SETTING_UP, RUNNING, TEARING_DOWN, FINISHED, LOG} def __init__(self, test_id, test_index, source_id): self.test_id = test_id # id of event source self.test_index = test_index self.source_id = source_id self.event_id = 0 def _event(self, event_type, payload=None): """Create a message object with certain base fields, and augmented by the payload. :param event_type: type of message this is :param payload: a dict containing extra fields for the message. Key names should not conflict with keys in the base event. """ assert event_type in ClientEventFactory.TYPES, "Unknown event type" if payload is None: payload = {} event = { "test_id": self.test_id, "source_id": self.source_id, "test_index": self.test_index, "event_id": self.event_id, "event_type": event_type, "event_time": time.time(), } assert len(set(event.keys()).intersection(set(payload.keys()))) == 0, ( "Payload and base event should not share keys. base event: %s, payload: %s" % (str(event), str(payload)) ) event.update(payload) self.event_id += 1 return event def copy(self, event): """Copy constructor: return a copy of the original message, but with a unique message id.""" new_event = copy.copy(event) new_event["message_id"] = self.event_id self.event_id += 1 return new_event def running(self): return self._event( event_type=ClientEventFactory.RUNNING, payload={"pid": os.getpid(), "pgroup_id": os.getpgrp()}, ) def ready(self): return self._event( event_type=ClientEventFactory.READY, payload={"pid": os.getpid(), "pgroup_id": os.getpgrp()}, ) def setting_up(self): return self._event(event_type=ClientEventFactory.SETTING_UP) def finished(self, result): return self._event(event_type=ClientEventFactory.FINISHED, payload={"result": result}) def log(self, message, level): return self._event( event_type=ClientEventFactory.LOG, payload={"message": message, "log_level": level}, ) class EventResponseFactory(object): """Used by the test runner to create responses to events from client processes.""" def _event_response(self, client_event, payload=None): if payload is None: payload = {} event_response = { "ack": True, "source_id": client_event["source_id"], "event_id": client_event["event_id"], } assert len(set(event_response.keys()).intersection(set(payload.keys()))) == 0, ( "Payload and base event should not share keys. base event: %s, payload: %s" % (str(event_response), str(payload)) ) event_response.update(payload) return event_response def running(self, client_event): return self._event_response(client_event) def ready(self, client_event, session_context, test_context, cluster): payload = { "session_context": session_context, "test_metadata": test_context.test_metadata, "cluster": cluster, } return self._event_response(client_event, payload) def setting_up(self, client_event): return self._event_response(client_event) def finished(self, client_event): return self._event_response(client_event) def log(self, client_event): return self._event_response(client_event) ================================================ FILE: ducktape/tests/loader.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import glob import importlib import inspect import itertools import json import os import re import sys from logging import Logger from operator import attrgetter from typing import Any, Iterable, List, Optional, Set, Tuple, Union import requests import yaml from ducktape.mark import parametrized from ducktape.mark.mark_expander import MarkedFunctionExpander from ducktape.tests.session import SessionContext from ducktape.tests.test import Test from ducktape.tests.test_context import TestContext class LoaderException(Exception): pass # A helper container class ModuleAndFile = collections.namedtuple("ModuleAndFile", ["module", "file"]) DEFAULT_TEST_FILE_PATTERN = r"(^test_.*\.py$)|(^.*_test\.py$)" DEFAULT_TEST_FUNCTION_PATTERN = "(^test.*)|(.*test$)" # Included for unit tests to be able to add support for loading local file:/// URLs. _requests_session = requests.session() class TestLoader(object): """Class used to discover and load tests.""" def __init__( self, session_context: SessionContext, logger: Logger, repeat: int = 1, injected_args: None = None, cluster: None = None, subset: int = 0, subsets: int = 1, historical_report: None = None, ) -> None: self.session_context = session_context self.cluster = cluster assert logger is not None self.logger = logger assert repeat >= 1 self.repeat = repeat if subset >= subsets: raise ValueError("The subset to execute must be in the range [0, subsets-1]") self.subset = subset self.subsets = subsets self.historical_report = historical_report self.test_file_pattern = DEFAULT_TEST_FILE_PATTERN self.test_function_pattern = DEFAULT_TEST_FUNCTION_PATTERN # A non-None value here means the loader will override the injected_args # in any discovered test, whether or not it is parametrized self.injected_args = injected_args def load(self, symbols: List[str], excluded_test_symbols: None = None) -> List[TestContext]: """ Discover tests specified by the symbols parameter (iterable of test symbols and/or test suite file paths). Skip any tests specified by excluded_test_symbols (iterable of test symbols). *Test symbol* is a pointer to the test or a group of tests. It is specified by the file/folder path or glob, optionally with Class.method after `::` : - `test-dir/` - loads all tests under `test-dir` but does NOT load test suites found under `test-dir` - `test-dir/prefix_*.py` - loads all files with a specified prefix - `/path/to/test/file.py` - `test/file.py::TestClass` - `test/file.py::TestClass.test_method` *Test suite* is a yaml file with the following format: ``` # multiple test suites can be included: test_suite_name: # list included test symbols - path/to/test.py # optionally test suite can have included and excluded sections: # you may also specify a list of other suite's you wish to import # that will also be loaded when loading this file by using the # import tag. import: # list of yaml files whose suites will also run: - path/to/suite.yml another_test_suite: included: # list of included test symbols: - path/to/test-dir/prefix_*.py excluded: # list of excluded test symbols: - path/to/test-dir/prefix_excluded.py ``` Each file found after parsing a symbol is checked to see if it contains a test: - Discover modules that 'look like' a test. By default, this means the filename is "test_*" or "*_test.py" - Discover test classes within each test module. A test class is a subclass of Test which is a leaf (i.e. it has no subclasses). - Discover test methods within each test class. A test method is a method containing 'test' in its name :param symbols: iterable that contains test symbols and/or test suite file paths. :param excluded_test_symbols: iterable that contains test symbols only. :return list of test context objects found during discovery. Note: if self.repeat is set to n, each test_context will appear in the list n times. """ test_symbols = [] test_suites = [] # symbol can point to a test or a test suite for symbol in symbols: if symbol.endswith(".yml"): # if it ends with .yml, its a test suite, read included and excluded paths from the file test_suites.append(symbol) else: # otherwise add it to default suite's included list test_symbols.append(symbol) contexts_from_suites = self._load_test_suite_files(test_suites) contexts_from_symbols = self._load_test_contexts(test_symbols) all_included = contexts_from_suites.union(contexts_from_symbols) # excluded_test_symbols apply to both tests from suites and tests from symbols global_excluded = self._load_test_contexts(excluded_test_symbols) all_test_context_list: Union[List[TestContext], Set[TestContext]] = self._filter_excluded_test_contexts( all_included, global_excluded ) # make sure no test is loaded twice all_test_context_list = self._filter_by_unique_test_id(all_test_context_list) # Sort to make sure we get a consistent order for when we create subsets all_test_context_list = sorted(all_test_context_list, key=attrgetter("test_id")) if not all_test_context_list: raise LoaderException("No tests to run!") self.logger.debug("Discovered these tests: " + str(all_test_context_list)) # Select the subset of tests. if self.historical_report: # With timing info, try to pack the subsets reasonably evenly based on timing. To do so, get timing info # for each test (using avg as a fallback for missing data), sort in descending order, then start greedily # packing tests into bins based on the least full bin at the time. raw_results = _requests_session.get(self.historical_report).json()["results"] time_results = {r["test_id"]: r["run_time_seconds"] for r in raw_results} avg_result_time = sum(time_results.values()) / len(time_results) time_results = {tc.test_id: time_results.get(tc.test_id, avg_result_time) for tc in all_test_context_list} all_test_context_list = sorted( all_test_context_list, key=lambda x: time_results[x.test_id], reverse=True, ) subsets = [[] for _ in range(self.subsets)] subsets_accumulated_time = [0] * self.subsets for tc in all_test_context_list: min_subset_idx = min( range(len(subsets_accumulated_time)), key=lambda i: subsets_accumulated_time[i], ) subsets[min_subset_idx].append(tc.test_id) subsets_accumulated_time[min_subset_idx] += time_results[tc.test_id] subset_test_context_list = subsets[self.subset] else: # Without timing info, select every nth test instead of blocks of n to avoid groups of tests that are # parametrizations of the same test being grouped together since that can lead to a single, parameterized, # long-running test causing a very imbalanced workload across different subsets. Imbalance is still # possible, but much less likely using this heuristic. subset_test_context_list = list(itertools.islice(all_test_context_list, self.subset, None, self.subsets)) self.logger.debug("Selected this subset of tests: " + str(subset_test_context_list)) return subset_test_context_list * self.repeat def discover( self, directory: str, module_name: str, cls_name: str, method_name: str, injected_args: None = None, ) -> List[TestContext]: """Discover and unpack parametrized tests tied to the given module/class/method :return list of test_context objects """ self.logger.debug( "Discovering tests at {} - {} - {} - {} - {}".format( directory, module_name, cls_name, method_name, injected_args ) ) # Check validity of path path = os.path.join(directory, module_name) if not os.path.exists(path): raise LoaderException("Path {} does not exist".format(path)) # Recursively search path for test modules module_and_file = self._import_module(path) if module_and_file: # Find all tests in discovered modules and filter out any that don't match the discovery symbol test_context_list = self._expand_module(module_and_file) if len(cls_name) > 0: test_context_list = [t for t in test_context_list if t.cls_name == cls_name] if len(method_name) > 0: test_context_list = [t for t in test_context_list if t.function_name == method_name] if injected_args is not None: if isinstance(injected_args, List): def condition(t): return t.injected_args in injected_args else: def condition(t): return t.injected_args == injected_args test_context_list = filter(condition, test_context_list) listed = list(test_context_list) if not listed: self.logger.warn( "No tests loaded for {} - {} - {} - {} - {}".format( directory, module_name, cls_name, method_name, injected_args ) ) return listed else: return [] def _parse_discovery_symbol(self, discovery_symbol: str, base_dir: None = None) -> Tuple[str, str, str, None]: """Parse a single 'discovery symbol' :param discovery_symbol: a symbol used to target test(s). Looks like: [::[.method_name]] :return tuple of form (directory, module.py, cls_name, function_name) :raise LoaderException if it can't be parsed Examples: "path/to/directory" -> ("path/to/directory", "", "") "path/to/test_file.py" -> ("path/to/test_file.py", "", "") "path/to/test_file.py::ClassName.method" -> ("path/to/test_file.py", "ClassName", "method") """ def divide_by_symbol(ds, symbol): if symbol not in ds: return ds, "" return ds.split(symbol, maxsplit=1) self.logger.debug("Trying to parse discovery symbol {}".format(discovery_symbol)) if base_dir: discovery_symbol = os.path.join(base_dir, discovery_symbol) if discovery_symbol.find("::") >= 0: path, cls_name = divide_by_symbol(discovery_symbol, "::") # If the part after :: contains a dot, use it to split into class + method cls_name, method_name = divide_by_symbol(cls_name, ".") method_name, injected_args_str = divide_by_symbol(method_name, "@") if injected_args_str: if self.injected_args: raise LoaderException("Cannot use both global and per-method test parameters") try: injected_args = json.loads(injected_args_str) except Exception as e: raise LoaderException("Invalid discovery symbol: cannot parse params: " + injected_args_str) from e else: injected_args = None else: # No "::" present in symbol path, cls_name, method_name, injected_args = discovery_symbol, "", "", None return path, cls_name, method_name, injected_args def _import_module(self, file_path: str) -> Optional[ModuleAndFile]: """Attempt to import a python module from the file path. Assume file_path is an absolute path ending in '.py' Return the imported module.. :param file_path: file to import module from. :return ModuleAndFile object that contains the successfully imported module and the file from which it was imported """ self.logger.debug("Trying to import module at path {}".format(file_path)) if file_path[-3:] != ".py" or not os.path.isabs(file_path): raise Exception("Expected absolute path ending in '.py' but got " + file_path) # Try all possible module imports for given file # Strip off '.py' before splitting path_pieces = [piece for piece in file_path[:-3].split("/") if len(piece) > 0] while len(path_pieces) > 0: module_name = ".".join(path_pieces) # Try to import the current file as a module self.logger.debug("Trying to import module {}".format(module_name)) try: module_and_file = ModuleAndFile(module=importlib.import_module(module_name), file=file_path) self.logger.debug("Successfully imported " + module_name) return module_and_file except Exception as e: # Because of the way we are searching for # valid modules in this loop, we expect some of the # module names we construct to fail to import. # # Therefore we check if the failure "looks normal", and log # expected failures only at debug level. # # Unexpected errors are aggressively logged, e.g. if the module # is valid but itself triggers an ImportError (e.g. typo in an # import line), or a SyntaxError. expected_error = False if isinstance(e, ImportError): match = re.search(r"No module named '?([^\s\']+)'?", str(e)) if match is not None: missing_module = match.groups()[0] if missing_module in module_name: expected_error = True else: # The error is still an expected error if missing_module is a suffix of module_name. # This is because the error message may contain only a suffix # of the original module_name if leftmost chunk of module_name is a legitimate # module name, but the rightmost part doesn't exist. # # Check this by seeing if it is a "piecewise suffix" of module_name - i.e. if the parts # delimited by dots match. This is a little bit stricter than just checking for a suffix # # E.g. "fancy.cool_module" is a piecewise suffix of "my.fancy.cool_module", # but "module" is not a piecewise suffix of "my.fancy.cool_module" missing_module_pieces = missing_module.split(".") expected_error = missing_module_pieces == path_pieces[-len(missing_module_pieces) :] if expected_error: self.logger.debug( "Failed to import %s. This is likely an artifact of the " "ducktape module loading process: %s: %s", module_name, e.__class__.__name__, e, ) else: self.logger.error( "Failed to import %s, which may indicate a broken test that cannot be loaded: %s: %s", module_name, e.__class__.__name__, e, ) finally: path_pieces = path_pieces[1:] self.logger.debug("Unable to import %s" % file_path) return None def _expand_module(self, module_and_file: ModuleAndFile) -> List[TestContext]: """Return a list of TestContext objects, one object for every 'testable unit' in module""" test_context_list = [] module = module_and_file.module file_name = module_and_file.file module_objects = module.__dict__.values() test_classes = [c for c in module_objects if self._is_test_class(c)] for cls in test_classes: test_context_list.extend( self._expand_class( TestContext( session_context=self.session_context, cluster=self.cluster, module=module.__name__, cls=cls, file=file_name, ) ) ) return test_context_list def _expand_class(self, t_ctx: TestContext) -> List[TestContext]: """Return a list of TestContext objects, one object for each method in t_ctx.cls""" test_methods = [] for f_name in dir(t_ctx.cls): f = getattr(t_ctx.cls, f_name) if self._is_test_function(f): test_methods.append(f) test_context_list = [] for f in test_methods: t = t_ctx.copy(function=f) test_context_list.extend(self._expand_function(t)) return test_context_list def _expand_function(self, t_ctx: TestContext) -> List[TestContext]: expander = MarkedFunctionExpander( t_ctx.session_context, t_ctx.module, t_ctx.cls, t_ctx.function, t_ctx.file, t_ctx.cluster, ) return expander.expand(self.injected_args) def _find_test_files(self, path_or_glob): """ Return a list of files at the specified path (or glob) that look like test files. - Globs are not recursive, so ** is not supported. - However, if the glob matches a folder (or is not a glob but simply a folder path), we will load all tests in that folder and recursively search the sub folders. :param path_or_glob: path to a test file, folder with test files or a glob that expands to folders and files :return: list of absolute paths to test files """ test_files = [] self.logger.debug("Looking for test files in {}".format(path_or_glob)) # glob is safe to be called on non-glob path - it would just return that same path wrapped in a list expanded_glob = glob.glob(path_or_glob) self.logger.debug("Expanded {} into {}".format(path_or_glob, expanded_glob)) def maybe_add_test_file(f): if self._is_test_file(f): test_files.append(f) else: self.logger.debug("Skipping {} because it isn't a test file".format(f)) for path in expanded_glob: if not os.path.exists(path): raise LoaderException("Path {} does not exist".format(path)) self.logger.debug("Checking {}".format(path)) if os.path.isfile(path): maybe_add_test_file(path) elif os.path.isdir(path): for pwd, dirs, files in os.walk(path): if "__init__.py" not in files: # Not a package - ignore this directory continue for f in files: file_path = os.path.abspath(os.path.join(pwd, f)) maybe_add_test_file(file_path) else: raise LoaderException("Got a path that we don't understand: " + path) return test_files def _is_test_file(self, file_name): """By default, a test file looks like test_*.py or *_test.py""" return re.match(self.test_file_pattern, os.path.basename(file_name)) is not None def _is_test_class(self, obj: Any) -> bool: """An object is a test class if it's a leafy subclass of Test.""" return inspect.isclass(obj) and issubclass(obj, Test) and len(obj.__subclasses__()) == 0 def _is_test_function(self, function: Any) -> bool: """A test function looks like a test and is callable (or expandable).""" if function is None: return False if not parametrized(function) and not callable(function): return False return re.match(self.test_function_pattern, function.__name__) is not None def _load_test_suite_files(self, test_suite_files: List[Any]) -> Set[Any]: suites = list() suites.extend(self._read_test_suite_from_file(test_suite_files)) all_contexts = set() for suite in suites: all_contexts.update(self._load_test_suite(**suite)) return all_contexts def _load_file(self, suite_file_path): if not os.path.exists(suite_file_path): raise LoaderException(f"Path {suite_file_path} does not exist") if not os.path.isfile(suite_file_path): raise LoaderException(f"{suite_file_path} is not a file, so it cannot be a test suite") with open(suite_file_path) as fp: try: file_content = yaml.load(fp, Loader=yaml.FullLoader) except Exception as e: raise LoaderException("Failed to load test suite from file: " + suite_file_path, e) if not file_content: raise LoaderException("Test suite file is empty: " + suite_file_path) if not isinstance(file_content, dict): raise LoaderException("Malformed test suite file: " + suite_file_path) for suite_name, suite_content in file_content.items(): if not suite_content: raise LoaderException("Empty test suite " + suite_name + " in " + suite_file_path) return file_content def _load_suites(self, file_path, file_content): suites = [] for suite_name, suite_content in file_content.items(): if not suite_content: raise LoaderException(f"Empty test suite {suite_name} in {file_path}") # if test suite is just a list of paths, those are included paths # otherwise, expect separate sections for included and excluded if isinstance(suite_content, list): included_paths = suite_content excluded_paths = None elif isinstance(suite_content, dict): included_paths = suite_content.get("included") excluded_paths = suite_content.get("excluded") else: raise LoaderException(f"Malformed test suite {suite_name} in {file_path}") suites.append( { "name": suite_name, "included": included_paths, "excluded": excluded_paths, "base_dir": os.path.dirname(file_path), } ) return suites def _read_test_suite_from_file(self, root_suite_file_paths: List[Any]) -> List[Any]: root_suite_file_paths = [os.path.abspath(file_path) for file_path in root_suite_file_paths] files = {file: self._load_file(file) for file in root_suite_file_paths} stack = root_suite_file_paths # load all files while len(stack) != 0: curr = stack.pop() loaded = files[curr] if "import" in loaded: if isinstance(loaded["import"], str): loaded["import"] = [loaded["import"]] directory = os.path.dirname(curr) # apply path of current file to the files inside abs_file_iter = (os.path.abspath(os.path.join(directory, file)) for file in loaded.get("import", [])) imported = [file for file in abs_file_iter if file not in files] for file in imported: files[file] = self._load_file(file) stack.extend(imported) del files[curr]["import"] # load all suites from all loaded files suites = [] for file_name, file_context in files.items(): suites.extend(self._load_suites(file_name, file_context)) return suites def _load_test_suite(self, **kwargs): name = kwargs["name"] included = kwargs["included"] excluded = kwargs.get("excluded") base_dir = kwargs.get("base_dir") excluded_contexts = self._load_test_contexts(excluded, base_dir=base_dir) included_contexts = self._load_test_contexts(included, base_dir=base_dir) self.logger.debug("Including tests: " + str(included_contexts)) self.logger.debug("Excluding tests: " + str(excluded_contexts)) # filter out any excluded test from the included tests set all_test_context_list = self._filter_excluded_test_contexts(included_contexts, excluded_contexts) if not all_test_context_list: raise LoaderException("No tests found in " + name) return all_test_context_list def _load_test_contexts( self, test_discovery_symbols: Optional[List[str]], base_dir: None = None ) -> Set[TestContext]: """ Load all test_context objects found in test_discovery_symbols. Each test discovery symbol is a dir or file path, optionally with with a ::Class or ::Class.method specified. :param test_discovery_symbols: list of test symbols to look into :return: List of test_context objects discovered by checking test_discovery_symbols (may be empty if none were discovered) """ if not test_discovery_symbols: return set() if not isinstance(test_discovery_symbols, list): raise LoaderException("Expected test_discovery_symbols to be a list.") all_test_context_list = set() for symbol in test_discovery_symbols: ( path_or_glob, cls_name, method, injected_args, ) = self._parse_discovery_symbol(symbol, base_dir) self.logger.debug( "Parsed symbol into {} - {} - {} - {}".format(path_or_glob, cls_name, method, injected_args) ) path_or_glob = os.path.abspath(path_or_glob) # TODO: consider adding a check to ensure glob or dir is not used together with cls_name and method test_files = [] if os.path.isfile(path_or_glob): # if it is a single file, just add it directly - https://github.com/confluentinc/ducktape/issues/284 test_files = [path_or_glob] else: # otherwise, when dealing with a dir or a glob, apply pattern matching rules test_files = self._find_test_files(path_or_glob) self._add_top_level_dirs_to_sys_path(test_files) for test_file in test_files: directory = os.path.dirname(test_file) module_name = os.path.basename(test_file) test_context_list_for_file = self.discover( directory, module_name, cls_name, method, injected_args=injected_args, ) all_test_context_list.update(test_context_list_for_file) if len(test_context_list_for_file) == 0: self.logger.warn("Didn't find any tests in %s " % test_file) return all_test_context_list def _filter_by_unique_test_id(self, contexts: Iterable[TestContext]) -> List[TestContext]: contexts_dict = dict() for context in contexts: if context.test_id not in contexts_dict: contexts_dict[context.test_id] = context return list(contexts_dict.values()) def _filter_excluded_test_contexts( self, included_contexts: Iterable[TestContext], excluded_contexts: Set[Any] ) -> Set[TestContext]: excluded_test_ids = {ctx.test_id for ctx in excluded_contexts} return {ctx for ctx in included_contexts if ctx.test_id not in excluded_test_ids} def _add_top_level_dirs_to_sys_path(self, test_files: List[str]) -> None: seen_dirs = set() for path in test_files: dir = os.path.dirname(path) while os.path.exists(os.path.join(dir, "__init__.py")): dir = os.path.dirname(dir) if dir not in seen_dirs: sys.path.append(dir) seen_dirs.add(dir) ================================================ FILE: ducktape/tests/loggermaker.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging class LoggerMaker(object): """This class helps ensure programmatically configured loggers are configured only once.""" def __init__(self, logger_name: str) -> None: self.logger_name = logger_name @property def logger(self) -> logging.Logger: """Read-only logger attribute.""" if not hasattr(self, "_logger"): self._logger = logging.getLogger(self.logger_name) if not self.configured: self.configure_logger() return self._logger @property def configured(self) -> bool: """Return True iff the logger has been configured. Since logging objects are global in the sense that logging.getLogger(self.logger_name) yields the same object, we assume it is already configured if it already has at least 1 handler. """ return len(logging.getLogger(self.logger_name).handlers) > 0 def configure_logger(self): raise NotImplementedError("configure_logger property must be implemented by a subclass") def close_logger(logger): """Filehandles etc are not closed automatically, so close them here""" if logger is not None: handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler) ================================================ FILE: ducktape/tests/reporter.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import json import os import shutil import sys import xml.etree.ElementTree as ET from pathlib import Path import yaml from ducktape.json_serializable import DucktapeJSONEncoder from ducktape.tests.status import FAIL, FLAKY, IGNORE, PASS from ducktape.utils.terminal_size import get_terminal_size from ducktape.utils.util import ducktape_version DEFAULT_SEPARATOR_WIDTH = 100 def format_time(t): """Return human-readable interval of time. Assumes t is in units of seconds. """ minutes = int(t / 60) seconds = t % 60 r = "" if minutes > 0: r += "%d minute%s " % (minutes, "" if minutes == 1 else "s") r += "%.3f seconds" % seconds return r class SingleResultReporter(object): """Helper class for creating a view of results from a single test.""" def __init__(self, result): self.result = result self.width = get_terminal_size()[0] def result_string(self): """Stringify single result""" result_lines = [ "test_id: %s" % self.result.test_id, "status: %s" % str(self.result.test_status).upper(), "run time: %s" % format_time(self.result.run_time_seconds), ] if self.result.test_status == FAIL: # Add summary if the test failed result_lines.append("\n") result_lines.append(" " + self.result.summary) if self.result.data is not None: result_lines.append(json.dumps(self.result.data)) return "\n".join(result_lines) def report_string(self): """Get the whole report string.""" return "\n".join(["=" * self.width, self.result_string()]) class SingleResultFileReporter(SingleResultReporter): def report(self): self.width = DEFAULT_SEPARATOR_WIDTH report_file = os.path.join(self.result.results_dir, "report.txt") with open(report_file, "w") as fp: fp.write(self.report_string()) # write collected data if self.result.data is not None: data_file = os.path.join(self.result.results_dir, "data.json") with open(data_file, "w") as fp: fp.write(json.dumps(self.result.data)) class SummaryReporter(object): def __init__(self, results): self.results = results self.width = get_terminal_size()[0] def report(self): raise NotImplementedError("method report must be implemented by subclasses of SummaryReporter") class SimpleSummaryReporter(SummaryReporter): def header_string(self): """Header lines of the report""" header_lines = [ "=" * self.width, "SESSION REPORT (ALL TESTS)", "ducktape version: %s" % ducktape_version(), "session_id: %s" % self.results.session_context.session_id, "run time: %s" % format_time(self.results.run_time_seconds), "tests run: %d" % len(self.results), "passed: %d" % self.results.num_passed, "flaky: %d" % self.results.num_flaky, "failed: %d" % self.results.num_failed, "ignored: %d" % self.results.num_ignored, "=" * self.width, ] return "\n".join(header_lines) def report_string(self): """Get the whole report string.""" report_lines = [self.header_string()] report_lines.extend( [SingleResultReporter(result).result_string() + "\n" + "-" * self.width for result in self.results] ) return "\n".join(report_lines) class SimpleFileSummaryReporter(SimpleSummaryReporter): def report(self): self.width = DEFAULT_SEPARATOR_WIDTH report_file = os.path.join(self.results.session_context.results_dir, "report.txt") with open(report_file, "w") as fp: fp.write(self.report_string()) class SimpleStdoutSummaryReporter(SimpleSummaryReporter): def report(self): print(self.report_string()) class JSONReporter(object): def __init__(self, results): self.results = results def report(self): report_file = os.path.abspath(os.path.join(self.results.session_context.results_dir, "report.json")) with open(report_file, "w") as f: f.write( json.dumps( self.results, cls=DucktapeJSONEncoder, sort_keys=True, indent=2, separators=(",", ": "), ) ) class JUnitReporter(object): def __init__(self, results): self.results = results def report(self): report_file = os.path.abspath(os.path.join(self.results.session_context.results_dir, "report.xml")) testsuites = {} # First bucket by module_name and argregate counts for result in self.results: module_name = result.module_name testsuites.setdefault(module_name, {}) # Set default values testsuite = testsuites[module_name] testsuite.setdefault("tests", 0) testsuite.setdefault("skipped", 0) testsuite.setdefault("failures", 0) testsuite.setdefault("errors", 0) testsuite.setdefault("testcases", []).append(result) # Always increment total number of tests testsuite["tests"] += 1 if result.test_status == FAIL: testsuite["failures"] += 1 elif result.test_status == IGNORE: testsuite["skipped"] += 1 total = self.results.num_failed + self.results.num_ignored + self.results.num_passed + self.results.num_flaky # Now start building XML document root = ET.Element( "testsuites", attrib=dict( name="ducktape", time=str(self.results.run_time_seconds), tests=str(total), disabled="0", errors="0", failures=str(self.results.num_failed), ), ) for module_name, testsuite in testsuites.items(): xml_testsuite = ET.SubElement( root, "testsuite", attrib=dict( name=module_name, tests=str(testsuite["tests"]), disabled="0", errors="0", failures=str(testsuite["failures"]), skipped=str(testsuite["skipped"]), ), ) for test in testsuite["testcases"]: # Since we're already aware of module_name and cls_name, strip that prefix off full_name = "{module_name}.{cls_name}.".format(module_name=module_name, cls_name=test.cls_name) if test.test_id.startswith(full_name): name = test.test_id[len(full_name) :] else: name = test.test_id xml_testcase = ET.SubElement( xml_testsuite, "testcase", attrib=dict( name=name, classname=test.cls_name, time=str(test.run_time_seconds), status=str(test.test_status), assertions="", ), ) if test.test_status == FAIL: xml_failure = ET.SubElement( xml_testcase, "failure", attrib=dict(message=test.summary.splitlines()[0]), ) xml_failure.text = test.summary elif test.test_status == IGNORE: ET.SubElement(xml_testcase, "skipped") with open(report_file, "w") as f: content = ET.tostring(root) if isinstance(content, bytes): content = content.decode("utf-8") f.write(content) class HTMLSummaryReporter(SummaryReporter): def __init__(self, results, expected_test_count): super().__init__(results) self.expected_test_count = expected_test_count def format_test_name(self, result): lines = [ "Module: " + result.module_name, "Class: " + result.cls_name, "Method: " + result.function_name, f"Nodes (used/allocated): {result.nodes_used}/{result.nodes_allocated}", ] if result.injected_args is not None: lines.append("Arguments:") lines.append( json.dumps( result.injected_args, sort_keys=True, indent=2, separators=(",", ": "), ) ) return "\n".join(lines) def format_result(self, result): test_result = str(result.test_status).lower() result_json = { "test_name": self.format_test_name(result), "test_result": test_result, "description": result.description, "run_time": format_time(result.run_time_seconds), "data": "" if result.data is None else json.dumps(result.data, sort_keys=True, indent=2, separators=(",", ": ")), "summary": result.summary, "test_log": self.test_results_dir(result), } return result_json def test_results_dir(self, result): """Return *relative path* to test results directory. Path is relative to the base results_dir. Relative path behaves better if the results directory is copied, moved etc. """ base_dir = os.path.abspath(result.session_context.results_dir) base_dir = os.path.join(base_dir, "") # Ensure trailing directory indicator test_results_dir = os.path.abspath(result.results_dir) return test_results_dir[len(base_dir) :] # truncate the "absolute" portion def format_report(self): if sys.version_info >= (3, 9): import importlib.resources as importlib_resources template = importlib_resources.files("ducktape").joinpath("templates/report/report.html").read_text("utf-8") else: import pkg_resources template = pkg_resources.resource_string(__name__, "../templates/report/report.html").decode("utf-8") num_tests_run = len(self.results) num_passes = 0 failed_result_string = [] passed_result_string = [] ignored_result_string = [] flaky_result_string = [] for result in self.results: json_string = json.dumps(self.format_result(result)) if result.test_status == PASS: num_passes += 1 passed_result_string.append(json_string) passed_result_string.append(",") elif result.test_status == FAIL: failed_result_string.append(json_string) failed_result_string.append(",") elif result.test_status == IGNORE: ignored_result_string.append(json_string) ignored_result_string.append(",") elif result.test_status == FLAKY: flaky_result_string.append(json_string) flaky_result_string.append(",") else: raise Exception("Unknown test status in report: {}".format(result.test_status.to_json())) args = { "ducktape_version": ducktape_version(), "expected_test_count": self.expected_test_count, "num_tests_run": num_tests_run, "num_passes": self.results.num_passed, "num_flaky": self.results.num_flaky, "num_failures": self.results.num_failed, "num_ignored": self.results.num_ignored, "run_time": format_time(self.results.run_time_seconds), "session": self.results.session_context.session_id, "passed_tests": "".join(passed_result_string), "flaky_tests": "".join(flaky_result_string), "failed_tests": "".join(failed_result_string), "ignored_tests": "".join(ignored_result_string), "test_status_names": ",".join(f"'{status}'" for status in (PASS, FAIL, IGNORE, FLAKY)), } html = template % args report_html = os.path.join(self.results.session_context.results_dir, "report.html") with open(report_html, "w") as fp: fp.write(html) fp.close() report_css = os.path.join(self.results.session_context.results_dir, "report.css") if sys.version_info >= (3, 9): import importlib.resources as importlib_resources with importlib_resources.as_file( importlib_resources.files("ducktape") / "templates/report/report.css" ) as report_css_origin: shutil.copy2(report_css_origin, report_css) else: import pkg_resources report_css_origin = pkg_resources.resource_filename(__name__, "../templates/report/report.css") shutil.copy2(report_css_origin, report_css) def report(self): self.format_report() class FailedTestSymbolReporter(SummaryReporter): def __init__(self, results): super().__init__(results) self.working_dir = Path().absolute() self.separator = "=" * self.width def to_symbol(self, result): p = Path(result.file_name).relative_to(self.working_dir) line = f"{p}::{result.cls_name}.{result.function_name}" if result.injected_args: injected_args_str = json.dumps(result.injected_args, separators=(",", ":")) line += f"@{injected_args_str}" return line def dump_test_suite(self, lines): print(self.separator) print("FAILED TEST SUITE") suite = {self.results.session_context.session_id: lines} file_path = Path(self.results.session_context.results_dir) / "rerun-failed.yml" with file_path.open("w") as fp: print(f"Test suite to rerun failed tests: {file_path}") yaml.dump(suite, stream=fp, indent=4) def print_test_symbols_string(self, lines): print(self.separator) print("FAILED TEST SYMBOLS") print("Pass the test symbols below to your ducktape run") # quote the symbol because json parameters will be processed by shell otherwise, making it not copy-pasteable print(" ".join([f"'{line}'" for line in lines])) def report(self): symbols = [self.to_symbol(result) for result in self.results if result.test_status == FAIL] if not symbols: return self.dump_test_suite(symbols) self.print_test_symbols_string(symbols) ================================================ FILE: ducktape/tests/result.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import os import time from typing import List from ducktape.cluster.cluster import Cluster from ducktape.cluster.vagrant import VagrantCluster from ducktape.json_serializable import DucktapeJSONEncoder from ducktape.tests.reporter import SingleResultFileReporter from ducktape.tests.session import SessionContext from ducktape.tests.status import FAIL, FLAKY, IGNORE, PASS from ducktape.tests.test_context import TestContext from ducktape.utils.local_filesystem_utils import mkdir_p from ducktape.utils.util import ducktape_version class TestResult(object): """Wrapper class for a single result returned by a single test.""" def __init__( self, test_context, test_index, session_context, test_status=PASS, summary="", data=None, start_time=-1, stop_time=-1, ): """ @param test_context standard test context object @param test_status did the test pass or fail, etc? @param summary summary information @param data data returned by the test, e.g. throughput """ self.nodes_allocated = len(test_context.cluster) self.nodes_used = test_context.cluster.max_used_nodes # Collect node_type from cluster metadata if available self.node_type = test_context.cluster_use_metadata.get("node_type") if hasattr(test_context, "services"): self.services = test_context.services.to_json() else: self.services = {} self.test_id = test_context.test_id self.module_name = test_context.module_name self.cls_name = test_context.cls_name self.function_name = test_context.function_name self.injected_args = test_context.injected_args self.description = test_context.description self.results_dir = TestContext.results_dir(test_context, test_index) self.test_index = test_index self.session_context = session_context self.test_status = test_status self.summary = summary self.data = data self.file_name = test_context.file self.base_results_dir = session_context.results_dir if not self.results_dir.endswith(os.path.sep): self.results_dir += os.path.sep if not self.base_results_dir.endswith(os.path.sep): self.base_results_dir += os.path.sep assert self.results_dir.startswith(self.base_results_dir) self.relative_results_dir = self.results_dir[len(self.base_results_dir) :] # For tracking run time self.start_time = start_time self.stop_time = stop_time def __repr__(self): return "<%s - test_status:%s, data:%s>" % ( self.__class__.__name__, self.test_status, str(self.data), ) @property def run_time_seconds(self): if self.start_time < 0: return -1 if self.stop_time < 0: return time.time() - self.start_time return self.stop_time - self.start_time def report(self): if not os.path.exists(self.results_dir): mkdir_p(self.results_dir) self.dump_json() test_reporter = SingleResultFileReporter(self) test_reporter.report() def dump_json(self): """Dump this object as json to the given location. By default, dump into self.results_dir/report.json""" with open(os.path.join(self.results_dir, "report.json"), "w") as fd: json.dump(self, fd, cls=DucktapeJSONEncoder, sort_keys=True, indent=2) def to_json(self): result = { "test_id": self.test_id, "module_name": self.module_name, "cls_name": self.cls_name, "function_name": self.function_name, "injected_args": self.injected_args, "description": self.description, "results_dir": self.results_dir, "relative_results_dir": self.relative_results_dir, "base_results_dir": self.base_results_dir, "test_status": self.test_status, "summary": self.summary, "data": self.data, "start_time": self.start_time, "stop_time": self.stop_time, "run_time_seconds": self.run_time_seconds, "nodes_allocated": self.nodes_allocated, "nodes_used": self.nodes_used, "services": self.services, } # Only include node_type if it was specified in the @cluster decorator if self.node_type is not None: result["node_type"] = self.node_type return result class TestResults(object): """Class used to aggregate individual TestResult objects from many tests.""" def __init__(self, session_context: SessionContext, cluster: VagrantCluster, client_status: dict) -> None: """ :type session_context: ducktape.tests.session.SessionContext """ self._results: List[TestResult] = [] self.session_context: SessionContext = session_context self.cluster: Cluster = cluster # For tracking total run time self.start_time: int = -1 self.stop_time: int = -1 self.client_status = client_status def append(self, obj: TestResult): return self._results.append(obj) def __len__(self): return len(self._results) def __iter__(self): return iter(self._results) @property def num_passed(self): return len([r for r in self._results if r.test_status == PASS]) @property def num_failed(self): return len([r for r in self._results if r.test_status == FAIL]) @property def num_ignored(self): return len([r for r in self._results if r.test_status == IGNORE]) @property def num_flaky(self): return len([r for r in self._results if r.test_status == FLAKY]) @property def run_time_seconds(self): if self.start_time < 0: return -1 if self.stop_time < 0: self.stop_time = time.time() return self.stop_time - self.start_time def get_aggregate_success(self): """Check cumulative success of all tests run so far :rtype: bool """ for result in self._results: if result.test_status == FAIL: return False return True def _stats(self, num_list): if len(num_list) == 0: return {"mean": None, "min": None, "max": None} return { "mean": sum(num_list) / float(len(num_list)), "min": min(num_list), "max": max(num_list), } def to_json(self): if self.run_time_seconds == 0 or len(self.cluster) == 0: # If things go horribly wrong, the test run may be effectively instantaneous # Let's handle this case gracefully, and avoid divide-by-zero cluster_utilization = 0 parallelism = 0 else: cluster_utilization = ( (1.0 / len(self.cluster)) * (1.0 / self.run_time_seconds) * sum([r.nodes_used * r.run_time_seconds for r in self]) ) parallelism = sum([r.run_time_seconds for r in self._results]) / self.run_time_seconds result = { "ducktape_version": ducktape_version(), "session_context": self.session_context, "run_time_seconds": self.run_time_seconds, "start_time": self.start_time, "stop_time": self.stop_time, "run_time_statistics": self._stats([r.run_time_seconds for r in self]), "cluster_nodes_used": self._stats([r.nodes_used for r in self]), "cluster_nodes_allocated": self._stats([r.nodes_allocated for r in self]), "cluster_utilization": cluster_utilization, "cluster_num_nodes": len(self.cluster), "num_passed": self.num_passed, "num_failed": self.num_failed, "num_ignored": self.num_ignored, "parallelism": parallelism, "client_status": {str(key): value for key, value in self.client_status.items()}, "results": [r for r in self._results], } if self.num_flaky: result["num_flaky"] = self.num_flaky return result ================================================ FILE: ducktape/tests/runner.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations from collections import namedtuple, defaultdict import copy import logging import multiprocessing import os import signal import time import traceback from typing import Dict, List import zmq from ducktape.cluster.finite_subcluster import FiniteSubcluster from ducktape.command_line.defaults import ConsoleDefaults from ducktape.cluster.vagrant import VagrantCluster from ducktape.cluster.node_container import InsufficientResourcesError from ducktape.tests.scheduler import TestScheduler from ducktape.tests.result import FAIL, TestResult from ducktape.tests.reporter import ( SimpleFileSummaryReporter, HTMLSummaryReporter, JSONReporter, JUnitReporter, ) from ducktape.utils import persistence from ducktape.errors import TimeoutError from ducktape.tests.event import ClientEventFactory, EventResponseFactory from ducktape.tests.result import TestResults from ducktape.tests.runner_client import run_client from ducktape.tests.serde import SerDe from ducktape.tests.session import SessionContext from ducktape.tests.test_context import TestContext from ducktape.utils.terminal_size import get_terminal_size DEFAULT_MP_JOIN_TIMEOUT = 30 DEFAULT_TIMEOUT_EXCEPTION_JOIN_TIMEOUT = 120 class Receiver(object): def __init__(self, min_port: int, max_port: int) -> None: assert min_port <= max_port, "Expected min_port <= max_port, but instead: min_port: %s, max_port %s" % ( min_port, max_port, ) self.port = None self.min_port = min_port self.max_port = max_port self.serde = SerDe() self.zmq_context = zmq.Context() self.socket = self.zmq_context.socket(zmq.REP) def start(self): """Bind to a random port in the range [self.min_port, self.max_port], inclusive""" # note: bind_to_random_port may retry the same port multiple times self.port = self.socket.bind_to_random_port( addr="tcp://*", min_port=self.min_port, max_port=self.max_port + 1, max_tries=2 * (self.max_port + 1 - self.min_port), ) def recv(self, timeout=1800000): if timeout is None: # use default value of 1800000 or 30 minutes timeout = 1800000 self.socket.RCVTIMEO = timeout try: message = self.socket.recv() except zmq.Again: raise TimeoutError("runner client unresponsive") return self.serde.deserialize(message) def send(self, event): self.socket.send(self.serde.serialize(event)) def close(self): self.socket.setsockopt(zmq.LINGER, 0) self.socket.close() TestKey = namedtuple("TestKey", ["test_id", "test_index"]) class TestRunner(object): # When set to True, the test runner will finish running/cleaning the current test, but it will not run any more stop_testing = False def __init__( self, cluster: VagrantCluster, session_context: SessionContext, session_logger: logging.Logger, tests: List[TestContext], deflake_num: int, min_port: int = ConsoleDefaults.TEST_DRIVER_MIN_PORT, max_port: int = ConsoleDefaults.TEST_DRIVER_MAX_PORT, finish_join_timeout: int = DEFAULT_MP_JOIN_TIMEOUT, timeout_exception_join_timeout: int = DEFAULT_TIMEOUT_EXCEPTION_JOIN_TIMEOUT, ) -> None: # Set handler for SIGTERM (aka kill -15) # Note: it doesn't work to set a handler for SIGINT (Ctrl-C) in this parent process because the # handler is inherited by all forked child processes, and it prevents the default python behavior # of translating SIGINT into a KeyboardInterrupt exception signal.signal(signal.SIGTERM, self._propagate_sigterm) # session_logger, message logger, self.session_logger = session_logger self.cluster = cluster self.event_response = EventResponseFactory() self.hostname = "localhost" self.receiver = Receiver(min_port, max_port) self.deflake_num = deflake_num self.session_context = session_context self.max_parallel = session_context.max_parallel self.client_report = defaultdict(dict) self.results = TestResults(self.session_context, self.cluster, client_status=self.client_report) self.exit_first = self.session_context.exit_first self.main_process_pid = os.getpid() self.scheduler = TestScheduler(tests, self.cluster) self.test_counter = 1 self.total_tests = len(self.scheduler) # This immutable dict tracks test_id -> test_context self._test_context = persistence.make_dict(**{t.test_id: t for t in tests}) self._test_cluster: Dict[TestKey, FiniteSubcluster] = {} # Track subcluster assigned to a particular TestKey self._client_procs: Dict[TestKey, multiprocessing.Process] = {} # track client processes running tests self.active_tests: Dict[TestKey, bool] = {} self.finished_tests: Dict[TestKey, dict] = {} self.test_schedule_log: List[TestKey] = [] self.finish_join_timeout: int = finish_join_timeout self.timeout_exception_join_timeout: int = timeout_exception_join_timeout def _terminate_process(self, process: multiprocessing.Process): # use os.kill rather than multiprocessing.terminate for more control # This is called after SIGTERM was sent and process didn't exit, so escalate to SIGKILL assert process.pid != os.getpid(), "Signal handler should not reach this point in a client subprocess." assert process.pid is not None, "Process has no pid, cannot terminate." if process.is_alive(): self._log(logging.WARNING, f"Process {process.name} did not respond to SIGTERM, escalating to SIGKILL") os.kill(process.pid, signal.SIGKILL) def _join_test_process(self, process_key, timeout: int = DEFAULT_MP_JOIN_TIMEOUT): # waits for process to complete, if it doesn't terminate it process: multiprocessing.Process = self._client_procs[process_key] start = time.time() while time.time() - start <= timeout: if not process.is_alive(): self.client_report[process_key]["status"] = "FINISHED" break time.sleep(0.1) else: # Note: This can lead to some tmp files being uncleaned, otherwise nothing else should be executed by the # client after this point. self._log( logging.ERROR, f"after waiting {timeout}s, process {process.name} failed to complete. Terminating..." ) self._terminate_process(process) self.client_report[process_key]["status"] = "TERMINATED" process.join() self.client_report[process_key]["exitcode"] = process.exitcode self.client_report[process_key]["runner_end_time"] = time.time() assert not process.is_alive() del self._client_procs[process_key] def _propagate_sigterm(self, signum, frame): """Handler SIGTERM and SIGINT by propagating SIGTERM to all client processes. Note that multiprocessing processes are in the same process group as the main process, so Ctrl-C will result in SIGINT being propagated to all client processes automatically. This may result in multiple SIGTERM signals getting sent to client processes in quick succession. However, it is possible that the main process (and not the process group) receives a SIGINT or SIGTERM directly. Propagating SIGTERM to client processes is necessary in this case. """ if os.getpid() != self.main_process_pid: # since we're using the multiprocessing module to create client processes, # this signal handler is also attached client processes, so we only want to propagate TERM signals # if this process *is* the main runner server process return self.stop_testing = True for p in self._client_procs.values(): self._terminate_process(p) def who_am_i(self): """Human-readable name helpful for logging.""" return self.__class__.__name__ @property def _ready_to_trigger_more_tests(self): """Should we pull another test from the scheduler?""" return ( not self.stop_testing and len(self.active_tests) < self.max_parallel and self.scheduler.peek() is not None ) @property def _expect_client_requests(self): return len(self.active_tests) > 0 def _report_unschedulable(self, unschedulable, err_msg=None): if not unschedulable: return self._log( logging.ERROR, f"There are {len(unschedulable)} tests which cannot be run due to insufficient cluster resources", ) for tc in unschedulable: if err_msg: msg = err_msg else: msg = ( f"Test {tc.test_id} requires more resources than are available in the whole cluster. " f"{self.cluster.all().nodes.attempt_remove_spec(tc.expected_cluster_spec)}" ) self._log(logging.ERROR, msg) result = TestResult( tc, self.test_counter, self.session_context, test_status=FAIL, summary=msg, start_time=time.time(), stop_time=time.time(), ) self.results.append(result) result.report() self.test_counter += 1 def _check_unschedulable(self): self._report_unschedulable(self.scheduler.filter_unschedulable_tests()) def _report_remaining_as_failed(self, reason): """Mark all remaining tests in the scheduler as failed with the given reason.""" remaining_tests = self.scheduler.drain_remaining_tests() if not remaining_tests: return self._log( logging.ERROR, f"Marking {len(remaining_tests)} remaining tests as failed: {reason}", ) for tc in remaining_tests: msg = f"Test not run: {reason}" self._log(logging.ERROR, f"{tc.test_id}: {msg}") result = TestResult( tc, self.test_counter, self.session_context, test_status=FAIL, summary=msg, start_time=time.time(), stop_time=time.time(), ) self.results.append(result) result.report() self.test_counter += 1 def _report_active_as_failed(self, reason): """Mark all currently active/running tests as failed with the given reason.""" active_test_keys = list(self.active_tests.keys()) if not active_test_keys: return self._log( logging.ERROR, f"Marking {len(active_test_keys)} active tests as failed: {reason}", ) stop_time = time.time() for test_key in active_test_keys: if hasattr(self, "_test_cluster") and test_key in self._test_cluster: subcluster = self._test_cluster[test_key] # Return nodes to the cluster and remove tracking entry self.cluster.free(subcluster.nodes) del self._test_cluster[test_key] tc = self._test_context[test_key.test_id] msg = f"Test timed out: {reason}" self._log(logging.ERROR, f"{tc.test_id}: {msg}") start_time = self.client_report[test_key].get("runner_start_time", stop_time) result = TestResult( tc, test_key.test_index, self.session_context, test_status=FAIL, summary=msg, start_time=start_time, stop_time=stop_time, ) self.results.append(result) result.report() # Clear active tests since we've reported them all as failed self.active_tests.clear() def run_all_tests(self): self.receiver.start() self.results.start_time = time.time() # Report tests which cannot be run self._check_unschedulable() # Run the tests! self._log( logging.INFO, "starting test run with session id %s..." % self.session_context.session_id, ) self._log(logging.INFO, "running %d tests..." % len(self.scheduler)) while self._ready_to_trigger_more_tests or self._expect_client_requests: try: while self._ready_to_trigger_more_tests: next_test_context = self.scheduler.peek() try: self._preallocate_subcluster(next_test_context) except InsufficientResourcesError: # We were not able to allocate the subcluster for this test, # this means not enough nodes passed health check. # Don't mark this test as failed just yet, some other test might finish running and # free up healthy nodes. # However, if some nodes failed, cluster size changed too, so we need to check if # there are any tests that can no longer be scheduled. self._log( logging.INFO, f"Couldn't schedule test context {next_test_context} but we'll keep trying", exc_info=True, ) self._check_unschedulable() else: # only remove the test from the scheduler once we've successfully allocated a subcluster for it self.scheduler.remove(next_test_context) self._run_single_test(next_test_context) if self._expect_client_requests: try: event = self.receiver.recv(timeout=self.session_context.test_runner_timeout) self._handle(event) except TimeoutError as e: # Handle timeout gracefully - clean up, mark tests as failed, and return results err_str = "Timeout error while receiving message: %s: %s" % ( str(type(e)), str(e), ) err_str += "\n" + traceback.format_exc(limit=16) self._log(logging.ERROR, err_str) # Send SIGTERM to all client processes immediately to allow graceful cleanup # (copy logs, run teardown, etc.) for process_key in list(self._client_procs.keys()): proc = self._client_procs[process_key] if proc.is_alive(): self._log(logging.INFO, f"Sending SIGTERM to process {proc.name} for graceful shutdown") os.kill(proc.pid, signal.SIGTERM) # Wait for processes to shutdown gracefully (in parallel), escalate to SIGKILL if needed for process_key in list(self._client_procs.keys()): self._join_test_process(process_key, self.timeout_exception_join_timeout) self._client_procs = {} # Mark active tests as failed with the exception message self._report_active_as_failed(str(e)) # Mark all remaining tests as failed with the exception message self._report_remaining_as_failed(str(e)) self.receiver.close() return self.results except Exception as e: # For all other exceptions, clean up and re-raise err_str = "Exception receiving message: %s: %s" % ( str(type(e)), str(e), ) err_str += "\n" + traceback.format_exc(limit=16) self._log(logging.ERROR, err_str) for proc in list(self._client_procs): self._join_test_process(proc, self.finish_join_timeout) self._client_procs = {} raise except KeyboardInterrupt: # If SIGINT is received, stop triggering new tests, and let the currently running tests finish self._log( logging.INFO, "Received KeyboardInterrupt. Now waiting for currently running tests to finish...", ) self.stop_testing = True # All clients should be cleaned up in their finish block if self._client_procs: self._log(logging.WARNING, f"Some clients failed to clean up, waiting 10min to join: {self._client_procs}") for proc in list(self._client_procs): self._join_test_process(proc, self.finish_join_timeout) self.receiver.close() return self.results def _run_single_test(self, test_context): """Start a test runner client in a subprocess""" current_test_counter = self.test_counter self.test_counter += 1 self._log( logging.INFO, "Triggering test %d of %d..." % (current_test_counter, self.total_tests), ) # Test is considered "active" as soon as we start it up in a subprocess test_key = TestKey(test_context.test_id, current_test_counter) self.active_tests[test_key] = True self.test_schedule_log.append(test_key) proc = multiprocessing.Process( target=run_client, args=[ self.hostname, self.receiver.port, test_context.test_id, current_test_counter, TestContext.logger_name(test_context, current_test_counter), TestContext.results_dir(test_context, current_test_counter), self.session_context.debug, self.session_context.fail_bad_cluster_utilization, self.deflake_num, ], ) self._client_procs[test_key] = proc proc.start() self.client_report[test_key]["status"] = "RUNNING" self.client_report[test_key]["pid"] = proc.pid self.client_report[test_key]["name"] = proc.name self.client_report[test_key]["runner_start_time"] = time.time() def _preallocate_subcluster(self, test_context): """Preallocate the subcluster which will be used to run the test. Side effect: store association between the test_id and the preallocated subcluster. :param test_context :return None """ allocated = self.cluster.alloc(test_context.expected_cluster_spec) if len(self.cluster.available()) == 0 and self.max_parallel > 1 and not self._test_cluster: self._log( logging.WARNING, "Test %s is using entire cluster. It's possible this test has no associated cluster metadata." % test_context.test_id, ) self._test_cluster[TestKey(test_context.test_id, self.test_counter)] = FiniteSubcluster(allocated) def _handle(self, event): self._log(logging.DEBUG, str(event)) if event["event_type"] == ClientEventFactory.READY: self._handle_ready(event) elif event["event_type"] in [ ClientEventFactory.RUNNING, ClientEventFactory.SETTING_UP, ClientEventFactory.TEARING_DOWN, ]: self._handle_lifecycle(event) elif event["event_type"] == ClientEventFactory.FINISHED: self._handle_finished(event) elif event["event_type"] == ClientEventFactory.LOG: self._handle_log(event) else: raise RuntimeError("Received event with unknown event type: " + str(event)) def _handle_ready(self, event): test_key = TestKey(event["test_id"], event["test_index"]) test_context = self._test_context[event["test_id"]] subcluster = self._test_cluster[test_key] self.receiver.send(self.event_response.ready(event, self.session_context, test_context, subcluster)) def _handle_log(self, event): self.receiver.send(self.event_response.log(event)) self._log(event["log_level"], event["message"]) def _handle_finished(self, event): test_key = TestKey(event["test_id"], event["test_index"]) self.receiver.send(self.event_response.finished(event)) # Idempotency: check if this is a ZMQ retry (test already finished) if test_key in self.finished_tests: self._log(logging.DEBUG, f"Received duplicate FINISHED for {test_key} (ZMQ retry), ignoring") return # Normal case: test is currently active if test_key not in self.active_tests: # This should never happen - indicates a logic bug raise RuntimeError( f"Received FINISHED for test {test_key} which is not in active_tests. " f"This indicates a bug in the test runner state management." ) result = event["result"] if result.test_status == FAIL and self.exit_first: self.stop_testing = True # Transition this test from running to finished del self.active_tests[test_key] self.finished_tests[test_key] = event self.results.append(result) # Free nodes used by the test subcluster = self._test_cluster[test_key] self.cluster.free(subcluster.nodes) del self._test_cluster[test_key] # Join on the finished test process self._join_test_process(test_key, timeout=self.finish_join_timeout) # Report partial result summaries - it is helpful to have partial test reports available if the # ducktape process is killed with a SIGKILL partway through test_results = copy.copy(self.results) # shallow copy reporters = [ SimpleFileSummaryReporter(test_results), HTMLSummaryReporter(test_results, self.total_tests), JSONReporter(test_results), JUnitReporter(test_results), ] for r in reporters: r.report() if self._should_print_separator: terminal_width, y = get_terminal_size() self._log(logging.INFO, "~" * int(2 * terminal_width / 3)) @property def _should_print_separator(self): """The separator is the twiddle that goes in between tests on stdout. This only makes sense to print if tests are run sequentially (aka max_parallel = 1) since output from tests is interleaved otherwise. Also, we don't want to print the separator after the last test output has been received, so we check that there's more test output expected. """ return self.session_context.max_parallel == 1 and ( self._expect_client_requests or self._ready_to_trigger_more_tests ) def _handle_lifecycle(self, event): self.receiver.send(self.event_response._event_response(event)) def _log(self, log_level, msg, *args, **kwargs): """Log to the service log of the current test.""" self.session_logger.log(log_level, msg, *args, **kwargs) ================================================ FILE: ducktape/tests/runner_client.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import signal import threading import time import traceback from collections import defaultdict from typing import List, Mapping, Optional, Tuple import zmq import psutil from ducktape.services.service import MultiRunServiceIdFactory, service_id_factory from ducktape.services.service_registry import ServiceRegistry from ducktape.tests.event import ClientEventFactory from ducktape.tests.loader import TestLoader from ducktape.tests.result import FAIL, IGNORE, PASS, TestResult from ducktape.tests.serde import SerDe from ducktape.tests.status import FLAKY, TestStatus from ducktape.tests.test import Test from ducktape.tests.test_context import TestContext, test_logger from ducktape.utils.local_filesystem_utils import mkdir_p def run_client(*args, **kwargs): client = RunnerClient(*args, **kwargs) client.ready() client.run() class Sender(object): REQUEST_TIMEOUT_MS = 3000 NUM_RETRIES = 5 BACKOFF_MULTIPLIER = 2.0 # Exponential backoff multiplier serde: SerDe message_supplier: ClientEventFactory server_endpoint: str zmq_context: zmq.Context socket: Optional[zmq.Socket] poller: zmq.Poller def __init__( self, server_host: str, server_port: str, message_supplier: ClientEventFactory, logger: logging.Logger, ): self.serde = SerDe() self.server_endpoint = "tcp://%s:%s" % (str(server_host), str(server_port)) self.zmq_context = zmq.Context() self.socket = None self.poller = zmq.Poller() self.message_supplier = message_supplier self.logger = logger self._init_socket() def _init_socket(self): self.socket = self.zmq_context.socket(zmq.REQ) self.socket.connect(self.server_endpoint) self.poller.register(self.socket, zmq.POLLIN) def send(self, event, blocking=True): retries_left = Sender.NUM_RETRIES current_timeout_ms = Sender.REQUEST_TIMEOUT_MS while retries_left > 0: serialized_event = self.serde.serialize(event) self.socket.send(serialized_event) retries_left -= 1 waiting_for_reply = True while waiting_for_reply: sockets = dict(self.poller.poll(current_timeout_ms)) if sockets.get(self.socket) == zmq.POLLIN: reply = self.socket.recv() if reply: return self.serde.deserialize(reply) else: # send another request... break else: self.close() self._init_socket() waiting_for_reply = False # Apply exponential backoff for next retry current_timeout_ms = int(current_timeout_ms * Sender.BACKOFF_MULTIPLIER) # Ensure each message we attempt to send has a unique id # This copy constructor gives us a duplicate with a new message id event = self.message_supplier.copy(event) raise RuntimeError("Unable to receive response from driver") def close(self): self.socket.setsockopt(zmq.LINGER, 0) self.socket.close() self.poller.unregister(self.socket) class RunnerClient(object): """Run a single test""" serde: SerDe logger: logging.Logger runner_port: int message: ClientEventFactory sender: Sender test_id: str test_index: int id: str test: Optional[Test] test_context: Optional[TestContext] all_services: Optional[ServiceRegistry] # configs fail_bad_cluster_utilization: bool deflake_num: int def __init__( self, server_hostname: str, server_port: int, test_id: str, test_index: int, logger_name: str, log_dir: str, debug: bool, fail_bad_cluster_utilization: bool, deflake_num: int, ): signal.signal(signal.SIGTERM, self._sigterm_handler) # register a SIGTERM handler self.serde = SerDe() self.logger = test_logger(logger_name, log_dir, debug) self.runner_port = server_port self.fail_bad_cluster_utilization = fail_bad_cluster_utilization self.test_id = test_id self.test_index = test_index self.id = "test-runner-%d-%d" % (os.getpid(), id(self)) self.message = ClientEventFactory(self.test_id, self.test_index, self.id) self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger) self.deflake_num = deflake_num # Wait to instantiate the test object until running the test self.test = None self.test_context = None self.all_services = None self.runner_shutting_down = False @property def deflake_enabled(self) -> bool: return self.deflake_num > 1 def ready(self): ready_reply = self.sender.send(self.message.ready()) self.session_context = ready_reply["session_context"] self.test_metadata = ready_reply["test_metadata"] self.cluster = ready_reply["cluster"] def send(self, event): # Don't attempt to send messages if the runner is shutting down # The driver is no longer listening and will timeout anyway if self.runner_shutting_down: return None return self.sender.send(event) def _kill_all_child_processes(self, send_signal=signal.SIGTERM): current_process = psutil.Process() # if this client has any children - kill them (for instances background service) children = current_process.children(recursive=True) for child in children: self.logger.warning(f"process {repr(child)} did not terminate on its own, killing with {send_signal}") child.send_signal(send_signal) def _sigterm_handler(self, signum, frame): """Translate SIGTERM to SIGINT on this process python will treat SIGINT as a Keyboard exception. Exception handling does the rest. """ self.runner_shutting_down = True self.logger.warning("Received SIGTERM, sending SIGINT to self and all child processes") self._kill_all_child_processes(signal.SIGINT) os.kill(os.getpid(), signal.SIGINT) # This will send SIGINT to the current process def _collect_test_context(self, directory, file_name, cls_name, method_name, injected_args): loader = TestLoader( self.session_context, self.logger, injected_args=injected_args, cluster=self.cluster, ) # TODO: deal with this in a more graceful fashion. # In an unlikely even that discover either raises the exception or fails to find exactly one test # we should probably continue trying other tests rather than killing this process loaded_context_list = loader.discover(directory, file_name, cls_name, method_name) assert len(loaded_context_list) == 1 test_context = loaded_context_list[0] test_context.cluster = self.cluster return test_context def run(self): self.log(logging.INFO, "Loading test %s" % str(self.test_metadata)) self.test_context = self._collect_test_context(**self.test_metadata) self.test_context.test_index = self.test_index self.send(self.message.running()) if self.test_context.ignore: # Skip running this test, but keep track of the fact that we ignored it result = TestResult( self.test_context, self.test_index, self.session_context, test_status=IGNORE, start_time=time.time(), stop_time=time.time(), ) result.report() # Tell the server we are finished self.send(self.message.finished(result=result)) return start_time = -1 stop_time = -1 test_status = FAIL data = None self.all_services = ServiceRegistry(enable_jvm_logs=self.session_context.enable_jvm_logs) summaries = [] num_runs = 0 try: while test_status == FAIL and num_runs < self.deflake_num: num_runs += 1 self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num)) start_time = time.time() test_status, run_summary, data = self._do_run(num_runs) if run_summary: summaries.append(run_summary) # dump threads after the test is complete; # if any thread is not terminated correctly by the test we'll see it here self.dump_threads(f"Threads after {self.test_id} finished") # if run passed, and not on the first run, the test is flaky if test_status == PASS and num_runs > 1: test_status = FLAKY msg = str(test_status.to_json()) if run_summary: msg += ": {}".format("\n".join(run_summary)) self.log(logging.INFO, msg) finally: stop_time = time.time() summary = self.process_run_summaries(summaries, test_status) test_status, summary = self._check_cluster_utilization(test_status, summary) # convert summary from list to string summary = "\n".join(summary) if num_runs > 1: # for reporting purposes report all services self.test_context.services = self.all_services # for flaky tests, we report the start and end time of the successful run, and not the whole run period result = TestResult( self.test_context, self.test_index, self.session_context, test_status, summary, data, start_time, stop_time, ) self.log(logging.INFO, "Data: %s" % str(result.data)) result.report() # Tell the server we are finished self._do_safely( lambda: self.send(self.message.finished(result=result)), f"Problem sending FINISHED message for {self.test_metadata}:\n", ) self._kill_all_child_processes() # Release test_context resources only after creating the result and finishing logging activity # The Sender object uses the same logger, so we postpone closing until after the finished message is sent self.test_context.close() self.all_services = None self.test_context = None self.test = None def process_run_summaries(self, run_summaries: List[List[str]], test_status: TestStatus) -> List[str]: """ Converts individual run summaries (there may be multiple if deflake is enabled) into a single run summary """ # no summary case, return test passed if not run_summaries: return ["Test Passed"] # single run, can just return the summary if not self.deflake_enabled: return run_summaries[0] failure_summaries: Mapping[Tuple[str, ...], List[int]] = defaultdict(list) # populate run summaries grouping run numbers by stack trace for run_num, summary in enumerate(run_summaries): # convert to tuple to be serializable (+1 for human readability 1 based indexing) failure_summaries[tuple(summary)].append(run_num + 1) final_summary = [] # handle run summaries for each deflake run: sub_summaries = [] for individual_summary, runs in failure_summaries.items(): sub_summary = [] runs_str = ", ".join(str(r) for r in runs) run_msg = f"run{'s' if len(runs) > 1 else ''} {runs_str} summary:" sub_summary.append(run_msg) sub_summary.extend(individual_summary) sub_summaries.append(sub_summary) if test_status == FLAKY: sub_summaries.append([f"run {len(run_summaries)}: PASSED"]) # combine summaries, with a '~~~~~' divider for sub_summary in sub_summaries[:-1]: final_summary.extend(sub_summary) break_line = "~" * max(len(line) for line in final_summary) if final_summary else "" final_summary.append(break_line) # the pass case could have no summaries, so need to validate that a subsummary exists if sub_summaries: final_summary.extend(sub_summaries[-1]) return final_summary def _do_run(self, num_runs): test_status = FAIL summary = [] data = None sid_factory = MultiRunServiceIdFactory(num_runs) if self.deflake_enabled else service_id_factory try: # Results from this test, as well as logs will be dumped here mkdir_p(TestContext.results_dir(self.test_context, self.test_index)) # Instantiate test self.test = self.test_context.cls(self.test_context) # Run the test unit self.setup_test() data = self.run_test() test_status = PASS except BaseException as e: # mark the test as failed before doing anything else test_status = FAIL err_trace = self._exc_msg(e) summary.extend(err_trace.split("\n")) finally: for service in self.test_context.services: service.service_id_factory = sid_factory self.all_services.append(service) self.teardown_test( teardown_services=not self.session_context.no_teardown, test_status=test_status, ) if hasattr(self.test_context, "services"): service_errors = self.test_context.services.errors() if service_errors: summary.extend(["", "", service_errors]) # free nodes if self.test: self.log(logging.DEBUG, "Freeing nodes...") self._do_safely(self.test.free_nodes, "Error freeing nodes:") return test_status, summary, data def _check_cluster_utilization(self, result, summary): """Checks if the number of nodes used by a test is less than the number of nodes requested by the test. If this is the case and we wish to fail on bad cluster utilization, the result value is failed. Will also print a warning if the test passes and the node utilization doesn't match. """ max_used = self.cluster.max_used() total = len(self.cluster.all()) if max_used < total: message = "Test requested %d nodes, used only %d" % (total, max_used) if self.fail_bad_cluster_utilization: # only check node utilization on test pass if result == PASS or result == FLAKY: self.log(logging.INFO, "FAIL: " + message) result = FAIL summary.append(message) else: self.log(logging.WARN, message) return result, summary def setup_test(self): """start services etc""" self.log(logging.INFO, "Setting up...") self.test.setup() def run_test(self): """Run the test! We expect test_context.function to be a function or unbound method which takes an instantiated test object as its argument. """ self.log(logging.INFO, "Running...") return self.test_context.function(self.test) def _exc_msg(self, e): return repr(e) + "\n" + traceback.format_exc(limit=16) def _do_safely(self, action, err_msg): try: action() except BaseException as e: self.log(logging.WARN, err_msg + " " + self._exc_msg(e)) def teardown_test(self, teardown_services=True, test_status=None): """teardown method which stops services, gathers log data, removes persistent state, and releases cluster nodes. Catch all exceptions so that every step in the teardown process is tried, but signal that the test runner should stop if a keyboard interrupt is caught. """ self.log(logging.INFO, "Tearing down...") if not self.test: self.log(logging.WARN, "%s failed to instantiate" % self.test_id) self.test_context.close() return services = self.test_context.services if teardown_services: self._do_safely(self.test.teardown, "Error running teardown method:") # stop services self._do_safely(services.stop_all, "Error stopping services:") # always collect service logs whether or not we tear down # logs are typically removed during "clean" phase, so collect logs before cleaning self.log(logging.DEBUG, "Copying logs from services...") self._do_safely( lambda: self.test.copy_service_logs(test_status), "Error copying service logs:", ) # clean up stray processes and persistent state if teardown_services: self.log(logging.DEBUG, "Cleaning up services...") self._do_safely(services.clean_all, "Error cleaning services:") def log(self, log_level, msg, *args, **kwargs): """Log to the service log and the test log of the current test.""" if self.test_context is None: msg = "%s: %s" % (self.__class__.__name__, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) else: msg = "%s: %s: %s" % ( self.__class__.__name__, self.test_context.test_name, str(msg), ) self.logger.log(log_level, msg, *args, **kwargs) self.send(self.message.log(msg, level=log_level)) def dump_threads(self, msg): dump = "\n".join([t.name for t in threading.enumerate()]) self.log(logging.DEBUG, f"{msg}: {dump}") ================================================ FILE: ducktape/tests/scheduler.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List from ducktape.cluster.vagrant import VagrantCluster from ducktape.tests.test_context import TestContext class TestScheduler(object): """This class tracks tests which are scheduled to run, and provides an ordering based on the current cluster state. The ordering is "on-demand"; calling next returns the largest cluster user which fits in the currently available cluster nodes. """ def __init__(self, test_contexts: List[TestContext], cluster: VagrantCluster) -> None: self.cluster = cluster # Track tests which would never be offered up by the scheduling algorithm due to insufficient # cluster resources self._test_context_list = test_contexts.copy() self._sort_test_context_list() def __len__(self) -> int: """Number of tests currently in the scheduler""" return len(self._test_context_list) def __iter__(self): return self def filter_unschedulable_tests(self): """ Filter out tests that cannot be scheduled with the current cluster, remove them from this scheduler and return them. """ all = self.cluster.all() unschedulable = [] for test_context in self._test_context_list: if not all.nodes.can_remove_spec(test_context.expected_cluster_spec): unschedulable.append(test_context) for u in unschedulable: self._test_context_list.remove(u) return unschedulable def _sort_test_context_list(self) -> None: """Replace self.test_context_list with a sorted shallow copy Sort from largest cluster users to smallest """ # sort from the largest cluster users to smallest self._test_context_list = sorted(self._test_context_list, key=lambda tc: tc.expected_num_nodes, reverse=True) def peek(self): """Locate and return the next object to be scheduled, without removing it internally. :return test_context for the next test to be scheduled. If scheduler is empty, or no test can currently be scheduled, return None. """ for tc in self._test_context_list: if self.cluster.available().nodes.can_remove_spec(tc.expected_cluster_spec): return tc return None def remove(self, tc): """Remove test context object from this scheduler. Intended usage is to peek() first, then perform whatever validity checks, and if they pass, remove() it from the scheduler. """ if tc: self._test_context_list.remove(tc) def drain_remaining_tests(self): """ Get all remaining tests in the scheduler and clear the scheduler. :return List of test contexts that were still in the scheduler """ remaining = self._test_context_list.copy() self._test_context_list.clear() return remaining ================================================ FILE: ducktape/tests/serde.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pickle class SerDe(object): def serialize(self, obj): if hasattr(obj, "serialize"): obj.serialize() else: return pickle.dumps(obj) def deserialize(self, bytes_obj, obj_cls=None): if obj_cls and hasattr(obj_cls, "deserialize"): return obj_cls.deserialize(bytes_obj) else: return pickle.loads(bytes_obj) ================================================ FILE: ducktape/tests/session.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import sys import time from ducktape.command_line.defaults import ConsoleDefaults from ducktape.tests.loggermaker import LoggerMaker class SessionContext(object): """Wrapper class for 'global' variables. A call to ducktape generates a single shared SessionContext object which helps route logging and reporting, etc. """ def __init__(self, **kwargs) -> None: # session_id, results_dir, cluster, globals): self.session_id = kwargs["session_id"] self.results_dir = os.path.abspath(kwargs["results_dir"]) self.debug = kwargs.get("debug", False) self.compress = kwargs.get("compress", False) self.exit_first = kwargs.get("exit_first", False) self.no_teardown = kwargs.get("no_teardown", False) self.max_parallel = kwargs.get("max_parallel", 1) self.default_expected_num_nodes = kwargs.get("default_num_nodes", None) self.fail_bad_cluster_utilization = kwargs.get("fail_bad_cluster_utilization") self.fail_greedy_tests = kwargs.get("fail_greedy_tests", False) self.test_runner_timeout = kwargs.get("test_runner_timeout") self.enable_jvm_logs = kwargs.get("enable_jvm_logs", False) self._globals = kwargs.get("globals") @property def globals(self): """None, or an immutable dictionary containing user-defined global variables.""" return self._globals def to_json(self): return self.__dict__ class SessionLoggerMaker(LoggerMaker): def __init__(self, session_context: SessionContext) -> None: super(SessionLoggerMaker, self).__init__(session_context.session_id + ".session_logger") self.log_dir = session_context.results_dir self.debug = session_context.debug def configure_logger(self) -> None: """Set up the logger to log to stdout and files. This creates a few files as a side-effect.""" if self.configured: return self._logger.setLevel(logging.DEBUG) fh_info = logging.FileHandler(os.path.join(self.log_dir, "session_log.info")) fh_debug = logging.FileHandler(os.path.join(self.log_dir, "session_log.debug")) fh_info.setLevel(logging.INFO) fh_debug.setLevel(logging.DEBUG) # create console handler with a higher log level ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG if self.debug else logging.INFO) # create formatter and add it to the handlers formatter = logging.Formatter(ConsoleDefaults.SESSION_LOG_FORMATTER) fh_info.setFormatter(formatter) fh_debug.setFormatter(formatter) ch.setFormatter(formatter) # add the handlers to the logger self._logger.addHandler(fh_info) self._logger.addHandler(fh_debug) self._logger.addHandler(ch) def generate_session_id(session_id_file: str) -> str: """Generate a new session id based on the previous session id found in session_id_file :type session_id_file: str Last-used session_id is in this file :rtype str New session_id """ def get_id(day, num): return day + "--%03d" % num def split_id(an_id): day = an_id[:10] num = int(an_id[12:]) return day, num def today(): return time.strftime("%Y-%m-%d") def next_id(prev_id): if prev_id is None: prev_day = today() prev_num = 0 else: prev_day, prev_num = split_id(prev_id) if prev_day == today(): next_day = prev_day next_num = prev_num + 1 else: next_day = today() next_num = 1 return get_id(next_day, next_num) if os.path.isfile(session_id_file): with open(session_id_file, "r") as fp: session_id = next_id(fp.read()) else: session_id = next_id(None) with open(session_id_file, "w") as fp: fp.write(session_id) return session_id def generate_results_dir(results_root: str, session_id: str) -> str: """Results from a single run of ducktape are assigned a session_id and put together in this directory. :type session_id: str :rtype: str """ return os.path.join(os.path.abspath(results_root), session_id) ================================================ FILE: ducktape/tests/status.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. class TestStatus(object): def __init__(self, status: str) -> None: self._status = str(status).lower() def __eq__(self, other): return str(self).lower() == str(other).lower() def __str__(self): return self._status def to_json(self): return str(self).upper() PASS = TestStatus("pass") FLAKY = TestStatus("flaky") FAIL = TestStatus("fail") IGNORE = TestStatus("ignore") ================================================ FILE: ducktape/tests/test.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import shutil import tempfile from contextlib import contextmanager from ducktape.template import TemplateRenderer from ducktape.tests.status import FAIL from ducktape.utils.local_filesystem_utils import mkdir_p from .test_context import TestContext class Test(TemplateRenderer): """Base class for tests.""" def __init__(self, test_context, *args, **kwargs): """ :type test_context: ducktape.tests.test.TestContext """ super(Test, self).__init__(*args, **kwargs) self.test_context = test_context @property def cluster(self): return self.test_context.cluster @property def logger(self): return self.test_context.logger def min_cluster_spec(self): """ THIS METHOD IS DEPRECATED AND WILL BE REMOVED IN THE SUBSEQUENT RELEASES. Nothing in the ducktape framework calls it, it is only provided so that subclasses don't break. If you're overriding this method in your subclass, please remove it. """ raise NotImplementedError def min_cluster_size(self): """ THIS METHOD IS DEPRECATED AND WILL BE REMOVED IN THE SUBSEQUENT RELEASES. Nothing in the ducktape framework calls it, it is only provided so that subclasses don't break. If you're overriding this method in your subclass, please remove it. """ raise NotImplementedError def setup(self): """Override this for custom setup logic.""" # for backward compatibility self.setUp() def teardown(self): """Override this for custom teardown logic.""" # for backward compatibility self.tearDown() def setUp(self): pass def tearDown(self): pass def free_nodes(self): try: self.test_context.services.free_all() except BaseException as e: if isinstance(e, KeyboardInterrupt): raise def compress_service_logs(self, node, service, node_logs): """Compress logs on a node corresponding to the given service. :param node: The node on which to compress the given logs :param service: The service to which the node belongs :param node_logs: Paths to logs (or log directories) which will be compressed :return: a list of paths to compressed logs. """ compressed_logs = [] for nlog in node_logs: try: node.account.ssh(_compress_cmd(nlog)) if nlog.endswith(os.path.sep): nlog = nlog[: -len(os.path.sep)] nlog += ".tgz" compressed_logs.append(nlog) except Exception as e: self.test_context.logger.warn("Error compressing log %s: service %s: %s" % (nlog, service, str(e))) return compressed_logs def copy_service_logs(self, test_status): """ Copy logs from service nodes to the results directory. If the test passed, only the default set will be collected. If the the test failed, all logs will be collected. """ for service in self.test_context.services: if not hasattr(service, "logs") or len(service.logs) == 0: self.test_context.logger.debug( "Won't collect service logs from %s - no logs to collect." % service.service_id ) continue log_dirs = service.logs for node in service.nodes: # Gather locations of logs to collect node_logs = [] for log_name in log_dirs.keys(): if test_status == FAIL or self.should_collect_log(log_name, service): node_logs.append(log_dirs[log_name]["path"]) self.test_context.logger.debug( "Preparing to copy logs from %s: %s" % (node.account.hostname, node_logs) ) if self.test_context.session_context.compress: self.test_context.logger.debug("Compressing logs...") node_logs = self.compress_service_logs(node, service, node_logs) if len(node_logs) > 0: # Create directory into which service logs will be copied dest = os.path.join( TestContext.results_dir(self.test_context, self.test_context.test_index), service.service_id, node.account.hostname, ) if not os.path.isdir(dest): mkdir_p(dest) # Try to copy the service logs self.test_context.logger.debug("Copying logs...") try: for log in node_logs: node.account.copy_from(log, dest) except Exception as e: self.test_context.logger.warn( "Error copying log %(log_name)s from %(source)s to %(dest)s. \ service %(service)s: %(message)s" % { "log_name": log_name, "source": log_dirs[log_name], "dest": dest, "service": service, "message": e, } ) def mark_for_collect(self, service, log_name=None): if log_name is None: # Mark every log for collection for log_name in service.logs: self.test_context.log_collect[(log_name, service)] = True else: self.test_context.log_collect[(log_name, service)] = True def mark_no_collect(self, service, log_name=None): self.test_context.log_collect[(log_name, service)] = False def should_collect_log(self, log_name, service): key = (log_name, service) default = service.logs[log_name]["collect_default"] val = self.test_context.log_collect.get(key, default) return val def _compress_cmd(log_path): """Return bash command which compresses the given path to a tarball.""" compres_cmd = 'cd "$(dirname %s)" && ' % log_path compres_cmd += 'f="$(basename %s)" && ' % log_path compres_cmd += 'if [ -e "$f" ]; then tar czf "$f.tgz" "$f"; fi && ' compres_cmd += "rm -rf %s" % log_path return compres_cmd @contextmanager def in_dir(path): """Changes working directory to given path. On exit, restore to original working directory.""" cwd = os.getcwd() try: os.chdir(path) yield finally: os.chdir(cwd) @contextmanager def in_temp_dir(): """Creates a temporary directory as the working directory. On exit, it is removed.""" with _new_temp_dir() as tmpdir: with in_dir(tmpdir): yield tmpdir @contextmanager def _new_temp_dir(): """Create a temporary directory that is removed automatically""" tmpdir = tempfile.mkdtemp() try: yield tmpdir finally: shutil.rmtree(tmpdir) ================================================ FILE: ducktape/tests/test_context.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import logging import os import re import shutil import sys import tempfile from typing import TYPE_CHECKING, Dict, Optional, Tuple from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.command_line.defaults import ConsoleDefaults from ducktape.mark.consts import CLUSTER_NODE_TYPE_KEYWORD, CLUSTER_SIZE_KEYWORD, CLUSTER_SPEC_KEYWORD from ducktape.services.service_registry import ServiceRegistry from ducktape.tests.loggermaker import LoggerMaker, close_logger from ducktape.tests.session import SessionContext from ducktape.utils.local_filesystem_utils import mkdir_p if TYPE_CHECKING: from ducktape.services.service import Service def _escape_pathname(s): """Remove fishy characters, replace most with dots""" # Remove all whitespace completely s = re.sub(r"\s+", "", s) # Replace bad characters with dots blacklist = r"[^\.\-=_\w\d]+" s = re.sub(blacklist, ".", s) # Multiple dots -> single dot (and no leading or trailing dot) s = re.sub(r"[\.]+", ".", s) return re.sub(r"^\.|\.$", "", s) class TestLoggerMaker(LoggerMaker): def __init__(self, logger_name, log_dir, debug): super(TestLoggerMaker, self).__init__(logger_name) self.log_dir = log_dir self.debug = debug def configure_logger(self): """Set up the logger to log to stdout and files. This creates a directory and a few files as a side-effect. """ if self.configured: return self._logger.setLevel(logging.DEBUG) mkdir_p(self.log_dir) # Create info and debug level handlers to pipe to log files info_fh = logging.FileHandler(os.path.join(self.log_dir, "test_log.info")) debug_fh = logging.FileHandler(os.path.join(self.log_dir, "test_log.debug")) info_fh.setLevel(logging.INFO) debug_fh.setLevel(logging.DEBUG) formatter = logging.Formatter(ConsoleDefaults.TEST_LOG_FORMATTER) info_fh.setFormatter(formatter) debug_fh.setFormatter(formatter) self._logger.addHandler(info_fh) self._logger.addHandler(debug_fh) ch = logging.StreamHandler(sys.stdout) ch.setFormatter(formatter) if self.debug: # If debug flag is set, pipe debug logs to stdout ch.setLevel(logging.DEBUG) else: # default - pipe warning level logging to stdout ch.setLevel(logging.WARNING) self._logger.addHandler(ch) def test_logger(logger_name, log_dir, debug): """Helper method for getting a test logger object Note that if this method is called multiple times with the same ``logger_name``, it returns the same logger object. Note also, that for a fixed ``logger_name``, configuration occurs only the first time this function is called. """ return TestLoggerMaker(logger_name, log_dir, debug).logger class TestContext(object): """Wrapper class for state variables needed to properly run a single 'test unit'.""" def __init__(self, **kwargs) -> None: """ :param session_context: :param cluster: the cluster object which will be used by this test :param module: name of the module containing the test class/method :param cls: class object containing the test method :param function: the test method :param file: file containing this module :param injected_args: a dict containing keyword args which will be passed to the test method :param cluster_use_metadata: dict containing information about how this test will use cluster resources """ self.session_context: SessionContext = kwargs["session_context"] self.cluster = kwargs.get("cluster") self.module = kwargs.get("module") self.test_suite_name = kwargs.get("test_suite_name") if kwargs.get("file") is not None: self.file = os.path.abspath(kwargs["file"]) else: self.file = None self.cls = kwargs.get("cls") self.function = kwargs.get("function") self.injected_args = kwargs.get("injected_args") self.ignore = kwargs.get("ignore", False) # cluster_use_metadata is a dict containing information about how this test will use cluster resources self.cluster_use_metadata = copy.copy(kwargs.get("cluster_use_metadata", {})) enable_jvm_logs = self.session_context.enable_jvm_logs if self.session_context else False self.services = ServiceRegistry(enable_jvm_logs=enable_jvm_logs) self.test_index = None # dict for toggling service log collection on/off self.log_collect: Dict[Tuple[str, Service], bool] = {} self._logger = None self._local_scratch_dir = None def __repr__(self) -> str: return ( f"" ) def copy(self, **kwargs) -> "TestContext": """Construct a new TestContext object from another TestContext object Note that this is not a true copy, since a fresh ServiceRegistry instance will be created. """ ctx_copy = TestContext(**self.__dict__) ctx_copy.__dict__.update(**kwargs) return ctx_copy @property def local_scratch_dir(self): """This local scratch directory is created/destroyed on the test driver before/after each test is run.""" if not self._local_scratch_dir: self._local_scratch_dir = tempfile.mkdtemp() return self._local_scratch_dir @property def test_metadata(self): return { "directory": os.path.dirname(self.file), "file_name": os.path.basename(self.file), "cls_name": self.cls_name, "method_name": self.function_name, "injected_args": self.injected_args, } @staticmethod def logger_name(test_context, test_index): if test_index is None: return test_context.test_id else: return "%s-%s" % (test_context.test_id, str(test_index)) @staticmethod def results_dir(test_context, test_index): d = test_context.session_context.results_dir if test_context.cls is not None: d = os.path.join(d, test_context.cls.__name__) if test_context.function is not None: d = os.path.join(d, test_context.function.__name__) if test_context.injected_args is not None: d = os.path.join(d, test_context.injected_args_name) if test_index is not None: d = os.path.join(d, str(test_index)) return d @property def expected_num_nodes(self) -> int: """ How many nodes of any type we expect this test to consume when run. Note that this will be 0 for both unschedulable tests and the tests that legitimately need 0 nodes. :return: an integer number of nodes. """ return self.expected_cluster_spec.size() if self.expected_cluster_spec else 0 @property def expected_cluster_spec(self) -> Optional[ClusterSpec]: """ The cluster spec we expect this test to consume when run. :return: A ClusterSpec object or None if the test cannot be run (e.g. session context settings disallow tests with no cluster metadata attached). """ cluster_spec = self.cluster_use_metadata.get(CLUSTER_SPEC_KEYWORD) cluster_size = self.cluster_use_metadata.get(CLUSTER_SIZE_KEYWORD) node_type = self.cluster_use_metadata.get(CLUSTER_NODE_TYPE_KEYWORD) if cluster_spec is not None: return cluster_spec elif cluster_size is not None: return ClusterSpec.simple_linux(cluster_size, node_type) elif not self.cluster: return ClusterSpec.empty() elif self.session_context.fail_greedy_tests: return None else: return self.cluster.all() @property def globals(self): return self.session_context.globals @property def module_name(self) -> str: return "" if self.module is None else self.module @property def cls_name(self) -> str: return "" if self.cls is None else self.cls.__name__ @property def function_name(self) -> str: return "" if self.function is None else self.function.__name__ @property def description(self): """Description of the test, needed in particular for reporting. If the function has a docstring, return that, otherwise return the class docstring or "". """ if self.function.__doc__: return self.function.__doc__ elif self.cls.__doc__ is not None: return self.cls.__doc__ else: return "" @property def injected_args_name(self) -> str: if self.injected_args is None: return "" else: params = ".".join(["%s=%s" % (k, self.injected_args[k]) for k in self.injected_args]) return _escape_pathname(params) @property def test_id(self) -> str: return self.test_name @property def test_name(self) -> str: """ The fully-qualified name of the test. This is similar to test_id, but does not include the session ID. It includes the module, class, and method name. """ name_components = [ self.module_name, self.cls_name, self.function_name, self.injected_args_name, ] return ".".join(filter(lambda x: x is not None and len(x) > 0, name_components)) @property def logger(self): if self._logger is None: self._logger = test_logger( TestContext.logger_name(self, self.test_index), TestContext.results_dir(self, self.test_index), self.session_context.debug, ) return self._logger def close(self): """Release resources, etc.""" if hasattr(self, "services"): for service in self.services: service.close() # Remove reference to services. This is important to prevent potential memory leaks if users write services # which themselves have references to large memory-intensive objects del self.services # Remove local scratch directory if self._local_scratch_dir and os.path.exists(self._local_scratch_dir): shutil.rmtree(self._local_scratch_dir) # Release file handles held by logger if self._logger: close_logger(self._logger) ================================================ FILE: ducktape/utils/__init__.py ================================================ ================================================ FILE: ducktape/utils/http_utils.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from urllib.request import Request, build_opener class HttpMixin(object): def http_request(self, url, method, data="", headers=None, timeout=None): if url[0:7].lower() != "http://": url = "http://%s" % url if hasattr(self, "logger") and self.logger is not None: self.logger.debug("Sending http request. Url: %s, Data: %s, Headers: %s" % (url, str(data), str(headers))) req = Request(url, data, headers) req.get_method = lambda: method # The timeout parameter in urllib2.urlopen has strange behavior, and # seems to raise errors when set to a number. Using an opener works however. opener = build_opener() if timeout is None: response = opener.open(req) else: response = opener.open(req, timeout=timeout) return response ================================================ FILE: ducktape/utils/local_filesystem_utils.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import errno import os def mkdir_p(path: str) -> None: """mkdir -p functionality. :type path: str """ try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise ================================================ FILE: ducktape/utils/persistence.py ================================================ # Copyright (c) 2009 Jason M Baker # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. """ Module contains persistent data structures from pysistence project, patched to work with python 3.x. """ def not_implemented_method(*args, **kwargs): raise NotImplementedError("Cannot set values in a PDict") class PDict(dict): """ Persistent dict. """ __setitem__ = not_implemented_method __delitem__ = not_implemented_method update = not_implemented_method clear = not_implemented_method pop = not_implemented_method popitem = not_implemented_method def _as_transient(self): return dict(self) def copy(self): return PDict(self) def without(self, *keys): new_dict = self._as_transient() for key in keys: del new_dict[key] return PDict(new_dict) def using(self, **kwargs): new_dict = self._as_transient() new_dict.update(kwargs) return PDict(new_dict) def __reduce__(self): # Pickling support in python 2.x and 3.x return make_dict, (self._as_transient(),) make_dict = PDict ================================================ FILE: ducktape/utils/terminal_size.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import platform import shlex import struct import subprocess """ Adapted from https://gist.github.com/jtriley/1108174 """ def get_terminal_size(): """getTerminalSize() - get width and height of console - works on linux,os x,windows,cygwin(windows) originally retrieved from: http://stackoverflow.com/questions/566746/how-to-get-console-window-width-in-python """ current_os = platform.system() tuple_xy = None if current_os == "Windows": tuple_xy = _get_terminal_size_windows() if tuple_xy is None: tuple_xy = _get_terminal_size_tput() # needed for window's python in cygwin's xterm! if current_os in ["Linux", "Darwin"] or current_os.startswith("CYGWIN"): tuple_xy = _get_terminal_size_linux() if tuple_xy is None: tuple_xy = (80, 25) # default value return tuple_xy def _get_terminal_size_windows(): try: from ctypes import create_string_buffer, windll # stdin handle is -10 # stdout handle is -11 # stderr handle is -12 h = windll.kernel32.GetStdHandle(-12) csbi = create_string_buffer(22) res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi) if res: ( bufx, bufy, curx, cury, wattr, left, top, right, bottom, maxx, maxy, ) = struct.unpack("hhhhHhhhhhh", csbi.raw) sizex = right - left + 1 sizey = bottom - top + 1 return sizex, sizey except Exception: pass def _get_terminal_size_tput(): # get terminal width # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width-height-of-a-terminal-window try: cols = int(subprocess.check_call(shlex.split("tput cols"))) rows = int(subprocess.check_call(shlex.split("tput lines"))) return (cols, rows) except Exception: pass def _get_terminal_size_linux(): def ioctl_GWINSZ(fd): try: import fcntl import termios cr = struct.unpack("hh", fcntl.ioctl(fd, termios.TIOCGWINSZ, "1234")) return cr except Exception: pass cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2) if not cr: try: fd = os.open(os.ctermid(), os.O_RDONLY) cr = ioctl_GWINSZ(fd) os.close(fd) except Exception: pass if not cr: try: cr = (os.environ["LINES"], os.environ["COLUMNS"]) except Exception: return None return int(cr[1]), int(cr[0]) ================================================ FILE: ducktape/utils/util.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib import time from typing import Callable from ducktape import __version__ as __ducktape_version__ from ducktape.errors import TimeoutError def wait_until(condition, timeout_sec, backoff_sec=0.1, err_msg="", retry_on_exc=False): """Block until condition evaluates as true or timeout expires, whichever comes first. :param condition: callable that returns True if the condition is met, False otherwise :param timeout_sec: number of seconds to check the condition for before failing :param backoff_sec: number of seconds to back off between each failure to meet the condition before checking again :param err_msg: a string or callable returning a string that will be included as the exception message if the condition is not met :param retry_on_exc: if True, will retry if condition raises an exception. If condition raised exception on last iteration, that exception will be raised as a cause of TimeoutError. If False and condition raises an exception, that exception will be forwarded to the caller immediately. Defaults to False (original ducktape behavior). # TODO: [1.0.0] flip this to True :return: silently if condition becomes true within the timeout window, otherwise raise Exception with the given error message. """ start = time.time() stop = start + timeout_sec last_exception = None while time.time() < stop: try: if condition(): return else: # reset last_exception if last iteration didn't raise any exception, but simply returned False last_exception = None except BaseException as e: # save last raised exception for logging it later last_exception = e if not retry_on_exc: raise e finally: time.sleep(backoff_sec) # it is safe to call Exception from None - will be just treated as a normal exception raise TimeoutError(err_msg() if callable(err_msg) else err_msg) from last_exception def package_is_installed(package_name): """Return true iff package can be successfully imported.""" try: importlib.import_module(package_name) return True except Exception: return False def ducktape_version(): """Return string representation of current ducktape version.""" return __ducktape_version__ def load_function(func_module_path) -> Callable: """Loads and returns a function from a module path seperated by '.'s""" module, function_name = func_module_path.rsplit(".", 1) try: func = getattr(importlib.import_module(module), function_name) if not callable(func): raise Exception("Function {} from module {} is not callable".format(function_name, module)) return func except AttributeError: raise Exception( "Function could not be loaded from the module path {}, verify that it is '.' seperated".format( func_module_path ) ) ================================================ FILE: requirements-test.txt ================================================ pytest~=6.2.0 # 4.0 drops py27 support mock==4.0.2 memory_profiler==0.57 statistics==1.0.3.5 requests-testadapter==0.3.0 pytest-cov~=3.0 pytest-xdist~=2.5 ruff==0.4.10 ================================================ FILE: requirements.txt ================================================ jinja2~=3.1.6 boto3==1.33.13 # jinja2 pulls in MarkupSafe with a > constraint, but we need to constrain it for compatibility MarkupSafe~=2.1.5 pyparsing==3.1.4 zipp==3.20.2 pywinrm==0.4.3 requests==2.32.4 paramiko~=3.4.0 pyzmq==26.4.0 pycryptodome==3.23.0 more-itertools==5.0.0 PyYAML==6.0.2 psutil==5.7.2 ================================================ FILE: ruff.toml ================================================ extend-exclude = [ "docs", ".virtualenvs" ] line-length = 120 [lint] select = [ "E4", "E7", "E9", "F" ] ignore = [ "E111", "W292", "E226" ] ================================================ FILE: service.yml ================================================ name: ducktape lang: python lang_version: '3.13' git: enable: true sonarqube: enable: true github: enable: true repo_name: confluentinc/ducktape renovatebot: enable: true automerge: enable: false semaphore: enable: true pipeline_enable: false branches: - master - 0.13.x - 0.12.x - 0.11.x - 0.10.x - 0.9.x - 0.8.x - 0.7.x ================================================ FILE: setup.cfg ================================================ # pytest configuration (can also be defined in in tox.ini or pytest.ini file) # # To ease possible confusion, prefix ducktape unit tests with 'check' instead of 'test', since # many ducktape files, classes, and methods have 'test' somewhere in the name [tool:pytest] python_files=check_*.py python_classes=Check python_functions=check_* # don't search inside any resources directory for unit tests norecursedirs = resources ================================================ FILE: setup.py ================================================ from setuptools import find_packages, setup from setuptools.command.test import test as TestCommand import re import sys version = "" with open("ducktape/__init__.py", "r") as fd: version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) if not version: raise RuntimeError("Cannot find version information") class PyTest(TestCommand): user_options = [("pytest-args=", "a", "Arguments to pass to py.test")] def initialize_options(self): TestCommand.initialize_options(self) self.pytest_args = [] def finalize_options(self): TestCommand.finalize_options(self) self.test_args = [] self.test_suite = True def run_tests(self): # import here, cause outside the eggs aren't loaded import pytest errno = pytest.main(self.pytest_args) self.run_command("flake8") sys.exit(errno) test_req = open("requirements-test.txt").read() setup( name="ducktape", version=version, description="Distributed system test tools", author="Confluent", platforms=["any"], entry_points={ "console_scripts": ["ducktape=ducktape.command_line.main:main"], }, license="apache2.0", url="http://github.com/confluentinc/ducktape", packages=find_packages(), package_data={"ducktape": ["templates/report/*"]}, python_requires=">= 3.6", install_requires=open("requirements.txt").read(), extras_require={"test": test_req}, setup_requires=["ruff==0.4.10"], cmdclass={"test": PyTest}, ) ================================================ FILE: systests/__init__.py ================================================ ================================================ FILE: systests/cluster/__init__.py ================================================ ================================================ FILE: systests/cluster/test_debug.py ================================================ """ This module contains tests that are useful for developer debugging and can contain sleep statements or test that intentionally fail or break things. They're separate from test_remote_account.py for that reason. """ import time from ducktape.mark import matrix, parametrize, ignore from ducktape.mark.resource import cluster from ducktape.tests.test import Test from systests.cluster.test_remote_account import GenericService class FailingTest(Test): """ The purpose of this test is to validate reporters. Some of them are intended to fail. """ def setup(self): self.service = GenericService(self.test_context, 1) @cluster(num_nodes=1) @matrix( string_param=["success-first", "fail-second", "fail-third"], int_param=[10, 20, -30], ) def matrix_test(self, string_param, int_param): assert not string_param.startswith("fail") and int_param > 0 @cluster(num_nodes=1) @parametrize(string_param="success-first", int_param=10) @parametrize(string_param="fail-second", int_param=-10) def parametrized_test(self, string_param, int_param): assert not string_param.startswith("fail") and int_param > 0 @cluster(num_nodes=1) def failing_test(self): assert False @cluster(num_nodes=1) def successful_test(self): assert True class DebugThisTest(Test): @cluster(num_nodes=1) def one_node_test_sleep_90s(self): self.service = GenericService(self.test_context, 1) self.logger.warning("one_node_test - Sleeping for 90s") time.sleep(90) assert True @cluster(num_nodes=1) def one_node_test_sleep_30s(self): self.service = GenericService(self.test_context, 1) self.logger.warning("another_one_node_test - Sleeping for 30s") time.sleep(30) assert True @cluster(num_nodes=1) def another_one_node_test_sleep_30s(self): self.service = GenericService(self.test_context, 1) self.logger.warning("yet_another_one_node_test - Sleeping for 30s") time.sleep(30) assert True @cluster(num_nodes=2) def two_node_test(self): self.service = GenericService(self.test_context, 2) assert True @cluster(num_nodes=2) def another_two_node_test(self): self.service = GenericService(self.test_context, 2) assert True @ignore @cluster(num_nodes=2) def a_two_node_ignored_test(self): assert False @cluster(num_nodes=2) def yet_another_two_node_test(self): self.service = GenericService(self.test_context, 2) assert True @cluster(num_nodes=3) def three_node_test(self): self.service = GenericService(self.test_context, 3) assert True @cluster(num_nodes=3) def three_node_test_sleeping_30s(self): self.service = GenericService(self.test_context, 3) self.logger.warning("Sleeping for 30s") time.sleep(30) assert True @cluster(num_nodes=3) def another_three_node_test(self): self.service = GenericService(self.test_context, 3) assert True @cluster(num_nodes=2) def bad_alloc_test(self): # @cluster annotation specifies 2 nodes, but we ask for 3, this will fail self.service = GenericService(self.test_context, 3) time.sleep(10) assert True ================================================ FILE: systests/cluster/test_no_cluster.py ================================================ from ducktape.mark.resource import cluster from ducktape.tests.test import Test class NoClusterTest(Test): """This test helps validate the behavior for no-cluster tests (ie 0 nodes)""" @cluster(num_nodes=0) def test_zero_nodes(self): self.logger.warn("Testing") assert True ================================================ FILE: systests/cluster/test_remote_account.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.consts import WINDOWS, LINUX from ducktape.cluster.node_spec import NodeSpec from ducktape.services.service import Service from ducktape.tests.test import Test from ducktape.errors import TimeoutError from ducktape.mark.resource import cluster import os import pytest import random import shutil from six import iteritems import tempfile from threading import Thread import time import logging from ducktape.utils.util import wait_until def generate_tempdir_name(): """Use this ad-hoc function instead of the tempfile module since we're creating and removing this directory with ssh commands. """ return "/tmp/" + "t" + str(int(time.time())) class RemoteAccountTestService(Service): """Simple service that allocates one node for performing tests of RemoteAccount functionality""" def __init__(self, context): super(RemoteAccountTestService, self).__init__(context, num_nodes=1) self.temp_dir = generate_tempdir_name() self.logs = { "my_log": {"path": self.log_file, "collect_default": True}, "non_existent_log": { "path": os.path.join(self.temp_dir, "absent.log"), "collect_default": True, }, } @property def log_file(self): return os.path.join(self.temp_dir, "test.log") def start_node(self, node): node.account.ssh("mkdir -p " + self.temp_dir) node.account.ssh("touch " + self.log_file) def stop_node(self, node): pass def clean_node(self, node): node.account.ssh("rm -rf " + self.temp_dir) def write_to_log(self, msg): self.nodes[0].account.ssh("echo -e -n " + repr(msg) + " >> " + self.log_file) class GenericService(Service): """Service which doesn't do anything - just a group of nodes, each of which has a scratch directory.""" def __init__(self, context, num_nodes): super(GenericService, self).__init__(context, num_nodes) self.worker_scratch_dir = "scratch" for node in self.nodes: node.account.mkdirs(self.worker_scratch_dir) def stop_node(self, node): # noop pass def clean_node(self, node): node.account.remove(self.worker_scratch_dir, allow_fail=True) class UnderUtilizedTest(Test): def setup(self): self.service = GenericService(self.test_context, 1) @cluster(num_nodes=3) def under_utilized_test(self): # setup() creates a service instance, which calls alloc() for one node assert self.test_context.cluster.max_used() == 1 assert len(self.test_context.cluster.used()) == 1 self.another_service = GenericService(self.test_context, 1) assert len(self.test_context.cluster.used()) == 2 assert self.test_context.cluster.max_used() == 2 self.service.stop() self.service.free() assert len(self.test_context.cluster.used()) == 1 assert self.test_context.cluster.max_used() == 2 class FileSystemTest(Test): """ Note that in an attempt to isolate the file system methods, validation should be done with ssh/shell commands. """ def setup(self): self.service = GenericService(self.test_context, 1) self.node = self.service.nodes[0] self.scratch_dir = self.service.worker_scratch_dir @cluster(num_nodes=1) def create_file_test(self): expected_contents = "hello world" fname = "myfile.txt" fpath = "%s/%s" % (self.scratch_dir, fname) self.node.account.create_file(fpath, expected_contents) # validate existence and contents self.node.account.ssh("test -f %s" % fpath) contents = "\n".join([line for line in self.node.account.ssh_capture("cat %s" % fpath)]) assert contents == expected_contents # TODO also check absolute path @cluster(num_nodes=1) def mkdir_test(self): dirname = "%s/mydir" % self.scratch_dir self.node.account.mkdir(dirname) # TODO - important!! check mode self.node.account.ssh("test -d %s" % dirname, allow_fail=False) # mkdir should not succeed if the base directories do not already exist dirname = "%s/a/b/c/d" % self.scratch_dir with pytest.raises(IOError): self.node.account.mkdir(dirname) # TODO also check absolute path @cluster(num_nodes=1) def mkdirs_nested_test(self): dirname = "%s/a/b/c/d" % self.scratch_dir # TODO important!! check mode self.node.account.mkdirs(dirname) self.node.account.ssh("test -d %s" % dirname, allow_fail=False) # TODO also check absolute path @cluster(num_nodes=1) def open_test(self): """Try opening, writing, reading a file.""" fname = "%s/myfile.txt" % self.scratch_dir expected_contents = b"hello world\nhooray!" with self.node.account.open(fname, "w") as f: f.write(expected_contents) with self.node.account.open(fname, "r") as f: contents = f.read() assert contents == expected_contents # Now try opening in append mode append = b"hithere" expected_contents = expected_contents + append with self.node.account.open(fname, "a") as f: f.write(append) with self.node.account.open(fname, "r") as f: contents = f.read() assert contents == expected_contents @cluster(num_nodes=1) def exists_file_test(self): """ Create various kinds of files and symlinks, verifying that exists works as expected. Note that because """ # create file, test existence with relative and absolute path self.node.account.ssh("touch %s/hi" % self.scratch_dir) assert self.node.account.exists("%s/hi" % self.scratch_dir) # TODO abspath # create symlink, test existence with relative and absolute path self.node.account.ssh("ln -s %s/hi %s/hi-link" % (self.scratch_dir, self.scratch_dir)) assert self.node.account.exists("%s/hi-link" % self.scratch_dir) # TODO abspath def exists_dir_test(self): # check bad path doesn't exist assert not self.node.account.exists("a/b/c/d") # create dir, test existence with relative and absolute path dpath = "%s/mydir" % self.scratch_dir self.node.account.ssh("mkdir %s" % dpath) assert self.node.account.exists(dpath) # TODO abspath # create symlink, test existence with relative and absolute path self.node.account.ssh("ln -s %s %s/mydir-link" % (dpath, self.scratch_dir)) assert self.node.account.exists("%s/mydir-link" % self.scratch_dir) # # TODO abspath def remove_test(self): """Test functionality of remove method""" # remove a non-empty directory dpath = "%s/mydir" % self.scratch_dir self.node.account.ssh("mkdir %s" % dpath) self.node.account.ssh("touch %s/hi.txt" % dpath) self.node.account.ssh("test -d %s" % dpath) self.node.account.remove(dpath) self.node.account.ssh("test ! -d %s" % dpath) # remove a file fpath = "%s/hello.txt" % self.scratch_dir self.node.account.ssh("echo 'hello world' > %s" % fpath) self.node.account.remove(fpath) # remove non-existent path with pytest.raises(RuntimeError): self.node.account.remove("a/b/c/d") # remove non-existent path with allow_fail = True should be ok self.node.account.remove("a/b/c/d", allow_fail=True) # Representation of a somewhat arbitrary directory structure for testing copy functionality # A key which has a string as its value represents a file # A key which has a dict as its value represents a subdirectory DIR_STRUCTURE = { "d00": { "another_file": b"1\n2\n3\n4\ncats and dogs", "d10": {"fasdf": b"lasdf;asfd\nahoppoqnbasnb"}, "d11": {"f65": b"afasdfsafdsadf"}, }, "a_file": b"hello world!", } def make_dir_structure(base_dir, dir_structure, node=None): """Make a file tree starting at base_dir with structure specified by dir_structure. if node is None, make the structure locally, else make it on the given node """ for k, v in iteritems(dir_structure): if isinstance(v, dict): # it's a subdirectory subdir_name = k subdir_path = os.path.join(base_dir, subdir_name) subdir_structure = v if node: node.account.mkdir(subdir_path) else: os.mkdir(subdir_path) make_dir_structure(subdir_path, subdir_structure, node) else: # it's a file file_name = k file_path = os.path.join(base_dir, file_name) file_contents = v if node: with node.account.open(file_path, "wb") as f: f.write(file_contents) else: with open(file_path, "wb") as f: f.write(file_contents) def verify_dir_structure(base_dir, dir_structure, node=None): """Verify locally or on the given node whether the file subtree at base_dir matches dir_structure.""" for k, v in iteritems(dir_structure): if isinstance(v, dict): # it's a subdirectory subdir_name = k subdir_path = os.path.join(base_dir, subdir_name) subdir_structure = v if node: assert node.account.isdir(subdir_path) else: assert os.path.isdir(subdir_path) verify_dir_structure(subdir_path, subdir_structure, node) else: # it's a file file_name = k file_path = os.path.join(base_dir, file_name) expected_file_contents = v if node: with node.account.open(file_path, "r") as f: contents = f.read() else: with open(file_path, "rb") as f: contents = f.read() assert expected_file_contents == contents, contents class CopyToAndFroTest(Test): """These tests check copy_to, and copy_from functionality.""" def setup(self): self.service = GenericService(self.test_context, 1) self.node = self.service.nodes[0] self.remote_scratch_dir = self.service.worker_scratch_dir self.local_temp_dir = tempfile.mkdtemp() self.logger.info("local_temp_dir: %s" % self.local_temp_dir) self.logger.info("node: %s" % str(self.node.account)) @cluster(num_nodes=1) def test_copy_to_dir_with_rename(self): # make dir structure locally make_dir_structure(self.local_temp_dir, DIR_STRUCTURE) dest = os.path.join(self.remote_scratch_dir, "renamed") self.node.account.copy_to(self.local_temp_dir, dest) # now validate the directory structure on the remote machine verify_dir_structure(dest, DIR_STRUCTURE, node=self.node) @cluster(num_nodes=1) def test_copy_to_dir_as_subtree(self): # copy directory "into" a directory; this should preserve the original directoryname make_dir_structure(self.local_temp_dir, DIR_STRUCTURE) self.node.account.copy_to(self.local_temp_dir, self.remote_scratch_dir) local_temp_dir_name = self.local_temp_dir if local_temp_dir_name.endswith(os.path.sep): local_temp_dir_name = local_temp_dir_name[: -len(os.path.sep)] verify_dir_structure(os.path.join(self.remote_scratch_dir, local_temp_dir_name), DIR_STRUCTURE) @cluster(num_nodes=1) def test_copy_from_dir_with_rename(self): # make dir structure remotely make_dir_structure(self.remote_scratch_dir, DIR_STRUCTURE, node=self.node) dest = os.path.join(self.local_temp_dir, "renamed") self.node.account.copy_from(self.remote_scratch_dir, dest) # now validate the directory structure locally verify_dir_structure(dest, DIR_STRUCTURE) @cluster(num_nodes=1) def test_copy_from_dir_as_subtree(self): # copy directory "into" a directory; this should preserve the original directoryname make_dir_structure(self.remote_scratch_dir, DIR_STRUCTURE, node=self.node) self.node.account.copy_from(self.remote_scratch_dir, self.local_temp_dir) verify_dir_structure(os.path.join(self.local_temp_dir, "scratch"), DIR_STRUCTURE) def teardown(self): # allow_fail in case scratch dir was not successfully created if os.path.exists(self.local_temp_dir): shutil.rmtree(self.local_temp_dir) class CopyDirectTest(Test): def setup(self): self.service = GenericService(self.test_context, 2) self.src_node, self.dest_node = self.service.nodes self.remote_scratch_dir = self.service.worker_scratch_dir self.logger.info("src_node: %s" % str(self.src_node.account)) self.logger.info("dest_node: %s" % str(self.dest_node.account)) @cluster(num_nodes=2) def test_copy_file(self): """Verify that a file can be correctly copied directly between nodes. This should work with or without the recursive flag. """ file_path = os.path.join(self.remote_scratch_dir, "myfile.txt") expected_contents = b"123" self.src_node.account.create_file(file_path, expected_contents) self.src_node.account.copy_between(file_path, file_path, self.dest_node) assert self.dest_node.account.isfile(file_path) with self.dest_node.account.open(file_path, "r") as f: contents = f.read() assert expected_contents == contents @cluster(num_nodes=2) def test_copy_directory(self): """Verify that a directory can be correctly copied directly between nodes.""" make_dir_structure(self.remote_scratch_dir, DIR_STRUCTURE, node=self.src_node) self.src_node.account.copy_between(self.remote_scratch_dir, self.remote_scratch_dir, self.dest_node) verify_dir_structure( os.path.join(self.remote_scratch_dir, "scratch"), DIR_STRUCTURE, node=self.dest_node, ) class TestClusterSpec(Test): @cluster(cluster_spec=ClusterSpec.simple_linux(2)) def test_create_two_node_service(self): self.service = GenericService(self.test_context, 2) for node in self.service.nodes: node.account.ssh("echo hi") @cluster( cluster_spec=ClusterSpec.from_nodes( [ NodeSpec(operating_system=WINDOWS), NodeSpec(operating_system=LINUX), NodeSpec(), # this one is also linux ] ) ) def three_nodes_test(self): self.service = GenericService(self.test_context, 3) for node in self.service.nodes: node.account.ssh("echo hi") class RemoteAccountTest(Test): def __init__(self, test_context): super(RemoteAccountTest, self).__init__(test_context) self.account_service = RemoteAccountTestService(test_context) def setup(self): self.account_service.start() @cluster(num_nodes=1) def test_flaky(self): choices = [ Exception("FLAKE 1"), Exception("FLAKE 1"), Exception("FLAKE 2"), Exception("FLAKE 2"), Exception("FLAKE 3"), None, ] exp = random.choice(choices) if exp: raise exp @cluster(num_nodes=1) def test_ssh_capture_combine_stderr(self): """Test that ssh_capture correctly captures stderr and stdout from remote process.""" node = self.account_service.nodes[0] # swap stdout and stderr in the echo process cmd = "for i in $(seq 1 5); do echo $i 3>&1 1>&2 2>&3; done" ssh_output = node.account.ssh_capture(cmd, combine_stderr=True) bad_ssh_output = node.account.ssh_capture(cmd, combine_stderr=False) # Same command, but don't capture stderr lines = [int(line.strip()) for line in ssh_output] assert lines == [i for i in range(1, 6)] bad_lines = [int(line.strip()) for line in bad_ssh_output] assert bad_lines == [] @cluster(num_nodes=1) def test_ssh_output_combine_stderr(self): """Test that ssh_output correctly captures stderr and stdout from remote process.""" node = self.account_service.nodes[0] # swap stdout and stderr in the echo process cmd = "for i in $(seq 1 5); do echo $i 3>&1 1>&2 2>&3; done" ssh_output = node.account.ssh_output(cmd, combine_stderr=True) bad_ssh_output = node.account.ssh_output(cmd, combine_stderr=False) # Same command, but don't capture stderr assert ssh_output == b"\n".join([str(i).encode("utf-8") for i in range(1, 6)]) + b"\n", ssh_output assert bad_ssh_output == b"", bad_ssh_output @cluster(num_nodes=1) def test_ssh_capture(self): """Test that ssh_capture correctly captures output from ssh subprocess.""" node = self.account_service.nodes[0] cmd = "for i in $(seq 1 5); do echo $i; done" ssh_output = node.account.ssh_capture(cmd, combine_stderr=False) lines = [int(line.strip()) for line in ssh_output] assert lines == [i for i in range(1, 6)] @cluster(num_nodes=1) def test_ssh_output(self): """Test that ssh_output correctly captures output from ssh subprocess.""" node = self.account_service.nodes[0] cmd = "for i in $(seq 1 5); do echo $i; done" ssh_output = node.account.ssh_output(cmd, combine_stderr=False) assert ssh_output == b"\n".join([str(i).encode("utf-8") for i in range(1, 6)]) + b"\n", ssh_output @cluster(num_nodes=1) def test_monitor_log(self): """Tests log monitoring by writing to a log in the background thread""" node = self.account_service.nodes[0] # Make sure we start the log with some data, including the value we're going to grep for self.account_service.write_to_log("foo\nbar\nbaz") # Background thread that simulates a process writing to the log self.wrote_log_line = False def background_logging_thread(): # This needs to be large enough that we can verify we've actually # waited some time for the data to be written, but not too long that # the test takes a long time time.sleep(3) self.wrote_log_line = True self.account_service.write_to_log("foo\nbar\nbaz") with node.account.monitor_log(self.account_service.log_file) as monitor: logging_thread = Thread(target=background_logging_thread) logging_thread.start() monitor.wait_until("foo", timeout_sec=10, err_msg="Never saw expected log") assert self.wrote_log_line logging_thread.join(5.0) if logging_thread.is_alive(): raise Exception("Timed out waiting for background thread.") @cluster(num_nodes=1) def test_monitor_log_exception(self): """Tests log monitoring correctly throws an exception when the regex was not found""" node = self.account_service.nodes[0] # Make sure we start the log with some data, including the value we're going to grep for self.account_service.write_to_log("foo\nbar\nbaz") timeout = 3 try: with node.account.monitor_log(self.account_service.log_file) as monitor: start = time.time() monitor.wait_until("foo", timeout_sec=timeout, err_msg="Never saw expected log") assert False, "Log monitoring should have timed out and thrown an exception" except TimeoutError: # expected end = time.time() assert end - start > timeout, "Should have waited full timeout period while monitoring the log" @cluster(num_nodes=1) def test_kill_process(self): """Tests that kill_process correctly works""" grep_str = '"nc -l -p 5000"' def get_pids(): pid_cmd = f"ps ax | grep -i {grep_str} | grep -v grep | awk '{{print $1}}'" return list(node.account.ssh_capture(pid_cmd, callback=int)) node = self.account_service.nodes[0] # Run TCP service using netcat node.account.ssh_capture("nohup nc -l -p 5000 > /dev/null 2>&1 &") wait_until( lambda: len(get_pids()) > 0, timeout_sec=10, err_msg="Failed to start process within %d sec" % 10, ) # Kill service. node.account.kill_process(grep_str) wait_until( lambda: len(get_pids()) == 0, timeout_sec=10, err_msg="Failed to kill process within %d sec" % 10, ) class TestIterWrapper(Test): def setup(self): self.line_num = 6 self.eps = 0.01 self.service = GenericService(self.test_context, num_nodes=1) self.node = self.service.nodes[0] self.temp_file = "ducktape-test-" + str(random.randint(0, 100000)) contents = "" for i in range(self.line_num): contents += "%d\n" % i self.node.account.create_file(self.temp_file, contents) def test_iter_wrapper(self): """Test has_next functionality on the returned iterable item.""" output = self.node.account.ssh_capture("cat " + self.temp_file) for i in range(self.line_num): assert output.has_next() # with timeout in case of hang assert output.next().strip() == str(i) start = time.time() assert output.has_next() is False stop = time.time() assert stop - start < self.eps, "has_next() should return immediately" def test_iter_wrapper_timeout(self): """Test has_next with timeout""" output = self.node.account.ssh_capture("tail -F " + self.temp_file) # allow command to be executed before we check output with timeout_sec = 0 time.sleep(0.5) for i in range(self.line_num): assert output.has_next(timeout_sec=0) assert output.next().strip() == str(i) timeout = 0.25 start = time.time() # This check will last for the duration of the timeout because the the remote tail -F process # remains running, and the output stream is not closed. assert output.has_next(timeout_sec=timeout) is False stop = time.time() assert (stop - start >= timeout) and (stop - start) < timeout + self.eps, ( "has_next() should return right after %s second" % str(timeout) ) def teardown(self): # tail -F call above will leave stray processes, so clean up cmd = "for p in $(ps ax | grep -v grep | grep \"%s\" | awk '{print $1}'); do kill $p; done" % self.temp_file self.node.account.ssh(cmd, allow_fail=True) self.node.account.ssh("rm -f " + self.temp_file, allow_fail=True) class RemoteAccountCompressedTest(Test): def __init__(self, test_context): super(RemoteAccountCompressedTest, self).__init__(test_context) self.account_service = RemoteAccountTestService(test_context) self.test_context.session_context.compress = True self.tar_msg = False self.tar_error = False def setup(self): self.account_service.start() @cluster(num_nodes=1) def test_log_compression_with_non_existent_files(self): """Test that log compression with tar works even when a specific log file has not been generated (e.g. heap dump) """ self.test_context.logger.addFilter(CompressionErrorFilter(self)) self.copy_service_logs(None) if not self.tar_msg: raise Exception("Never saw attempt to compress log") if self.tar_error: raise Exception("Failure when compressing logs") class CompressionErrorFilter(logging.Filter): def __init__(self, test): super(CompressionErrorFilter, self).__init__() self.test = test def filter(self, record): if "tar czf" in record.msg: self.test.tar_msg = True if "Error" in record.msg: self.test.tar_error = True return True ================================================ FILE: systests/cluster/test_runner_operations.py ================================================ # Copyright 2022 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.services.service import Service from ducktape.tests.test import Test from ducktape.mark.resource import cluster import time class SimpleEchoService(Service): """Simple service that allocates one node for performing tests of RemoteAccount functionality""" logs = { "my_log": {"path": "/tmp/log", "collect_default": True}, } def __init__(self, context): super(SimpleEchoService, self).__init__(context, num_nodes=1) self.count = 0 def echo(self): self.nodes[0].account.ssh("echo {} >> /tmp/log".format(self.count)) self.count += 1 class SimpleRunnerTest(Test): def setup(self): self.service = SimpleEchoService(self.test_context) @cluster(num_nodes=1) def timeout_test(self): """ a simple longer running test to test special run flags agaisnt. """ self.service.start() while self.service.count < 500: self.service.echo() time.sleep(0.1) @cluster(num_nodes=1) def quick1_test(self): """ a simple quick test to test basic execution. """ self.service.start() while self.service.count < 20: self.service.echo() time.sleep(0.2) @cluster(num_nodes=1) def quick2_test(self): """ a simple quick test to test basic execution. """ self.service.start() while self.service.count < 20: self.service.echo() time.sleep(0.2) ================================================ FILE: tests/__init__.py ================================================ from ducktape.tests.test import Test from ducktape.tests.test_context import TestContext __all__ = ["Test", "TestContext"] ================================================ FILE: tests/cluster/__init__.py ================================================ ================================================ FILE: tests/cluster/check_cluster.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections from ducktape.cluster.cluster_node import ClusterNode from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.node_spec import NodeSpec from ducktape.cluster.consts import LINUX, WINDOWS from tests.ducktape_mock import FakeCluster FakeRemoteAccount = collections.namedtuple("FakeRemoteAccount", ["operating_system"]) class CheckCluster(object): def setup_method(self, _): self.cluster = FakeCluster(0) self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=LINUX))) self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=LINUX))) self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=WINDOWS))) self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=WINDOWS))) self.cluster._available_nodes.add_node(ClusterNode(FakeRemoteAccount(operating_system=WINDOWS))) def spec(self, linux_nodes, windows_nodes): nodes = [] for i in range(linux_nodes): nodes.append(NodeSpec(LINUX)) for i in range(windows_nodes): nodes.append(NodeSpec(WINDOWS)) return ClusterSpec(nodes) def check_enough_capacity(self): assert self.cluster.available().nodes.can_remove_spec(self.spec(2, 2)) assert self.cluster.available().nodes.can_remove_spec(self.spec(2, 3)) def check_not_enough_capacity(self): assert not self.cluster.available().nodes.can_remove_spec(self.spec(5, 2)) assert not self.cluster.available().nodes.can_remove_spec(self.spec(5, 5)) assert not self.cluster.available().nodes.can_remove_spec(self.spec(3, 3)) ================================================ FILE: tests/cluster/check_cluster_spec.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.node_spec import NodeSpec class CheckClusterSpec(object): def check_cluster_spec_sizes(self): simple_linux_2 = ClusterSpec.simple_linux(2) assert 2 == len(simple_linux_2) assert 0 == len(ClusterSpec.empty()) def check_to_string(self): empty = ClusterSpec.empty() assert "[]" == str(empty) simple_linux_5 = ClusterSpec.simple_linux(5) assert '[{"num_nodes": 5, "os": "linux"}]' == str(simple_linux_5) def check_simple_linux_with_node_type(self): """Test simple_linux with node_type parameter.""" spec = ClusterSpec.simple_linux(3, node_type="large") assert len(spec) == 3 for node_spec in spec: assert node_spec.operating_system == "linux" assert node_spec.node_type == "large" def check_simple_linux_without_node_type(self): """Test simple_linux without node_type (backward compatibility).""" spec = ClusterSpec.simple_linux(2) assert len(spec) == 2 for node_spec in spec: assert node_spec.operating_system == "linux" assert node_spec.node_type is None def check_grouped_by_os_and_type_empty(self): """Test grouped_by_os_and_type on empty ClusterSpec via NodeContainer.""" spec = ClusterSpec.empty() grouped = spec.nodes.grouped_by_os_and_type() assert grouped == {} def check_grouped_by_os_and_type_single_type(self): """Test grouped_by_os_and_type with single node type via NodeContainer.""" spec = ClusterSpec.simple_linux(3, node_type="small") grouped = spec.nodes.grouped_by_os_and_type() assert grouped == {("linux", "small"): 3} def check_grouped_by_os_and_type_mixed(self): """Test grouped_by_os_and_type with mixed node types via NodeContainer.""" spec = ClusterSpec( nodes=[ NodeSpec("linux", node_type="small"), NodeSpec("linux", node_type="small"), NodeSpec("linux", node_type="large"), NodeSpec("linux", node_type=None), NodeSpec("windows", node_type="medium"), ] ) grouped = spec.nodes.grouped_by_os_and_type() assert grouped == { ("linux", "small"): 2, ("linux", "large"): 1, ("linux", None): 1, ("windows", "medium"): 1, } ================================================ FILE: tests/cluster/check_finite_subcluster.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.consts import LINUX from ducktape.cluster.finite_subcluster import FiniteSubcluster from ducktape.cluster.node_container import ( InsufficientResourcesError, NodeNotPresentError, ) from ducktape.services.service import Service import pickle import pytest class MockFiniteSubclusterNode: @property def operating_system(self): return LINUX class CheckFiniteSubcluster(object): single_node_cluster_json = {"nodes": [{"hostname": "localhost"}]} def check_cluster_size(self): cluster = FiniteSubcluster([]) assert len(cluster) == 0 n = 10 cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)]) assert len(cluster) == n def check_pickleable(self): cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(10)]) pickle.dumps(cluster) def check_allocate_free(self): n = 10 cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)]) assert len(cluster) == n assert cluster.num_available_nodes() == n nodes = cluster.alloc(Service.setup_cluster_spec(num_nodes=1)) assert len(nodes) == 1 assert len(cluster) == n assert cluster.num_available_nodes() == n - 1 nodes2 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2)) assert len(nodes2) == 2 assert len(cluster) == n assert cluster.num_available_nodes() == n - 3 cluster.free(nodes) assert cluster.num_available_nodes() == n - 2 cluster.free(nodes2) assert cluster.num_available_nodes() == n def check_alloc_too_many(self): n = 10 cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)]) with pytest.raises(InsufficientResourcesError): cluster.alloc(Service.setup_cluster_spec(num_nodes=(n + 1))) def check_free_too_many(self): n = 10 cluster = FiniteSubcluster([MockFiniteSubclusterNode() for _ in range(n)]) nodes = cluster.alloc(Service.setup_cluster_spec(num_nodes=n)) with pytest.raises(NodeNotPresentError): nodes.append(MockFiniteSubclusterNode()) cluster.free(nodes) ================================================ FILE: tests/cluster/check_json.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.json import JsonCluster from ducktape.cluster.node_container import InsufficientResourcesError from ducktape.services.service import Service import pickle import pytest from tests.runner.fake_remote_account import create_fake_remote_account def create_json_cluster(*args, **kwargs): return JsonCluster(*args, make_remote_account_func=create_fake_remote_account, **kwargs) class CheckJsonCluster(object): single_node_cluster_json = {"nodes": [{"ssh_config": {"host": "localhost"}}]} def check_invalid_json(self): # Missing list of nodes with pytest.raises(ValueError): create_json_cluster({}) # Missing hostname, which is required with pytest.raises(ValueError): create_json_cluster({"nodes": [{}]}) @staticmethod def cluster_hostnames(nodes): return set([node.account.hostname for node in nodes]) def check_cluster_size(self): cluster = create_json_cluster({"nodes": []}) assert len(cluster) == 0 n = 10 cluster = create_json_cluster({"nodes": [{"ssh_config": {"hostname": "localhost%d" % x}} for x in range(n)]}) assert len(cluster) == n def check_pickleable(self): cluster = create_json_cluster( { "nodes": [ {"ssh_config": {"host": "localhost1"}}, {"ssh_config": {"host": "localhost2"}}, {"ssh_config": {"host": "localhost3"}}, ] } ) pickle.dumps(cluster) def check_allocate_free(self): cluster = create_json_cluster( { "nodes": [ {"ssh_config": {"host": "localhost1"}}, {"ssh_config": {"host": "localhost2"}}, {"ssh_config": {"host": "localhost3"}}, ] } ) assert len(cluster) == 3 assert cluster.num_available_nodes() == 3 nodes = cluster.alloc(Service.setup_cluster_spec(num_nodes=1)) nodes_hostnames = self.cluster_hostnames(nodes) assert len(cluster) == 3 assert cluster.num_available_nodes() == 2 nodes2 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2)) nodes2_hostnames = self.cluster_hostnames(nodes2) assert len(cluster) == 3 assert cluster.num_available_nodes() == 0 assert nodes_hostnames.isdisjoint(nodes2_hostnames) cluster.free(nodes) assert cluster.num_available_nodes() == 1 cluster.free(nodes2) assert cluster.num_available_nodes() == 3 def check_parsing(self): """Checks that RemoteAccounts are generated correctly from input JSON""" node = create_json_cluster({"nodes": [{"ssh_config": {"host": "hostname"}}]}).alloc( Service.setup_cluster_spec(num_nodes=1) )[0] assert node.account.hostname == "hostname" assert node.account.user is None ssh_config = { "host": "hostname", "user": "user", "hostname": "localhost", "port": 22, } node = create_json_cluster( {"nodes": [{"hostname": "hostname", "user": "user", "ssh_config": ssh_config}]} ).alloc(Service.setup_cluster_spec(num_nodes=1))[0] assert node.account.hostname == "hostname" assert node.account.user == "user" # check ssh configs assert node.account.ssh_config.host == "hostname" assert node.account.ssh_config.user == "user" assert node.account.ssh_config.hostname == "localhost" assert node.account.ssh_config.port == 22 def check_exhausts_supply(self): cluster = create_json_cluster(self.single_node_cluster_json) with pytest.raises(InsufficientResourcesError): cluster.alloc(Service.setup_cluster_spec(num_nodes=2)) def check_node_names(self): cluster = create_json_cluster( { "nodes": [ {"ssh_config": {"host": "localhost1"}}, {"ssh_config": {"host": "localhost2"}}, {"ssh_config": {"host": "localhost3"}}, ] } ) hosts = set(["localhost1", "localhost2", "localhost3"]) nodes = cluster.alloc(cluster.available()) assert hosts == set(node.name for node in nodes) ================================================ FILE: tests/cluster/check_localhost.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.localhost import LocalhostCluster from ducktape.services.service import Service import pickle class CheckLocalhostCluster(object): def setup_method(self, _): self.cluster = LocalhostCluster() def check_size(self): len(self.cluster) >= 2**31 - 1 def check_pickleable(self): cluster = LocalhostCluster() pickle.dumps(cluster) def check_request_free(self): available = self.cluster.num_available_nodes() initial_size = len(self.cluster) # Should be able to allocate arbitrarily many nodes nodes = self.cluster.alloc(Service.setup_cluster_spec(num_nodes=100)) assert len(nodes) == 100 for i, node in enumerate(nodes): assert node.account.hostname == "localhost%d" % i assert node.account.ssh_hostname == "localhost" assert node.account.ssh_config.hostname == "localhost" assert node.account.ssh_config.port == 22 assert node.account.user is None assert self.cluster.num_available_nodes() == (available - 100) assert len(self.cluster) == initial_size # This shouldn't change self.cluster.free(nodes) assert self.cluster.num_available_nodes() == available ================================================ FILE: tests/cluster/check_node_container.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.cluster_node import ClusterNode from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.consts import WINDOWS, LINUX from ducktape.cluster.node_spec import NodeSpec from ducktape.cluster.node_container import ( NodeContainer, NodeNotPresentError, InsufficientResourcesError, InsufficientHealthyNodesError, ) import pytest from ducktape.cluster.remoteaccount import RemoteAccountSSHConfig from tests.ducktape_mock import MockAccount from tests.runner.fake_remote_account import FakeRemoteAccount, FakeWindowsRemoteAccount def fake_account(host, is_available=True, node_type=None): return FakeRemoteAccount( ssh_config=RemoteAccountSSHConfig(host=host), is_available=is_available, node_type=node_type ) def fake_win_account(host, is_available=True, node_type=None): return FakeWindowsRemoteAccount( ssh_config=RemoteAccountSSHConfig(host=host), is_available=is_available, node_type=node_type ) def count_nodes_by_os(container, target_os): """Helper to count nodes by OS from node_groups.""" count = 0 for (os, _), nodes in container.node_groups.items(): if os == target_os: count += len(nodes) return count class CheckNodeContainer(object): def check_sizes(self): empty = NodeContainer() assert 0 == empty.size() assert 0 == len(empty) nodes = [ClusterNode(MockAccount())] container = NodeContainer(nodes) assert 1 == container.size() assert 1 == len(container) def check_add_and_remove(self): nodes = [ ClusterNode(MockAccount()), ClusterNode(MockAccount()), ClusterNode(MockAccount()), ClusterNode(MockAccount()), ClusterNode(MockAccount()), ] container = NodeContainer([]) assert 0 == len(container) container.add_node(nodes[0]) container.add_node(nodes[1]) container.add_node(nodes[2]) container2 = container.clone() i = 0 for node in container: assert nodes[i] == node i += 1 assert 3 == len(container) container.remove_node(nodes[0]) with pytest.raises(NodeNotPresentError): container.remove_node(nodes[0]) assert 2 == len(container) assert 3 == len(container2) def check_remove_single_node_spec(self): """Check remove_spec() method - verify a simple happy path of removing a single node""" accounts = [ fake_account("host1"), fake_account("host2"), fake_win_account("w1"), fake_win_account("w2"), ] container = NodeContainer(accounts) one_linux_node_spec = ClusterSpec(nodes=[NodeSpec(LINUX)]) one_windows_node_spec = ClusterSpec(nodes=[NodeSpec(WINDOWS)]) def _remove_single_node(one_node_spec, os): assert container.can_remove_spec(one_node_spec) good_nodes, bad_nodes = container.remove_spec(one_node_spec) assert good_nodes and len(good_nodes) == 1 assert good_nodes[0].os == os assert not bad_nodes _remove_single_node(one_windows_node_spec, WINDOWS) assert count_nodes_by_os(container, LINUX) == 2 assert count_nodes_by_os(container, WINDOWS) == 1 _remove_single_node(one_windows_node_spec, WINDOWS) assert count_nodes_by_os(container, LINUX) == 2 assert count_nodes_by_os(container, WINDOWS) == 0 assert not container.can_remove_spec(one_windows_node_spec) with pytest.raises(InsufficientResourcesError): container.remove_spec(one_windows_node_spec) _remove_single_node(one_linux_node_spec, LINUX) assert count_nodes_by_os(container, LINUX) == 1 assert count_nodes_by_os(container, WINDOWS) == 0 _remove_single_node(one_linux_node_spec, LINUX) assert count_nodes_by_os(container, LINUX) == 0 assert count_nodes_by_os(container, WINDOWS) == 0 assert not container.can_remove_spec(one_linux_node_spec) with pytest.raises(InsufficientResourcesError): container.remove_spec(one_linux_node_spec) @pytest.mark.parametrize( "cluster_spec", [ pytest.param( ClusterSpec(nodes=[NodeSpec(LINUX), NodeSpec(WINDOWS), NodeSpec(WINDOWS)]), id="not enough windows nodes", ), pytest.param( ClusterSpec(nodes=[NodeSpec(LINUX), NodeSpec(LINUX), NodeSpec(WINDOWS)]), id="not enough linux nodes", ), pytest.param( ClusterSpec( nodes=[ NodeSpec(LINUX), NodeSpec(LINUX), NodeSpec(WINDOWS), NodeSpec(WINDOWS), ] ), id="not enough nodes", ), ], ) def check_not_enough_nodes_to_remove(self, cluster_spec): """ Check what happens if there aren't enough resources in this container to match a given spec. Various parametrizations check the behavior for when there are enough nodes for one OS but not another, or for both. """ accounts = [fake_account("host1"), fake_win_account("w1")] container = NodeContainer(accounts) original_container = container.clone() assert not container.can_remove_spec(cluster_spec) assert len(container.attempt_remove_spec(cluster_spec)) > 0 with pytest.raises(InsufficientResourcesError): container.remove_spec(cluster_spec) # check that container was not modified assert container.node_groups == original_container.node_groups @pytest.mark.parametrize( "accounts", [ pytest.param( [ fake_account("host1"), fake_account("host2"), fake_win_account("w1"), fake_win_account("w2", is_available=False), ], id="windows not available", ), pytest.param( [ fake_account("host1"), fake_account("host2", is_available=False), fake_win_account("w1"), fake_win_account("w2"), ], id="linux not available", ), pytest.param( [ fake_account("host1"), fake_account("host2", is_available=False), fake_win_account("w1"), fake_win_account("w2", is_available=False), ], id="neither is available", ), ], ) def check_not_enough_healthy_nodes(self, accounts): """ When there's not enough healthy nodes in any of the OS-s, we obviously don't want to remove anything. Even when there's enough healthy nodes for one of the OS-s, but not enough in another one, we don't want to remove any nodes at all. Various sets of params check if there aren't enough healthy nodes for one OS but not the other, or both. """ container = NodeContainer(accounts) original_container = container.clone() expected_bad_nodes = [acc for acc in accounts if not acc.is_available] spec = ClusterSpec( nodes=[ NodeSpec(LINUX), NodeSpec(LINUX), NodeSpec(WINDOWS), NodeSpec(WINDOWS), ] ) assert container.can_remove_spec(spec) with pytest.raises(InsufficientHealthyNodesError) as exc_info: container.remove_spec(spec) assert exc_info.value.bad_nodes == expected_bad_nodes # check that no nodes were actually allocated, but unhealthy ones were removed from the cluster original_container.remove_nodes(expected_bad_nodes) assert container.node_groups == original_container.node_groups @pytest.mark.parametrize( "accounts", [ pytest.param( [ fake_account("host1"), fake_account("host2"), fake_account("host3"), fake_win_account("w1", is_available=False), fake_win_account("w2"), fake_win_account("w2"), ], id="windows not available", ), pytest.param( [ fake_account("host1", is_available=False), fake_account("host2"), fake_account("host3"), fake_win_account("w1"), fake_win_account("w2"), fake_win_account("w2"), ], id="linux not available", ), pytest.param( [ fake_account("host1", is_available=False), fake_account("host2"), fake_account("host3"), fake_win_account("w1", is_available=False), fake_win_account("w2"), fake_win_account("w2"), ], id="neither is available", ), pytest.param( [ fake_account("host1"), fake_account("host2"), fake_account("host3"), fake_win_account("w1"), fake_win_account("w2"), fake_win_account("w2"), ], id="all are available", ), ], ) def check_enough_healthy_but_some_bad_nodes_too(self, accounts): """ Check that we can successfully allocate all necessary nodes - even if some nodes don't pass health checks, we still have enough nodes to match the provided cluster spec. This test assumes that if we do encounter unhealthy node, we encounter it before we can finish allocating healthy ones, otherwise it would just mean testing the happy path (which we do in one of the params). """ container = NodeContainer(accounts) original_container = container.clone() expected_bad_nodes = [acc for acc in accounts if not acc.is_available] spec = ClusterSpec( nodes=[ NodeSpec(LINUX), NodeSpec(LINUX), NodeSpec(WINDOWS), NodeSpec(WINDOWS), ] ) assert container.can_remove_spec(spec) good_nodes, bad_nodes = container.remove_spec(spec) # alloc should succeed # check that we did catch a bad node if any assert bad_nodes == expected_bad_nodes # check that container has exactly the right number of nodes left - # we removed len(spec) healthy nodes, plus len(expected_bad_nodes) of unhealthy nodes. assert len(container) == len(original_container) - len(spec) - len(expected_bad_nodes) # check that we got 2 windows nodes and two linux nodes in response, # don't care which ones in particular assert len(good_nodes) == 4 actual_linux = [node for node in good_nodes if node.os == LINUX] assert len(actual_linux) == 2 actual_win = [node for node in good_nodes if node.os == WINDOWS] assert len(actual_win) == 2 def check_empty_cluster_spec(self): accounts = [fake_account("host1"), fake_account("host2"), fake_account("host3")] container = NodeContainer(accounts) spec = ClusterSpec.empty() assert not container.attempt_remove_spec(spec) assert container.can_remove_spec(spec) good, bad = container.remove_spec(spec) assert not good assert not bad def check_none_cluster_spec(self): accounts = [fake_account("host1"), fake_account("host2"), fake_account("host3")] container = NodeContainer(accounts) spec = None assert container.attempt_remove_spec(spec) assert not container.can_remove_spec(spec) with pytest.raises(InsufficientResourcesError): container.remove_spec(spec) # ==================== node_type tests ==================== def check_node_groups_by_type(self): """Check that nodes are grouped by (os, node_type) in node_groups.""" accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="small"), fake_account("host3", node_type="large"), fake_account("host4"), # no node_type (None) fake_win_account("w1", node_type="small"), ] container = NodeContainer(accounts) # Check node_groups has correct keys assert (LINUX, "small") in container.node_groups assert (LINUX, "large") in container.node_groups assert (LINUX, None) in container.node_groups assert (WINDOWS, "small") in container.node_groups # Check counts assert len(container.node_groups[(LINUX, "small")]) == 2 assert len(container.node_groups[(LINUX, "large")]) == 1 assert len(container.node_groups[(LINUX, None)]) == 1 assert len(container.node_groups[(WINDOWS, "small")]) == 1 def check_remove_spec_with_node_type(self): """Check remove_spec with node_type specified in the cluster spec.""" accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="small"), fake_account("host3", node_type="large"), ] container = NodeContainer(accounts) # Request 1 small node small_spec = ClusterSpec(nodes=[NodeSpec(LINUX, node_type="small")]) assert container.can_remove_spec(small_spec) good_nodes, bad_nodes = container.remove_spec(small_spec) assert len(good_nodes) == 1 assert good_nodes[0].node_type == "small" assert not bad_nodes # Should have 1 small and 1 large left assert len(container.node_groups.get((LINUX, "small"), [])) == 1 assert len(container.node_groups.get((LINUX, "large"), [])) == 1 def check_remove_spec_with_node_type_not_available(self): """Check remove_spec fails when requested node_type is not available.""" accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="small"), ] container = NodeContainer(accounts) # Request a large node (not available) large_spec = ClusterSpec(nodes=[NodeSpec(LINUX, node_type="large")]) assert not container.can_remove_spec(large_spec) with pytest.raises(InsufficientResourcesError): container.remove_spec(large_spec) def check_remove_spec_node_type_none_matches_any(self): """Check that node_type=None in spec matches any available node.""" accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="large"), ] container = NodeContainer(accounts) # Request a node with no specific type - should match any any_spec = ClusterSpec(nodes=[NodeSpec(LINUX, node_type=None)]) assert container.can_remove_spec(any_spec) good_nodes, _ = container.remove_spec(any_spec) assert len(good_nodes) == 1 # Should have allocated one of the available nodes assert good_nodes[0].node_type in ("small", "large") def check_remove_spec_mixed_node_types(self): """Check remove_spec with mixed node_type requirements.""" accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="small"), fake_account("host3", node_type="large"), fake_account("host4", node_type="large"), ] container = NodeContainer(accounts) # Request 1 small and 1 large mixed_spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type="large"), ] ) assert container.can_remove_spec(mixed_spec) good_nodes, _ = container.remove_spec(mixed_spec) assert len(good_nodes) == 2 small_nodes = [n for n in good_nodes if n.node_type == "small"] large_nodes = [n for n in good_nodes if n.node_type == "large"] assert len(small_nodes) == 1 assert len(large_nodes) == 1 # ==================== Double-counting fix tests ==================== def check_mixed_typed_and_untyped_double_counting_rejected(self): """ Test that mixed typed and untyped requirements correctly fail when total capacity is insufficient (the double-counting bug fix). Scenario: - Available: 3 Linux nodes (2 small, 1 large) - Request: 2 linux/small + 2 linux/any - Expected: FAIL (need 4 nodes, only have 3) Before the fix, this would incorrectly pass because: - linux/small check: 2 available >= 2 needed ✓ - linux/any check: 3 available >= 2 needed ✓ But the any-type was double-counting the small nodes! """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="small"), fake_account("host3", node_type="large"), ] container = NodeContainer(accounts) # Request 2 small + 2 any = 4 total, but only 3 available impossible_spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type=None), NodeSpec(LINUX, node_type=None), ] ) assert not container.can_remove_spec(impossible_spec) with pytest.raises(InsufficientResourcesError): container.remove_spec(impossible_spec) def check_mixed_typed_and_untyped_valid_passes(self): """ Test that mixed typed and untyped requirements correctly pass when there is sufficient capacity. Scenario: - Available: 4 Linux nodes (2 small, 2 large) - Request: 2 linux/small + 1 linux/any - Expected: PASS (need 3 nodes, have 4) """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="small"), fake_account("host3", node_type="large"), fake_account("host4", node_type="large"), ] container = NodeContainer(accounts) # Request 2 small + 1 any = 3 total, have 4 available valid_spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type=None), ] ) assert container.can_remove_spec(valid_spec) good_nodes, bad_nodes = container.remove_spec(valid_spec) assert len(good_nodes) == 3 assert not bad_nodes def check_specific_type_shortage_detected(self): """ Test that shortage of a specific type is detected even when total OS capacity is sufficient. Scenario: - Available: 3 Linux nodes (1 small, 2 large) - Request: 2 linux/small + 1 linux/any - Expected: FAIL (need 2 small, only have 1) """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="large"), fake_account("host3", node_type="large"), ] container = NodeContainer(accounts) # Request 2 small + 1 any - total capacity OK but not enough small spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type=None), ] ) assert not container.can_remove_spec(spec) error_msg = container.attempt_remove_spec(spec) assert "linux/small" in error_msg def check_holistic_os_capacity_check(self): """ Test that total OS capacity is checked before detailed type checks. Scenario: - Available: 2 Linux nodes (1 small, 1 large) - Request: 1 linux/small + 1 linux/large + 1 linux/any - Expected: FAIL (need 3 total, only have 2) """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="large"), ] container = NodeContainer(accounts) spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type="large"), NodeSpec(LINUX, node_type=None), ] ) assert not container.can_remove_spec(spec) error_msg = container.attempt_remove_spec(spec) # Should report total Linux shortage assert "linux" in error_msg.lower() def check_multi_os_mixed_requirements(self): """ Test holistic validation works correctly with multiple OS types. Scenario: - Available: 3 Linux (2 small, 1 large), 2 Windows (1 small, 1 large) - Request: 2 linux/small + 1 linux/any + 1 windows/any - Expected: FAIL for Linux (need 3, have 3, but 2 small reserved leaves only 1 for any) """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="small"), fake_account("host3", node_type="large"), fake_win_account("w1", node_type="small"), fake_win_account("w2", node_type="large"), ] container = NodeContainer(accounts) # For Linux: 2 small + 2 any = 4 total, but only 3 available # For Windows: 1 any = fine spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type=None), NodeSpec(LINUX, node_type=None), NodeSpec(WINDOWS, node_type=None), ] ) assert not container.can_remove_spec(spec) def check_only_any_type_requirements(self): """ Test that requests with only any-type requirements work correctly. """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="large"), fake_account("host3"), # no type ] container = NodeContainer(accounts) # Request 3 any-type - should work spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type=None), NodeSpec(LINUX, node_type=None), NodeSpec(LINUX, node_type=None), ] ) assert container.can_remove_spec(spec) good_nodes, _ = container.remove_spec(spec) assert len(good_nodes) == 3 # Request 4 any-type - should fail container2 = NodeContainer(accounts) spec2 = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type=None), NodeSpec(LINUX, node_type=None), NodeSpec(LINUX, node_type=None), NodeSpec(LINUX, node_type=None), ] ) assert not container2.can_remove_spec(spec2) def check_allocation_order_specific_before_any(self): """ Test that specific node_type requirements are allocated before any-type. This prevents any-type requests from "stealing" nodes needed by specific types. Scenario: - Available: 2 Linux nodes (1 small, 1 large) - Request: 1 linux/small + 1 linux/any - Expected: SUCCESS - small gets the small node, any gets the large node Without proper ordering, if linux/any is processed first, it might take the small node, leaving linux/small with no matching nodes. """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="large"), ] container = NodeContainer(accounts) # Request 1 small + 1 any - should succeed with proper ordering spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type=None), ] ) assert container.can_remove_spec(spec) good_nodes, bad_nodes = container.remove_spec(spec) assert len(good_nodes) == 2 assert not bad_nodes # Verify we got one of each type types = [n.node_type for n in good_nodes] assert "small" in types assert "large" in types def check_allocation_order_with_multiple_specific_types(self): """ Test allocation order with multiple specific types and any-type. Scenario: - Available: 3 Linux nodes (1 small, 1 medium, 1 large) - Request: 1 linux/small + 1 linux/large + 1 linux/any - Expected: SUCCESS - specific types get their nodes, any gets medium """ accounts = [ fake_account("host1", node_type="small"), fake_account("host2", node_type="medium"), fake_account("host3", node_type="large"), ] container = NodeContainer(accounts) spec = ClusterSpec( nodes=[ NodeSpec(LINUX, node_type="small"), NodeSpec(LINUX, node_type="large"), NodeSpec(LINUX, node_type=None), ] ) assert container.can_remove_spec(spec) good_nodes, _ = container.remove_spec(spec) assert len(good_nodes) == 3 # Verify we got all three types types = [n.node_type for n in good_nodes] assert "small" in types assert "medium" in types assert "large" in types ================================================ FILE: tests/cluster/check_remoteaccount.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.errors import TimeoutError from tests.ducktape_mock import MockAccount from tests.test_utils import find_available_port from ducktape.cluster.remoteaccount import RemoteAccount from ducktape.cluster.remoteaccount import RemoteAccountSSHConfig import pytest import logging from threading import Thread from http.server import SimpleHTTPRequestHandler import socketserver import threading import time class DummyException(Exception): pass def raise_error_checker(error, remote_account): raise DummyException("dummy raise: {}\nfrom: {}".format(error, remote_account)) def raise_no_error_checker(error, remote_account): pass class SimpleServer(object): """Helper class which starts a simple server listening on localhost at the specified port""" def __init__(self): self.port = find_available_port() self.handler = SimpleHTTPRequestHandler self.httpd = socketserver.TCPServer(("", self.port), self.handler) self.close_signal = threading.Event() self.server_started = False def start(self, delay_sec=0.0): """Run the server after specified delay""" def run(): self.close_signal.wait(delay_sec) if not self.close_signal.is_set(): self.server_started = True self.httpd.serve_forever() self.background_thread = Thread(target=run) self.background_thread.start() def stop(self): self.close_signal.set() if self.server_started: self.httpd.shutdown() self.background_thread.join(timeout=0.5) if self.background_thread.is_alive(): raise Exception("SimpleServer failed to stop quickly") class CheckRemoteAccount(object): def setup(self): self.server = SimpleServer() self.account = MockAccount() def check_wait_for_http(self): """Check waiting without timeout""" self.server.start(delay_sec=0.0) self.account.wait_for_http_service(port=self.server.port, headers={}, timeout=10, path="/") def check_wait_for_http_timeout(self): """Check waiting with timeout""" timeout = 1 start = time.time() self.server.start(delay_sec=5) try: self.account.wait_for_http_service(port=self.server.port, headers={}, timeout=timeout, path="/") raise Exception("Should have timed out waiting for server to start") except TimeoutError: # expected behavior. Now check that we're reasonably close to the expected timeout # This is a fairly loose check since there are various internal timeouts that can affect the overall # timing actual_timeout = time.time() - start assert abs(actual_timeout - timeout) / timeout < 1 @pytest.mark.parametrize( "checkers", [ [raise_error_checker], [raise_no_error_checker, raise_error_checker], [raise_error_checker, raise_no_error_checker], ], ) def check_ssh_checker(self, checkers): self.server.start() ssh_config = RemoteAccountSSHConfig.from_string( """ Host dummy_host.com Hostname dummy_host.name.com Port 22 User dummy ConnectTimeout 1 """ ) self.account = RemoteAccount(ssh_config, ssh_exception_checks=checkers) with pytest.raises(DummyException): self.account.ssh("echo test") def teardown(self): self.server.stop() class CheckRemoteAccountEquality(object): def check_remote_account_equality(self): """Different instances of remote account initialized with the same parameters should be equal.""" ssh_config = RemoteAccountSSHConfig(host="thehost", hostname="localhost", port=22) kwargs = { "ssh_config": ssh_config, "externally_routable_ip": "345", "logger": logging.getLogger(__name__), } r1 = RemoteAccount(**kwargs) r2 = RemoteAccount(**kwargs) assert r1 == r2 ================================================ FILE: tests/cluster/check_vagrant.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.vagrant import VagrantCluster from ducktape.services.service import Service import json import pickle import os import random import pytest from ducktape.cluster.remoteaccount import RemoteAccountError from tests.runner.fake_remote_account import create_fake_remote_account TWO_HOSTS = """Host worker1 HostName 127.0.0.1 User vagrant Port 2222 UserKnownHostsFile /dev/null StrictHostKeyChecking no PasswordAuthentication no IdentityFile /Users/foo/ducktape.git/.vagrant/machines/worker1/virtualbox/private_key IdentitiesOnly yes LogLevel FATAL Host worker2 HostName 127.0.0.2 User vagrant Port 2200 UserKnownHostsFile /dev/null StrictHostKeyChecking no PasswordAuthentication no IdentityFile /Users/foo/ducktape.git/.vagrant/machines/worker2/virtualbox/private_key IdentitiesOnly yes LogLevel FATAL """ def make_vagrant_cluster(*args, **kwargs): return VagrantCluster(make_remote_account_func=create_fake_remote_account, *args, **kwargs) class CheckVagrantCluster(object): def setup_method(self, _): # We roll our own tempfile name instead of using python tempfile module because # in some cases, we want self.cluster_file to be the name of a file which does not yet exist self.cluster_file = "cluster_file_temporary-%d.json" % random.randint(1, 2**63 - 1) if os.path.exists(self.cluster_file): os.remove(self.cluster_file) def teardown_method(self, _): if os.path.exists(self.cluster_file): os.remove(self.cluster_file) def _set_monkeypatch_attr(self, monkeypatch): monkeypatch.setattr( "ducktape.cluster.vagrant.VagrantCluster._vagrant_ssh_config", lambda vc: (TWO_HOSTS, None), ) monkeypatch.setattr( "ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount.fetch_externally_routable_ip", lambda vc: "127.0.0.1", ) def check_pickleable(self, monkeypatch): self._set_monkeypatch_attr(monkeypatch) cluster = VagrantCluster() pickle.dumps(cluster) def check_one_host_parsing(self, monkeypatch): """check the behavior of VagrantCluster when cluster_file is not specified. VagrantCluster should read cluster information from _vagrant_ssh_config(). """ self._set_monkeypatch_attr(monkeypatch) cluster = make_vagrant_cluster() assert len(cluster) == 2 assert cluster.num_available_nodes() == 2 node1, node2 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2)) assert node1.account.hostname == "worker1" assert node1.account.user == "vagrant" assert node1.account.ssh_hostname == "127.0.0.1" assert node2.account.hostname == "worker2" assert node2.account.user == "vagrant" assert node2.account.ssh_hostname == "127.0.0.2" def check_cluster_file_write(self, monkeypatch): """check the behavior of VagrantCluster when cluster_file is specified but the file doesn't exist. VagrantCluster should read cluster information from _vagrant_ssh_config() and write the information to cluster_file. """ self._set_monkeypatch_attr(monkeypatch) assert not os.path.exists(self.cluster_file) cluster = make_vagrant_cluster(cluster_file=self.cluster_file) cluster_json_expected = {} nodes = [ { "externally_routable_ip": node_account.externally_routable_ip, "ssh_config": { "host": node_account.ssh_config.host, "hostname": node_account.ssh_config.hostname, "user": node_account.ssh_config.user, "identityfile": node_account.ssh_config.identityfile, "password": node_account.ssh_config.password, "port": node_account.ssh_config.port, "connecttimeout": None, }, } for node_account in cluster._available_accounts ] cluster_json_expected["nodes"] = nodes cluster_json_actual = json.load(open(os.path.abspath(self.cluster_file))) assert cluster_json_actual == cluster_json_expected def check_cluster_file_read(self, monkeypatch): """check the behavior of VagrantCluster when cluster_file is specified and the file exists. VagrantCluster should read cluster information from cluster_file. """ self._set_monkeypatch_attr(monkeypatch) # To verify that VagrantCluster reads cluster information from the cluster_file, the # content in the file is intentionally made different from that returned by _vagrant_ssh_config(). nodes_expected = [] node1_expected = { "externally_routable_ip": "127.0.0.3", "ssh_config": { "host": "worker3", "hostname": "127.0.0.3", "user": "vagrant", "port": 2222, "password": "password", "identityfile": "/path/to/identfile3", "connecttimeout": None, }, } nodes_expected.append(node1_expected) node2_expected = { "externally_routable_ip": "127.0.0.2", "ssh_config": { "host": "worker2", "hostname": "127.0.0.2", "user": "vagrant", "port": 2223, "password": None, "identityfile": "/path/to/indentfile2", "connecttimeout": 10, }, } nodes_expected.append(node2_expected) cluster_json_expected = {} cluster_json_expected["nodes"] = nodes_expected json.dump( cluster_json_expected, open(self.cluster_file, "w+"), indent=2, separators=(",", ": "), sort_keys=True, ) # Load the cluster from the json file we just created cluster = make_vagrant_cluster(cluster_file=self.cluster_file) assert len(cluster) == 2 assert cluster.num_available_nodes() == 2 node2, node3 = cluster.alloc(Service.setup_cluster_spec(num_nodes=2)) assert node3.account.hostname == "worker2" assert node3.account.user == "vagrant" assert node3.account.ssh_hostname == "127.0.0.2" assert node3.account.ssh_config.to_json() == node2_expected["ssh_config"] assert node2.account.hostname == "worker3" assert node2.account.user == "vagrant" assert node2.account.ssh_hostname == "127.0.0.3" assert node2.account.ssh_config.to_json() == node1_expected["ssh_config"] def check_no_valid_network_devices(self, monkeypatch): """ test to make sure that a remote account error is raised when no network devices are found """ monkeypatch.setattr( "ducktape.cluster.vagrant.VagrantCluster._vagrant_ssh_config", lambda vc: (TWO_HOSTS, None), ) monkeypatch.setattr( "ducktape.cluster.linux_remoteaccount.LinuxRemoteAccount.get_network_devices", lambda account: [], ) with pytest.raises(RemoteAccountError): VagrantCluster() ================================================ FILE: tests/command_line/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/command_line/check_main.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.command_line.main import get_user_defined_globals from ducktape.command_line.main import setup_results_directory from ducktape.command_line.main import update_latest_symlink import json import os import os.path import pickle import pytest import tempfile class CheckSetupResultsDirectory(object): def setup_method(self, _): self.results_root = tempfile.mkdtemp() self.results_dir = os.path.join(self.results_root, "results_directory") self.latest_symlink = os.path.join(self.results_root, "latest") def validate_directories(self): """Validate existence of results directory and correct symlink""" assert os.path.exists(self.results_dir) assert os.path.exists(self.latest_symlink) assert os.path.islink(self.latest_symlink) # check symlink points to correct location assert os.path.realpath(self.latest_symlink) == os.path.realpath(self.results_dir) def check_creation(self): """Check results and symlink from scratch""" assert not os.path.exists(self.results_dir) setup_results_directory(self.results_dir) update_latest_symlink(self.results_root, self.results_dir) self.validate_directories() def check_symlink(self): """Check "latest" symlink behavior""" # if symlink already exists old_results = os.path.join(self.results_root, "OLD") os.mkdir(old_results) os.symlink(old_results, self.latest_symlink) assert os.path.islink(self.latest_symlink) and os.path.exists(self.latest_symlink) setup_results_directory(self.results_dir) update_latest_symlink(self.results_root, self.results_dir) self.validate_directories() # Try again if symlink exists and points to nothing os.rmdir(self.results_dir) assert os.path.islink(self.latest_symlink) and not os.path.exists(self.latest_symlink) setup_results_directory(self.results_dir) update_latest_symlink(self.results_root, self.results_dir) self.validate_directories() globals_json = """ { "x": 200 } """ invalid_globals_json = """ { can't parse this!: ?right? } """ valid_json_not_dict = """ [ { "x": 200, "y": 300 } ] """ class CheckUserDefinedGlobals(object): """Tests for the helper method which parses in user defined globals option""" def check_immutable(self): """Expect the user defined dict object to be immutable.""" global_dict = get_user_defined_globals(globals_json) with pytest.raises(NotImplementedError): global_dict["x"] = -1 with pytest.raises(NotImplementedError): global_dict["y"] = 3 def check_pickleable(self): """Expect the user defined dict object to be pickleable""" globals_dict = get_user_defined_globals(globals_json) assert globals_dict # Need to test non-empty dict, to ensure py3 compatibility assert pickle.loads(pickle.dumps(globals_dict)) == globals_dict def check_parseable_json_string(self): """Check if globals_json is parseable as JSON, we get back a dictionary view of parsed JSON.""" globals_dict = get_user_defined_globals(globals_json) assert globals_dict == json.loads(globals_json) def check_unparseable(self): """If globals string is not a path to a file, and not parseable as JSON we want to raise a ValueError""" with pytest.raises(ValueError): get_user_defined_globals(invalid_globals_json) def check_parse_from_file(self): """Validate that, given a filename of a file containing valid JSON, we correctly parse the file contents.""" _, fname = tempfile.mkstemp() try: with open(fname, "w") as fh: fh.write(globals_json) global_dict = get_user_defined_globals(fname) assert global_dict == json.loads(globals_json) assert global_dict["x"] == 200 finally: os.remove(fname) def check_bad_parse_from_file(self): """Validate behavior when given file containing invalid JSON""" _, fname = tempfile.mkstemp() try: with open(fname, "w") as fh: # Write invalid JSON fh.write(invalid_globals_json) with pytest.raises(ValueError): get_user_defined_globals(fname) finally: os.remove(fname) def check_non_dict(self): """Valid JSON which does not parse as a dict should raise a ValueError""" # Should be able to parse this as JSON json.loads(valid_json_not_dict) with pytest.raises(ValueError): get_user_defined_globals(valid_json_not_dict) def check_non_dict_from_file(self): """Validate behavior when given file containing valid JSON which does not parse as a dict""" _, fname = tempfile.mkstemp() try: with open(fname, "w") as fh: # Write valid JSON which does not parse as a dict fh.write(valid_json_not_dict) with pytest.raises(ValueError, match=fname): get_user_defined_globals(fname) finally: os.remove(fname) ================================================ FILE: tests/command_line/check_parse_args.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.command_line.parse_args import parse_args from io import StringIO import os import re import shutil import sys import tempfile class Capturing(object): """This context manager can be used to capture stdout from a function call. E.g. with Capture() as captured: call_function() assert captured.output == expected_output """ def __enter__(self): self._stdout = sys.stdout sys.stdout = self._stringio = StringIO() return self def __exit__(self, *args): self.output = self._stringio.getvalue() sys.stdout = self._stdout class CheckParseArgs(object): def check_empty_args(self): """Check that parsing an empty args list results in printing a usage message, followed by sys.exit(0)""" try: with Capturing() as captured: parse_args([]) except SystemExit as e: assert e.code == 0 assert captured.output.find("usage") >= 0 def check_version(self): """If --version is present, ducktape should print version and exit""" try: with Capturing() as captured: parse_args(["--version"]) except SystemExit as e: assert e.code == 0 assert re.search(r"[\d]+\.[\d]+\.[\d]+", captured.output) is not None def check_empty_test_path(self): """Check that default test_path is an array consisting of cwd.""" args = ["--collect-only"] parsed = parse_args(args) parsed_paths = [os.path.abspath(p) for p in parsed["test_path"]] assert parsed_paths == [os.path.abspath(".")] def check_multiple_test_paths(self): """Check that parser properly handles multiple "test paths". It should capture a list of test paths.""" paths = ["path1"] args = ["--debug"] + paths + ["--collect-only"] parsed = parse_args(args) assert parsed["test_path"] == paths paths = ["path%d" % i for i in range(10)] args = ["--cluster-file", "my-cluster-file"] + paths + ["--debug", "--exit-first"] parsed = parse_args(args) assert parsed["test_path"] == paths def check_multiple_exclude(self): excluded = ["excluded1", "excluded2"] args = ["--collect-only", "--exclude"] + excluded + ["--debug"] parsed = parse_args(args) assert parsed["exclude"] == excluded def check_config_overrides(self, monkeypatch): """Check that parsed arguments pick up values from config files, and that overrides match precedence.""" tmpdir = tempfile.mkdtemp(dir="/tmp") # Create tmp file for global config project_cfg_filename = os.path.join(tmpdir, "ducktape-project.cfg") user_cfg_filename = os.path.join(tmpdir, "ducktape-user.cfg") project_cfg = [ "--cluster-file CLUSTERFILE-project", "--results-root RESULTSROOT-project", "--parameters PARAMETERS-project", ] # user_cfg options should override project_cfg user_cfg = ["--results-root RESULTSROOT-user", "--parameters PARAMETERS-user"] try: monkeypatch.setattr( "ducktape.command_line.defaults.ConsoleDefaults.PROJECT_CONFIG_FILE", project_cfg_filename, ) monkeypatch.setattr( "ducktape.command_line.defaults.ConsoleDefaults.USER_CONFIG_FILE", user_cfg_filename, ) with open(project_cfg_filename, "w") as project_f: project_f.write("\n".join(project_cfg)) with open(user_cfg_filename, "w") as user_f: user_f.write("\n".join(user_cfg)) # command-line options should override user_cfg and project_cfg args_dict = parse_args(["--parameters", "PARAMETERS-commandline"]) assert args_dict["cluster_file"] == "CLUSTERFILE-project" assert args_dict["results_root"] == "RESULTSROOT-user" assert args_dict["parameters"] == "PARAMETERS-commandline" finally: shutil.rmtree(tmpdir) def check_config_file_option(self): """Check that config file option works""" tmpdir = tempfile.mkdtemp(dir="/tmp") user_cfg_filename = os.path.join(tmpdir, "ducktape-user.cfg") user_cfg = ["--results-root RESULTSROOT-user", "--parameters PARAMETERS-user"] try: with open(user_cfg_filename, "w") as user_f: user_f.write("\n".join(user_cfg)) args_dict = parse_args(["--config-file", user_cfg_filename]) assert args_dict["results_root"] == "RESULTSROOT-user" assert args_dict["parameters"] == "PARAMETERS-user" finally: shutil.rmtree(tmpdir) def check_config_overrides_for_n_args(self, monkeypatch): """Check that parsed arguments pick up values from config files, and that overrides match precedence.""" tmpdir = tempfile.mkdtemp(dir="/tmp") # Create tmp file for global config project_cfg_filename = os.path.join(tmpdir, "ducktape-project.cfg") user_cfg_filename = os.path.join(tmpdir, "ducktape-user.cfg") project_cfg = [ "--exclude", "test1", "test2", "test3", ] # user_cfg options should override project_cfg user_cfg = [ "test1", "test2", "test3", ] try: monkeypatch.setattr( "ducktape.command_line.defaults.ConsoleDefaults.PROJECT_CONFIG_FILE", project_cfg_filename, ) monkeypatch.setattr( "ducktape.command_line.defaults.ConsoleDefaults.USER_CONFIG_FILE", user_cfg_filename, ) with open(project_cfg_filename, "w") as project_f: project_f.write("\n".join(project_cfg)) with open(user_cfg_filename, "w") as user_f: user_f.write("\n".join(user_cfg)) # command-line options should override user_cfg and project_cfg args_dict = parse_args(["test1", "test2", "test3", "test4", "--max-parallel", "9000"]) assert args_dict["test_path"] == ["test1", "test2", "test3", "test4"] assert args_dict["exclude"] == ["test1", "test2", "test3"] assert args_dict["max_parallel"] == 9000 finally: shutil.rmtree(tmpdir) ================================================ FILE: tests/ducktape_mock.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Tuple from ducktape.cluster.cluster import Cluster from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.cluster.consts import LINUX from ducktape.cluster.node_container import NodeContainer from ducktape.tests.session import SessionContext from ducktape.tests.test import Test from ducktape.tests.test_context import TestContext from ducktape.cluster.linux_remoteaccount import LinuxRemoteAccount from ducktape.cluster.remoteaccount import RemoteAccountSSHConfig from unittest.mock import MagicMock import os import tempfile def mock_cluster(): return MagicMock( all=lambda: [MagicMock(spec=ClusterSpec)] * 3, max_used=lambda: 3, max_used_nodes=3, ) class FakeClusterNode(object): @property def operating_system(self): return LINUX class FakeCluster(Cluster): """A cluster class with counters, but no actual node objects""" def __init__(self, num_nodes): super(FakeCluster, self).__init__() self._available_nodes = NodeContainer() for i in range(0, num_nodes): self._available_nodes.add_node(FakeClusterNode()) self._in_use_nodes = NodeContainer() def do_alloc(self, cluster_spec): good_nodes, bad_nodes = self._available_nodes.remove_spec(cluster_spec) self._in_use_nodes.add_nodes(good_nodes) return good_nodes def free_single(self, node): self._in_use_nodes.remove_node(node) self._available_nodes.add_node(node) def available(self): return ClusterSpec.from_nodes(self._available_nodes) def used(self): return ClusterSpec.from_nodes(self._in_use_nodes) def session_context(**kwargs): """Return a SessionContext object""" if "results_dir" not in kwargs.keys(): tmp = tempfile.mkdtemp() session_dir = os.path.join(tmp, "test_dir") os.mkdir(session_dir) kwargs["results_dir"] = session_dir return SessionContext(session_id="test_session", **kwargs) class TestMockTest(Test): def mock_test(self): pass def test_context(session_context=session_context(), cluster=mock_cluster()): """Return a TestContext object""" return TestContext( session_context=session_context, file="tests/ducktape_mock.py", module=__name__, cls=TestMockTest, function=TestMockTest.mock_test, cluster=cluster, ) class MockNode(object): """Mock cluster node""" def __init__(self): self.account = MockAccount() class MockAccount(LinuxRemoteAccount): """Mock node.account object. It's Linux because tests are run in Linux.""" def __init__(self, **kwargs): ssh_config = RemoteAccountSSHConfig(host="localhost", user=None, hostname="localhost", port=22) super(MockAccount, self).__init__(ssh_config, externally_routable_ip="localhost", logger=None, **kwargs) class MockSender(MagicMock): send_results: List[Tuple] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.send_results = [] def send(self, *args, **kwargs): self.send_results.append((args, kwargs)) ================================================ FILE: tests/loader/__init__.py ================================================ ================================================ FILE: tests/loader/check_loader.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.loader import TestLoader, LoaderException, _requests_session import tests.ducktape_mock import os import os.path import pytest import re import requests import tempfile import yaml from mock import Mock from requests_testadapter import Resp class LocalFileAdapter(requests.adapters.HTTPAdapter): def build_response_from_file(self, request): file_path = request.url[7:] with open(file_path, "rb") as file: buff = bytearray(os.path.getsize(file_path)) file.readinto(buff) resp = Resp(buff) r = self.build_response(request, resp) return r def send(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None): return self.build_response_from_file(request) def resources_dir(): return os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources") def discover_dir(): """Return the absolute path to the directory to use with discovery tests.""" return os.path.join(resources_dir(), "loader_test_directory") def sub_dir_a(): return os.path.join(discover_dir(), "sub_dir_a") def num_tests_in_file(fpath): """Count expected number of tests in the file. Search for NUM_TESTS = N return N if pattern is present else 0 """ with open(fpath, "r") as fd: match = re.search(r"^NUM_TESTS\s*=\s*(\d+)", fd.read(), re.MULTILINE) if not match: return 0 return int(match.group(1)) def num_tests_in_dir(dpath): """Walk through directory subtree and count up expected number of tests that TestLoader should find.""" assert os.path.exists(dpath) assert os.path.isdir(dpath) num_tests = 0 for pwd, dirs, files in os.walk(dpath): for f in files: if not f.endswith(".py"): continue file_path = os.path.abspath(os.path.join(pwd, f)) num_tests += num_tests_in_file(file_path) return num_tests def invalid_test_suites(): dpath = os.path.join(discover_dir(), "invalid_test_suites") params = [] for pwd, dirs, files in os.walk(dpath): for f in files: if not f.endswith(".yml"): continue file_path = os.path.abspath(os.path.join(pwd, f)) params.append(pytest.param(file_path, id=os.path.basename(file_path))) return params class CheckTestLoader(object): def setup_method(self, method): self.SESSION_CONTEXT = tests.ducktape_mock.session_context() # To simplify unit tests, add file:// support to the test loader's functionality for loading previous # report.json files _requests_session.mount("file://", LocalFileAdapter()) @pytest.mark.parametrize("suite_file_path", invalid_test_suites()) def check_test_loader_raises_on_invalid_test_suite(self, suite_file_path): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) with pytest.raises(LoaderException): loader.load([suite_file_path]) @pytest.mark.parametrize( ["expected_count", "input_symbols", "excluded_symbols"], [ pytest.param( 6, [ # import a single test suite with an import statement os.path.join(discover_dir(), "test_suite_with_single_import.yml") ], None, id="load dependency path", ), pytest.param( 2, [ # test that a self import doesn't cause an infinite loop os.path.join(discover_dir(), "test_suite_with_self_import.yml") ], None, id="self load in import", ), pytest.param( 5, [ # test that a cyclic import doesn't cause an infinite loop os.path.join(discover_dir(), "test_suite_cyclic_b.yml") ], None, id="self load in import", ), pytest.param( 5, [ # test that a cyclic import starting with second import os.path.join(discover_dir(), "test_suite_cyclic_a.yml") ], None, id="self load in import", ), pytest.param( 5, [ # test single import statement with parent reference os.path.join(discover_dir(), "test_suites", "sub_dir_test_import.yml") ], None, id="self load in import", ), pytest.param( 8, [ # see test suite files for number of tests in it. # decorated test suite includes 2 tests; # single includes 4 tests; multiple includes 3 tests; # however both single and multiple test suites include one test method of test_by.py, # so total -= 1 os.path.join(discover_dir(), "test_suite_single.yml"), os.path.join(discover_dir(), "test_suite_multiple.yml"), os.path.join(discover_dir(), "test_suite_decorated.yml"), ], None, id="load multiple test suite files", ), pytest.param( 5, [ # see test suite file for number of tests in it os.path.join(discover_dir(), "test_suites", "test_suite_glob.yml") ], None, id="load test suite with globs", ), pytest.param( 2, [ # test suite that includes sub_dir_a/test_c.py (1 test total): os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c.yml"), # explicitly include test_a.yml (1 test total) os.path.join(discover_dir(), "test_a.py"), ], None, id="load both file and suite", ), pytest.param( 1, [ # test suite that includes sub_dir_a/test_c.py (1 test total): os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c.yml"), # explicitly include test_a.yml (1 test total) os.path.join(discover_dir(), "test_a.py"), ], [ # explicitly exclude the sub_dir_a/test_c.py (included with test suite): os.path.join(sub_dir_a(), "test_c.py"), ], id="global exclude overrides test suite include", ), pytest.param( 4, [ # sub_dir_a contains 4 total tests # test suite that includes sub_dir_a/*.py but excludes sub_dir_a/test_d.py: os.path.join(discover_dir(), "test_suites", "sub_dir_a_with_exclude.yml"), # explicitly include sub_dir_a/test_d.py to override exclusion from test suite: os.path.join(sub_dir_a(), "test_d.py"), ], None, id="global include overrides test suite exclude", ), pytest.param( 1, [ # load two test suites and two files that all point to the same actual test # and verify that in the end only 1 test has been loaded os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c.yml"), os.path.join(discover_dir(), "test_suites", "sub_dir_a_test_c_via_class.yml"), os.path.join(sub_dir_a(), "test_c.py"), os.path.join(sub_dir_a(), "test_c.py::TestC"), ], None, id="same test in test suites and test files", ), ], ) def check_test_loader_with_test_suites_and_files(self, expected_count, input_symbols, excluded_symbols): """ When both files and test suites are loaded, files (both included and excluded) are loaded after and separately from the test suites, so even if a test suite excludes file A, it will be included if it's passed directly. And if file A is excluded directly, even if any of the test suites includes it, it will still be excluded. """ loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load(input_symbols, excluded_test_symbols=excluded_symbols) assert len(tests) == expected_count def check_test_loader_with_directory(self): """Check discovery on a directory.""" loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load([discover_dir()]) assert len(tests) == num_tests_in_dir(discover_dir()) @pytest.mark.parametrize( ["dir_", "file_name"], [ pytest.param(discover_dir(), "test_a.py"), pytest.param(resources_dir(), "a.py"), ], ) def check_test_loader_with_file(self, dir_, file_name): """Check discovery on a file.""" loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) module_path = os.path.join(dir_, file_name) tests = loader.load([module_path]) assert len(tests) == num_tests_in_file(module_path) def check_test_loader_with_glob(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) file_glob = os.path.join(discover_dir(), "*_a.py") # should resolve to test_a.py only tests = loader.load([file_glob]) assert len(tests) == 1 def check_test_loader_multiple_files(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) file_a = os.path.join(discover_dir(), "test_a.py") file_b = os.path.join(discover_dir(), "test_b.py") tests = loader.load([file_a, file_b]) assert len(tests) == num_tests_in_file(file_a) + num_tests_in_file(file_b) def check_test_loader_include_dir_exclude_file(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) excluded_file_a = os.path.join(discover_dir(), "test_a.py") excluded_file_b = os.path.join(discover_dir(), "test_b.py") num_excluded = num_tests_in_file(excluded_file_a) + num_tests_in_file(excluded_file_b) tests = loader.load([discover_dir()], [excluded_file_a, excluded_file_b]) assert len(tests) == num_tests_in_dir(discover_dir()) - num_excluded def check_test_loader_exclude_subdir(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) included_dir = discover_dir() excluded_dir = sub_dir_a() tests = loader.load([included_dir], [excluded_dir]) assert len(tests) == num_tests_in_dir(included_dir) - num_tests_in_dir(excluded_dir) def check_test_loader_exclude_subdir_glob(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) included_dir = discover_dir() excluded_dir = sub_dir_a() excluded_dir_glob = os.path.join(sub_dir_a(), "*.py") tests = loader.load([included_dir], [excluded_dir_glob]) assert len(tests) == num_tests_in_dir(included_dir) - num_tests_in_dir(excluded_dir) def check_test_loader_raises_when_nothing_is_included(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) file_a = os.path.join(discover_dir(), "test_a.py") file_b = os.path.join(discover_dir(), "test_b.py") with pytest.raises(LoaderException): loader.load([file_a, file_b], [discover_dir()]) def check_test_loader_raises_on_include_subdir_exclude_parent_dir(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) with pytest.raises(LoaderException): loader.load([(sub_dir_a())], [(discover_dir())]) def check_test_loader_with_nonexistent_file(self): """Check discovery on a non-existent path should throw LoaderException""" with pytest.raises(LoaderException): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) loader.load([os.path.join(discover_dir(), "file_that_does_not_exist.py")]) def check_test_loader_include_dir_without_tests(self): with pytest.raises(LoaderException): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) loader.load([os.path.join(discover_dir(), "sub_dir_no_tests")]) def check_test_loader_include_file_without_tests(self): with pytest.raises(LoaderException): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) loader.load([os.path.join(discover_dir(), "sub_dir_no_tests", "just_some_file.py")]) def check_test_loader_allow_exclude_dir_without_tests(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load([discover_dir()], [os.path.join(discover_dir(), "sub_dir_no_tests")]) assert len(tests) == num_tests_in_dir(discover_dir()) def check_test_loader_allow_exclude_file_without_tests(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load( [discover_dir()], [os.path.join(discover_dir(), "sub_dir_no_tests", "just_some_file.py")], ) assert len(tests) == num_tests_in_dir(discover_dir()) def check_test_loader_allow_exclude_nonexistent_file(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load( [discover_dir()], [os.path.join(discover_dir(), "file_that_does_not_exist.py")], ) assert len(tests) == num_tests_in_dir(discover_dir()) def check_test_loader_with_class(self): """Check test discovery with discover class syntax.""" loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load([os.path.join(discover_dir(), "test_b.py::TestBB")]) assert len(tests) == 2 # Sanity check, test that it discovers two test class & 3 tests if it searches the whole module tests = loader.load([os.path.join(discover_dir(), "test_b.py")]) assert len(tests) == 3 def check_test_loader_include_dir_exclude_class(self): """Check test discovery with discover class syntax.""" loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load([discover_dir()], [os.path.join(discover_dir(), "test_b.py::TestBB")]) # TestBB contains 2 test methods assert len(tests) == num_tests_in_dir(discover_dir()) - 2 def check_test_loader_include_class_exclude_method(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) included = [os.path.join(discover_dir(), "test_b.py::TestBB")] excluded = [os.path.join(discover_dir(), "test_b.py::TestBB.test_bb_one")] tests = loader.load(included, excluded) # TestBB contains 2 test methods, but 1 is excluded assert len(tests) == 1 def check_test_loader_include_dir_exclude_method(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) excluded = [os.path.join(discover_dir(), "test_b.py::TestBB.test_bb_one")] tests = loader.load([discover_dir()], excluded) # excluded 1 method only assert len(tests) == num_tests_in_dir(discover_dir()) - 1 def check_test_loader_with_matrix_params(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) included = [ os.path.join( discover_dir(), 'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "test "}', ) ] tests = loader.load(included) # TestMatrix contains a single parametrized method. Since we only provide a single set of params, we should # end up with a single context assert len(tests) == 1 assert tests[0].injected_args == {"x": 1, "y": "test "} def check_test_loader_with_params_special_chars(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) included = [ os.path.join( discover_dir(), r"test_decorated.py::TestParametrizdeSpecial.test_special_characters_params" r'@{"version": "6.1.0", "chars": "!@#$%^&*()_+::.,/? \"{}\\"}', ) ] tests = loader.load(included) # TestMatrix contains a single parametrized method. Since we only provide a single set of params, we should # end up with a single context assert len(tests) == 1 assert tests[0].injected_args == { "version": "6.1.0", "chars": '!@#$%^&*()_+::.,/? "{}\\', } def check_test_loader_with_multiple_matrix_params(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) params = '[{"x": 1,"y": "test "}, {"x": 2,"y": "I\'m"}]' included = [ os.path.join( discover_dir(), "test_decorated.py::TestMatrix.test_thing@{}".format(params), ) ] tests = loader.load(included) # TestMatrix contains a single parametrized method. # We provide two sets of params, so we should end up with two contexts assert len(tests) == 2 injected_args = map(lambda t: t.injected_args, tests) assert {"x": 1, "y": "test "} in injected_args assert {"x": 2, "y": "I'm"} in injected_args def check_test_loader_with_parametrize(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) included = [ os.path.join( discover_dir(), 'test_decorated.py::TestParametrized.test_thing@{"x":1,"y":2}', ) ] tests = loader.load(included) assert len(tests) == 1 assert tests[0].injected_args == {"x": 1, "y": 2} def check_test_loader_with_parametrize_with_objects(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) parameters = '{"d": {"a": "A"}, "lst": ["whatever"]}' included = [ os.path.join( discover_dir(), "test_decorated.py::TestObjectParameters.test_thing@{}".format(parameters), ) ] tests = loader.load(included) assert len(tests) == 1 assert tests[0].injected_args == {"d": {"a": "A"}, "lst": ["whatever"]} def check_test_loader_with_injected_args(self): """When the --parameters command-line option is used, the loader behaves a little bit differently: each test method annotated with @parametrize or @matrix should only expand to a single discovered test, and the injected args should be those passed in from command-line. """ # parameter values don't have to match any of the pre-defined parameters in the annotation # moreover, even parameter keys don't have to match method arguments, though if that's the case # the runner will complain, but the loader wouldn't care (this has been ducktape's behavior for a while now) injected_args = {"x": 100, "y": -100} loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), injected_args=injected_args) file = os.path.join(discover_dir(), "test_decorated.py") tests = loader.load([file]) assert len(tests) == 6 for t in tests: assert t.injected_args == injected_args def check_test_loader_raises_with_both_injected_args_and_parameters(self): """One should not use both --parameters command-line flag and parameterized test symbols at the same time. Loader will explicitly raise in such cases to avoid confusing behavior. """ injected_args = {"x": 1, "y": "test "} loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), injected_args=injected_args) included = [ os.path.join( discover_dir(), 'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "test "}', ) ] with pytest.raises(LoaderException, match="Cannot use both"): loader.load(included) def check_test_loader_raises_on_params_not_found(self): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) # parameter syntax is valid, but there is no such parameter defined in the test annotation in the code included = [ os.path.join( discover_dir(), 'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "missing"}', ) ] with pytest.raises(LoaderException, match="No tests to run"): loader.load(included) @pytest.mark.parametrize( "symbol", [ # no class "test_decorated.py::.test_thing" # no method 'test_decorated.py::TestMatrix@{"x": 1, "y": "test "}' # invalid json in params 'test_decorated.py::TestMatrix.test_thing@{x: 1,"y": "test "}' ], ) def check_test_loader_raises_on_malformed_test_discovery_symbol(self, symbol): loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) included = [os.path.join(discover_dir(), symbol)] with pytest.raises(LoaderException, match="Invalid discovery symbol"): loader.load(included) def check_test_loader_exclude_with_injected_args(self): injected_args = {"x": 1, "y": -1} loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), injected_args=injected_args) included = [os.path.join(discover_dir(), "test_decorated.py")] excluded = [os.path.join(discover_dir(), "test_decorated.py::TestStackedMatrix")] tests = loader.load(included, excluded) # test_decorated.py contains 5 test methods total # we exclude 1 class with 1 method so should be 4 # exclusion shouldn't care about injected args assert len(tests) == 5 for t in tests: assert t.injected_args == injected_args def check_test_loader_exclude_with_params(self): """ Checks behavior of exclude flag with parametrized annotations. Should exclude only a single permutation of the method """ loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) # included 8 tests included = [os.path.join(discover_dir(), "test_decorated.py::TestMatrix")] # exclude 1 test excluded = [ os.path.join( discover_dir(), 'test_decorated.py::TestMatrix.test_thing@{"x": 1,"y": "test "}', ) ] tests = loader.load(included, excluded) assert len(tests) == 7 def check_test_loader_exclude_with_params_multiple(self): """ Checks behavior of exclude flag with parametrized annotations. Should exclude two permutations of the method """ loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) # include 8 tests included = [os.path.join(discover_dir(), "test_decorated.py::TestMatrix")] # exclude 2 tests params = '[{"x": 1,"y": "test "}, {"x": 2,"y": "I\'m"}]' excluded = [ os.path.join( discover_dir(), "test_decorated.py::TestMatrix.test_thing@{}".format(params), ) ] tests = loader.load(included, excluded) assert len(tests) == 6 def check_test_loader_with_subsets(self): """Check that computation of subsets work properly. This validates both that the division of tests is correct (i.e. as even a distribution as we can get but uneven in the expected way when necessary) and that the division happens after the expansion of tests marked for possible expansion (e.g. matrix, parametrize).""" file = os.path.join(discover_dir(), "test_decorated.py") # The test file contains 18 tests. With 4 subsets, first two subset should have an "extra" loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=0, subsets=4) tests = loader.load([file]) assert len(tests) == 5 loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=1, subsets=4) tests = loader.load([file]) assert len(tests) == 5 loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=2, subsets=4) tests = loader.load([file]) assert len(tests) == 4 loader = TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=3, subsets=4) tests = loader.load([file]) assert len(tests) == 4 def check_test_loader_with_invalid_subsets(self): """Check that the TestLoader throws an exception if the requests subset is larger than the number of subsets""" with pytest.raises(ValueError): TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=4, subsets=4) with pytest.raises(ValueError): TestLoader(self.SESSION_CONTEXT, logger=Mock(), subset=5, subsets=4) def check_test_loader_with_time_based_subsets(self): """Check that computation of subsets using a report with timing information correctly generates subsets that are optimized based on timing rather than number of tests. """ file = os.path.join(discover_dir(), "test_b.py") report_url = "file://" + os.path.join(resources_dir(), "report.json") # The expected behavior of the current implementation is to add tests to each subset from largest to smallest, # using the least full subset each time. The test data with times of (10, 5, 1) should result in the first # subset containing 1 test and the second containing 2 (the opposite of the simple count-based strategy) loader = TestLoader( self.SESSION_CONTEXT, logger=Mock(), subset=0, subsets=2, historical_report=report_url, ) tests = loader.load([file]) assert len(tests) == 1 loader = TestLoader( self.SESSION_CONTEXT, logger=Mock(), subset=1, subsets=2, historical_report=report_url, ) tests = loader.load([file]) assert len(tests) == 2 def check_loader_with_non_yml_file(self): """ test loading a test file as an import """ file = os.path.join(discover_dir(), "test_suite_import_py.yml") loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) with pytest.raises(LoaderException, match=r"Failed to load test suite from file: \S+test_a\.py"): loader.load([file]) def check_loader_with_non_suite_yml_file(self): """ test importing a suite that is malformed """ file1 = os.path.join(discover_dir(), "test_suite_malformed.yml") file2 = os.path.join(discover_dir(), "test_suite_import_malformed.yml") loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) with pytest.raises(LoaderException, match="No tests found in simple_malformed_suite"): loader.load([file1]) with pytest.raises(LoaderException, match="No tests found in simple_malformed_suite"): loader.load([file2]) def check_test_loader_with_absolute_path(self): """ Test loading suites using absolute paths to other imported suites as well as absolute paths to tests in the suite """ with tempfile.TemporaryDirectory() as td: temp_suite1 = os.path.join(td, "temp_suite1.yml") temp_suite2 = os.path.join(td, "temp_suite2.yml") with open(temp_suite1, "w") as f: test_yaml1 = yaml.dump( { "import": str(os.path.join(td, "temp_suite2.yml")), "suite": [os.path.abspath(os.path.join(discover_dir(), "test_a.py"))], } ) f.write(test_yaml1) with open(temp_suite2, "w") as f: test_yaml2 = yaml.dump({"suite": [os.path.abspath(os.path.join(discover_dir(), "test_b.py"))]}) f.write(test_yaml2) loader = TestLoader(self.SESSION_CONTEXT, logger=Mock()) tests = loader.load([temp_suite1]) assert len(tests) == 4 def join_parsed_symbol_components(parsed): """ Join together a parsed symbol e.g. { 'path': 'path/to/dir/test_file.py', 'cls': 'ClassName', 'method': 'method' }, -> 'path/to/dir/test_file.py::ClassName.method' """ symbol = os.path.join(parsed["path"]) if parsed["cls"] or parsed["method"]: symbol += "::" symbol += parsed["cls"] if parsed["method"]: symbol += "." symbol += parsed["method"] return symbol def normalize_ending_slash(dirname): if dirname.endswith(os.path.sep): dirname = dirname[: -len(os.path.sep)] return dirname class CheckParseSymbol(object): def check_parse_discovery_symbol(self): """Check that "test discovery symbol" parsing logic works correctly""" parsed_symbols = [ {"path": "path/to/dir", "cls": "", "method": ""}, {"path": "path/to/dir/test_file.py", "cls": "", "method": ""}, {"path": "path/to/dir/test_file.py", "cls": "ClassName", "method": ""}, { "path": "path/to/dir/test_file.py", "cls": "ClassName", "method": "method", }, {"path": "path/to/dir", "cls": "ClassName", "method": ""}, {"path": "test_file.py", "cls": "", "method": ""}, {"path": "test_file.py", "cls": "ClassName", "method": ""}, {"path": "test_file.py", "cls": "ClassName", "method": "method"}, ] loader = TestLoader(tests.ducktape_mock.session_context(), logger=Mock()) for parsed in parsed_symbols: symbol = join_parsed_symbol_components(parsed) expected_parsed = ( normalize_ending_slash(parsed["path"]), parsed["cls"], parsed["method"], ) actually_parsed = loader._parse_discovery_symbol(symbol) actually_parsed = ( normalize_ending_slash(actually_parsed[0]), actually_parsed[1], actually_parsed[2], ) assert actually_parsed == expected_parsed, "%s did not parse as expected" % symbol ================================================ FILE: tests/loader/resources/__init__.py ================================================ ================================================ FILE: tests/loader/resources/loader_test_directory/README ================================================ A dummy test directory structure for checking test discovery behavior ================================================ FILE: tests/loader/resources/loader_test_directory/__init__.py ================================================ ================================================ FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/empty_file.yml ================================================ ================================================ FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/malformed_test_suite.yml ================================================ malformed_test_suite: included: should_have_been_a_list: - but - is - a - dict ================================================ FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/not_yaml.yml ================================================ this is not a yaml file No, really ================================================ FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/test_suite_refers_to_non_existent_file.yml ================================================ non_existent_file_suit: included: - file_does_not_exist.py ================================================ FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/test_suite_with_malformed_params.yml ================================================ non_existent_file_suit: included: - 'test_decorated.py::TestMatrix.test_thing@[{x: 1,y: "test "}]' # this json won't parse because no quotes ================================================ FILE: tests/loader/resources/loader_test_directory/invalid_test_suites/test_suites_with_no_tests.yml ================================================ empty_test_suite_a: empty_test_suite_b: excluded: - ../sub_dir_a ================================================ FILE: tests/loader/resources/loader_test_directory/name_does_not_match_pattern.py ================================================ from ducktape.tests.test import Test class TestNotLoaded(Test): """Loader should not discover this - module name does not match default pattern.""" def test_a(self): pass ================================================ FILE: tests/loader/resources/loader_test_directory/sub_dir_a/__init__.py ================================================ ================================================ FILE: tests/loader/resources/loader_test_directory/sub_dir_a/test_c.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test NUM_TESTS = 1 class TestC(Test): """Loader should discover this.""" def test(self): pass class TestInvisible(object): """Loader should not discover this.""" def test_invisible(self): pass ================================================ FILE: tests/loader/resources/loader_test_directory/sub_dir_a/test_d.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test NUM_TESTS = 3 class TestD(Test): """Loader should discover this.""" def test_d(self): pass def test_dd(self): pass def ddd_test(self): pass class TestInvisible(object): """Loader should not discover this.""" def test_invisible(self): pass ================================================ FILE: tests/loader/resources/loader_test_directory/sub_dir_no_tests/__init__.py ================================================ ================================================ FILE: tests/loader/resources/loader_test_directory/sub_dir_no_tests/just_some_file.py ================================================ class JustSomeClass(object): pass ================================================ FILE: tests/loader/resources/loader_test_directory/test_a.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test NUM_TESTS = 1 class TestA(Test): """Loader should discover this.""" def test_a(self): pass class TestInvisible(object): """Loader should not discover this.""" def test_invisible(self): pass ================================================ FILE: tests/loader/resources/loader_test_directory/test_b.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test NUM_TESTS = 3 class TestB(Test): """Loader should discover this.""" def test_b(self): pass class TestBB(Test): """Loader should discover this with 2 tests.""" test_not_callable = 3 def test_bb_one(self): pass def bb_two_test(self): pass def other_method(self): pass class TestInvisible(object): """Loader should not discover this.""" def test_invisible(self): pass ================================================ FILE: tests/loader/resources/loader_test_directory/test_decorated.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test from ducktape.mark import matrix from ducktape.mark import parametrize NUM_TESTS = 18 class TestMatrix(Test): """8 tests here""" @matrix(x=[1, 2], y=["I'm", " a ", "test ", "matrix!"]) def test_thing(self, x, y): pass class TestStackedMatrix(Test): """4 tests""" @matrix(x=[1, 2], y=[-1, 0]) def test_thing(self, x, y): pass class TestParametrized(Test): @parametrize(x=10) def test_single_decorator(self, x=1, y="hi"): """1 test""" pass @parametrize(x=1, y=2) @parametrize(x="abc", y=[]) def test_thing(self, x, y): """2 tests""" pass class TestParametrizdeSpecial(Test): @parametrize(version="6.1.0", chars='!@#$%^&*()_+::.,/? "{}\\') def test_special_characters_params(self, version, chars): """1 tests""" pass class TestObjectParameters(Test): @parametrize(d={"a": "A"}, lst=["whatever"]) @parametrize(d={"z": "Z"}, lst=["something"]) def test_thing(self, d, lst): """2 tests""" pass ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_cyclic_a.yml ================================================ import: - test_suite_cyclic_b.yml # 1 test test_suite_cyclic_a: - ./sub_dir_a/test_c.py # 1 test - ./test_b.py # 3 # suite total: 4 # total: 5 ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_cyclic_b.yml ================================================ import: - test_suite_cyclic_a.yml # 4 test test_suite_cyclic_b: - test_a.py # 1 test # suite total: 1 test # total: 5 tests ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_decorated.yml ================================================ test_suite_decorated: - 'test_decorated.py::TestMatrix.test_thing@[{"x": 1,"y": "test "}, {"x": 2, "y": "test "}]' # 2 tests - 'test_decorated.py::TestStackedMatrix.test_thing@{"x": 100, "y": 100}' # ignored, there are no such params in code # total: 2 tests ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_import_malformed.yml ================================================ import: - test_suite_malformed.yml simple_suite: - test_a.py # 1 ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_import_py.yml ================================================ import: - test_a.py # improper import simple_suite: - test_a.py # 1 test ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_malformed.yml ================================================ simple_malformed_suite: cheese: - 'fatty milk' - 'salt' bread: - 'flower' - 'water' - 'yeast' ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_multiple.yml ================================================ test_suite_b: - sub_dir_a/test_c.py # 1 test - sub_dir_a/test_d.py::TestD.test_d # 1 test # suite total: 2 tests test_suite_c: included: - sub_dir_a # +4 tests across all files - test_b.py # +3 tests excluded: - sub_dir_a/test_d.py # -3 tests - test_b.py::TestBB # -2 tests # suite total: 2 tests # total: 4 # - test_c.py (with 1 test method) included in both test suites so total -= 1 # total: 3 ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_single.yml ================================================ test_suite_a: included: - test_a.py # 1 test - test_b.py::TestBB.test_bb_one # 1 test - sub_dir_a # sub_dir_a/test_c.py = 0 (excluded) # sub_dir_a/test_d.py = 3 - 1 (excluded) = 2 excluded: - sub_dir_a/test_c.py # 1 test - sub_dir_a/test_d.py::TestD.test_dd # 1 test # total: 4 tests ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_with_self_import.yml ================================================ test_suite_d: included: - sub_dir_a # +4 tests across all files - test_b.py # +3 tests excluded: - sub_dir_a/test_d.py # -3 tests - test_b.py::TestBB # -2 tests # suite total: 2 tests import: - ./test_suite_with_self_import.yml # +0 test ================================================ FILE: tests/loader/resources/loader_test_directory/test_suite_with_single_import.yml ================================================ test_suite_d: included: - sub_dir_a # +4 tests across all files - test_b.py # +3 tests excluded: - sub_dir_a/test_d.py # -3 tests - test_b.py::TestBB # -2 tests # suite total: 6 tests import: - ./test_suite_single.yml # +4 test ================================================ FILE: tests/loader/resources/loader_test_directory/test_suites/refers_to_parent_dir.yml ================================================ test_suite_a: included: - ../test_a.py # 1 test # total: 1 tests ================================================ FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_a_test_c.yml ================================================ test_suite: - ../sub_dir_a/test_c.py ================================================ FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_a_test_c_via_class.yml ================================================ test_suite: - ../sub_dir_a/test_c.py::TestC ================================================ FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_a_with_exclude.yml ================================================ test_suite: included: - ../sub_dir_a/*.py excluded: - ../sub_dir_a/test_d.py ================================================ FILE: tests/loader/resources/loader_test_directory/test_suites/sub_dir_test_import.yml ================================================ import: - ../test_suite_cyclic_a.yml # 5 tests - .././test_suites/../sub_dir_no_tests/.././test_suite_cyclic_b.yml # 0 tests # 5 test total ================================================ FILE: tests/loader/resources/loader_test_directory/test_suites/test_suite_glob.yml ================================================ test_suite_glob: included: - ../sub_dir_a/* - ../test_?.py excluded: - ../sub_dir_a/*_d.py # globs expand to: # included: # sub_dir_a/test_c.py (1 test) # sub_dir_a/test_d.py (3 tests) - excluded # test_a.py (1 test) # test_b.py (3 tests) # excluded: # sub_dir_a/test_d.py (3 tests) # TOTAL: # 5 tests ================================================ FILE: tests/loader/resources/report.json ================================================ { "results": [ { "test_id": "tests.loader.resources.loader_test_directory.test_b.TestB.test_b", "run_time_seconds": 10 }, { "test_id": "tests.loader.resources.loader_test_directory.test_b.TestBB.test_bb_one", "run_time_seconds": 5 }, { "test_id": "tests.loader.resources.loader_test_directory.test_b.TestBB.bb_two_test", "run_time_seconds": 1 } ] } ================================================ FILE: tests/logger/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/logger/check_logger.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import psutil import shutil import tempfile from ducktape.tests.loggermaker import LoggerMaker, close_logger class DummyFileLoggerMaker(LoggerMaker): def __init__(self, log_dir, n_handles): """Create a logger with n_handles file handles, with files in log_dir""" self.log_dir = log_dir self.n_handles = n_handles @property def logger_name(self): return "a.b.c" def configure_logger(self): for i in range(self.n_handles): fh = logging.FileHandler(os.path.join(self.log_dir, "log-" + str(i))) self._logger.addHandler(fh) def open_files(): # current process p = psutil.Process() return p.open_files() class CheckLogger(object): def setup_method(self, _): self.temp_dir = tempfile.mkdtemp() def check_close_logger(self): """Check that calling close_logger properly cleans up resources.""" initial_open_files = open_files() n_handles = 100 log_maker = DummyFileLoggerMaker(self.temp_dir, n_handles) # accessing logger attribute lazily triggers configuration of logger the_logger = log_maker.logger assert len(open_files()) == len(initial_open_files) + n_handles close_logger(the_logger) assert len(open_files()) == len(initial_open_files) def teardown_method(self, _): if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) ================================================ FILE: tests/mark/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/mark/check_cluster_use_metadata.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster import LocalhostCluster from ducktape.mark import parametrize, matrix, ignore, defaults from ducktape.mark.mark_expander import MarkedFunctionExpander from ducktape.mark.resource import cluster from ducktape.cluster.cluster_spec import ClusterSpec import pytest from tests import ducktape_mock class CheckClusterUseAnnotation(object): def check_basic_usage_arbitrary_metadata(self): cluster_use_metadata = {"a": 2, "b": "hi", "num_nodes": 300} @cluster(**cluster_use_metadata) def function(): return "hi" assert hasattr(function, "marks") test_context_list = MarkedFunctionExpander(function=function).expand() assert len(test_context_list) == 1 assert test_context_list[0].cluster_use_metadata == cluster_use_metadata def check_basic_usage_cluster_spec(self): num_nodes = 200 @cluster(cluster_spec=ClusterSpec.simple_linux(num_nodes)) def function(): return "hi" assert hasattr(function, "marks") test_context_list = MarkedFunctionExpander(function=function).expand() assert len(test_context_list) == 1 assert len(test_context_list[0].expected_cluster_spec.nodes) == num_nodes # All nodes should be linux for node in test_context_list[0].expected_cluster_spec.nodes.elements(): assert node.operating_system == "linux" def check_basic_usage_num_nodes(self): num_nodes = 200 @cluster(num_nodes=num_nodes) def function(): return "hi" assert hasattr(function, "marks") test_context_list = MarkedFunctionExpander(function=function).expand() assert len(test_context_list) == 1 assert test_context_list[0].expected_num_nodes == num_nodes @pytest.mark.parametrize("fail_greedy_tests", [True, False]) @pytest.mark.parametrize("has_annotation", [True, False]) def check_empty_cluster_annotation(self, fail_greedy_tests, has_annotation): @cluster() def function_with_annotation(): return "hi" def function_no_annotation(): return "hello" assert hasattr(function_with_annotation, "marks") assert not hasattr(function_no_annotation, "marks") # no annotation and empty annotation behave identically as far as this functionality is concerned function = function_with_annotation if has_annotation else function_no_annotation mock_cluster = LocalhostCluster(num_nodes=1000) session_context = ducktape_mock.session_context(fail_greedy_tests=fail_greedy_tests) tc_list = MarkedFunctionExpander( function=function, cluster=mock_cluster, session_context=session_context ).expand() assert len(tc_list) == 1 if fail_greedy_tests: assert tc_list[0].expected_num_nodes == 0 assert tc_list[0].expected_cluster_spec is None else: assert tc_list[0].expected_num_nodes == 1000 @pytest.mark.parametrize("fail_greedy_tests", [True, False]) def check_zero_nodes_annotation(self, fail_greedy_tests): @cluster(num_nodes=0) def function(): return "hi" assert hasattr(function, "marks") mock_cluster = LocalhostCluster(num_nodes=1000) session_context = ducktape_mock.session_context(fail_greedy_tests=fail_greedy_tests) tc_list = MarkedFunctionExpander( function=function, cluster=mock_cluster, session_context=session_context ).expand() assert len(tc_list) == 1 assert tc_list[0].expected_num_nodes == 0 assert tc_list[0].expected_cluster_spec is not None assert len(tc_list[0].expected_cluster_spec) == 0 def check_with_parametrize(self): num_nodes = 200 @cluster(num_nodes=num_nodes) @parametrize(x=1) def f(x, y=2, z=3): return x, y, z test_context_list = MarkedFunctionExpander(function=f).expand() assert len(test_context_list) == 1 assert test_context_list[0].expected_num_nodes == num_nodes def check_beneath_parametrize(self): """Annotations such as cluster, which add metadata, but do not create new tests, add the metadata to all test cases physically below the annotation. In the case of a parametrized test, when @cluster has no parametrize annotations below it, there are not any test cases to which it will apply, so none of the resulting tests should have associated cluster size metadata. """ num_nodes = 200 @parametrize(x=1) @cluster(num_nodes=num_nodes) def f(x, y=2, z=3): return x, y, z with pytest.raises(AssertionError): MarkedFunctionExpander(function=f).expand() def check_with_nodes_default_parametrize_matrix(self): default_num_nodes = 5 parametrize_num_nodes = 3 matrix_num_nodes = 7 @cluster(num_nodes=default_num_nodes) @defaults(y=[5]) @parametrize(x=100, y=200) @cluster(num_nodes=parametrize_num_nodes) @parametrize(x=10, y=20) @parametrize(x=30, y=40) @cluster(num_nodes=matrix_num_nodes) @matrix(x=[1, 2]) def f(x, y): return x, y test_context_list = MarkedFunctionExpander(function=f).expand() assert len(test_context_list) == 5 # {'x': 1, 'y': 5} -> using matrix with matrix_num_nodes assert test_context_list[0].expected_num_nodes == matrix_num_nodes # {'x': 2, 'y': 5} -> using matrix with matrix_num_nodes assert test_context_list[1].expected_num_nodes == matrix_num_nodes # {'x': 30, 'y': 40} -> using parametrize with cluster parametrize_num_nodes assert test_context_list[2].expected_num_nodes == parametrize_num_nodes # {'x': 10, 'y': 20} -> using parametrize with cluster parametrize_num_nodes assert test_context_list[3].expected_num_nodes == parametrize_num_nodes # {'x': 100, 'y': 200} -> using parametrize with default_num_nodes assert test_context_list[4].expected_num_nodes == default_num_nodes def check_no_override(self): """cluster use metadata should apply to all test cases physically below it, except for those which already have cluster use metadata. """ num_nodes_1 = 200 num_nodes_2 = 42 # num_nodes_2 should *not* override num_nodes_1 @cluster(num_nodes=num_nodes_2) @cluster(num_nodes=num_nodes_1) def f(x, y=2, z=3): return x, y, z test_context_list = MarkedFunctionExpander(function=f).expand() assert len(test_context_list) == 1 assert test_context_list[0].expected_num_nodes == num_nodes_1 def check_parametrized_with_multiple_cluster_annotations(self): num_nodes_1 = 10 num_nodes_2 = 20 # num_nodes_1 should *not* override num_nodes_2 @cluster(num_nodes=num_nodes_1) @parametrize(x=1) @parametrize(x=1.5) @cluster(num_nodes=num_nodes_2) @parametrize(x=2) def f(x, y=2, z=3): return x, y, z test_context_list = MarkedFunctionExpander(function=f).expand() assert len(test_context_list) == 3 assert test_context_list[0].expected_num_nodes == num_nodes_1 assert test_context_list[1].expected_num_nodes == num_nodes_1 assert test_context_list[2].expected_num_nodes == num_nodes_2 def check_matrix_with_multiple_cluster_annotations(self): num_nodes_1 = 10 num_nodes_2 = 20 # num_nodes_1 should *not* override num_nodes_2 @cluster(num_nodes=num_nodes_1) @matrix(x=[1]) @matrix(x=[1.5, 1.6], y=[-15, -16]) @cluster(num_nodes=num_nodes_2) @matrix(x=[2]) def f(x, y=2, z=3): return x, y, z test_context_list = MarkedFunctionExpander(function=f).expand() assert len(test_context_list) == 6 assert test_context_list[0].expected_num_nodes == num_nodes_1 assert test_context_list[1].expected_num_nodes == num_nodes_1 assert test_context_list[2].expected_num_nodes == num_nodes_1 assert test_context_list[3].expected_num_nodes == num_nodes_1 assert test_context_list[4].expected_num_nodes == num_nodes_1 assert test_context_list[5].expected_num_nodes == num_nodes_2 def check_with_ignore(self): num_nodes = 200 @cluster(num_nodes=num_nodes) @ignore def function(): return "hi" assert hasattr(function, "marks") test_context_list = MarkedFunctionExpander(function=function).expand() assert len(test_context_list) == 1 assert test_context_list[0].expected_num_nodes == num_nodes # order shouldn't matter here @ignore @cluster(num_nodes=num_nodes) def function(): return "hi" assert hasattr(function, "marks") test_context_list = MarkedFunctionExpander(function=function).expand() assert len(test_context_list) == 1 assert test_context_list[0].expected_num_nodes == num_nodes ================================================ FILE: tests/mark/check_env.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.mark.mark_expander import MarkedFunctionExpander from ducktape.mark import env, is_env import os class CheckEnv(object): def check_does_not_raise_exception_when_key_not_exists(self): class C(object): @env(BLAH="8") def function(self): return 1 def check_has_env_annotation(self): class C(object): @env(JAVA_HOME="blah") def function(self): return 1 assert is_env(C.function) def check_is_ignored_if_env_not_correct(self): class C(object): @env(JAVA_HOME="blah") def function(self): return 1 context_list = MarkedFunctionExpander(function=C.function, cls=C).expand() assert context_list[0].ignore def check_is_not_ignore_if_correct_env(self): os.environ["test_key"] = "test" class C(object): @env(test_key="test") def function(self): return 1 context_list = MarkedFunctionExpander(function=C.function, cls=C).expand() assert not context_list[0].ignore ================================================ FILE: tests/mark/check_ignore.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.mark.mark_expander import MarkedFunctionExpander from ducktape.mark import ignore, ignored, parametrize, matrix import pytest class CheckIgnore(object): def check_simple(self): @ignore def function(x=1, y=2, z=3): return x, y, z assert ignored(function) context_list = MarkedFunctionExpander(function=function).expand() assert len(context_list) == 1 assert context_list[0].ignore def check_simple_method(self): class C(object): @ignore def function(self, x=1, y=2, z=3): return x, y, z assert ignored(C.function) context_list = MarkedFunctionExpander(function=C.function, cls=C).expand() assert len(context_list) == 1 assert context_list[0].ignore def check_ignore_all(self): @ignore @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @parametrize() def function(x=1, y=2, z=3): return x, y, z assert ignored(function) context_list = MarkedFunctionExpander(function=function).expand() assert len(context_list) == 4 for ctx in context_list: assert ctx.ignore def check_ignore_all_method(self): """Check @ignore() with no arguments used with various parametrizations on a method.""" class C(object): @ignore @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @matrix(x=[1, 2, 3]) @parametrize() def function(self, x=1, y=2, z=3): return x, y, z assert ignored(C.function) context_list = MarkedFunctionExpander(function=C.function, cls=C).expand() assert len(context_list) == 7 for ctx in context_list: assert ctx.ignore def check_ignore_specific(self): @ignore(x=100, y=200, z=300) @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @parametrize() def function(x=1, y=2, z=3): return x, y, z assert ignored(function) context_list = MarkedFunctionExpander(None, function=function).expand() assert len(context_list) == 4 for ctx in context_list: if ctx.injected_args == {"x": 100, "y": 200, "z": 300}: assert ctx.ignore else: assert not ctx.ignore def check_ignore_specific_method(self): class C(object): @ignore(x=100, y=200, z=300) @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @parametrize() def function(self, x=1, y=2, z=3): return x, y, z assert ignored(C.function) context_list = MarkedFunctionExpander(function=C.function, cls=C).expand() assert len(context_list) == 4 for ctx in context_list: if ctx.injected_args == {"x": 100, "y": 200, "z": 300}: assert ctx.ignore else: assert not ctx.ignore def check_invalid_specific_ignore(self): """If there are no test cases to which ignore applies, it should raise an error Keeping in mind annotations "point down": they only apply to test cases physically below. """ class C(object): @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @parametrize() @ignore(x=100, y=200, z=300) def function(self, x=1, y=2, z=3): return x, y, z assert ignored(C.function) with pytest.raises(AssertionError): MarkedFunctionExpander(function=C.function, cls=C).expand() def check_invalid_ignore_all(self): """If there are no test cases to which ignore applies, it should raise an error""" class C(object): @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @parametrize() @ignore def function(self, x=1, y=2, z=3): return x, y, z assert ignored(C.function) with pytest.raises(AssertionError): MarkedFunctionExpander(function=C.function, cls=C).expand() ================================================ FILE: tests/mark/check_parametrize.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.mark import parametrize, parametrized, matrix, defaults from ducktape.mark.mark_expander import MarkedFunctionExpander class CheckParametrize(object): def check_simple(self): @parametrize(x=100, z=300) def function(x=1, y=2, z=3): return x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 1 assert injected_args_list[0] == {"x": 100, "z": 300} context_list = MarkedFunctionExpander(function=function).expand() all_f = [cxt.function for cxt in context_list] assert all_f[0]() == (100, 2, 300) def check_simple_method(self): class C(object): @parametrize(x=100, z=300) def function(self, x=1, y=2, z=3): return x, y, z assert parametrized(C.function) injected_args_list = [m.injected_args for m in C.function.marks] assert len(injected_args_list) == 1 assert injected_args_list[0] == {"x": 100, "z": 300} context_list = MarkedFunctionExpander(None, function=C.function).expand() all_f = [cxt.function for cxt in context_list] assert all_f[0](C()) == (100, 2, 300) def check_stacked(self): @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @parametrize() def function(x=1, y=2, z=3): return x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 4 context_list = MarkedFunctionExpander(None, function=function).expand() all_f = [cxt.function for cxt in context_list] assert all_f[0]() == (100, 200, 300) assert all_f[1]() == (100, 2, 300) assert all_f[2]() == (1, 200, 3) assert all_f[3]() == (1, 2, 3) def check_stacked_method(self): class C(object): @parametrize(x=100, y=200, z=300) @parametrize(x=100, z=300) @parametrize(y=200) @parametrize() def function(self, x=1, y=2, z=3): return x, y, z assert parametrized(C.function) injected_args_list = [m.injected_args for m in C.function.marks] assert len(injected_args_list) == 4 context_list = MarkedFunctionExpander(None, function=C.function).expand() all_f = [cxt.function for cxt in context_list] c = C() assert all_f[0](c) == (100, 200, 300) assert all_f[1](c) == (100, 2, 300) assert all_f[2](c) == (1, 200, 3) assert all_f[3](c) == (1, 2, 3) class CheckMatrix(object): def check_simple(self): @matrix(x=[1, 2], y=[-1, -2]) def function(x=1, y=2, z=3): return x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 1 context_list = MarkedFunctionExpander(None, function=function).expand() assert len(context_list) == 4 for ctx in context_list: f = ctx.function injected_args = ctx.injected_args assert f() == (injected_args["x"], injected_args["y"], 3) def check_simple_method(self): class C(object): @matrix(x=[1, 2], y=[-1, -2]) def function(self, x=1, y=2, z=3): return x, y, z assert parametrized(C.function) injected_args_list = [m.injected_args for m in C.function.marks] assert len(injected_args_list) == 1 context_list = MarkedFunctionExpander(None, function=C.function).expand() assert len(context_list) == 4 c = C() for ctx in context_list: f = ctx.function injected_args = ctx.injected_args assert f(c) == (injected_args["x"], injected_args["y"], 3) def check_stacked(self): @matrix(x=[1, 2], y=[0]) @matrix(x=[-1], z=[-10]) def function(x=1, y=2, z=3): return x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 2 context_list = MarkedFunctionExpander(None, function=function).expand() assert len(context_list) == 3 expected_output = {(1, 0, 3), (2, 0, 3), (-1, 2, -10)} output = set() for c in context_list: output.add(c.function()) assert output == expected_output def check_stacked_method(self): class C(object): @matrix(x=[1, 2], y=[0]) @matrix(x=[-1], z=[-10]) def function(self, x=1, y=2, z=3): return x, y, z assert parametrized(C.function) injected_args_list = [m.injected_args for m in C.function.marks] assert len(injected_args_list) == 2 context_list = MarkedFunctionExpander(None, function=C.function).expand() assert len(context_list) == 3 expected_output = {(1, 0, 3), (2, 0, 3), (-1, 2, -10)} output = set() for ctx in context_list: output.add(ctx.function(C())) assert output == expected_output class CheckDefaults(object): def check_defaults(self): @defaults(z=[1, 2]) @matrix(x=[1], y=[1, 2]) @parametrize(x=3, y=4) def function(x=1, y=2, z=-1): return x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 3 context_list = MarkedFunctionExpander(None, function=function).expand() assert len(context_list) == 6 expected_output = { (1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 2), (3, 4, 1), (3, 4, 2), } output = set() for ctx in context_list: output.add(ctx.function()) assert output == expected_output def check_defaults_method(self): class C(object): @defaults(z=[1, 2]) @matrix(x=[1], y=[1, 2]) @parametrize(x=3, y=4) def function(self, x=1, y=2, z=-1): return x, y, z assert parametrized(C.function) injected_args_list = [m.injected_args for m in C.function.marks] assert len(injected_args_list) == 3 context_list = MarkedFunctionExpander(None, function=C.function).expand() assert len(context_list) == 6 expected_output = { (1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 2), (3, 4, 1), (3, 4, 2), } output = set() c = C() for ctx in context_list: f = ctx.function injected_args = ctx.injected_args assert f(c) == (injected_args["x"], injected_args["y"], injected_args["z"]) output.add(ctx.function(C())) assert output == expected_output def check_overlap_param(self): @defaults(y=[3, 4], z=[1, 2]) @parametrize(w=1, x=2, y=3) def function(w=10, x=20, y=30, z=40): return w, x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 2 context_list = MarkedFunctionExpander(None, function=function).expand() assert len(context_list) == 2 expected_output = {(1, 2, 3, 1), (1, 2, 3, 2)} output = set() for ctx in context_list: output.add(ctx.function()) assert output == expected_output def check_overlap_matrix(self): @defaults(y=[3, 4], z=[1, 2]) @matrix(x=[1, 2], y=[5, 6]) def function(x=20, y=30, z=40): return x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 2 context_list = MarkedFunctionExpander(None, function=function).expand() assert len(context_list) == 8 expected_output = { (1, 5, 1), (1, 5, 2), (2, 5, 1), (2, 5, 2), (1, 6, 1), (1, 6, 2), (2, 6, 1), (2, 6, 2), } output = set() for ctx in context_list: output.add(ctx.function()) assert output == expected_output def check_only_defaults(self): @defaults(x=[3], z=[1, 2]) def function(x=1, y=2, z=-1): return x, y, z assert parametrized(function) injected_args_list = [m.injected_args for m in function.marks] assert len(injected_args_list) == 1 context_list = MarkedFunctionExpander(None, function=function).expand() assert len(context_list) == 2 expected_output = {(3, 2, 1), (3, 2, 2)} output = set() for ctx in context_list: output.add(ctx.function()) assert output == expected_output ================================================ FILE: tests/mark/resources/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/reporter/check_symbol_reporter.py ================================================ from pathlib import Path from unittest.mock import Mock from ducktape.tests.reporter import FailedTestSymbolReporter def check_to_symbol_no_args(tmp_path): result = Mock( file_name="/test_folder/test_file", cls_name="TestClass", function_name="test_func", injected_args=None, ) reporter = FailedTestSymbolReporter(Mock()) reporter.working_dir = Path("/") assert reporter.to_symbol(result) == "test_folder/test_file::TestClass.test_func" def check_to_symbol_relative_path(tmp_path): result = Mock( file_name="/test_folder/test_file", cls_name="TestClass", function_name="test_func", injected_args=None, ) reporter = FailedTestSymbolReporter(Mock()) reporter.working_dir = Path("/test_folder") assert reporter.to_symbol(result) == "test_file::TestClass.test_func" def check_to_symbol_with_args(): result = Mock( file_name="/test_folder/test_file", cls_name="TestClass", function_name="test_func", injected_args={"arg": "val"}, ) reporter = FailedTestSymbolReporter(Mock()) reporter.working_dir = Path("/") assert reporter.to_symbol(result) == 'test_folder/test_file::TestClass.test_func@{"arg":"val"}' ================================================ FILE: tests/runner/__init__.py ================================================ ================================================ FILE: tests/runner/check_runner.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from unittest.mock import patch import pytest from ducktape.cluster.node_container import NodeContainer, InsufficientResourcesError from ducktape.tests.runner_client import RunnerClient from ducktape.tests.status import PASS, FAIL from ducktape.tests.test_context import TestContext from ducktape.tests.runner import TestRunner from ducktape.mark.mark_expander import MarkedFunctionExpander from ducktape.cluster.localhost import LocalhostCluster from tests.ducktape_mock import FakeCluster, MockSender import tests.ducktape_mock from tests.runner.resources.test_fails_to_init import FailsToInitTest from tests.runner.resources.test_fails_to_init_in_setup import FailsToInitInSetupTest from .resources.test_bad_actor import BadActorTest from .resources.test_thingy import ClusterTestThingy, TestThingy from .resources.test_failing_tests import FailingTest from ducktape.tests.reporter import JUnitReporter from unittest.mock import Mock, MagicMock import os import xml.etree.ElementTree as ET from .resources.test_various_num_nodes import VariousNumNodesTest TEST_THINGY_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_thingy.py")) FAILING_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_failing_tests.py")) FAILS_TO_INIT_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_fails_to_init.py")) FAILS_TO_INIT_IN_SETUP_TEST_FILE = os.path.abspath( os.path.join(os.path.dirname(__file__), "resources/test_fails_to_init_in_setup.py") ) VARIOUS_NUM_NODES_TEST_FILE = os.path.abspath( os.path.join(os.path.dirname(__file__), "resources/test_various_num_nodes.py") ) BAD_ACTOR_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_bad_actor.py")) class CheckRunner(object): def check_insufficient_cluster_resources(self): """The test runner should behave sensibly when the cluster is too small to run a given test.""" mock_cluster = FakeCluster(1) session_context = tests.ducktape_mock.session_context() test_context = TestContext( session_context=session_context, module=None, cls=TestThingy, function=TestThingy.test_pi, file=TEST_THINGY_FILE, cluster=mock_cluster, cluster_use_metadata={"num_nodes": 1000}, ) runner = TestRunner(mock_cluster, session_context, Mock(), [test_context], 1) # Even though the cluster is too small, the test runner should this handle gracefully without raising an error results = runner.run_all_tests() assert len(results) == 1 assert results.num_failed == 1 assert results.num_passed == 0 assert results.num_ignored == 0 def _do_expand(self, test_file, test_class, test_methods, cluster=None, session_context=None): ctx_list = [] for f in test_methods: ctx_list.extend( MarkedFunctionExpander( session_context=session_context, cls=test_class, function=f, file=test_file, cluster=cluster, ).expand() ) return ctx_list def check_simple_run(self): """Check expected behavior when running a single test.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [ TestThingy.test_pi, TestThingy.test_ignore1, TestThingy.test_ignore2, TestThingy.test_failure, ] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert len(results) == 4 assert results.num_flaky == 0 assert results.num_failed == 1 assert results.num_passed == 1 assert results.num_ignored == 2 result_with_data = [r for r in results if r.data is not None][0] assert result_with_data.data == {"data": 3.14159} def check_deflake_run(self): """Check expected behavior when running a single test.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [TestThingy.test_flaky, TestThingy.test_failure] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 2) results = runner.run_all_tests() assert len(results) == 2 assert results.num_flaky == 1 assert results.num_failed == 1 assert results.num_passed == 0 assert results.num_ignored == 0 def check_runner_report_junit(self): """Check we can serialize results into a xunit xml format. Also ensures that the XML report adheres to the Junit spec using xpath queries""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [ TestThingy.test_pi, TestThingy.test_ignore1, TestThingy.test_ignore2, TestThingy.test_failure, ] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() JUnitReporter(results).report() xml_report = os.path.join(session_context.results_dir, "report.xml") assert os.path.exists(xml_report) tree = ET.parse(xml_report) assert len(tree.findall("./testsuite/testcase/failure")) == 1 assert len(tree.findall("./testsuite/testcase/skipped")) == 2 assert len(tree.findall("./testsuite/testcase")) == 4 passed = tree.findall("./testsuite/testcase/[@status='pass']") assert len(passed) == 1 assert passed[0].get("classname") == "TestThingy" assert passed[0].get("name") == "test_pi" failures = tree.findall("./testsuite/testcase/[@status='fail']") assert len(failures) == 1 assert failures[0].get("classname") == "TestThingy" assert failures[0].get("name") == "test_failure" ignores = tree.findall("./testsuite/testcase/[@status='ignore']") assert len(ignores) == 2 assert ignores[0].get("classname") == "TestThingy" assert ignores[1].get("classname") == "TestThingy" assert ignores[0].get("name") == "test_ignore1" assert ignores[1].get("name") == "test_ignore2.x=5" def check_exit_first(self): """Confirm that exit_first in session context has desired effect of preventing any tests from running after the first test failure. """ mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context(**{"exit_first": True}) test_methods = [FailingTest.test_fail] ctx_list = self._do_expand( test_file=FAILING_TEST_FILE, test_class=FailingTest, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert len(ctx_list) > 1 assert len(results) == 1 def check_exits_if_failed_to_initialize(self): """Validate that runner exits correctly when tests failed to initialize.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() ctx_list = self._do_expand( test_file=FAILS_TO_INIT_TEST_FILE, test_class=FailsToInitTest, test_methods=[FailsToInitTest.test_nothing], cluster=mock_cluster, session_context=session_context, ) ctx_list.extend( self._do_expand( test_file=FAILS_TO_INIT_IN_SETUP_TEST_FILE, test_class=FailsToInitInSetupTest, test_methods=[FailsToInitInSetupTest.test_nothing], cluster=mock_cluster, session_context=session_context, ) ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() # These tests fail to initialize, each class has two test methods, so should have 4 results, all failed assert len(results) == 4 assert results.num_flaky == 0 assert results.num_failed == 4 assert results.num_passed == 0 assert results.num_ignored == 0 # mock an error reporting test failure - this should not prevent subsequent tests from execution and mark # failed test as failed correctly @patch.object(RunnerClient, "_exc_msg", side_effect=Exception) def check_sends_result_when_error_reporting_exception(self, exc_msg_mock): """Validates that an error when reporting an exception in the test doesn't prevent subsequent tests from executing""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [TestThingy.test_failure, TestThingy.test_pi] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert len(results) == 2 assert results.num_flaky == 0 assert results.num_failed == 1 assert results.num_passed == 1 assert results.num_ignored == 0 def check_run_failure_with_bad_cluster_allocation(self): """Check test should be marked failed if it under-utilizes the cluster resources.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context( fail_bad_cluster_utilization="fail_bad_cluster_utilization" ) ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=ClusterTestThingy, test_methods=[ClusterTestThingy.test_bad_num_nodes], cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert len(results) == 1 assert results.num_flaky == 0 assert results.num_failed == 1 assert results.num_passed == 0 assert results.num_ignored == 0 def check_test_failure_with_too_many_nodes_requested(self): mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context(debug=True) ctx_list = self._do_expand( test_file=BAD_ACTOR_TEST_FILE, test_class=BadActorTest, test_methods=[BadActorTest.test_too_many_nodes], cluster=mock_cluster, session_context=session_context, ) ctx_list.extend( self._do_expand( test_file=VARIOUS_NUM_NODES_TEST_FILE, test_class=VariousNumNodesTest, test_methods=[VariousNumNodesTest.test_one_node_a], cluster=mock_cluster, session_context=session_context, ) ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert results.num_flaky == 0 assert results.num_failed == 1 assert results.num_passed == 1 assert results.num_ignored == 0 passed = [r for r in results if r.test_status == PASS] failed = [r for r in results if r.test_status == FAIL] assert passed[0].test_id == "tests.runner.resources.test_various_num_nodes.VariousNumNodesTest.test_one_node_a" assert failed[0].test_id == "tests.runner.resources.test_bad_actor.BadActorTest.test_too_many_nodes" def check_runner_timeout(self): """Check process cleanup and error handling when runner times out. When timeout occurs, the runner should: 1. Clean up all client processes 2. Mark active tests as failed with 'Test timed out' message 3. Mark remaining unrun tests as failed with 'Test not run' message 4. Return results gracefully instead of raising """ mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context(max_parallel=1000, test_runner_timeout=1) test_methods = [TestThingy.test_delayed, TestThingy.test_failure] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) # Should return results gracefully instead of raising results = runner.run_all_tests() # All client processes should be cleaned up assert not runner._client_procs # Both tests should be marked as failed assert len(results) == 2 assert results.num_failed == 2 assert results.num_passed == 0 # Check that failure summaries contain timeout information for result in results: assert result.test_status == FAIL # Summary should mention either "timed out" or "not run" with the exception message assert "runner client unresponsive" in result.summary.lower() @pytest.mark.parametrize("fail_greedy_tests", [True, False]) def check_fail_greedy_tests(self, fail_greedy_tests): mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context(fail_greedy_tests=fail_greedy_tests) test_methods = [ VariousNumNodesTest.test_empty_cluster_annotation, VariousNumNodesTest.test_no_cluster_annotation, VariousNumNodesTest.test_zero_nodes, ] ctx_list = self._do_expand( test_file=VARIOUS_NUM_NODES_TEST_FILE, test_class=VariousNumNodesTest, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert results.num_flaky == 0 assert results.num_failed == (2 if fail_greedy_tests else 0) # zero-node test should always pass, whether we fail on greedy or not assert results.num_passed == (1 if fail_greedy_tests else 3) assert results.num_ignored == 0 def check_cluster_shrink(self): """ Check what happens if cluster loses a node while the runner is already running. SchedulerTestThingy has two 5-node tests, and one of each for 4, 3, and 2 nodes. Thus both 5-node tests should pass, first one failing during pre-allocation phase, second one shouldn't even attempt to be allocated. And all the other tests should pass still. """ mock_cluster = ShrinkingLocalhostCluster(num_nodes=5) session_context = tests.ducktape_mock.session_context(max_parallel=10) test_methods = [ VariousNumNodesTest.test_five_nodes_a, VariousNumNodesTest.test_five_nodes_b, VariousNumNodesTest.test_four_nodes, VariousNumNodesTest.test_three_nodes_a, VariousNumNodesTest.test_two_nodes_a, ] ctx_list = self._do_expand( test_file=VARIOUS_NUM_NODES_TEST_FILE, test_class=VariousNumNodesTest, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert len(results) == 5 assert results.num_flaky == 0 assert results.num_failed == 2 # both of the 5-node tests should fail assert results.num_passed == 3 # 4-node, 3-node and 2-node should all pass assert results.num_ignored == 0 def check_cluster_shrink_reschedule(self): """ Test that the test that failed to schedule initially due to a node going offline is not lost and is still scheduled when more nodes become available. We start with a 6-node cluster. First we run a long-ish 3-node test, leaving 3 nodes available. Then when trying to run a second 3-node test, we shrink the cluster, emulating one of the nodes going down - this leaves only 2 nodes available, so we cannot run this test. However, after the first 3-node test finishes running, it will return its 3 nodes back to the cluster, so the second 3-node test becomes schedulable again - this is what we test for. Also two two-node tests should pass too - they should be scheduled before the second 3-node test, while two nodes are waiting for the first 3-node test to finish. It's generally not a good practice to rely on sleep, but I think it's acceptable in this case, since we do need to rely on parallelism. """ mock_cluster = ShrinkingLocalhostCluster(num_nodes=6, shrink_on=2) session_context = tests.ducktape_mock.session_context(max_parallel=10) test_methods = [ VariousNumNodesTest.test_three_nodes_asleep, VariousNumNodesTest.test_three_nodes_b, VariousNumNodesTest.test_two_nodes_a, VariousNumNodesTest.test_two_nodes_b, ] ctx_list = self._do_expand( test_file=VARIOUS_NUM_NODES_TEST_FILE, test_class=VariousNumNodesTest, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert len(results) == 4 assert results.num_flaky == 0 assert results.num_failed == 0 assert results.num_passed == 4 assert results.num_ignored == 0 # normal order on a 6-node cluster would be: # - test_three_nodes_asleep, test_three_nodes_b, test_two_nodes_a, test_two_nodes_b # however the cluster would shrink to 5 nodes after scheduling the first 3-node test, # leaving no space for the second 3-node test to be scheduled, bumping it down the line, # while two 2-node tests will be scheduled alongside the expected_scheduling_order = [ "VariousNumNodesTest.test_three_nodes_asleep", "VariousNumNodesTest.test_two_nodes_a", "VariousNumNodesTest.test_two_nodes_b", "VariousNumNodesTest.test_three_nodes_b", ] # We check the actual order the tests were scheduled in, since completion order might be different, # with so many fast tests running in parallel. actual_scheduling_order = [x.test_id for x in runner.test_schedule_log] assert actual_scheduling_order == expected_scheduling_order def check_cluster_shrink_to_zero(self): """ Validates that if the cluster is shrunk to zero nodes size, no tests can run, but we still exit gracefully. """ mock_cluster = ShrinkingLocalhostCluster(num_nodes=1, shrink_on=1) session_context = tests.ducktape_mock.session_context(max_parallel=10) test_methods = [ VariousNumNodesTest.test_one_node_a, VariousNumNodesTest.test_one_node_b, ] ctx_list = self._do_expand( test_file=VARIOUS_NUM_NODES_TEST_FILE, test_class=VariousNumNodesTest, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) results = runner.run_all_tests() assert len(results) == 2 assert results.num_flaky == 0 assert results.num_failed == 2 assert results.num_passed == 0 assert results.num_ignored == 0 def check_runner_client_report(self): """Validates that an error when reporting an exception in the test doesn't prevent subsequent tests from executing""" mock_cluster = tests.ducktape_mock.mock_cluster() session_context = tests.ducktape_mock.session_context() test_context = tests.ducktape_mock.test_context(session_context=session_context) rc = RunnerClient( "localhost", 22, test_context.test_id, 0, "dummy", "/tmp/dummy", True, False, 5, ) rc.sender = MockSender() rc.cluster = mock_cluster rc.session_context = session_context rc.test_metadata = MagicMock() with patch("ducktape.tests.runner_client.RunnerClient._collect_test_context") as test_collect_patch: test_collect_patch.return_value = test_context rc.run() # get the last message, get the args send to that message, and get the first arg finished_result = rc.sender.send_results[-1][0][0] assert finished_result.get("event_type") == "FINISHED" assert finished_result["result"].summary == "Test Passed" def check_report_remaining_as_failed(self): """Test that _report_remaining_as_failed marks all unrun tests as failed.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [TestThingy.test_pi, TestThingy.test_failure] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) # Call the method directly reason = "test reason for failure" runner._report_remaining_as_failed(reason) # All tests should be marked as failed assert len(runner.results) == 2 assert runner.results.num_failed == 2 for result in runner.results: assert result.test_status == FAIL assert f"Test not run: {reason}" in result.summary # Scheduler should be empty after draining assert len(runner.scheduler) == 0 def check_report_active_as_failed(self): """Test that _report_active_as_failed marks running tests as failed.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [TestThingy.test_pi] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) # Simulate an active test using the actual test_id from ctx_list from ducktape.tests.runner import TestKey import time test_ctx = ctx_list[0] test_key = TestKey(test_ctx.test_id, 1) runner.active_tests[test_key] = True runner.client_report[test_key] = {"runner_start_time": time.time() - 10} # Call the method reason = "test timeout reason" runner._report_active_as_failed(reason) # Test should be marked as failed assert len(runner.results) == 1 assert runner.results.num_failed == 1 results_list = [r for r in runner.results] assert len(results_list) == 1 result = results_list[0] assert result.test_status == FAIL assert f"Test timed out: {reason}" in result.summary assert result.start_time < result.stop_time # Active tests should be cleared assert len(runner.active_tests) == 0 def check_report_active_as_failed_frees_cluster(self): """Test that _report_active_as_failed frees cluster resources.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [TestThingy.test_pi] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) # Simulate an active test with allocated cluster from ducktape.tests.runner import TestKey from ducktape.cluster.finite_subcluster import FiniteSubcluster import time test_ctx = ctx_list[0] test_key = TestKey(test_ctx.test_id, 1) # Allocate cluster nodes for this test allocated_nodes = mock_cluster.alloc(test_ctx.expected_cluster_spec) runner._test_cluster[test_key] = FiniteSubcluster(allocated_nodes) runner.active_tests[test_key] = True runner.client_report[test_key] = {"runner_start_time": time.time() - 10} # Check that nodes are allocated initial_available = mock_cluster.num_available_nodes() assert test_key in runner._test_cluster # Call the method reason = "test timeout reason" runner._report_active_as_failed(reason) # Cluster resources should be freed assert test_key not in runner._test_cluster assert mock_cluster.num_available_nodes() == initial_available + len(allocated_nodes) # Active tests should be cleared assert len(runner.active_tests) == 0 def check_runner_client_shutdown_flag(self): """Test that runner client respects shutdown flag when sending messages.""" mock_cluster = tests.ducktape_mock.mock_cluster() session_context = tests.ducktape_mock.session_context() test_context = tests.ducktape_mock.test_context(session_context=session_context) rc = RunnerClient( "localhost", 22, test_context.test_id, 0, "dummy", "/tmp/dummy", True, False, 5, ) rc.sender = MockSender() rc.cluster = mock_cluster rc.session_context = session_context # Set shutdown flag rc.runner_shutting_down = True # Try to send a message - should return None without sending from ducktape.tests.event import ClientEventFactory message_factory = ClientEventFactory(test_context.test_id, 0, "test-client") result = rc.send(message_factory.ready()) assert result is None # No messages should have been sent assert len(rc.sender.send_results) == 0 def check_duplicate_finished_message_handling(self): """Test that duplicate FINISHED messages are handled correctly with three distinct cases.""" from ducktape.tests.runner import TestKey from ducktape.tests.result import TestResult from ducktape.cluster.finite_subcluster import FiniteSubcluster import time mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [TestThingy.test_pi] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) runner = TestRunner(mock_cluster, session_context, Mock(), ctx_list, 1) # Mock the receiver.send to avoid ZMQ socket issues in tests runner.receiver.send = Mock() # Simulate a test that has been started test_ctx = ctx_list[0] test_key = TestKey(test_ctx.test_id, 1) runner.active_tests[test_key] = True # Allocate cluster for this test allocated_nodes = mock_cluster.alloc(test_ctx.expected_cluster_spec) runner._test_cluster[test_key] = FiniteSubcluster(allocated_nodes) # Mock a client process runner._client_procs[test_key] = Mock() runner._client_procs[test_key].is_alive.return_value = False runner._client_procs[test_key].join.return_value = None runner._client_procs[test_key].exitcode = 0 runner._client_procs[test_key].name = "MockProcess" # Create a FINISHED event from ducktape.tests.event import ClientEventFactory event_factory = ClientEventFactory(test_ctx.test_id, 1, "test-client") result = TestResult(test_ctx, 1, session_context, PASS, "Test passed", None, time.time(), time.time()) event = event_factory.finished(result=result) # Case 1: Normal first FINISHED - should process normally runner._handle_finished(event) assert test_key not in runner.active_tests assert test_key in runner.finished_tests assert len(runner.results) == 1 assert test_key not in runner._test_cluster assert runner.receiver.send.call_count == 1 # Case 2: Duplicate FINISHED (ZMQ retry) - should be ignored gracefully runner._handle_finished(event) assert len(runner.results) == 1 # Should not add duplicate assert runner.receiver.send.call_count == 2 # ACK still sent # Case 3: FINISHED for unknown test - should raise RuntimeError unknown_event_factory = ClientEventFactory("unknown.test.id", 999, "test-client") unknown_result = TestResult(test_ctx, 999, session_context, PASS, "Test passed", None, time.time(), time.time()) unknown_event = unknown_event_factory.finished(result=unknown_result) with pytest.raises(RuntimeError, match="not in active_tests"): runner._handle_finished(unknown_event) def check_timeout_exception_join_timeout_param(self): """Test that timeout_exception_join_timeout parameter is used correctly.""" mock_cluster = LocalhostCluster(num_nodes=1000) session_context = tests.ducktape_mock.session_context() test_methods = [TestThingy.test_pi] ctx_list = self._do_expand( test_file=TEST_THINGY_FILE, test_class=TestThingy, test_methods=test_methods, cluster=mock_cluster, session_context=session_context, ) # Create runner with custom timeout values runner = TestRunner( mock_cluster, session_context, Mock(), ctx_list, 1, finish_join_timeout=10, timeout_exception_join_timeout=60, ) assert runner.finish_join_timeout == 10 assert runner.timeout_exception_join_timeout == 60 class ShrinkingLocalhostCluster(LocalhostCluster): def __init__(self, *args, shrink_on=1, **kwargs): super().__init__(*args, **kwargs) self.bad_nodes = NodeContainer() # which call to shrink on self.shrink_on = shrink_on self.num_alloc_calls = 0 def do_alloc(self, cluster_spec): allocated = super().do_alloc(cluster_spec) self.num_alloc_calls += 1 if self.shrink_on == self.num_alloc_calls: bad_node = allocated.pop() self._in_use_nodes.remove_node(bad_node) self.bad_nodes.add_node(bad_node) # simplified logic, we know all nodes are of the same OS/type # check if we don't have enough nodes any more # (which really should be true every time, since the largest test would be scheduled) if len(allocated) < len(cluster_spec): # return all good nodes back to be available for node in allocated: self._in_use_nodes.remove_node(node) self._available_nodes.add_node(node) raise InsufficientResourcesError("yeah") return allocated ================================================ FILE: tests/runner/check_runner_memory.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.runner import TestRunner from ducktape.mark.mark_expander import MarkedFunctionExpander from ducktape.cluster.localhost import LocalhostCluster from .resources.test_memory_leak import MemoryLeakTest import math from memory_profiler import memory_usage import os from queue import Queue import statistics from statistics import mean import tests.ducktape_mock from mock import Mock N_TEST_CASES = 5 MEMORY_LEAK_TEST_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources/test_memory_leak.py")) class InstrumentedTestRunner(TestRunner): """Identical to TestRunner, except dump memory used by the current process before running each test. """ def __init__(self, *args, **kwargs): self.queue = kwargs.get("queue") del kwargs["queue"] super(InstrumentedTestRunner, self).__init__(*args, **kwargs) def _run_single_test(self, test_context): # write current memory usage to file before running the test pid = os.getpid() current_memory = memory_usage(pid)[0] self.queue.put(current_memory) super(InstrumentedTestRunner, self)._run_single_test(test_context) class CheckMemoryUsage(object): def setup_method(self, _): self.cluster = LocalhostCluster(num_nodes=100) self.session_context = tests.ducktape_mock.session_context() def check_for_inter_test_memory_leak(self): """Until v0.3.10, ducktape had a serious source of potential memory leaks. Because test_context objects held a reference to all services for the duration of a test run, the memory used by any individual service would not be garbage-collected until well after *all* tests had run. This memory leak was discovered in Kafka system tests, where many long-running system tests were enough to cumulatively use up the memory on the test machine, causing a cascade of test failures due to inability to allocate any more memory. This test provides a regression check against this type of memory leak; it fails without the fix, and passes with it. """ # Get a list of test_context objects for the test runner ctx_list = [] test_methods = [MemoryLeakTest.test_leak] for f in test_methods: ctx_list.extend( MarkedFunctionExpander( session_context=self.session_context, cls=MemoryLeakTest, function=f, file=MEMORY_LEAK_TEST_FILE, cluster=self.cluster, ).expand() ) assert len(ctx_list) == N_TEST_CASES # Sanity check q = Queue() runner = InstrumentedTestRunner(self.cluster, self.session_context, Mock(), ctx_list, 1, queue=q) runner.run_all_tests() measurements = [] while not q.empty(): measurements.append(q.get()) self.validate_memory_measurements(measurements) def validate_memory_measurements(self, measurements): """A rough heuristic to check that stair-case style memory leak is not present. The idea is that when well-behaved, in this specific test, the maximum memory usage should be near the "middle". Here we check that the maximum usage is within 5% of the median memory usage. What is meant by stair-case? When the leak was present in its most blatant form, each repetition of MemoryLeak.test_leak run by the test runner adds approximately a fixed amount of memory without freeing much, resulting in a memory usage profile that looks like a staircase going up to the right. """ median_usage = statistics.median(measurements) max_usage = max(measurements) usage_stats = "\nmax: %s,\nmedian: %s,\nall: %s\n" % ( max_usage, median_usage, measurements, ) # we want to make sure that max usage doesn't exceed median usage by very much relative_diff = (max_usage - median_usage) / median_usage slope = self._linear_regression_slope(measurements) if slope > 0: # check max memory usage iff the memory measurements seem to be increasing overall assert relative_diff <= 0.05, ( "max usage exceeded median usage by too much; there may be a memory leak: %s" % usage_stats ) def _linear_regression_slope(self, arr): """Return the sign of the slope of the least squares fit line.""" assert len(arr) > 0 x_vals = [i for i in range(len(arr))] mean_x = mean(x_vals) mean_y = mean(arr) # mean([x_i * y_i]) - mean_x * mean_y # slope = ----------------------------------- # variance([x_i]) # # where variance is (1/N) * sum([(x_i - mean_x)^2]) # # the denominator in regression formula is always positive, so it's enough to compute the numerator slope_numerator = mean([i * arr[i] for i in x_vals]) slope_numerator = slope_numerator - (mean_x * mean_y) # return the sign return math.copysign(slope_numerator, 1) ================================================ FILE: tests/runner/check_sender_receiver.py ================================================ # Copyright 2021 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.localhost import LocalhostCluster from tests.ducktape_mock import test_context, session_context import logging import pytest from ducktape.tests.runner_client import Sender from ducktape.tests.runner import Receiver from ducktape.tests.event import ClientEventFactory, EventResponseFactory from ducktape.errors import TimeoutError import multiprocessing as mp import os class CheckSenderReceiver(object): def ready_response(self, client_id, port): sender_event_factory = ClientEventFactory("test_1", 0, client_id) sender = Sender( server_host="localhost", server_port=port, message_supplier=sender_event_factory, logger=logging, ) sender.send(sender_event_factory.ready()) def check_simple_messaging(self): s_context = session_context() cluster = LocalhostCluster(num_nodes=1000) t_context = test_context(s_context, cluster) client_id = "test-runner-{}-{}".format(os.getpid(), id(self)) receiver_response_factory = EventResponseFactory() receiver = Receiver(5556, 5656) receiver.start() port = receiver.port try: p = mp.Process(target=self.ready_response, args=(client_id, port)) p.start() event = receiver.recv(timeout=10000) assert event["event_type"] == ClientEventFactory.READY logging.info("replying to client") receiver.send(receiver_response_factory.ready(event, s_context, t_context, cluster)) finally: p.join(timeout=2) # Add timeout to prevent hanging if subprocess has issues receiver.close() def check_timeout(self): client_id = "test-runner-{}-{}".format(os.getpid(), id(self)) receiver = Receiver(5556, 5656) receiver.start() port = receiver.port try: p = mp.Process(target=self.ready_response, args=(client_id, port)) p.start() with pytest.raises(TimeoutError): receiver.recv(timeout=0) finally: # Terminate the process instead of waiting for it to timeout # (it will keep retrying for up to 15 seconds with default timeouts) p.terminate() p.join(timeout=1) receiver.close() def check_exponential_backoff(self): """Test that Sender applies exponential backoff on retries.""" import time # Save original values original_timeout = Sender.REQUEST_TIMEOUT_MS original_retries = Sender.NUM_RETRIES # Override timeout values for faster testing BEFORE creating sender Sender.REQUEST_TIMEOUT_MS = 100 # 100ms instead of 3000ms Sender.NUM_RETRIES = 3 # 3 retries instead of 5 try: client_id = "test-backoff-client" event_factory = ClientEventFactory("test_1", 0, client_id) # Create sender that will timeout (no receiver listening on this port) sender = Sender( server_host="localhost", server_port=9999, # Nothing listening here message_supplier=event_factory, logger=logging, ) start_time = time.time() try: sender.send(event_factory.ready()) except RuntimeError as e: # Expected to fail with "Unable to receive response from driver" assert "Unable to receive response from driver" in str(e) elapsed = time.time() - start_time # With 2x backoff: 100ms + 200ms + 400ms = 700ms total # Without backoff: 100ms * 3 = 300ms total # We expect at least 500ms to prove backoff is working # Use 400ms to account for timing variance assert elapsed > 0.4, f"Expected > 0.4s with backoff, got {elapsed:.3f}s" assert elapsed < 2.0, f"Expected < 2.0s, got {elapsed:.3f}s (test taking too long)" sender.close() finally: # Restore original values Sender.REQUEST_TIMEOUT_MS = original_timeout Sender.NUM_RETRIES = original_retries ================================================ FILE: tests/runner/fake_remote_account.py ================================================ from ducktape.cluster.consts import LINUX, WINDOWS from ducktape.cluster.remoteaccount import RemoteAccount class FakeRemoteAccount(RemoteAccount): def __init__(self, *args, is_available=True, **kwargs): super().__init__(*args, **kwargs) self.os = LINUX self.is_available = is_available def available(self): return self.is_available def fetch_externally_routable_ip(self, *args, **kwargs): return "fake ip" class FakeWindowsRemoteAccount(FakeRemoteAccount): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.os = WINDOWS def create_fake_remote_account(*args, **kwargs): return FakeRemoteAccount(*args, **kwargs) ================================================ FILE: tests/runner/resources/__init__.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/runner/resources/test_bad_actor.py ================================================ from ducktape.mark.resource import cluster from ducktape.services.service import Service from ducktape.tests.test import Test class FakeService(Service): pass class BadActorTest(Test): @cluster(num_nodes=2) def test_too_many_nodes(self): """ This test should fail to allocate and should be dealt with gracefully by the framework, marking it as failed and moving on. """ FakeService(self.test_context, num_nodes=3) ================================================ FILE: tests/runner/resources/test_failing_tests.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function from ducktape.mark.resource import cluster from ducktape.tests.test import Test from ducktape.mark import matrix """All tests in this module fail""" class FailingTest(Test): def __init__(self, test_context): super(FailingTest, self).__init__(test_context) @cluster(num_nodes=1000) @matrix(x=[_ for _ in range(2)]) def test_fail(self, x): print("Test %s fails!" % x) raise RuntimeError("This test throws an error!") ================================================ FILE: tests/runner/resources/test_fails_to_init.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test from ducktape.mark import matrix """All tests in this module fail""" class FailsToInitTest(Test): """All tests in this class fail to initialize due to an exception in constructor""" def __init__(self, test_context): super(FailsToInitTest, self).__init__(test_context) x = None x.split(":") @matrix(x=[_ for _ in range(2)]) def test_nothing(self): self.logger.info("NOTHING") ================================================ FILE: tests/runner/resources/test_fails_to_init_in_setup.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test from ducktape.mark import matrix """All tests in this module fail""" class FailsToInitInSetupTest(Test): """All tests in this class fail to initialize due to an exception in setUp() method""" def __init__(self, test_context): super(FailsToInitInSetupTest, self).__init__(test_context) def setUp(self): x = None x.split(":") @matrix(x=[_ for _ in range(2)]) def test_nothing(self): self.logger.info("NOTHING") ================================================ FILE: tests/runner/resources/test_memory_leak.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.mark.resource import cluster from ducktape.tests.test import Test from ducktape.services.service import Service from ducktape.mark import matrix MEMORY_EATER_LIST_SIZE = 10000000 N_TEST_CASES = 5 class MemoryEater(Service): """Simple service that has a reference to a list with many elements""" def __init__(self, context): super(MemoryEater, self).__init__(context, 1) self.items = [] def start_node(self, node): self.items = [x for x in range(MEMORY_EATER_LIST_SIZE)] def stop_node(self, node): pass def clean_node(self, node): pass @property def num_nodes(self): return 1 class MemoryLeakTest(Test): """A group of identical "memory-hungry" ducktape tests. Each test holds a reference to a service which itself holds a reference to a large (memory intensive) object. """ def __init__(self, test_context): super(MemoryLeakTest, self).__init__(test_context) self.memory_eater = MemoryEater(test_context) @cluster(num_nodes=100) @matrix(x=[i for i in range(N_TEST_CASES)]) def test_leak(self, x): self.memory_eater.start() ================================================ FILE: tests/runner/resources/test_thingy.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import time from ducktape.tests.test import Test from ducktape.mark import ignore, parametrize from ducktape.mark.resource import cluster _flake = False class TestThingy(Test): """Fake ducktape test class""" @cluster(num_nodes=1000) def test_pi(self): return {"data": 3.14159} @cluster(num_nodes=1000) def test_delayed(self): time.sleep(1) @cluster(num_nodes=1000) @ignore def test_ignore1(self): pass @cluster(num_nodes=1000) @ignore(x=5) @parametrize(x=5) def test_ignore2(self, x=2): pass @cluster(num_nodes=1000) def test_failure(self): raise Exception("This failed") @cluster(num_nodes=1000) def test_flaky(self): global _flake flake, _flake = _flake, not _flake assert flake class ClusterTestThingy(Test): """Fake ducktape test class""" @cluster(num_nodes=10) def test_bad_num_nodes(self): pass ================================================ FILE: tests/runner/resources/test_various_num_nodes.py ================================================ import time from ducktape.mark.resource import cluster from ducktape.tests.test import Test class VariousNumNodesTest(Test): """ Allocates various number of nodes. """ @cluster(num_nodes=5) def test_five_nodes_a(self): assert True @cluster(num_nodes=5) def test_five_nodes_b(self): assert True @cluster(num_nodes=4) def test_four_nodes(self): assert True @cluster(num_nodes=3) def test_three_nodes_asleep(self): time.sleep(3) assert True @cluster(num_nodes=3) def test_three_nodes_a(self): assert True @cluster(num_nodes=3) def test_three_nodes_b(self): assert True @cluster(num_nodes=2) def test_two_nodes_a(self): assert True @cluster(num_nodes=2) def test_two_nodes_b(self): assert True @cluster(num_nodes=1) def test_one_node_a(self): assert True @cluster(num_nodes=1) def test_one_node_b(self): assert True def test_no_cluster_annotation(self): assert True @cluster() def test_empty_cluster_annotation(self): assert True # this one is valid regardless of # whether the greedy tests are allowed or not # because it's not greedy, quite the opposite @cluster(num_nodes=0) def test_zero_nodes(self): assert True ================================================ FILE: tests/scheduler/__init__.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/scheduler/check_scheduler.py ================================================ # Copyright 2016 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections from ducktape.cluster.cluster_spec import ClusterSpec from tests.ducktape_mock import FakeCluster from ducktape.tests.scheduler import TestScheduler from ducktape.services.service import Service FakeContext = collections.namedtuple("FakeContext", ["test_id", "expected_num_nodes", "expected_cluster_spec"]) class CheckScheduler(object): def setup_method(self, _): self.cluster = FakeCluster(100) self.tc0 = FakeContext(0, expected_num_nodes=10, expected_cluster_spec=ClusterSpec.simple_linux(10)) self.tc1 = FakeContext(1, expected_num_nodes=50, expected_cluster_spec=ClusterSpec.simple_linux(50)) self.tc2 = FakeContext( 2, expected_num_nodes=100, expected_cluster_spec=ClusterSpec.simple_linux(100), ) self.tc_list = [ self.tc0, self.tc1, self.tc2, ] def check_empty(self): """Check expected behavior of empty scheduler.""" scheduler = TestScheduler([], self.cluster) assert len(scheduler) == 0 assert scheduler.peek() is None def check_non_empty_cluster_too_small(self): """Ensure that scheduler does not return tests if the cluster does not have enough available nodes.""" scheduler = TestScheduler(self.tc_list, self.cluster) assert len(scheduler) == len(self.tc_list) assert scheduler.peek() is not None # alloc all cluster nodes so none are available self.cluster.alloc(Service.setup_cluster_spec(num_nodes=len(self.cluster))) assert self.cluster.num_available_nodes() == 0 # peeking should not yield an object assert scheduler.peek() is None def check_simple_usage(self): """Check usage with fully available cluster.""" scheduler = TestScheduler(self.tc_list, self.cluster) c = 2 while len(scheduler) > 0: t = scheduler.peek() assert t.test_id == c assert len(scheduler) == c + 1 scheduler.remove(t) assert len(scheduler) == c c -= 1 def check_with_changing_cluster_availability(self): """Modify cluster usage in between calls to next()""" scheduler = TestScheduler(self.tc_list, self.cluster) # start with 100-node cluster (configured in setup_method()) # allocate 60 nodes; only test_id 0 (which needs 10 nodes) should be available nodes = self.cluster.alloc(Service.setup_cluster_spec(num_nodes=60)) assert self.cluster.num_available_nodes() == 40 t = scheduler.peek() assert t == self.tc0 scheduler.remove(t) assert scheduler.peek() is None # return 10 nodes, so 50 are available in the cluster # next test from the scheduler should be test id 1 (which needs 50 nodes) return_nodes = nodes[:10] keep_nodes = nodes[10:] self.cluster.free(return_nodes) assert self.cluster.num_available_nodes() == 50 t = scheduler.peek() assert t == self.tc1 scheduler.remove(t) assert scheduler.peek() is None # return remaining nodes, so cluster is fully available # next test from scheduler should be test id 2 (which needs 100 nodes) return_nodes = keep_nodes self.cluster.free(return_nodes) assert self.cluster.num_available_nodes() == len(self.cluster) t = scheduler.peek() assert t == self.tc2 scheduler.remove(t) # scheduler should become empty now assert len(scheduler) == 0 assert scheduler.peek() is None def check_filter_unschedulable_tests(self): self.cluster = FakeCluster(49) # only test id 0 can be scheduled on this cluster scheduler = TestScheduler(self.tc_list, self.cluster) unschedulable = scheduler.filter_unschedulable_tests() assert set(unschedulable) == {self.tc1, self.tc2} # subsequent calls should return empty list and not modify scheduler assert not scheduler.filter_unschedulable_tests() assert scheduler.peek() == self.tc0 def check_drain_remaining_tests(self): """Test that drain_remaining_tests returns all tests and empties the scheduler.""" scheduler = TestScheduler(self.tc_list, self.cluster) # Initially scheduler should have all tests assert len(scheduler) == len(self.tc_list) # Drain all remaining tests remaining = scheduler.drain_remaining_tests() # Should return all tests assert len(remaining) == len(self.tc_list) assert set(remaining) == set(self.tc_list) # Scheduler should now be empty assert len(scheduler) == 0 assert scheduler.peek() is None # Calling again should return empty list remaining_again = scheduler.drain_remaining_tests() assert len(remaining_again) == 0 def check_drain_remaining_tests_partial(self): """Test drain_remaining_tests after some tests have been removed.""" scheduler = TestScheduler(self.tc_list, self.cluster) # Remove one test t = scheduler.peek() scheduler.remove(t) initial_len = len(scheduler) # Drain remaining tests remaining = scheduler.drain_remaining_tests() # Should return only the remaining tests assert len(remaining) == initial_len assert t not in remaining # Scheduler should be empty assert len(scheduler) == 0 ================================================ FILE: tests/services/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/services/check_background_thread_service.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.services.background_thread import BackgroundThreadService from ducktape.errors import TimeoutError from tests.ducktape_mock import test_context, MockNode import pytest import time class DummyService(BackgroundThreadService): """Single node service that sleeps for self.run_time_sec seconds in a background thread.""" def __init__(self, context, run_time_sec, exc=None): super(DummyService, self).__init__(context, 1) self.running = False self.run_time_sec = run_time_sec self._exc = exc def who_am_i(self, node=None): return "DummyService" def idx(self, node): return 1 def allocate_nodes(self): self.nodes = [MockNode()] def _worker(self, idx, node): if self._exc: raise self._exc self.running = True end = time.time() + self.run_time_sec while self.running: time.sleep(0.1) if time.time() > end: self.running = False break def stop_node(self, node): self.running = False class CheckBackgroundThreadService(object): def setup_method(self, method): self.context = test_context() def check_service_constructor(self): """Check that BackgroundThreadService constructor corresponds to the base class's one.""" exp_spec = ClusterSpec.simple_linux(10) service = BackgroundThreadService(self.context, cluster_spec=exp_spec) assert service.cluster_spec == exp_spec service = BackgroundThreadService(self.context, num_nodes=20) assert service.cluster_spec.size() == 20 with pytest.raises(RuntimeError): BackgroundThreadService(self.context, num_nodes=20, cluster_spec=exp_spec) def check_service_timeout(self): """Test that wait(timeout_sec) raise a TimeoutError in approximately the expected time.""" self.service = DummyService(self.context, float("inf")) self.service.start() start = time.time() timeout_sec = 0.1 try: self.service.wait(timeout_sec=timeout_sec) raise Exception("Expected service to timeout.") except TimeoutError: end = time.time() # Relative difference should be pretty small # within 10% should be reasonable actual_timeout = end - start relative_difference = abs(timeout_sec - actual_timeout) / timeout_sec assert relative_difference < 0.1, ( "Correctly threw timeout error, but timeout doesn't match closely with expected timeout. " + "(expected timeout, actual timeout): (%s, %s)" % (str(timeout_sec), str(actual_timeout)) ) def check_no_timeout(self): """Run an instance of DummyService with a short run_time_sec. It should stop without timing out.""" self.service = DummyService(self.context, run_time_sec=0.1) self.service.start() self.service.wait(timeout_sec=0.5) def check_wait_node(self): self.service = DummyService(self.context, run_time_sec=float("inf")) self.service.start() node = self.service.nodes[0] assert not self.service.wait_node(node, timeout_sec=0.1) self.service.stop_node(node) assert self.service.wait_node(node) def check_wait_node_no_start(self): self.service = DummyService(self.context, run_time_sec=float("inf")) node = self.service.nodes[0] assert self.service.wait_node(node) def check_background_exception(self): self.service = DummyService(self.context, float("inf"), Exception("failure")) self.service.start() with pytest.raises(Exception): self.service.wait(timeout_sec=1) with pytest.raises(Exception): self.service.stop(timeout_sec=1) assert hasattr(self.service, "errors") ================================================ FILE: tests/services/check_jvm_logging.py ================================================ # Copyright 2024 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.services.service import Service from ducktape.jvm_logging import JVMLogger from tests.ducktape_mock import test_context, session_context from ducktape.cluster.localhost import LocalhostCluster from unittest.mock import Mock import os class JavaService(Service): """Mock Java service for testing JVM logging.""" def __init__(self, context, num_nodes): super(JavaService, self).__init__(context, num_nodes) self.start_called = False self.clean_called = False self.ssh_commands = [] def idx(self, node): return 1 def start_node(self, node, **kwargs): super(JavaService, self).start_node(node, **kwargs) self.start_called = True # Simulate a Java command being run node.account.ssh("java -version") def clean_node(self, node, **kwargs): super(JavaService, self).clean_node(node, **kwargs) self.clean_called = True def create_mock_node(): """Create a mock node with a mock account that tracks SSH calls.""" node = Mock() node.account = Mock() node.account.ssh = Mock(return_value=None) node.account.hostname = "mock-host" node.account.externally_routable_ip = "127.0.0.1" return node class CheckJVMLogging(object): def setup_method(self, _): self.cluster = LocalhostCluster() self.session_context = session_context() self.context = test_context(self.session_context, cluster=self.cluster) self.jvm_logger = JVMLogger() def check_enable_for_service(self): """Check that JVM logging can be enabled for a service.""" service = JavaService(self.context, 1) # Enable JVM logging self.jvm_logger.enable_for_service(service) # Verify logs dict was updated assert hasattr(service, "logs") assert "jvm_gc_log" in service.logs assert "jvm_stdout_stderr" in service.logs assert "jvm_heap_dump" in service.logs # Verify helper methods were added assert hasattr(service, "JVM_LOG_DIR") assert hasattr(service, "jvm_options") assert hasattr(service, "setup_jvm_logging") assert hasattr(service, "clean_jvm_logs") def check_jvm_options_format(self): """Check that JVM options string is properly formatted.""" jvm_opts = self.jvm_logger._get_jvm_options() # Should start with -Xlog:disable to prevent console pollution assert jvm_opts.startswith("-Xlog:disable") # Should contain key logging options assert "gc*:file=" in jvm_opts assert "HeapDumpOnOutOfMemoryError" in jvm_opts assert "safepoint=info" in jvm_opts assert "class+load=info" in jvm_opts assert "jit+compilation=info" in jvm_opts assert "NativeMemoryTracking=summary" in jvm_opts def check_ssh_wrapping(self): """Check that all SSH methods are wrapped to inject JDK_JAVA_OPTIONS.""" # Create a mock node first mock_node = create_mock_node() # Add ssh_capture and ssh_output methods to mock mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") # Enable JVM logging for a service service = JavaService(self.context, 1) self.jvm_logger.enable_for_service(service) # Manually call the wrapped start_node with our mock node # This simulates what happens during service.start() service.start_node(mock_node) # Verify all SSH methods were wrapped assert hasattr(mock_node.account, "original_ssh") assert hasattr(mock_node.account, "original_ssh_capture") assert hasattr(mock_node.account, "original_ssh_output") # The current methods should be callables (the wrapper functions) assert callable(mock_node.account.ssh) assert callable(mock_node.account.ssh_capture) assert callable(mock_node.account.ssh_output) assert service.start_called def check_ssh_methods_inject_options(self): """Check that wrapped SSH methods actually inject JDK_JAVA_OPTIONS.""" service = JavaService(self.context, 1) node = service.nodes[0] # Track what commands are actually executed executed_commands = [] def track_ssh(cmd, allow_fail=False): executed_commands.append(("ssh", cmd)) return 0 def track_ssh_capture(cmd, allow_fail=False, callback=None, combine_stderr=True, timeout_sec=None): executed_commands.append(("ssh_capture", cmd)) return iter([]) def track_ssh_output(cmd, allow_fail=False, combine_stderr=True, timeout_sec=None): executed_commands.append(("ssh_output", cmd)) return "" # Set the tracking functions as the original methods node.account.ssh = track_ssh node.account.ssh_capture = track_ssh_capture node.account.ssh_output = track_ssh_output # Enable JVM logging - this wraps start_node self.jvm_logger.enable_for_service(service) # Start node - this wraps the SSH methods service.start_node(node) # Clear the setup commands (from mkdir, etc) executed_commands.clear() # Now execute commands through the wrapped methods node.account.ssh("java -version") node.account.ssh_capture("java -jar app.jar") node.account.ssh_output("java -cp test.jar Main") # Verify JDK_JAVA_OPTIONS was injected in all commands assert len(executed_commands) == 3, f"Expected 3 commands, got {len(executed_commands)}" for method, cmd in executed_commands: assert "JDK_JAVA_OPTIONS=" in cmd, f"{method} didn't inject JDK_JAVA_OPTIONS: {cmd}" assert "-Xlog:disable" in cmd, f"{method} didn't include JVM options: {cmd}" # Verify it uses env command (more portable) assert cmd.startswith("env JDK_JAVA_OPTIONS="), f"{method} doesn't use env command: {cmd}" # Verify it appends to existing (${JDK_JAVA_OPTIONS:-}) assert "${JDK_JAVA_OPTIONS:-}" in cmd, f"{method} doesn't preserve existing options: {cmd}" # Verify no extra quotes around the options (shlex.quote would add them) assert "'-Xlog:disable" not in cmd, f"{method} has unwanted quotes around options: {cmd}" def check_preserves_existing_jvm_options(self): """Check that existing JDK_JAVA_OPTIONS are preserved and appended to.""" service = JavaService(self.context, 1) node = service.nodes[0] # Track what commands are executed executed_commands = [] def track_ssh(cmd, allow_fail=False): executed_commands.append(cmd) return 0 node.account.ssh = track_ssh node.account.ssh_capture = Mock(return_value=iter([])) node.account.ssh_output = Mock(return_value="") # Enable JVM logging self.jvm_logger.enable_for_service(service) # Start node service.start_node(node) executed_commands.clear() # Execute a command - the wrapper should preserve any existing JDK_JAVA_OPTIONS node.account.ssh("java -version") # Verify the command structure assert len(executed_commands) == 1 cmd = executed_commands[0] # Should use env command assert cmd.startswith("env JDK_JAVA_OPTIONS=") # Should preserve existing options with ${JDK_JAVA_OPTIONS:-} assert "${JDK_JAVA_OPTIONS:-}" in cmd # Should append our JVM logging options assert "-Xlog:disable" in cmd def check_ssh_wrap_idempotent(self): """Check that SSH wrapping is idempotent (handles restarts).""" mock_node = create_mock_node() mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") service = JavaService(self.context, 1) # Enable JVM logging self.jvm_logger.enable_for_service(service) # Get the wrapped start_node method wrapped_start_node = service.start_node # Call it twice with the same node wrapped_start_node(mock_node) # Verify wrapping happened assert hasattr(mock_node.account, "original_ssh") assert hasattr(mock_node.account, "original_ssh_capture") assert hasattr(mock_node.account, "original_ssh_output") # Call again (simulating restart) wrapped_start_node(mock_node) # SSH should still have the original_* attributes (idempotent) assert hasattr(mock_node.account, "original_ssh") assert hasattr(mock_node.account, "original_ssh_capture") assert hasattr(mock_node.account, "original_ssh_output") def check_clean_node_behavior(self): """Check that clean_node properly cleans up and restores SSH methods.""" mock_node = create_mock_node() mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") service = JavaService(self.context, 1) # Enable JVM logging self.jvm_logger.enable_for_service(service) # Get the wrapped methods wrapped_start_node = service.start_node wrapped_clean_node = service.clean_node # Start node to wrap SSH methods wrapped_start_node(mock_node) assert hasattr(mock_node.account, "original_ssh") assert hasattr(mock_node.account, "original_ssh_capture") assert hasattr(mock_node.account, "original_ssh_output") # Clean node to restore SSH methods wrapped_clean_node(mock_node) # Verify SSH methods were restored assert not hasattr(mock_node.account, "original_ssh") assert not hasattr(mock_node.account, "original_ssh_capture") assert not hasattr(mock_node.account, "original_ssh_output") def check_log_paths(self): """Check that log paths are correctly configured.""" service = JavaService(self.context, 1) self.jvm_logger.enable_for_service(service) log_dir = self.jvm_logger.log_dir # Verify log paths assert service.logs["jvm_gc_log"]["path"] == os.path.join(log_dir, "gc.log") assert service.logs["jvm_stdout_stderr"]["path"] == os.path.join(log_dir, "jvm.log") assert service.logs["jvm_heap_dump"]["path"] == os.path.join(log_dir, "heap_dump.hprof") # Verify collection flags assert service.logs["jvm_gc_log"]["collect_default"] is True assert service.logs["jvm_stdout_stderr"]["collect_default"] is True assert service.logs["jvm_heap_dump"]["collect_default"] is False def check_kwargs_preserved(self): """Check that start_node and clean_node preserve kwargs after wrapping.""" service = JavaService(self.context, 1) # Replace the node with a mock node mock_node = create_mock_node() service.nodes = [mock_node] self.jvm_logger.enable_for_service(service) # This should not raise an error even with extra kwargs service.start(timeout_sec=30, clean=False) assert service.start_called def check_setup_failure_doesnt_break_wrapping(self): """Check that if _setup_on_node fails, the error propagates correctly.""" from ducktape.cluster.remoteaccount import RemoteCommandError mock_node = create_mock_node() mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") # Make mkdir fail def failing_ssh(cmd, allow_fail=False): if "mkdir" in cmd: if not allow_fail: raise RemoteCommandError(mock_node.account, cmd, 1, "Permission denied") return 1 return 0 mock_node.account.ssh = failing_ssh service = JavaService(self.context, 1) self.jvm_logger.enable_for_service(service) # start_node should fail during setup try: service.start_node(mock_node) assert False, "Expected RemoteCommandError" except RemoteCommandError as e: assert "Permission denied" in str(e) def check_wrapped_ssh_failure_propagates(self): """Check that SSH failures are properly propagated through the wrapper.""" from ducktape.cluster.remoteaccount import RemoteCommandError mock_node = create_mock_node() # Track calls ssh_calls = [] def tracking_ssh(cmd, allow_fail=False): ssh_calls.append((cmd, allow_fail)) # Fail on "test-java" commands (not the real java from JavaService.start_node) if "test-java" in cmd and not allow_fail: raise RemoteCommandError(mock_node.account, cmd, 127, "java: command not found") return 0 mock_node.account.ssh = tracking_ssh mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") service = JavaService(self.context, 1) self.jvm_logger.enable_for_service(service) service.start_node(mock_node) ssh_calls.clear() # Command that should fail try: mock_node.account.ssh("test-java -version") assert False, "Expected RemoteCommandError" except RemoteCommandError as e: assert "java: command not found" in str(e) # Command with allow_fail=True should not raise result = mock_node.account.ssh("test-java -version", allow_fail=True) assert result == 0 # Our mock returns 0 when allow_fail=True def check_wrapped_ssh_with_allow_fail(self): """Check that allow_fail parameter works correctly through the wrapper.""" mock_node = create_mock_node() # Track calls and their allow_fail parameter ssh_calls = [] def tracking_ssh(cmd, allow_fail=False): ssh_calls.append((cmd, allow_fail)) return 1 if "fail" in cmd else 0 mock_node.account.ssh = tracking_ssh mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") service = JavaService(self.context, 1) self.jvm_logger.enable_for_service(service) service.start_node(mock_node) ssh_calls.clear() # Test with allow_fail=False (default) mock_node.account.ssh("echo success") assert ssh_calls[-1][1] is False # Test with allow_fail=True mock_node.account.ssh("echo fail", allow_fail=True) assert ssh_calls[-1][1] is True def check_cleanup_failure_still_restores_ssh(self): """Check that even if cleanup fails, SSH methods are still restored.""" from ducktape.cluster.remoteaccount import RemoteCommandError mock_node = create_mock_node() cleanup_called = [] def tracking_ssh(cmd, allow_fail=False): if "rm -rf" in cmd: cleanup_called.append(True) if not allow_fail: raise RemoteCommandError(mock_node.account, cmd, 1, "rm failed") return 0 original_ssh = tracking_ssh mock_node.account.ssh = original_ssh mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") service = JavaService(self.context, 1) self.jvm_logger.enable_for_service(service) # Start node service.start_node(mock_node) assert hasattr(mock_node.account, "original_ssh") # Clean node - cleanup uses allow_fail=True, so it shouldn't raise service.clean_node(mock_node) # Verify cleanup was attempted assert len(cleanup_called) > 0 # Verify SSH was restored despite cleanup "failure" assert not hasattr(mock_node.account, "original_ssh") def check_double_cleanup_is_safe(self): """Check that calling clean_node twice doesn't cause errors.""" mock_node = create_mock_node() mock_node.account.ssh_capture = Mock(return_value=iter([])) mock_node.account.ssh_output = Mock(return_value="") service = JavaService(self.context, 1) self.jvm_logger.enable_for_service(service) # Start node service.start_node(mock_node) assert hasattr(mock_node.account, "original_ssh") # Clean node first time service.clean_node(mock_node) assert not hasattr(mock_node.account, "original_ssh") # Clean node second time - should not raise service.clean_node(mock_node) assert not hasattr(mock_node.account, "original_ssh") ================================================ FILE: tests/services/check_service.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.services.service import Service from tests.ducktape_mock import test_context, session_context from ducktape.cluster.localhost import LocalhostCluster class DummyService(Service): """Simple fake service class.""" def __init__(self, context, num_nodes): super(DummyService, self).__init__(context, num_nodes) self.started_count = 0 self.cleaned_count = 0 self.stopped_count = 0 self.started_kwargs = {} self.cleaned_kwargs = {} self.stopped_kwargs = {} def idx(self, node): return 1 def start_node(self, node, **kwargs): super(DummyService, self).start_node(node, **kwargs) self.started_count += 1 self.started_kwargs = kwargs def clean_node(self, node, **kwargs): super(DummyService, self).clean_node(node, **kwargs) self.cleaned_count += 1 self.cleaned_kwargs = kwargs def stop_node(self, node, **kwargs): super(DummyService, self).stop_node(node, **kwargs) self.stopped_count += 1 self.stopped_kwargs = kwargs class DifferentDummyService(Service): """Another fake service class.""" def __init__(self, context, num_nodes): super(DifferentDummyService, self).__init__(context, num_nodes) def idx(self, node): return 1 class CheckAllocateFree(object): def setup_method(self, _): self.cluster = LocalhostCluster() self.session_context = session_context() self.context = test_context(self.session_context, cluster=self.cluster) def check_allocate_free(self): """Check that allocating and freeing nodes works. This regression test catches the error with Service.free() introduced in v0.3.3 and fixed in v0.3.4 """ # Node allocation takes place during service instantiation initial_cluster_size = len(self.cluster) self.service = DummyService(self.context, 10) assert self.cluster.num_available_nodes() == initial_cluster_size - 10 self.service.free() assert self.cluster.num_available_nodes() == initial_cluster_size def check_order(self): """Check expected behavior with service._order method""" self.dummy0 = DummyService(self.context, 4) self.diffDummy0 = DifferentDummyService(self.context, 100) self.dummy1 = DummyService(self.context, 1) self.diffDummy1 = DifferentDummyService(self.context, 2) self.diffDummy2 = DifferentDummyService(self.context, 5) assert self.dummy0._order == 0 assert self.dummy1._order == 1 assert self.diffDummy0._order == 0 assert self.diffDummy1._order == 1 assert self.diffDummy2._order == 2 class CheckStartStop(object): def setup_method(self, _): self.cluster = LocalhostCluster() self.session_context = session_context() self.context = test_context(self.session_context, cluster=self.cluster) def check_start_stop_clean(self): """ Checks that start, stop, and clean invoke the expected per-node calls, and that start also runs stop and clean """ service = DummyService(self.context, 2) service.start() assert service.started_count == 2 assert service.stopped_count == 2 assert service.cleaned_count == 2 service.stop() assert service.stopped_count == 4 service.start(clean=False) assert service.started_count == 4 assert service.stopped_count == 6 assert service.cleaned_count == 2 service.stop() assert service.stopped_count == 8 service.clean() assert service.cleaned_count == 4 def check_kwargs_support(self): """Check that start, stop, and clean, and their per-node versions, can accept keyword arguments""" service = DummyService(self.context, 2) kwargs = {"foo": "bar"} service.start(**kwargs) assert service.started_kwargs == kwargs service.stop(**kwargs) assert service.stopped_kwargs == kwargs service.clean(**kwargs) assert service.cleaned_kwargs == kwargs ================================================ FILE: tests/templates/__init__.py ================================================ ================================================ FILE: tests/templates/service/__init__.py ================================================ ================================================ FILE: tests/templates/service/check_render.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.services.service import Service from tests.ducktape_mock import test_context class CheckTemplateRenderingService(object): """ Tests rendering of templates, using input from a Service """ def new_instance(self): return TemplateRenderingService() def check_simple(self): self.new_instance().render_simple() def check_single_variable(self): self.new_instance().render_single_variable() def check_overload(self): self.new_instance().render_overload() def check_class_template(self): self.new_instance().render_class_template() def check_file_template(self): self.new_instance().render_file_template() class TemplateRenderingService(Service): NO_VARIABLE = "fixed content" SIMPLE_VARIABLE = "Hello {{a_field}}!" OVERLOAD_VARIABLES = "{{normal}} {{overload}}" CLASS_CONSTANT_TEMPLATE = "{{ CLASS_CONSTANT }}" CLASS_CONSTANT = "constant" def __init__(self): super(TemplateRenderingService, self).__init__(test_context(), 1) def render_simple(self): """Test that a trivial template works""" assert self.render_template(self.NO_VARIABLE) == self.NO_VARIABLE def render_single_variable(self): """Test that fields on the object are available to templates""" self.a_field = "world" assert self.render_template(self.SIMPLE_VARIABLE) == "Hello world!" def render_overload(self): self.normal = "normal" assert self.render_template(self.OVERLOAD_VARIABLES, overload="overloaded") == "normal overloaded" def render_class_template(self): assert self.render_template(self.CLASS_CONSTANT_TEMPLATE) == self.CLASS_CONSTANT self.CLASS_CONSTANT = "instance override" assert self.render_template(self.CLASS_CONSTANT_TEMPLATE) == "instance override" def render_file_template(self): self.a_field = "world" assert "Sample world" == self.render("sample") ================================================ FILE: tests/templates/service/templates/sample ================================================ Sample {{a_field}} ================================================ FILE: tests/templates/test/__init__.py ================================================ ================================================ FILE: tests/templates/test/check_render.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests import Test from ducktape.tests import TestContext from ducktape.template import TemplateRenderer from tests.ducktape_mock import session_context import os import tempfile class CheckTemplateRenderingTest(object): """ Minimal test to verify template rendering functionality """ def setup(self): dir = tempfile.gettempdir() session_ctx = session_context(results_dir=dir) test_ctx = TestContext(session_context=session_ctx) return TemplateRenderingTest(test_ctx) def check_string_template(self): test = self.setup() test.other = "goodbye" result = test.render_template("Hello {{name}} and {{other}}", name="World") assert "Hello World and goodbye" == result def check_file_template(self): test = self.setup() test.name = "world" assert "Sample world" == test.render("sample") class CheckPackageSearchPath(object): """ Simple check on extracting package and search path based on module name. """ def check_package_search_path(self): package, path = TemplateRenderer._package_search_path("a.b.c") # search path should be b/templates since templates is by convention a sibling of c assert package == "a" and path == os.path.join("b", "templates") package, path = TemplateRenderer._package_search_path("hi") assert package == "hi" and path == "templates" package, path = TemplateRenderer._package_search_path("") assert package == "" and path == "templates" def check_get_ctx(self): class A(TemplateRenderer): x = "xxx" class B(A): y = "yyy" b = B() b.instance = "b instance" ctx_a = A()._get_ctx() assert ctx_a["x"] == "xxx" assert "yyy" not in ctx_a assert "instance" not in ctx_a ctx_b = b._get_ctx() assert ctx_b["x"] == "xxx" assert ctx_b["y"] == "yyy" assert ctx_b["instance"] == "b instance" class TemplateRenderingTest(Test): pass ================================================ FILE: tests/templates/test/templates/sample ================================================ Sample {{name}} ================================================ FILE: tests/test_utils.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import socket def find_available_port(min_port=8000, max_port=9000): """Return first available port in the range [min_port, max_port], inclusive. Note that this actually isn't a 100% reliable way of getting a port, but it's probably good enough -- once you close a socket you cannot be sure of its availability. This was the source of a bunch of issues in the Apache Kafka unit tests. """ for p in range(min_port, max_port + 1): try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("localhost", p)) s.close() return p except socket.error: pass raise Exception("No available port found in range [%d, %d]" % (min_port, max_port)) ================================================ FILE: tests/tests/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/tests/check_session.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.session import generate_results_dir, SessionContext from ducktape.cluster.localhost import LocalhostCluster import os.path import pickle import shutil import tempfile class CheckGenerateResultsDir(object): def setup_method(self, _): self.tempdir = tempfile.mkdtemp() def check_generate_results_root(self): """Check the generated results directory has the specified path as its root""" results_root = os.path.abspath(tempfile.mkdtemp()) results_dir = generate_results_dir(results_root, "my_session_id") assert results_dir.find(results_root) == 0 def check_pickleable(self): """Check that session_context object is pickleable This is necessary so that SessionContext objects can be shared between processes when using python multiprocessing module. """ kwargs = { "session_id": "hello-123", "results_dir": self.tempdir, "cluster": LocalhostCluster(), "globals": {}, } session_context = SessionContext(**kwargs) pickle.dumps(session_context) def teardown_method(self, _): if os.path.exists(self.tempdir): shutil.rmtree(self.tempdir) ================================================ FILE: tests/tests/check_test.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import random import shutil import sys import tempfile from ducktape.cluster.cluster_spec import ClusterSpec from ducktape.tests.test import Test, _compress_cmd, in_dir, in_temp_dir from ducktape.tests.test_context import _escape_pathname, TestContext from tests import ducktape_mock class DummyTest(Test): """class description""" def test_class_description(self): pass def test_function_description(self): """function description""" pass class DummyTestNoDescription(Test): def test_this(self): pass class CheckLifecycle(object): def check_test_context_double_close(self): context = TestContext( session_context=ducktape_mock.session_context(), cls=DummyTest, function=DummyTest.test_function_description, ) context.close() context.close() assert not hasattr(context, "services") def check_cluster_property(self): exp_cluster = ClusterSpec.simple_linux(5) tc = TestContext( session_context=ducktape_mock.session_context(), cluster=exp_cluster, cls=DummyTest, function=DummyTest.test_function_description, ) test_obj = tc.cls(tc) assert test_obj.cluster == exp_cluster class CheckEscapePathname(object): def check_illegal_path(self): path = "\\/.a=2, b=x/y/z" assert _escape_pathname(path) == "a=2.b=x.y.z" def check_negative(self): # it's better if negative numbers are preserved path = "x= -2, y=-50" assert _escape_pathname(path) == "x=-2.y=-50" def check_many_dots(self): path = "..a.....b.c...d." assert _escape_pathname(path) == "a.b.c.d" class CheckDescription(object): """Check that pulling a description from a test works as expected.""" def check_from_function(self): """If the function has a docstring, the description should come from the function""" context = TestContext( session_context=ducktape_mock.session_context(), cls=DummyTest, function=DummyTest.test_function_description, ) assert context.description == "function description" def check_from_class(self): """If the test method has no docstring, description should come from the class docstring""" context = TestContext( session_context=ducktape_mock.session_context(), cls=DummyTest, function=DummyTest.test_class_description, ) assert context.description == "class description" def check_no_description(self): """If nobody has a docstring, there shouldn't be an error, and description should be empty string""" context = TestContext( session_context=ducktape_mock.session_context(), cls=DummyTestNoDescription, function=DummyTestNoDescription.test_this, ) assert context.description == "" class CheckCompressCmd(object): """Check expected behavior of compress command used before collecting service logs""" def setup_method(self, _): self.tempdir = tempfile.mkdtemp() def _make_random_file(self, dir, num_chars=10000): """Populate filename with random characters.""" filename = os.path.join(dir, "f-%d" % random.randint(1, 2**63 - 1)) content = "".join([random.choice("0123456789abcdefghijklmnopqrstuvwxyz\n") for _ in range(num_chars)]) with open(filename, "w") as f: f.writelines(content) return filename def _make_files(self, dir, num_files=10): """Populate dir with several files with random characters.""" for i in range(num_files): self._make_random_file(dir) def _validate_compressed(self, uncompressed_path): if uncompressed_path.endswith(os.path.sep): uncompressed_path = uncompressed_path[: -len(os.path.sep)] compressed_path = uncompressed_path + ".tgz" # verify original file is replaced by filename.tgz assert os.path.exists(compressed_path) assert not os.path.exists(uncompressed_path) # verify that uncompressing gets us back the original with in_dir(self.tempdir): os.system("tar xzf %s" % (compressed_path)) assert os.path.exists(uncompressed_path) def check_compress_service_logs_swallow_error(self): """Try compressing a non-existent service log, and check that it logs a message without throwing an error.""" from tests.ducktape_mock import session_context tc = TestContext( session_context=session_context(), module=sys.modules[DummyTestNoDescription.__module__], cls=DummyTestNoDescription, function=DummyTestNoDescription.test_this, ) tc._logger = logging.getLogger(__name__) temp_log_file = tempfile.NamedTemporaryFile(delete=False).name try: tmp_log_handler = logging.FileHandler(temp_log_file) tc._logger.addHandler(tmp_log_handler) test_obj = tc.cls(tc) # Expect an error to be triggered but swallowed test_obj.compress_service_logs(node=None, service=None, node_logs=["hi"]) tmp_log_handler.close() with open(temp_log_file, "r") as f: s = f.read() assert s.find("Error compressing log hi") >= 0 finally: if os.path.exists(temp_log_file): os.remove(temp_log_file) def check_abs_path_file(self): """Check compress command on an absolute path to a file""" with in_temp_dir() as dir1: filename = self._make_random_file(self.tempdir) abspath_filename = os.path.abspath(filename) # since we're using absolute path to file, we should be able to run the compress command from anywhere with in_temp_dir() as dir2: assert dir1 != dir2 os.system(_compress_cmd(abspath_filename)) self._validate_compressed(abspath_filename) def check_relative_path_file(self): """Check compress command on a relative path to a file""" with in_temp_dir(): filename = self._make_random_file(self.tempdir) with in_dir(self.tempdir): filename = os.path.basename(filename) assert len(filename.split(os.path.sep)) == 1 # compress it! os.system(_compress_cmd(filename)) self._validate_compressed(filename) def check_abs_path_dir(self): """Validate compress command with absolute path to a directory""" dirname = tempfile.mkdtemp(dir=self.tempdir) dirname = os.path.abspath(dirname) self._make_files(dirname) # compress it! if not dirname.endswith(os.path.sep): # extra check - ensure all of this works with trailing '/' dirname += os.path.sep os.system(_compress_cmd(dirname)) self._validate_compressed(dirname) def check_relative_path_dir(self): """Validate tarball compression of a directory""" dirname = tempfile.mkdtemp(dir=self.tempdir) self._make_files(dirname) dirname = os.path.basename(dirname) assert len(dirname.split(os.path.sep)) == 1 # compress it! with in_dir(self.tempdir): os.system(_compress_cmd(dirname)) self._validate_compressed(dirname) def teardown_method(self, _): if os.path.exists(self.tempdir): shutil.rmtree(self.tempdir) ================================================ FILE: tests/tests/check_test_context.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ducktape.tests.test import Test from ducktape.services.service import Service from ducktape.mark import parametrize from ducktape.mark.mark_expander import MarkedFunctionExpander from tests.ducktape_mock import session_context from mock import MagicMock class CheckTestContext(object): def check_copy_constructor(self): """Regression test against a bug introduced in 0.3.7 The TestContext copy constructor was copying the ServiceRegistry object by reference. As a result, services registering themselves with one test context would be registered with the copied context as well, resulting in the length of the service registry to grow additively from test to test. This problem cropped up in particular with parametrized tests. """ expander = MarkedFunctionExpander( session_context=session_context(), cls=DummyTest, function=DummyTest.test_me, cluster=MagicMock(), ) ctx_list = expander.expand() for ctx in ctx_list: # Constructing an instance of the test class causes a service to be registered with the test context ctx.cls(ctx) # Ensure that each context.services object is a unique reference assert len(set(id(ctx.services) for ctx in ctx_list)) == len(ctx_list) class DummyTest(Test): def __init__(self, test_context): super(DummyTest, self).__init__(test_context) self.service = DummyService(test_context) @parametrize(x=1) @parametrize(x=2) def test_me(self): pass class DummyService(Service): def __init__(self, context): super(DummyService, self).__init__(context, 1) ================================================ FILE: tests/utils/__init__.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: tests/utils/check_util.py ================================================ # Copyright 2015 Confluent Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from ducktape.errors import TimeoutError from ducktape.utils.util import wait_until import time class CheckUtils(object): def check_wait_until(self): """Check normal wait until behavior""" start = time.time() wait_until(lambda: time.time() > start + 0.5, timeout_sec=2, backoff_sec=0.1) def check_wait_until_timeout(self): """Check that timeout throws exception""" start = time.time() with pytest.raises(TimeoutError, match="Hello world"): wait_until( lambda: time.time() > start + 5, timeout_sec=0.5, backoff_sec=0.1, err_msg="Hello world", ) def check_wait_until_timeout_callable_msg(self): """Check that timeout throws exception and the error message is generated via a callable""" start = time.time() with pytest.raises(TimeoutError, match="Hello world"): wait_until( lambda: time.time() > start + 5, timeout_sec=0.5, backoff_sec=0.1, err_msg=lambda: "Hello world", ) def check_wait_until_with_exception(self): def condition_that_raises(): raise Exception("OG") with pytest.raises(TimeoutError) as exc_info: wait_until( condition_that_raises, timeout_sec=0.5, backoff_sec=0.1, err_msg="Hello world", retry_on_exc=True, ) exc_chain = exc_info.getrepr(chain=True).chain # 2 exceptions in the chain - OG and Hello world assert len(exc_chain) == 2 # each element of a chain is a tuple of traceback, "crash" (which is the short message) # and optionally descr, which is None for the bottom of the chain # and "The exception above is ..." for all the others # We're interested in crash, since that one is a one-liner with the actual exception message. og_message = str(exc_chain[0][1]) hello_message = str(exc_chain[1][1]) assert "OG" in og_message assert "Hello world" in hello_message def check_wait_until_with_exception_on_first_step_only_but_still_fails(self): start = time.time() def condition_that_raises_before_3(): if time.time() < start + 0.3: raise Exception("OG") else: return False with pytest.raises(TimeoutError) as exc_info: wait_until( condition_that_raises_before_3, timeout_sec=0.5, backoff_sec=0.1, err_msg="Hello world", retry_on_exc=True, ) exc_chain = exc_info.getrepr(chain=True).chain assert len(exc_chain) == 1 hello_message = str(exc_chain[0][1]) assert "Hello world" in hello_message def check_wait_until_exception_which_succeeds_eventually(self): start = time.time() def condition_that_raises_before_3_but_then_succeeds(): if time.time() < start + 0.3: raise Exception("OG") else: return True wait_until( condition_that_raises_before_3_but_then_succeeds, timeout_sec=0.5, backoff_sec=0.1, err_msg="Hello world", retry_on_exc=True, ) def check_wait_until_breaks_early_on_exception(self): def condition_that_raises(): raise Exception("OG") with pytest.raises(Exception, match="OG") as exc_info: wait_until( condition_that_raises, timeout_sec=0.5, backoff_sec=0.1, err_msg="Hello world", ) assert "Hello world" not in str(exc_info) ================================================ FILE: tox.ini ================================================ [tox] envlist = py38, py39, py310, py311, py312, py313, cover, style, docs [testenv] # Consolidate all deps here instead of separately in test/style/cover so we # have a single env to work with, which makes debugging easier (like which env?). # Not as clean but easier to work with during development, which is better. deps = -r requirements.txt -r requirements-test.txt install_command = pip install -U {packages} recreate = False skipsdist = True usedevelop = True setenv = PIP_PROCESS_DEPENDENCY_LINKS=1 PIP_DEFAULT_TIMEOUT=60 ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future envdir = {package_root}/.virtualenvs/ducktape_{envname} commands = pytest {env:PYTESTARGS:} {posargs} [testenv:py38] envdir = {package_root}/.virtualenvs/ducktape-py38 [testenv:py39] envdir = {package_root}/.virtualenvs/ducktape-py39 [testenv:py310] envdir = {package_root}/.virtualenvs/ducktape-py310 [testenv:py311] envdir = {package_root}/.virtualenvs/ducktape-py311 [testenv:py312] envdir = {package_root}/.virtualenvs/ducktape-py312 [testenv:py313] envdir = {package_root}/.virtualenvs/ducktape-py313 [testenv:style] basepython = python3.13 envdir = {package_root}/.virtualenvs/ducktape commands = ruff format --check ruff check [testenv:ruff] basepython = python3.13 envdir = {package_root}/.virtualenvs/ducktape commands = ruff {posargs} [testenv:cover] basepython = python3.13 envdir = {package_root}/.virtualenvs/ducktape commands = pytest {env:PYTESTARGS:} --cov ducktape --cov-report=xml --cov-report=html --cov-report=term --cov-report=annotate:textcov \ --cov-fail-under=70 [testenv:docs] basepython = python3.13 deps = -r {toxinidir}/docs/requirements.txt changedir = {toxinidir}/docs commands = sphinx-build -M {env:SPHINX_BUILDER:html} . _build {posargs}