Repository: stitchfix/flotilla-os Branch: master Commit: d16becadf8cb Files: 224 Total size: 1.1 MB Directory structure: gitextract_unzartjt/ ├── .circleci/ │ └── config.yml ├── .github/ │ ├── CODEOWNERS │ └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .migrations/ │ ├── V20200123054713__initial_table_create.sql │ ├── V20200123054714__add_spark_extension.sql │ ├── V20200205133700__executable.sql │ ├── V20200206115000__template.sql │ ├── V20200210154600__template_refactor.sql │ ├── V20200211160100__task_col_fix.sql │ ├── V20200211161900__template_indicies.sql │ ├── V20200212101900__template.sql │ ├── V20200213101400__task_indexes.sql │ ├── V20200213125200__rename_default_payload.sql │ ├── V20200225125200__add_limits.sql │ ├── V20200325125200__add_attempts.sql │ ├── V20200325125201__add_spawned.sql │ ├── V20200625125201__add_run_exceptions.sql │ ├── V20210083054714__metrics_uri.sql │ ├── V20210427125201__add_active_deadline_seconds.sql │ ├── V20210807125201__drop_index_container_name.sql │ ├── V20211007125201__add_description.sql │ ├── V20220907125201__add_idempotence.sql │ ├── V20220907125202__add_arch.sql │ ├── V20221215125203__add_labels.sql │ ├── V20230718115000__add_ephemeral_storage.sql │ ├── V20231013191711__add_requires_docker.sql │ ├── V20231122141100__add_target_cluster.sql │ ├── V20240205132100__add_service_account.sql │ ├── V20250122141100__add_cluster_routing.sql │ └── dev.conf ├── ARA_METRICS_COMPARISON.md ├── Dockerfile ├── LICENSE ├── README.html ├── README.md ├── ara-impact-report-staging.md ├── ara-impact-report.md ├── clients/ │ ├── cluster/ │ │ ├── cluster.go │ │ └── eks_cluster_client.go │ ├── httpclient/ │ │ ├── client.go │ │ └── client_test.go │ ├── logs/ │ │ ├── eks_cloudwatch_logs_client.go │ │ ├── eks_s3_logs_client.go │ │ └── logs.go │ ├── metrics/ │ │ ├── datadog_metrics_client.go │ │ └── metrics.go │ └── middleware/ │ └── client.go ├── conf/ │ └── config.yml ├── config/ │ ├── config.go │ └── config_test.go ├── datadog-ara-dashboard-api.json ├── docker-compose.yml ├── docs/ │ ├── ara-command-hash-bug-report.md │ ├── ara-command-hash-fix-locations.md │ ├── ara-command-hash-history.md │ ├── ara-instrumentation.md │ └── ara.md ├── exceptions/ │ └── errors.go ├── execution/ │ ├── adapter/ │ │ ├── eks_adapter.go │ │ └── eks_adapter_test.go │ └── engine/ │ ├── dcm.go │ ├── eks_engine.go │ ├── emr_engine.go │ └── engine.go ├── flotilla/ │ ├── app.go │ ├── endpoints.go │ ├── endpoints_test.go │ └── router.go ├── go.mod ├── go.sum ├── log/ │ ├── event.go │ ├── event_test.go │ ├── logger.go │ └── logger_test.go ├── main.go ├── queue/ │ ├── manager.go │ ├── sqs_manager.go │ └── sqs_manager_test.go ├── services/ │ ├── definition.go │ ├── definition_test.go │ ├── execution.go │ ├── execution_test.go │ ├── logs.go │ ├── logs_test.go │ ├── template.go │ └── worker.go ├── state/ │ ├── manager.go │ ├── models.go │ ├── models_test.go │ ├── pg_queries.go │ ├── pg_state_manager.go │ └── pg_state_manager_test.go ├── testutils/ │ └── mocks.go ├── tracing/ │ └── tracing.go ├── ui/ │ ├── .gitignore │ ├── .prettierrc │ ├── Dockerfile │ ├── README.md │ ├── package.json │ ├── public/ │ │ └── index.html │ ├── src/ │ │ ├── api.ts │ │ ├── components/ │ │ │ ├── ARASwitch.tsx │ │ │ ├── App.tsx │ │ │ ├── Attribute.tsx │ │ │ ├── AutoscrollSwitch.tsx │ │ │ ├── BaseTaskForm.tsx │ │ │ ├── CloudtrailRecords.tsx │ │ │ ├── ClusterSelect.tsx │ │ │ ├── CreateTaskForm.tsx │ │ │ ├── DeleteTaskButton.tsx │ │ │ ├── Duration.tsx │ │ │ ├── EngineTag.tsx │ │ │ ├── EnvFieldArray.tsx │ │ │ ├── EnvList.tsx │ │ │ ├── EnvQueryFilter.tsx │ │ │ ├── ErrorCallout.tsx │ │ │ ├── FieldError.tsx │ │ │ ├── GenericMultiSelect.tsx │ │ │ ├── GroupNameSelect.tsx │ │ │ ├── ISO8601AttributeValue.tsx │ │ │ ├── ListFiltersDropdown.tsx │ │ │ ├── ListRequest.tsx │ │ │ ├── Log.tsx │ │ │ ├── LogProcessor.tsx │ │ │ ├── LogRequesterCloudWatchLogs.tsx │ │ │ ├── LogRequesterS3.tsx │ │ │ ├── LogVirtualized.tsx │ │ │ ├── LogVirtualizedRow.tsx │ │ │ ├── LogVirtualizedSearch.tsx │ │ │ ├── Navigation.tsx │ │ │ ├── NodeLifecycleSelect.tsx │ │ │ ├── Pagination.tsx │ │ │ ├── QueryParams.tsx │ │ │ ├── Request.tsx │ │ │ ├── ResourceUsageValue.tsx │ │ │ ├── Run.tsx │ │ │ ├── RunAttributes.tsx │ │ │ ├── RunDebugAttributes.tsx │ │ │ ├── RunEvents.tsx │ │ │ ├── RunSidebar.tsx │ │ │ ├── RunStatusSelect.tsx │ │ │ ├── RunTag.tsx │ │ │ ├── Runs.tsx │ │ │ ├── SettingsButton.tsx │ │ │ ├── SortableTh.tsx │ │ │ ├── StopRunButton.tsx │ │ │ ├── Table.tsx │ │ │ ├── TagsSelect.tsx │ │ │ ├── Task.tsx │ │ │ ├── TaskDetails.tsx │ │ │ ├── TaskExecutionForm.tsx │ │ │ ├── TaskRuns.tsx │ │ │ ├── Tasks.tsx │ │ │ ├── Template.tsx │ │ │ ├── TemplateDetails.tsx │ │ │ ├── TemplateExecutionForm.tsx │ │ │ ├── TemplateHistoryTable.tsx │ │ │ ├── TemplateRunForm.tsx │ │ │ ├── Templates.tsx │ │ │ ├── Toaster.ts │ │ │ ├── Toggler.tsx │ │ │ ├── UpdateTaskForm.tsx │ │ │ ├── ViewHeader.tsx │ │ │ └── __tests__/ │ │ │ ├── BaseTaskForm.spec.tsx │ │ │ ├── ClusterSelect.spec.tsx │ │ │ ├── CreateTaskForm.spec.tsx │ │ │ ├── DeleteTaskButton.spec.tsx │ │ │ ├── EnvFieldArray.spec.tsx │ │ │ ├── GroupNameSelect.spec.tsx │ │ │ ├── ListRequest.spec.tsx │ │ │ ├── LogProcessor.spec.tsx │ │ │ ├── LogVirtualized.spec.tsx │ │ │ ├── LogVirtualizedSearch.spec.tsx │ │ │ ├── Pagination.spec.tsx │ │ │ ├── QueryParams.spec.tsx │ │ │ ├── Request.spec.tsx │ │ │ ├── Run.spec.tsx │ │ │ ├── Runs.spec.tsx │ │ │ ├── StopRunButton.spec.tsx │ │ │ ├── TaskRuns.spec.tsx │ │ │ ├── Tasks.spec.tsx │ │ │ └── UpdateTaskForm.spec.tsx │ │ ├── constants.ts │ │ ├── helpers/ │ │ │ ├── FlotillaClient.ts │ │ │ ├── __mocks__/ │ │ │ │ └── FlotillaClient.ts │ │ │ ├── __tests__/ │ │ │ │ ├── FlotillaClient.spec.ts │ │ │ │ ├── getInitialValuesForTaskRun.spec.ts │ │ │ │ └── pageToOffsetLimit.spec.ts │ │ │ ├── calculateDuration.ts │ │ │ ├── constructDefaultObjectFromJsonSchema.ts │ │ │ ├── getEnhancedRunStatus.ts │ │ │ ├── getInitialValuesForExecutionForm.ts │ │ │ ├── getOwnerIdRunTagFromCookies.ts │ │ │ ├── pageToOffsetLimit.ts │ │ │ ├── runFormHelpers.ts │ │ │ ├── selectHelpers.ts │ │ │ ├── taskFormHelpers.ts │ │ │ └── testHelpers.ts │ │ ├── index.css │ │ ├── index.tsx │ │ ├── localstorage.ts │ │ ├── react-app-env.d.ts │ │ ├── setupTests.js │ │ ├── state/ │ │ │ ├── runView.ts │ │ │ ├── settings.ts │ │ │ └── store.ts │ │ ├── types.ts │ │ └── workers/ │ │ ├── index.ts │ │ └── log.worker.ts │ └── tsconfig.json ├── utils/ │ ├── dd_tracing.go │ └── utils.go └── worker/ ├── events_worker.go ├── events_worker_test.go ├── retry_worker.go ├── retry_worker_test.go ├── status_worker.go ├── status_worker_test.go ├── submit_worker.go ├── submit_worker_test.go ├── worker.go ├── worker_manager.go └── worker_test.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .circleci/config.yml ================================================ --- version: 2 jobs: build: working_directory: ~/go/src/github.com/stitchfix/flotilla-os docker: - image: cimg/go:1.24 environment: FLOTILLA_MODE: test DATABASE_URL: postgresql://flotilla:flotilla@localhost/flotilla?sslmode=disable READONLY_DATABASE_URL: postgresql://flotilla:flotilla@localhost/flotilla?sslmode=disable PG_USER: flotilla PG_HOST: 127.0.0.1 GO111MODULE: "on" - image: cimg/postgres:17.4 environment: POSTGRES_USER: flotilla POSTGRES_DB: flotilla POSTGRES_PASSWORD: flotilla steps: - checkout - run: name: Installing Flyway command: curl -sL https://repo1.maven.org/maven2/org/flywaydb/flyway-commandline/6.5.7/flyway-commandline-6.5.7-linux-x64.tar.gz | tar xz && sudo ln -s "$(pwd)/flyway-6.5.7/flyway" /usr/local/bin/flyway - run: name: Waiting for Postgres to be ready command: dockerize -wait tcp://localhost:5432 -timeout 5m - run: name: Set Up DB command: | pwd ls -a flyway baseline -configFiles=./.migrations/dev.conf \ -user=flotilla \ -password=flotilla flyway migrate -configFiles=./.migrations/dev.conf \ -locations=filesystem:./.migrations/ \ -user=flotilla \ -password=flotilla - run: go get ./... - run: go test -v ./... ================================================ FILE: .github/CODEOWNERS ================================================ # This file uses the GitHub CODEOWNERS convention to assign PR reviewers: # https://help.github.com/articles/about-codeowners/ * @stitchfix/dev-platform ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## PROBLEM ## SOLUTION ================================================ FILE: .gitignore ================================================ # Binaries for programs and plugins *.exe *.dll *.so *.dylib # Test binary, build with `go test -c` *.test # Output of the go coverage tool, specifically when used with LiteIDE *.out # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 .glide/ vendor/** !vendor/vendor.json .idea *.iml flotilla-os # gh-pages and ui_branch node_modules .cache/ .DS_Store yarn-error.log ui/build/ .env ================================================ FILE: .migrations/V20200123054713__initial_table_create.sql ================================================ -- -- Definitions -- CREATE TABLE IF NOT EXISTS task_def ( definition_id character varying PRIMARY KEY, alias character varying, image character varying NOT NULL, group_name character varying NOT NULL, memory integer, cpu integer, gpu integer, command text, env jsonb, -- Refactor these "user" character varying, arn character varying, container_name character varying NOT NULL, task_type character varying, privileged boolean, adaptive_resource_allocation boolean, -- Refactor these CONSTRAINT task_def_alias UNIQUE(alias) ); CREATE TABLE IF NOT EXISTS task_def_ports ( task_def_id character varying NOT NULL REFERENCES task_def(definition_id), port integer NOT NULL, CONSTRAINT task_def_ports_pkey PRIMARY KEY(task_def_id, port) ); CREATE INDEX IF NOT EXISTS ix_task_def_alias ON task_def(alias); CREATE INDEX IF NOT EXISTS ix_task_def_group_name ON task_def(group_name); CREATE INDEX IF NOT EXISTS ix_task_def_image ON task_def(image); CREATE INDEX IF NOT EXISTS ix_task_def_env ON task_def USING gin (env jsonb_path_ops); -- -- Runs -- CREATE TABLE IF NOT EXISTS task ( run_id character varying NOT NULL PRIMARY KEY, definition_id character varying REFERENCES task_def(definition_id), alias character varying, image character varying, cluster_name character varying, exit_code integer, exit_reason character varying, status character varying, queued_at timestamp with time zone, started_at timestamp with time zone, finished_at timestamp with time zone, instance_id character varying, instance_dns_name character varying, group_name character varying, env jsonb, -- Refactor these -- task_arn character varying, docker_id character varying, "user" character varying, task_type character varying, -- Refactor these -- command text, command_hash text, memory integer, cpu integer, gpu integer, ephemeral_storage integer, node_lifecycle text, engine character varying DEFAULT 'eks' NOT NULL, container_name text, pod_name text, namespace text, max_cpu_used integer, max_memory_used integer, pod_events jsonb, cloudtrail_notifications jsonb ); CREATE INDEX IF NOT EXISTS ix_task_definition_id ON task(definition_id); CREATE INDEX IF NOT EXISTS ix_task_cluster_name ON task(cluster_name); CREATE INDEX IF NOT EXISTS ix_task_status ON task(status); CREATE INDEX IF NOT EXISTS ix_task_group_name ON task(group_name); CREATE INDEX IF NOT EXISTS ix_task_env ON task USING gin (env jsonb_path_ops); CREATE INDEX IF NOT EXISTS ix_task_definition_id ON task(definition_id); CREATE INDEX IF NOT EXISTS ix_task_task_arn ON task(task_arn); CREATE INDEX IF NOT EXISTS ix_task_definition_id_started_at_desc ON task(definition_id, started_at DESC NULLS LAST); CREATE INDEX IF NOT EXISTS ix_task_definition_id_started_at_desc_engine ON task(definition_id, started_at DESC NULLS LAST, engine); CREATE INDEX IF NOT EXISTS ix_finished_at_status_cluster_name ON task USING btree (cluster_name, status, finished_at DESC); CREATE INDEX IF NOT EXISTS ix_task_definition_id_started_at_asc ON task USING btree (definition_id, started_at); CREATE INDEX IF NOT EXISTS ix_task_pod_events ON task USING gin (pod_events jsonb_path_ops); CREATE INDEX IF NOT EXISTS ix_task_queued_at_status_engine ON task USING btree (queued_at, status, engine); CREATE INDEX IF NOT EXISTS task_definition_id_engine_started_at_index ON task USING btree (definition_id, engine, started_at DESC); -- -- Status -- CREATE TABLE IF NOT EXISTS task_status ( status_id integer NOT NULL PRIMARY KEY, task_arn character varying, status_version integer NOT NULL, status character varying, "timestamp" timestamp with time zone DEFAULT now() ); CREATE INDEX IF NOT EXISTS ix_task_status_task_arn ON task_status(task_arn); CREATE SEQUENCE IF NOT EXISTS task_status_status_id_seq START WITH 1 INCREMENT BY 1 NO MINVALUE NO MAXVALUE CACHE 1; ALTER TABLE ONLY task_status ALTER COLUMN status_id SET DEFAULT nextval('task_status_status_id_seq'::regclass); -- -- Tags -- CREATE TABLE IF NOT EXISTS tags ( text character varying NOT NULL PRIMARY KEY ); CREATE TABLE IF NOT EXISTS task_def_tags ( tag_id character varying NOT NULL REFERENCES tags(text), task_def_id character varying NOT NULL REFERENCES task_def(definition_id) ); CREATE TABLE IF NOT EXISTS worker ( worker_type character varying, engine character varying, count_per_instance integer ); ================================================ FILE: .migrations/V20200123054714__add_spark_extension.sql ================================================ ALTER TABLE task ADD COLUMN IF NOT EXISTS spark_extension JSONB; ================================================ FILE: .migrations/V20200205133700__executable.sql ================================================ ALTER TABLE task ADD COLUMN executable_id VARCHAR, ADD COLUMN executable_type VARCHAR DEFAULT 'task_definition'; ================================================ FILE: .migrations/V20200206115000__template.sql ================================================ CREATE TABLE template ( template_id VARCHAR PRIMARY KEY, type VARCHAR NOT NULL, version INTEGER NOT NULL, schema JSONB NOT NULL, command_template TEXT NOT NULL, image VARCHAR NOT NULL, memory INTEGER NOT NULL, gpu INTEGER NOT NULL, cpu INTEGER NOT NULL, env JSONB, privileged BOOLEAN, adaptive_resource_allocation BOOLEAN, container_name VARCHAR NOT NULL, CONSTRAINT template_type_version UNIQUE(type, version) ); ALTER TABLE task ADD COLUMN IF NOT EXISTS executable_request_custom JSONB; ================================================ FILE: .migrations/V20200210154600__template_refactor.sql ================================================ ALTER TABLE template DROP CONSTRAINT template_type_version; ALTER TABLE template RENAME COLUMN type to template_name; ALTER TABLE template ADD CONSTRAINT template_name_version UNIQUE(template_name, version); ================================================ FILE: .migrations/V20200211160100__task_col_fix.sql ================================================ ALTER TABLE task RENAME COLUMN executable_request_custom to execution_request_custom; ================================================ FILE: .migrations/V20200211161900__template_indicies.sql ================================================ CREATE INDEX IF NOT EXISTS ix_template_id ON template(template_id); CREATE INDEX IF NOT EXISTS ix_template_name ON template(template_name); ================================================ FILE: .migrations/V20200212101900__template.sql ================================================ ALTER TABLE template ADD COLUMN default_payload JSONB; ALTER TABLE template ADD COLUMN avatar_uri VARCHAR; ================================================ FILE: .migrations/V20200213101400__task_indexes.sql ================================================ CREATE INDEX IF NOT EXISTS ix_task_executable_id ON task(executable_id); CREATE INDEX IF NOT EXISTS ix_task_executable_id_started_at_desc ON task(executable_id, started_at DESC NULLS LAST); CREATE INDEX IF NOT EXISTS ix_task_executable_id_started_at_desc_engine ON task(executable_id, started_at DESC NULLS LAST, engine); ================================================ FILE: .migrations/V20200213125200__rename_default_payload.sql ================================================ ALTER TABLE template RENAME COLUMN default_payload to defaults; ================================================ FILE: .migrations/V20200225125200__add_limits.sql ================================================ ALTER TABLE task ADD COLUMN memory_limit integer; ALTER TABLE task ADD COLUMN cpu_limit integer; ================================================ FILE: .migrations/V20200325125200__add_attempts.sql ================================================ ALTER TABLE task ADD COLUMN attempt_count integer; ================================================ FILE: .migrations/V20200325125201__add_spawned.sql ================================================ ALTER TABLE task ADD COLUMN spawned_runs jsonb; ================================================ FILE: .migrations/V20200625125201__add_run_exceptions.sql ================================================ ALTER TABLE task ADD COLUMN run_exceptions jsonb; ================================================ FILE: .migrations/V20210083054714__metrics_uri.sql ================================================ ALTER TABLE task ADD COLUMN IF NOT EXISTS metrics_uri varchar; ================================================ FILE: .migrations/V20210427125201__add_active_deadline_seconds.sql ================================================ ALTER TABLE task ADD COLUMN active_deadline_seconds integer; ================================================ FILE: .migrations/V20210807125201__drop_index_container_name.sql ================================================ alter table task_def alter column container_name drop not null; ================================================ FILE: .migrations/V20211007125201__add_description.sql ================================================ ALTER TABLE task ADD COLUMN IF NOT EXISTS description varchar; ================================================ FILE: .migrations/V20220907125201__add_idempotence.sql ================================================ ALTER TABLE task ADD COLUMN IF NOT EXISTS idempotence_key varchar; ================================================ FILE: .migrations/V20220907125202__add_arch.sql ================================================ ALTER TABLE task ADD COLUMN IF NOT EXISTS arch varchar; ================================================ FILE: .migrations/V20221215125203__add_labels.sql ================================================ ALTER TABLE task ADD COLUMN IF NOT EXISTS labels jsonb; ================================================ FILE: .migrations/V20230718115000__add_ephemeral_storage.sql ================================================ ALTER TABLE task_def ADD COLUMN IF NOT EXISTS ephemeral_storage INTEGER; ALTER TABLE task ADD COLUMN IF NOT EXISTS ephemeral_storage INTEGER; ================================================ FILE: .migrations/V20231013191711__add_requires_docker.sql ================================================ ALTER TABLE task_def ADD COLUMN IF NOT EXISTS requires_docker BOOLEAN DEFAULT(false); ALTER TABLE task ADD COLUMN IF NOT EXISTS requires_docker BOOLEAN DEFAULT(false); ================================================ FILE: .migrations/V20231122141100__add_target_cluster.sql ================================================ ALTER TABLE task_def ADD COLUMN IF NOT EXISTS target_cluster VARCHAR; ================================================ FILE: .migrations/V20240205132100__add_service_account.sql ================================================ ALTER TABLE task ADD COLUMN IF NOT EXISTS service_account VARCHAR; ================================================ FILE: .migrations/V20250122141100__add_cluster_routing.sql ================================================ DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'cluster_status') THEN CREATE TYPE cluster_status AS ENUM ('active', 'maintenance', 'offline'); END IF; END$$; CREATE TABLE IF NOT EXISTS cluster_state ( id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, name VARCHAR NOT NULL, cluster_version VARCHAR NOT NULL DEFAULT '', status cluster_status NOT NULL DEFAULT 'active', status_reason VARCHAR, status_since TIMESTAMP WITH TIME ZONE DEFAULT NOW(), capabilities VARCHAR[] NOT NULL DEFAULT '{}', allowed_tiers VARCHAR[] NOT NULL DEFAULT '{}', region VARCHAR NOT NULL, updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), namespace VARCHAR NOT NULL DEFAULT '', emr_virtual_cluster VARCHAR NOT NULL DEFAULT '', spark_server_uri VARCHAR NOT NULL DEFAULT '' ); CREATE INDEX IF NOT EXISTS ix_cluster_state_name ON cluster_state(name); CREATE INDEX IF NOT EXISTS ix_cluster_state_status ON cluster_state(status); DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='task' AND column_name='tier') THEN ALTER TABLE task ADD COLUMN tier TEXT; END IF; END$$; ================================================ FILE: .migrations/dev.conf ================================================ flyway.url=jdbc:postgresql://127.0.0.1:5432/flotilla flyway.user=flotilla flyway.password=flotilla flyway.cleanDisabled=true flyway.group=true flyway.locations=filesystem:.migrations ================================================ FILE: ARA_METRICS_COMPARISON.md ================================================ # ARA Metrics Implementation Comparison Comparing `ez/ara-metrics` (HEAD) vs `ez/ara-metrics-alt` ## Overview Both implementations add instrumentation to track Auto Resource Adjustment (ARA) behavior to identify over-provisioning patterns, particularly the ~300GB memory limit issue. However, they differ significantly in approach, metrics design, logging strategy, and code structure. --- ## Similarities ### Shared Goals - Track ARA resource adjustments - Detect when jobs hit maximum resource limits (especially 350GB memory) - Enable monitoring to identify over-provisioning patterns - Instrument `adaptiveResources()` function - Add structured logging for debugging ### Common Changes - Both modify `execution/adapter/eks_adapter.go` - Both add new metric constants to `clients/metrics/metrics.go` - Both track default resources before ARA applies adjustments - Both detect and report when max bounds are hit - Both use structured key-value logging format --- ## Key Differences ### 1. **Metric Naming Convention** **HEAD (`ez/ara-metrics`):** - Uses hierarchical dot notation: `engine.eks.ara.*` - Examples: `engine.eks.ara.estimation_attempted`, `engine.eks.ara.memory_increase` - Consistent with existing codebase pattern (`engine.eks.execute`, etc.) **Alt (`ez/ara-metrics-alt`):** - Uses flat namespace: `ara.*` - Examples: `ara.resource_adjustment`, `ara.memory_increase_ratio` - Shorter, more concise names **Winner:** HEAD - Consistent with existing naming conventions --- ### 2. **Metrics Coverage** **HEAD (10 metrics):** ```go // Estimation tracking EngineEKSARAEstimationAttempted // Counter EngineEKSARAEstimationSucceeded // Counter EngineEKSARAEstimationFailed // Counter // Resource tracking EngineEKSARAMaxResourceHit // Counter (tagged with resource:memory or resource:cpu) EngineEKSARAMemoryIncrease // Distribution EngineEKSARACPUIncrease // Distribution EngineEKSARADefaultMemory // Distribution EngineEKSARAARAMemory // Distribution EngineEKSARADefaultCPU // Distribution EngineEKSARAARACPU // Distribution ``` **Alt (8 metrics):** ```go // Core tracking ARAResourceAdjustment // Counter (when ARA triggers) ARANoHistoricalData // Counter (when no data found) // Ratio tracking ARAMemoryIncreaseRatio // Histogram ARACPUIncreaseRatio // Histogram // Limit detection ARAHitMaxMemory // Counter ARAHitMaxCPU // Counter // Final distributions ARAFinalMemoryMB // Histogram ARAFinalCPUMillicores // Histogram ``` **Comparison:** - **HEAD:** More granular - separates estimation attempts from successes/failures - **ALT:** More focused - tracks key ratios and final states - **HEAD:** Tracks resource increases as absolute values - **ALT:** Tracks increases as ratios (better for understanding relative growth) **Winner:** Tie - Both approaches have merit. HEAD provides more granularity; ALT provides better insight into relative growth. --- ### 3. **Logging Strategy** **HEAD:** - Logging only occurs when max resource bounds are hit - Uses stored logger instance (field on `eksAdapter`) - Separate `emitARAMetrics()` method for structured logging - Logs once per max-bound-hit event - Fields: `run_id`, `definition_id`, `executable_id`, `command`, default/final resources, max hit flags **ALT:** - **Multiple logging points:** 1. When ARA triggers adjustments (INFO level) 2. When max limits hit (WARN level) 3. In `state/pg_state_manager.go` for historical data lookups (success/no data/error) - Uses inline `flotillaLog.NewLogger(nil, nil)` - creates new logger instances - More verbose logging at each step - Detailed structured fields including ratios, overage amounts, cluster name - Separate logs for historical data lookup success/failure **Winner:** ALT - More comprehensive logging provides better debugging capability --- ### 4. **Logger Management** **HEAD:** ```go type eksAdapter struct { logger flotillaLog.Logger // Stored as field } func NewEKSAdapter(logger flotillaLog.Logger) (EKSAdapter, error) { adapter := eksAdapter{logger: logger} return &adapter, nil } // Usage in HEAD if a.logger == nil { return } a.logger.Log(logFields...) ``` **ALT:** ```go // No logger field stored // Creates new logger instances inline _ = flotillaLog.NewLogger(nil, nil).Log(...) ``` **Comparison:** - **HEAD:** Dependency injection pattern - logger passed via constructor, stored as field - **ALT:** Creates new logger instances inline (less efficient, harder to test) - **HEAD:** Requires updating `eks_engine.go` to pass logger (which it does) - **ALT:** No changes needed to constructor/initialization **Winner:** HEAD - Better design pattern (dependency injection), more testable --- ### 5. **Tagging Strategy** **HEAD:** - No tags used on metrics (empty `[]string{}`) - Simpler, avoids cardinality concerns - May limit filtering/grouping capabilities in DataDog **ALT:** - Uses cluster tags: `[]string{fmt.Sprintf("cluster:%s", run.ClusterName)}` - Explicitly documented as "low-cardinality tags to avoid excessive volume" - Enables per-cluster analysis **Winner:** ALT - Tags enable better filtering and per-cluster analysis --- ### 6. **Metric Types** **HEAD:** - Uses `Distribution()` for all numeric metrics - Uses `Increment()` for counters **ALT:** - Uses `Histogram()` for ratios and final values - Uses `Increment()` for counters **Comparison:** - DataDog treats Histogram and Distribution similarly for most use cases - Both approaches are valid **Winner:** Tie - No significant difference --- ### 7. **Code Structure** **HEAD:** - Cleaner separation: detects max hits after bounds checking - Uses helper method `emitARAMetrics()` to centralize logging logic - More modular: logging logic separate from bounds checking **ALT:** - Metrics/logging embedded directly in `checkResourceBounds()` - Requires passing additional parameters (`run`, `executable`, `defaultCPU`, etc.) to `checkResourceBounds()` - More invasive changes to function signatures - Inline logging at multiple points **Winner:** HEAD - Better code organization, less invasive changes --- ### 8. **State Manager Instrumentation** **HEAD:** - No changes to `state/pg_state_manager.go` - Only instruments the adapter layer **ALT:** - **Adds instrumentation to `state/pg_state_manager.go`** - Logs when historical data is found/not found/errors occur - Provides visibility into the data lookup layer - Helps debug issues with historical data queries **Winner:** ALT - Provides better end-to-end visibility --- ### 9. **Test Coverage** **HEAD:** - **Comprehensive test suite** (524 lines in `eks_adapter_test.go`) - Tests multiple scenarios: - ARA enabled with successful estimation - GPU jobs (skip ARA) - Estimation failures - Max resource bounds hitting - ARA disabled - Logger nil handling - Mock implementations for logger and state manager **ALT:** - No test files included **Winner:** HEAD - Significantly better test coverage --- ### 10. **Documentation** **HEAD:** - Commit message describes changes - No separate documentation file **ALT:** - **Comprehensive 317-line documentation** (`docs/ara-instrumentation.md`) - Includes: - Overview of ARA algorithm - Historical context of ARA implementation - Detailed explanation of metrics - DataDog query examples - Alert recommendations - Investigation workflow - Future improvement suggestions - Extremely helpful for operators and future developers **Winner:** ALT - Outstanding documentation --- ### 11. **Detection Logic** **HEAD:** ```go // After bounds checking cpuRequestBeforeBounds := cpuRequest memRequestBeforeBounds := memRequest cpuRequest, memRequest = a.checkResourceBounds(...) // Then detect hits if memRequestBeforeBounds > maxMem { maxMemHit = true // emit metrics/logs } ``` **ALT:** ```go // Inside checkResourceBounds() if mem > maxMem { // Emit metrics and logs immediately _ = metrics.Increment(metrics.ARAHitMaxMemory, ...) // ... logging ... mem = maxMem } ``` **Comparison:** - **HEAD:** Two-step process - check bounds, then detect if hit - **ALT:** Single-step - detect and log during bounds checking - **ALT:** More straightforward, less code **Winner:** ALT - Simpler, more direct approach --- ### 12. **ARA Trigger Detection** **HEAD:** - No explicit "ARA triggered" detection - Only tracks estimation attempts/success/failure - Doesn't distinguish between "ARA found same values" vs "ARA actually changed resources" **ALT:** ```go araTriggered := (estimatedResources.Cpu != cpuRequest || estimatedResources.Memory != memRequest) ``` - Explicitly detects when ARA actually changes resources - Only logs/increments metrics when resources actually change - More precise tracking **Winner:** ALT - More accurate tracking of actual ARA adjustments --- ## Best-of-Breed Recommendation **The ideal solution would combine:** ### From HEAD: 1. ? **Metric naming convention** - Use `engine.eks.ara.*` pattern 2. ? **Logger as dependency** - Store logger as field, inject via constructor 3. ? **Code organization** - Separate `emitARAMetrics()` method 4. ? **Test coverage** - Include comprehensive test suite 5. ? **Granular metrics** - Track estimation attempts/success/failure separately ### From ALT: 1. ? **Logging strategy** - Log when ARA triggers AND when limits hit 2. ? **State manager instrumentation** - Add logging in `pg_state_manager.go` 3. ? **Documentation** - Include comprehensive docs file 4. ? **Tagging** - Use cluster tags for filtering 5. ? **Ratio metrics** - Track ratios instead of/in addition to absolute increases 6. ? **ARA trigger detection** - Explicitly detect when ARA actually changes resources ### Hybrid Approach: ```go // Metrics (combine both approaches) - engine.eks.ara.estimation_attempted // Counter - engine.eks.ara.estimation_succeeded // Counter - engine.eks.ara.estimation_failed // Counter - engine.eks.ara.resource_adjustment // Counter (only when changed) - engine.eks.ara.memory_increase_ratio // Histogram (ALT's approach) - engine.eks.ara.cpu_increase_ratio // Histogram - engine.eks.ara.hit_max_memory // Counter - engine.eks.ara.hit_max_cpu // Counter - engine.eks.ara.final_memory_mb // Histogram - engine.eks.ara.final_cpu_millicores // Histogram // Logging (ALT's comprehensive approach) - Log when ARA triggers (INFO) - Log when limits hit (WARN) - Log in state manager for historical lookups // Code structure (HEAD's approach) - Store logger as field - Separate emitARAMetrics() method - Use cluster tags on metrics // Documentation - Include ALT's comprehensive docs // Tests - Include HEAD's comprehensive test suite ``` --- ## Verdict **Best Overall:** Neither solution is perfect alone. **ALT is closer to production-ready** due to: - Comprehensive documentation - Better logging strategy - End-to-end instrumentation - Ratio-based metrics (easier to understand) **But HEAD has better engineering practices:** - Dependency injection - Test coverage - Code organization **Recommendation:** Start with ALT as the base, then incorporate HEAD's improvements: 1. Store logger as field (HEAD) 2. Add test suite (HEAD) 3. Optionally adjust metric names to match HEAD's convention 4. Keep ALT's logging and documentation This hybrid would be the best-of-breed solution. ================================================ FILE: Dockerfile ================================================ FROM golang:latest RUN mkdir -p /go/src/github.com/stitchfix/flotilla-os ADD . /go/src/github.com/stitchfix/flotilla-os WORKDIR /go/src/github.com/stitchfix/flotilla-os RUN go install github.com/stitchfix/flotilla-os ENTRYPOINT /go/bin/flotilla-os /go/src/github.com/stitchfix/flotilla-os/conf ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.html ================================================ README

flotilla-os

Circle CI Go Report Card

Introduction

Flotilla is a self-service framework that dramatically simplifies the process of defining and executing containerized jobs. This means you get to focus on the work you’re doing rather than how to do it.

Once deployed, Flotilla allows you to:

Philosophy

Flotilla is strongly opinionated about self-service for data science.

The core assumption is that you understand your work the best. Therefore, it is you who should own your work from end-to-end. In other words, you shouldn’t need to be a “production engineer” to run your jobs or to access logs in case of problems. Do this with Flotilla.

Quick Start

Minimal Assumptions

Before we can do anything there’s some prerequistes that must be met.

  1. Flotilla by default uses AWS. You must have an AWS account and AWS keys available. This quick-start guide uses AWS keys exported into the environment variables: AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. If you’ve got credentials configured on your machine you can set these easily by running:
export AWS_ACCESS_KEY_ID=$(aws --profile default configure get aws_access_key_id)
export AWS_SECRET_ACCESS_KEY=$(aws --profile default configure get aws_secret_access_key)

Note: When running on AWS EC2 instances or ECS it’s better practice to use an IAM profile for AWS credentials

  1. The AWS credentials must be authorized. The permissions required are described in the following policy document for AWS (you can attach it to a user or a role depending on how you manage users in AWS).
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "flotilla-policy",
            "Effect": "Allow",
            "Action": [
                "sqs:DeleteMessage",
                "sqs:ListQueues",
                "sqs:GetQueueUrl",
                "logs:DescribeLogGroups",
                "sqs:ReceiveMessage",
                "events:PutRule",
                "sqs:SendMessage",
                "sqs:GetQueueAttributes",
                "ecs:DescribeClusters",
                "ecs:DeregisterTaskDefinition",
                "events:ListRuleNamesByTarget",
                "ecs:RunTask",
                "ecs:RegisterTaskDefinition",
                "sqs:CreateQueue",
                "ecs:ListContainerInstances",
                "ecs:DescribeContainerInstances",
                "ecs:ListClusters",
                "ecs:StopTask",
                "logs:CreateLogGroup",
                "logs:PutRetentionPolicy",
                "logs:GetLogEvents",
                "events:PutTargets",
                "sqs:SetQueueAttributes"
            ],
            "Resource": "*"
        }
    ]
}
  1. Flotilla uses AWS’s Elastic Container Service (ECS) and Elastic Kubernetes Service (EKS) as the execution backend. However, Flotilla does not manage ECS/EKS clusters. There must be at least one cluster defined in AWS’s ECS/EKS service available to you and it must have at least one task node. Most typically this is the default cluster and examples will assume this going forward. You can easily set up a cluster by following the instructions here: https://docs.aws.amazon.com/AmazonECS/latest/developerguide/launch_container_instance.html

https://docs.aws.amazon.com/eks/latest/userguide/what-is-eks.html

Starting the service locally

You can run the service locally (which will still leverage AWS resources) using the docker-compose tool. From inside the repo run:

docker-compose up -d

You’ll notice it builds the code in the repo and starts the flotilla service as well as the default postgres backend.

Verify the service is running by making a GET request with cURL (or navigating to in a web browser) the url http://localhost:5000/api/v6/task. A 200OK response means things are good!

Note: The default configuration under conf and in the docker-compose.yml assume port 3000. You’ll have to change it in both places if you don’t want to use port 3000 locally.

Using the UI

Flotilla has a simple, easy to use UI. Here’s some example images for basic usage.

Define a task with the UI

The UI allows you to quickly create new tasks.

Define Task

Launch a task with UI

You can run tasks you’ve created with the UI as well. Once you’ve ran a task the run will transition from Queued to Pending to Running before it finishes and shows Success or Failed (see Task Life Cycle). Once a task is in the Running state the logs should be visible.

  1. Launch

Run Task

  1. Queued –> Pending

Queued Task

Pending Task 3. View logs

Running Task

Finished Task

Basic API Usage

Defining your first task

Before you can run a task you first need to define it. We’ll use the example hello world task definition. Here’s what that looks like:

hello-world.json

{
  "alias": "hello-flotilla",
  "group_name": "examples",
  "image": "ubuntu:latest",
  "memory": 512,
  "env": [
    {
      "name": "USERNAME",
      "value": "_fill_me_in_"
    }
  ],
  "command": "echo \"hello ${USERNAME}\""
}

It’s a simple task that runs in the default ubuntu image, prints your username to the logs, and exits.

Note: While you can use non-public images and images in your own registries with flotilla, credentials for accessing those images must exist on the ECS hosts. This is outside the scope of this doc. See the AWS documentation.

Let’s define it:

curl -XPOST localhost:5000/api/v6/task --data @examples/hello-world.json

You’ll notice that if you visit the initial url again http://localhost:5000/api/v6/task the newly defined definition will be in the list.

Running your first task

This is the fun part. You’ll make a PUT request to the execution endpoint for the task you just defined and specify any environment variables.

curl -XPUT localhost:5000/api/v6/task/alias/hello-flotilla/execute -d '{
  "cluster":"default",
  "env":[
    {"name":"USERNAME","value":"yourusername"}
  ],
  "run_tags":{"owner_id":"youruser"}
}'

Note: run_tags is defined as a way for all runs to have a ownership injected for visibility and is required.

You’ll get a response that contains a run_id field. You can check the status of your task at http://localhost:5000/api/v6/history/<run_id>

curl -XGET localhost:5000/api/v6/history/<run_id>

{
  "instance": {
    "dns_name": "<dns-host-of-task-node>",
    "instance_id": "<instance-id-of-task-node>"
  },
  "run_id": "<run_id>",
  "definition_id": "<definition_id>",
  "alias": "hello-flotilla",
  "image": "ubuntu:latest",
  "cluster": "default",
  "status": "PENDING",
  "env": [
    {
      "name": "FLOTILLA_RUN_OWNER_ID",
      "value": "youruser"
    },
    {
      "name": "FLOTILLA_SERVER_MODE",
      "value": "dev"
    },
    {
      "name": "FLOTILLA_RUN_ID",
      "value": "<run_id>"
    },
    {
      "name": "USERNAME",
      "value": "yourusername"
    }
  ]
}

and you can get the logs for your task at http://localhost:5000/api/v6/<run_id>/logs. You will not see any logs until your task is at least in the RUNNING state.

curl -XGET localhost:5000/api/v6/<run_id>/logs

{
  "last_seen":"<last_seen_token_used_for_paging>",
  "log":"+ set -e\n+ echo 'hello yourusername'\nhello yourusername"
}

Definitions and Task Life Cycle

Definitions

Name Definition
task A definition of a task that can be executed to create a run
run An instance of a task

Task Life Cycle

When executed, a task’s run goes through several transitions

  1. QUEUED - this is the first phase of a run and means the run is currently queued and waiting to be allocated to a cluster
  2. PENDING - every worker.submit_interval (defined in the config) the submit worker pulls from the queues and submits them for execution. At this point, if the cluster associated with the run has resources, the run gets allocated to the cluster and transitions to the PENDING status. For the default execution engine this stage encapsulates the process of pulling the docker image and starting the container. It can take several minutes depending on whether the image is cached and how large the image is.
  3. RUNNING - Once the run starts on a particular execution host it transitions to this stage. At this point logs should become available.
  4. STOPPED - A run enters this stage when it finishes execution. This can mean it either succeeded or failed depending on the existence of an exit_code and the value of that exit code.
  5. NEEDS_RETRY - on occassion, due to host level characteristics (full disk, too many open files, timeouts pulling image, etc) the run exits with a null exit code without ever being executed. In this case the reason is analyzed to determine if the run is retriable. If it is, the task transitions to this status and is allocated to the appropriate execution queue again, and will repeat the lifecycle.

Normal Lifecycle

QUEUED –> PENDING –> RUNNING –> STOPPED

Retry Lifecycle

… –> PENDING –> STOPPED –> NEEDS_RETRY –> QUEUED –> …

Deploying

In a production deployment you’ll want multiple instances of the flotilla service running and postgres running elsewhere (eg. Amazon RDS). In this case the most salient detail configuration detail is the DATABASE_URL.

Docker based deploy

The simplest way to deploy for very light usage is to avoid a reverse proxy and deploy directly with docker.

  1. Build and tag an image for flotilla using the Dockerfile provided in this repo:

    docker build -t <your repo name>/flotilla:<version tag>
    
    2. Run this image wherever you deploy your services:

    docker run -e DATABASE_URL=<your db url> -e FLOTILLA_MODE=prod -p 3000:3000 ...<other standard docker run args>
    

    Notes:

    • Flotilla uses viper for configuration so you can override any of the default configuration under conf/ using run time environment variables passed to docker run
    • In most realistic deploys you’ll likely want to configure a reverse proxy to sit in front of the flotilla container. See the docs here

    See docker run for more details

Configuration In Detail

The variables in conf/config.yml are sensible defaults. Most should be left alone unless you’re developing flotilla itself. However, there are a few you may want to change in a production environment.

Variable Name Description
worker.retry_interval Run frequency of the retry worker
worker.submit_interval Poll frequency of the submit worker
worker.status_interval Poll frequency of the status update worker
http.server.read_timeout_seconds Sets read timeout in seconds for the http server
http.server.write_timeout_seconds Sets the write timeout in seconds for the http server
http.server.listen_address The port for the http server to listen on
owner_id_var Which environment variable containing ownership information to inject into the runtime of jobs
enabled_workers This variable is a list of the workers that run. Use this to control what workers run when using a multi-container deployment strategy. Valid list items include (retry, submit, and status)
log.namespace For the default ECS execution engine setup this is the log-group to use
log.retention_days For the default ECS execution engine this is the number of days to retain logs
log.driver.options.* For the default ECS execution engine these map to the awslogs driver options here
queue.namespace For the default ECS execution engine this is the prefix used for SQS to determine which queues to pull job launch messages from
queue.retention_seconds For the default ECS execution engine this configures how long a message will stay in an SQS queue without being consumed
queue.process_time For the default ECS execution engine configures the length of time allowed to process a job launch message
queue.status For the default ECS execution engine this configures which SQS queue to route ECS cluster status updates to
queue.status_rule For the default ECS execution engine this configures the name of the rule for routing ECS cluster status updates
metrics.dogstatsd.address Statds metrics host in Datadog format
metrics.dogstatsd.namespace Namespace for the metrics - for example flotilla.
redis_address Redis host for caching and locks
redis_db Redis db to be used - numeric

Development

API Documentation

See API

Building

Currently Flotilla is built using go 1.9.3 and uses the govendor to manage dependencies.

govendor sync && go build
================================================ FILE: README.md ================================================ # flotilla-os [![Circle CI](https://circleci.com/gh/stitchfix/flotilla-os.svg?style=shield)](https://circleci.com/gh/stitchfix/flotilla-os) [![Go Report Card](https://goreportcard.com/badge/github.com/stitchfix/flotilla-os)](https://goreportcard.com/report/github.com/stitchfix/flotilla-os) ## Introduction Flotilla is a self-service framework that dramatically simplifies the process of defining and executing containerized jobs. This means you get to focus on the work you're doing rather than _how_ to do it. Once deployed, Flotilla allows you to: * Define containerized jobs by allowing you to specify exactly what command to run, what image to run that command in, and what resources that command needs to run * Run any previously defined job and access its logs, status, and exit code * View and edit job definitions with a flexible UI * Run jobs and view execution history and logs within the UI * Use the complete REST API for definitions, jobs, and logs to build your own custom workflows ## Philosophy Flotilla is strongly opinionated about self-service for data science. The core assumption is that you understand your work the best. Therefore, it is _you_ who should own your work from end-to-end. In other words, you shouldn't need to be a "production engineer" to run your jobs or to access logs in case of problems. Do this with Flotilla. ## Quick Start ### Minimal Assumptions Flotilla uses AWS's Elastic Kubernetes Service (EKS) as the execution backend. However, Flotilla does not manage EKS clusters. There must be at least one cluster defined in AWS's EKS service available to you and it must have at least one task node. Most typically this is the `default` cluster and examples will assume this going forward. https://docs.aws.amazon.com/eks/latest/userguide/what-is-eks.html ### Starting the service locally You can run the service locally (which will still leverage AWS resources) using the [docker-compose](https://docs.docker.com/compose/) tool. From inside the repo run: ``` docker-compose up -d ``` You'll notice it builds the code in the repo and starts the flotilla service as well as the default postgres backend. Verify the service is running by making a `GET` request with cURL (or navigating to in a web browser) the url `http://localhost:5000/api/v6/task`. A 200OK response means things are good! > Note: The default configuration under `conf` and in the `docker-compose.yml` assume port 3000. You'll have to change it in both places if you don't want to use port 3000 locally. ### Using the UI Flotilla has a simple, easy to use UI. Here's some example images for basic usage. #### Define a task with the UI The UI allows you to quickly create new tasks. ![Define Task](https://user-images.githubusercontent.com/10807627/36499487-47a0b82c-16f6-11e8-886b-ca6d38276889.png "Create New Task") #### Launch a task with UI You can run tasks you've created with the UI as well. Once you've ran a task the run will transition from `Queued` to `Pending` to `Running` before it finishes and shows `Success` or `Failed` (see [Task Life Cycle](#definitions-and-task-life-cycle)). Once a task is in the `Running` state the logs should be visible. 1. Launch ![Run Task](https://user-images.githubusercontent.com/10807627/36499492-481da436-16f6-11e8-9f14-5bbe8c297434.png "Run Task") 2. Queued --> Pending ![Queued Task](https://user-images.githubusercontent.com/10807627/36499491-4801515a-16f6-11e8-9525-db85bb999887.png "Queued Task") ![Pending Task](https://user-images.githubusercontent.com/10807627/36499490-47e27e88-16f6-11e8-8041-355de885be44.png "Pending Task") 3. View logs ![Running Task](https://user-images.githubusercontent.com/10807627/36499493-4842176c-16f6-11e8-9467-a345987bd407.png "Running Task") ![Finished Task](https://user-images.githubusercontent.com/10807627/36499494-48609cfa-16f6-11e8-8656-5504063cb6e7.png "Finished Task") ### Basic API Usage #### Defining your first task Before you can run a task you first need to define it. We'll use the example hello world task definition. Here's what that looks like: > hello-world.json > ``` { "alias": "hello-flotilla", "group_name": "examples", "image": "ubuntu:latest", "memory": 512, "env": [ { "name": "USERNAME", "value": "_fill_me_in_" } ], "command": "echo \"hello ${USERNAME}\"" } ``` It's a simple task that runs in the default ubuntu image, prints your username to the logs, and exits. > Note: While you can use non-public images and images in your own registries with flotilla, credentials for accessing those images must exist on the EKS hosts. This is outside the scope of this doc. Let's define it: ``` curl -XPOST localhost:5000/api/v6/task --data @examples/hello-world.json ``` You'll notice that if you visit the initial url again `http://localhost:5000/api/v6/task` the newly defined definition will be in the list. #### Running your first task This is the fun part. You'll make a `PUT` request to the execution endpoint for the task you just defined and specify any environment variables. ``` curl -XPUT localhost:5000/api/v6/task/alias/hello-flotilla/execute -d '{ "cluster":"default", "env":[ {"name":"USERNAME","value":"yourusername"} ], "run_tags":{"owner_id":"youruser"} }' ``` > Note: `run_tags` is defined as a way for all runs to have a ownership injected for visibility and is *required*. You'll get a response that contains a `run_id` field. You can check the status of your task at `http://localhost:5000/api/v6/history/` ``` curl -XGET localhost:5000/api/v6/history/ { "instance": { "dns_name": "", "instance_id": "" }, "run_id": "", "definition_id": "", "alias": "hello-flotilla", "image": "ubuntu:latest", "cluster": "default", "status": "PENDING", "env": [ { "name": "FLOTILLA_RUN_OWNER_ID", "value": "youruser" }, { "name": "FLOTILLA_SERVER_MODE", "value": "dev" }, { "name": "FLOTILLA_RUN_ID", "value": "" }, { "name": "USERNAME", "value": "yourusername" } ] } ``` and you can get the logs for your task at `http://localhost:5000/api/v6//logs`. You will not see any logs until your task is at least in the `RUNNING` state. ``` curl -XGET localhost:5000/api/v6//logs { "last_seen":"", "log":"+ set -e\n+ echo 'hello yourusername'\nhello yourusername" } ``` ## Definitions and Task Life Cycle ### Definitions | Name | Definition | | ---- | ---------- | | `task` | A definition of a task that can be executed to create a `run` | | `run` | An instance of a task | ### Task Life Cycle When executed, a task's run goes through several transitions 1. `QUEUED` - this is the first phase of a run and means the run is currently queued and waiting to be allocated to a cluster 2. `PENDING` - every `worker.submit_interval` (defined in the config) the submit worker pulls from the queues and submits them for execution. At this point, if the cluster associated with the run has resources, the run gets allocated to the cluster and transitions to the `PENDING` status. For the default execution engine this stage encapsulates the process of pulling the docker image and starting the container. It can take several minutes depending on whether the image is cached and how large the image is. 3. `RUNNING` - Once the run starts on a particular execution host it transitions to this stage. At this point logs should become available. 4. `STOPPED` - A run enters this stage when it finishes execution. This can mean it either succeeded or failed depending on the existence of an `exit_code` and the value of that exit code. 5. `NEEDS_RETRY` - on occassion, due to host level characteristics (full disk, too many open files, timeouts pulling image, etc) the run exits with a null exit code without ever being executed. In this case the reason is analyzed to determine if the run is retriable. If it is, the task transitions to this status and is allocated to the appropriate execution queue again, and will repeat the lifecycle. #### Normal Lifecycle `QUEUED` --> `PENDING` --> `RUNNING` --> `STOPPED` #### Retry Lifecycle ... --> `PENDING` --> `STOPPED` --> `NEEDS_RETRY` --> `QUEUED` --> ... ## Deploying In a production deployment you'll want multiple instances of the flotilla service running and postgres running elsewhere (eg. Amazon RDS). In this case the most salient detail configuration detail is the `DATABASE_URL`. ### Docker based deploy The simplest way to deploy for very light usage is to avoid a reverse proxy and deploy directly with docker. 1. Build and tag an image for flotilla using the `Dockerfile` provided in this repo: ``` docker build -t /flotilla: ``` 2. Run this image wherever you deploy your services: ``` docker run -e DATABASE_URL= -e FLOTILLA_MODE=prod -p 3000:3000 ... ``` > Notes: > ----- > * Flotilla uses [viper](https://github.com/spf13/viper) for configuration so you can override any of the default configuration under `conf/` using run time environment variables passed to `docker run` > * In most realistic deploys you'll likely want to configure a reverse proxy to sit in front of the flotilla container. See the docs [here](https://hub.docker.com/_/nginx/) See [docker run](https://docs.docker.com/engine/reference/run/) for more details ### Configuration In Detail The variables in `conf/config.yml` are sensible defaults. Most should be left alone unless you're developing flotilla itself. However, there are a few you may want to change in a production environment. | Variable Name | Description | | ------------- | ----------- | | `worker_retry_interval` | Run frequency of the retry worker | | `worker_submit_interval` | Poll frequency of the submit worker | | `worker_status_interval` | Poll frequency of the status update worker | | `http_server_read_timeout_seconds` | Sets read timeout in seconds for the http server | | `http_server_write_timeout_seconds` | Sets the write timeout in seconds for the http server | | `http_server_listen_address` | The port for the http server to listen on | | `owner_id_var` | Which environment variable containing ownership information to inject into the runtime of jobs | | `enabled_workers` | This variable is a list of the workers that run. Use this to control what workers run when using a multi-container deployment strategy. Valid list items include (`retry`, `submit`, and `status`) | | `metrics_dogstatsd_address` | Statds metrics host in Datadog format | | `metrics_dogstatsd_namespace` | Namespace for the metrics - for example `flotilla.` | | `redis_address` | Redis host for caching and locks| | `redis_db` | Redis db to be used - numeric | | `eks_clusters` | hash-map of cluster-name and it's associated kubeconfig (encoded in base64) | | `eks_kubeconfig_basepath` | folder where the kubeconfigs are stored | | `eks_cluster_ondemand_whitelist` | override list of cluster names where to force ondemand node types | | `eks_cluster_override` | EKS clusters to override traffic | | `eks_scheduler_name` | Custom scheduler name to use, default is `kube-scheduler` | | `eks_manifest_storage.options.region` | Kubernetes manifest s3 upload bucket aws region | | `eks_manifest_storage_options_s3_bucket_name` | S3 bucket name for manifest storage. | | `eks_manifest_storage_options_s3_bucket_root_dir` | S3 root bucket path. | | `eks_log_namespace_retention_days` | Number of days to store logs. | | `eks_log_namespace_driver_name` | Logger name. | | `eks_log_namespace_driver_options_s3_bucket_name` | S3 bucket name to store logs. | | `eks_log_namespace_driver_options_s3_bucket_root_dir` | S3 root bucket path within the bucket.| | `eks_job_namespace` | Kubernetes namespace to submit jobs to. | | `eks_job_ttl` | default job ttl in seconds | | `eks_job_queue` | SQS job queue - the api places the jobs on this queue and the submit worker asynchronously submits it to Kubernetes/EKS | | `eks.service_account` | Kubernetes service account to use for jobs. | ## Development ### API Documentation See [API](https://stitchfix.github.io/flotilla-os/api.html) ### Building Currently Flotilla is built using `go` 1.9.3 and uses the `go mod` to manage dependencies. ``` go get && go build ``` ================================================ FILE: ara-impact-report-staging.md ================================================ # ARA Impact Analysis Report - STAGING Environment ## 10-Day Analysis of Adaptive Resource Allocation (Dec 7-17, 2025) ### Executive Summary This report analyzes the impact of the ARA bug fix deployed on **December 16, 2025** in the **STAGING environment**. **Key Findings:** - **forklift-deploy-model-v1**: Fix deployed mid-day Dec 16, full effect on Dec 17 - Before fix (Dec 7-15): NULL `command_hash`, memory 4-6.5GB (at/below baseline) - After fix (Dec 17): Proper `command_hash`, memory 4-6.5GB (unchanged) - **No memory over-allocation issue in staging** (unlike production) - **python-3.11 jobs**: Working correctly with ARA - Baseline: 50MB - Elevated: 1-16GB via ARA (reasonable levels) - **No extreme 350GB allocations** (staging max is 40GB) - **GPU jobs**: None in staging environment - **Environment difference**: Staging has much lower max memory ceiling (40GB vs 350GB in production) --- ## Environment Overview **Database Container**: `77b8e13079e5` (postgres:16) **Analysis Period**: 2025-12-07 to 2025-12-17 (10 days) **Total Jobs**: 125,154 jobs from 14 unique definitions --- ## Query 1: forklift-deploy-model-v1 Command Hash Population ### Query ```sql SELECT DATE(queued_at) as date, command_hash IS NULL as hash_null, COUNT(*) as count FROM task WHERE definition_id IN (SELECT definition_id FROM task_def WHERE alias = 'forklift-deploy-model-v1') AND queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(queued_at), command_hash IS NULL ORDER BY date, hash_null; ``` ### Results ``` date | hash_null | count ------------+-----------+------- 2025-12-07 | t | 30 2025-12-08 | t | 35 2025-12-09 | t | 57 2025-12-10 | t | 31 2025-12-11 | t | 33 2025-12-12 | t | 30 2025-12-13 | t | 30 2025-12-14 | t | 25 2025-12-15 | t | 30 2025-12-16 | f | 5 ← Fix deployed (partial) 2025-12-16 | t | 25 2025-12-17 | f | 30 ← Fix fully active ``` ### Analysis - **Dec 7-15**: 100% of forklift jobs had NULL `command_hash` (301 jobs total) - **Dec 16**: Transition day - 5 jobs with proper hash, 25 with NULL (fix deployed mid-day) - **Dec 17**: 100% of forklift jobs have proper `command_hash` (30 jobs) - **Fix deployment time**: Mid-day December 16, 2025 --- ## Query 2: forklift-deploy-model-v1 Memory Allocations ### Query ```sql SELECT DATE(queued_at) as date, MIN(memory) as min_mem, MAX(memory) as max_mem, AVG(memory)::int as avg_mem, COUNT(*) as count FROM task WHERE definition_id IN (SELECT definition_id FROM task_def WHERE alias = 'forklift-deploy-model-v1') AND queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(queued_at) ORDER BY date; ``` ### Results ``` date | min_mem | max_mem | avg_mem | count ------------+---------+---------+---------+------- 2025-12-07 | 4000 | 6500 | 5500 | 30 2025-12-08 | 4000 | 6500 | 5286 | 35 2025-12-09 | 4000 | 6500 | 4789 | 57 2025-12-10 | 4000 | 6500 | 5452 | 31 2025-12-11 | 4000 | 8500 | 5500 | 33 2025-12-12 | 4000 | 6500 | 5500 | 30 2025-12-13 | 4000 | 6500 | 5500 | 30 2025-12-14 | 4000 | 6500 | 5500 | 25 2025-12-15 | 4000 | 6500 | 5500 | 30 2025-12-16 | 4000 | 6500 | 5500 | 30 2025-12-17 | 4000 | 6500 | 5500 | 30 ``` ### Analysis - **Baseline**: 8GB (8000MB) from task definition - **Memory allocations**: 4-6.5GB (all at or below baseline) - **Before fix**: Despite NULL `command_hash`, no memory over-allocation - **After fix**: Memory unchanged (4-6.5GB range) - **Key difference from production**: Staging forklift jobs **never exhibited the 18-33GB over-allocation** seen in production --- ## Query 3: Elevated Memory Jobs (ARA Impact) ### Query ```sql SELECT DATE(t.queued_at) as date, COUNT(*) as elevated_jobs, COUNT(DISTINCT t.definition_id) as unique_defs FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE t.memory > td.memory * 1.5 AND td.adaptive_resource_allocation = true AND t.queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(t.queued_at) ORDER BY date; ``` ### Results ``` date | elevated_jobs | unique_defs ------------+---------------+------------- 2025-12-07 | 134 | 1 2025-12-08 | 129 | 1 2025-12-09 | 150 | 1 2025-12-10 | 217 | 1 2025-12-11 | 416 | 1 2025-12-12 | 420 | 1 2025-12-13 | 417 | 1 2025-12-14 | 418 | 1 2025-12-15 | 413 | 1 2025-12-16 | 450 | 1 2025-12-17 | 395 | 1 ``` ### Analysis - **Total elevated jobs**: 3,559 jobs over 10 days - **All from one definition**: `python-3.11` (baseline: 50MB) - **Average**: ~324 elevated jobs per day - **Pattern**: Consistent elevation throughout the period (no change after fix) - **This is expected**: python-3.11 jobs have proper `command_hash` throughout --- ## Query 4: python-3.11 Memory Elevation Details ### Query ```sql SELECT DATE(t.queued_at) as date, td.alias, td.memory as baseline_mb, t.memory as allocated_mb, CAST((t.memory::float / td.memory) as numeric(10,2)) as multiplier, COUNT(*) as job_count FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE t.memory > td.memory * 1.5 AND td.adaptive_resource_allocation = true AND t.queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(t.queued_at), td.alias, td.memory, t.memory ORDER BY date, job_count DESC LIMIT 50; ``` ### Results (sample) ``` date | alias | baseline_mb | allocated_mb | multiplier | job_count ------------+-------------+-------------+--------------+------------+----------- 2025-12-11 | python-3.11 | 50 | 1024 | 20.48 | 284 2025-12-11 | python-3.11 | 50 | 4096 | 81.92 | 88 2025-12-11 | python-3.11 | 50 | 1792 | 35.84 | 39 2025-12-11 | python-3.11 | 50 | 8000 | 160.00 | 5 2025-12-12 | python-3.11 | 50 | 1024 | 20.48 | 292 2025-12-12 | python-3.11 | 50 | 4096 | 81.92 | 88 2025-12-12 | python-3.11 | 50 | 1792 | 35.84 | 32 2025-12-12 | python-3.11 | 50 | 8000 | 160.00 | 5 2025-12-12 | python-3.11 | 50 | 16000 | 320.00 | 3 ``` ### Analysis - **Elevation levels**: - 1GB (1024MB): Most common (~300 jobs/day) - 4GB (4096MB): Consistent (~88 jobs/day) - 8GB (8000MB): Regular (~5 jobs/day) - 16GB (16000MB): Rare (3 jobs total) - **No extreme allocations**: Max is 16GB (vs 350GB in production) - **Reasonable multipliers**: 20-320x (vs 7000x in production) --- ## Query 5: python-3.11 Command Hash Status ### Query ```sql SELECT DATE(queued_at) as date, command_hash IS NULL as hash_null, COUNT(*) as count FROM task WHERE definition_id IN (SELECT definition_id FROM task_def WHERE alias = 'python-3.11') AND queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(queued_at), command_hash IS NULL ORDER BY date, hash_null; ``` ### Results ``` date | hash_null | count ------------+-----------+------- 2025-12-07 | f | 134 2025-12-08 | f | 129 2025-12-09 | f | 150 2025-12-10 | f | 217 2025-12-11 | f | 416 2025-12-12 | f | 420 2025-12-13 | f | 417 2025-12-14 | f | 418 2025-12-15 | f | 413 2025-12-16 | f | 450 2025-12-17 | f | 396 ``` ### Analysis - **100% of python-3.11 jobs** have proper `command_hash` throughout the entire period - **ARA working correctly**: Jobs are elevated based on proper command hash lookups - **No NULL command_hash issue**: Unlike forklift, python-3.11 had command_hash all along --- ## Query 6: GPU Jobs Analysis ### Query ```sql SELECT COUNT(*) as gpu_job_count, COUNT(DISTINCT definition_id) as unique_definitions FROM task WHERE gpu IS NOT NULL AND gpu > 0 AND queued_at >= CURRENT_DATE - INTERVAL '10 days'; ``` ### Results ``` gpu_job_count | unique_definitions ---------------+-------------------- 0 | 0 ``` ### Analysis - **No GPU jobs** in staging environment over the past 10 days - The GPU detection bug fix is not testable in staging - GPU jobs appear to be production-only workloads --- ## Query 7: Memory Distribution ### Query ```sql SELECT memory, COUNT(*) FROM task WHERE queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY memory ORDER BY memory DESC LIMIT 15; ``` ### Results ``` memory | count --------+-------- | 3536 ← NULL (jobs still queued/pending) 40960 | 22 ← 40GB (max in staging) 20000 | 3 16000 | 3 8500 | 1 8000 | 57 6500 | 195 4096 | 973 4000 | 213 2744 | 1 2048 | 1073 1792 | 123 1568 | 2 1024 | 101156 ← Most common (1GB) 1000 | 58 ``` ### Analysis - **Max memory allocated**: 40GB (40,960MB) - **Most common**: 1GB (1,024MB) - 101,156 jobs (80.7%) - **Distribution**: Heavily skewed toward small allocations - **No extreme allocations**: Nothing above 40GB --- ## Staging vs Production Comparison | Metric | Production | Staging | Notes | |--------|-----------|---------|-------| | **Max memory limit** | 350GB | 40GB | Staging has 8.75x lower ceiling | | **forklift over-allocation** | 18-33GB (before fix) | None | Staging had no issue | | **python-3.11 max allocation** | 350GB | 16GB | 21.8x difference | | **GPU jobs** | 460 jobs | 0 jobs | Production only | | **Total jobs (10 days)** | 280,215 | 125,154 | Production 2.2x larger | | **command_hash fix date** | Dec 16 | Dec 16 | Same deployment | --- ## Conclusions ### Fix Effectiveness in Staging: ✅ Verified 1. **forklift-deploy-model-v1**: - **Before fix (Dec 7-15)**: NULL `command_hash` but no memory issues - **After fix (Dec 17)**: Proper `command_hash`, memory unchanged - **No over-allocation problem** in staging (unlike production) - Root cause: Staging already had lower max memory limits 2. **python-3.11**: - **Throughout period**: Proper `command_hash`, ARA working correctly - **Elevated to**: 1-16GB (reasonable levels) - **No extreme allocations**: Staging max limit prevents 350GB scenario 3. **Environment differences**: - Staging has **40GB max memory** vs production's **350GB** - This prevented the extreme allocation issue we saw in production - Staging is a safer environment for testing ARA changes ### Key Insights 1. **Staging didn't exhibit the production issue** because: - Lower max memory ceiling (40GB vs 350GB) - forklift jobs stayed within reasonable bounds despite NULL `command_hash` 2. **The fix deployed successfully**: - Mid-day Dec 16: Partial deployment - Dec 17: Full effect with 100% proper `command_hash` 3. **No GPU jobs in staging**: - Cannot validate GPU bug fix in this environment - GPU workloads are production-specific ### Recommendations 1. **Production parity**: Consider raising staging max memory to match production (248GB new limit) for better testing 2. **GPU testing**: Add GPU job definitions to staging for comprehensive ARA testing 3. **Monitoring**: The fix is working correctly in staging, safe to deploy the 248GB limit reduction 4. **No action needed**: Staging forklift jobs are healthy and don't require intervention --- ## Appendix: Container Information - **Database Container**: `77b8e13079e5` (postgres:16) - **Database URL**: Available as `$FLOTILLA_DATABASE_URL` in container environment - **Environment**: STAGING - **Report Generated**: 2025-12-17 - **Analysis Period**: 2025-12-07 to 2025-12-17 (10 days) - **Fix Deployed**: 2025-12-16 (mid-day) --- ## Sample Query Template To reproduce this analysis or run ad-hoc queries: ```bash docker exec 77b8e13079e5 bash -c 'psql $FLOTILLA_DATABASE_URL -c "YOUR_QUERY_HERE"' ``` Example: ```bash docker exec 77b8e13079e5 bash -c 'psql $FLOTILLA_DATABASE_URL -c "SELECT COUNT(*) FROM task WHERE memory > 10000 AND queued_at >= CURRENT_DATE - INTERVAL '\''1 day'\'';"' ``` ================================================ FILE: ara-impact-report.md ================================================ # ARA Impact Analysis Report ## 10-Day Analysis of Adaptive Resource Allocation (Dec 7-17, 2025) ### Executive Summary This report analyzes the impact of the ARA bug fix deployed on **December 16, 2025**. The fix changed ARA lookups from using `description` to `command_hash`, preventing incorrect resource allocation matches. **Key Findings:** - **350GB allocations** (baseline: 50MB): Continue at expected levels (legitimate OOM responses) - **forklift-deploy-model-v1 elevations** (baseline: 8GB): **Completely eliminated** after fix deployment - **Fix effectiveness**: 100% resolution for the forklift issue (21 elevated jobs/day → 0 elevated jobs/day) - **Root cause identified**: `command_hash` was NULL before fix despite having command text - The fix both (a) started calculating `command_hash` properly and (b) changed ARA lookup logic - Before: NULL `command_hash` + NULL `description` → incorrect ARA matches → 18-33GB allocations - After: Proper `command_hash` (19432e77...) → correct lookups → 4-7GB allocations (at baseline) --- ## Query 1: Daily Count of 350GB Memory Jobs ### Query ```sql SELECT DATE(queued_at) as date, COUNT(*) as count_350gb_jobs FROM task WHERE memory = 350000 AND queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(queued_at) ORDER BY date LIMIT 15; ``` ### Results ``` date | count_350gb_jobs ------------+------------------ 2025-12-07 | 14 2025-12-08 | 14 2025-12-09 | 29 2025-12-10 | 53 2025-12-11 | 16 2025-12-12 | 30 2025-12-13 | 16 2025-12-14 | 14 2025-12-15 | 15 2025-12-16 | 52 ← Fix deployed 2025-12-17 | 14 ``` ### Analysis - **Average before fix (Dec 7-15)**: 21.2 jobs/day - **Day of fix (Dec 16)**: 52 jobs (spike likely due to deployment activity) - **After fix (Dec 17)**: 14 jobs (within normal range) - These jobs have a **baseline of only 50MB** but allocate **350GB** (7000x increase) --- ## Query 2: 350GB Jobs by Definition/Alias ### Query ```sql SELECT DATE(t.queued_at) as date, td.alias, COUNT(*) as job_count FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE t.memory = 350000 AND t.queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(t.queued_at), td.alias ORDER BY date, job_count DESC LIMIT 50; ``` ### Results (sample) ``` date | alias | job_count ------------+----------------------+----------- 2025-12-15 | python-3.11 | 10 2025-12-15 | pytorch2-24.05-py3_8 | 3 2025-12-15 | pytorch2-24.05-py3_1 | 2 2025-12-16 | python-3.11 | 30 2025-12-16 | pytorch2-24.05-py3_8 | 15 2025-12-16 | pytorch2-24.05-py3_1 | 7 2025-12-17 | python-3.11 | 5 2025-12-17 | pytorch2-24.05-py3_8 | 5 2025-12-17 | pytorch2-24.05-py3_1 | 4 ``` ### Analysis - Three definition aliases affected: `python-3.11`, `pytorch2-24.05-py3_8`, `pytorch2-24.05-py3_1` - All three definitions have baseline memory of **50MB** - Distribution across definitions remains consistent before and after fix - These appear to be **legitimate ARA responses** to actual OOM conditions --- ## Query 3: Other Elevated Memory Jobs (Non-350GB) ### Query ```sql SELECT DATE(t.queued_at) as date, COUNT(*) as elevated_jobs, COUNT(DISTINCT t.definition_id) as unique_defs FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE t.memory > td.memory * 1.5 AND td.adaptive_resource_allocation = true AND t.queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(t.queued_at) ORDER BY date LIMIT 15; ``` ### Results ``` date | elevated_jobs | unique_defs ------------+---------------+------------- 2025-12-07 | 16 | 1 2025-12-08 | 11 | 1 2025-12-09 | 14 | 1 2025-12-10 | 24 | 1 2025-12-11 | 4 | 1 2025-12-12 | 5 | 1 2025-12-13 | 10 | 1 2025-12-14 | 6 | 1 2025-12-15 | 21 | 1 2025-12-16 | 5 | 1 ← Fix deployed 2025-12-17 | 0 | 0 ← No elevated jobs! ``` ### Analysis - **Average before fix (Dec 7-15)**: 12.3 elevated jobs/day - **After fix (Dec 17)**: **0 jobs** ✅ - All elevated jobs came from a **single definition** (forklift-deploy-model-v1) - **100% fix effectiveness** for this issue --- ## Query 4: Detailed Elevation Analysis (forklift-deploy-model-v1) ### Query ```sql SELECT DATE(t.queued_at) as date, td.alias, td.memory as baseline_mb, t.memory as allocated_mb, CAST((t.memory::float / td.memory) as numeric(10,2)) as multiplier, COUNT(*) as job_count FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE t.memory > td.memory * 1.5 AND td.adaptive_resource_allocation = true AND t.queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(t.queued_at), td.alias, td.memory, t.memory ORDER BY date, job_count DESC LIMIT 40; ``` ### Results (sample) ``` date | alias | baseline_mb | allocated_mb | multiplier | job_count ------------+--------------------------+-------------+--------------+------------+----------- 2025-12-14 | forklift-deploy-model-v1 | 8000 | 19000 | 2.38 | 4 2025-12-14 | forklift-deploy-model-v1 | 8000 | 33000 | 4.13 | 2 2025-12-15 | forklift-deploy-model-v1 | 8000 | 33000 | 4.13 | 17 2025-12-15 | forklift-deploy-model-v1 | 8000 | 19000 | 2.38 | 4 2025-12-16 | forklift-deploy-model-v1 | 8000 | 19000 | 2.38 | 4 2025-12-16 | forklift-deploy-model-v1 | 8000 | 33000 | 4.13 | 1 2025-12-17 | (no results) | N/A | N/A | N/A | 0 ``` ### Analysis - **Baseline**: 8GB (8000MB) - **Elevated allocations**: - 18GB (2.25x multiplier) - 19GB (2.38x multiplier) - 33GB (4.13x multiplier) - **Peak day**: Dec 15 with 21 total elevated jobs - **After fix**: Complete elimination on Dec 17 --- ## Query 5: Command Hash Diversity (350GB Jobs) ### Query ```sql SELECT DATE(t.queued_at) as date, td.alias, COUNT(*) as total_jobs, COUNT(DISTINCT t.command_hash) as unique_commands FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE t.memory = 350000 AND t.queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(t.queued_at), td.alias ORDER BY date, total_jobs DESC LIMIT 50; ``` ### Results (sample) ``` date | alias | total_jobs | unique_commands ------------+----------------------+------------+----------------- 2025-12-15 | python-3.11 | 10 | 5 2025-12-15 | pytorch2-24.05-py3_8 | 3 | 3 2025-12-15 | pytorch2-24.05-py3_1 | 2 | 2 2025-12-16 | python-3.11 | 30 | 8 2025-12-16 | pytorch2-24.05-py3_8 | 15 | 7 2025-12-16 | pytorch2-24.05-py3_1 | 7 | 5 2025-12-17 | python-3.11 | 5 | 5 2025-12-17 | pytorch2-24.05-py3_8 | 5 | 5 2025-12-17 | pytorch2-24.05-py3_1 | 4 | 4 ``` ### Analysis - **High command diversity**: Multiple unique command hashes per day - **Dec 15**: 15 jobs with 10 unique commands (67% unique) - **Dec 17**: 14 jobs with 14 unique commands (100% unique) - This diversity indicates **legitimate ARA responses** to different workloads with actual OOM history - The fix correctly uses `command_hash` for matching, not generic descriptions --- ## Query 6: Command Hash Analysis (forklift-deploy-model-v1) ### Query ```sql SELECT DATE(t.queued_at) as date, t.memory as allocated_mb, COUNT(*) as total_jobs, COUNT(t.command_hash) as non_null_hashes, COUNT(DISTINCT t.command_hash) as unique_commands FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE td.alias = 'forklift-deploy-model-v1' AND t.memory > td.memory * 1.5 AND t.queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(t.queued_at), t.memory ORDER BY date, allocated_mb LIMIT 50; ``` ### Results (sample) ``` date | allocated_mb | total_jobs | non_null_hashes | unique_commands ------------+--------------+------------+-----------------+----------------- 2025-12-14 | 19000 | 4 | 0 | 0 2025-12-14 | 33000 | 2 | 0 | 0 2025-12-15 | 19000 | 4 | 0 | 0 2025-12-15 | 33000 | 17 | 0 | 0 2025-12-16 | 19000 | 4 | 0 | 0 2025-12-16 | 33000 | 1 | 0 | 0 ``` ### Critical Finding: The command_hash Bug **Before Fix (Dec 7-16):** - **ALL forklift-deploy-model-v1 jobs had `command_hash = NULL`** (despite having a 206-char shell script) - The `description` field is also **always NULL** for forklift jobs - With both NULL, the old ARA code was incorrectly matching these jobs, causing false elevations **After Fix (Dec 17):** - `command_hash = 19432e77696deb6666bb12c67feb2b8d` (now properly calculated) - All forklift jobs get the same hash because they run the identical command - ARA now correctly looks up this hash and finds no OOM history - Result: No elevation (jobs run at or below the 8GB baseline) --- ## Query 7: Baseline vs Allocated Memory (350GB Jobs) ### Query ```sql SELECT t.definition_id, td.memory as baseline_memory, t.memory as allocated_memory, COUNT(*) as job_count FROM task t JOIN task_def td ON t.definition_id = td.definition_id WHERE t.memory = 350000 AND t.queued_at >= CURRENT_DATE - INTERVAL '3 days' GROUP BY t.definition_id, td.memory, t.memory ORDER BY job_count DESC LIMIT 20; ``` ### Results ``` definition_id | baseline_memory | allocated_memory | job_count ---------------------------------------------------------+-----------------+------------------+----------- sf-base_python-3_11-7449eda4-b8b3-4146-77c5-a47f8caac81b | 50 | 350000 | 52 sf-base_pytorch2-24__5-py3-505a283c-1e0a-43da-4c9b-071... | 50 | 350000 | 24 sf-base_pytorch2-24__5-py3-ceef4c9e-6ebc-41e5-6cef-a33... | 50 | 350000 | 16 ``` ### Analysis - **Massive increase**: 50MB → 350GB (7000x multiplier) - Indicates these are **ML training jobs** with significant memory requirements - The ARA system is correctly identifying commands that have historically run out of memory - These allocations continue appropriately after the fix --- ## Query 8: forklift-deploy-model-v1 Memory Allocation Timeline ### Query ```sql SELECT DATE(queued_at) as date, MIN(memory) as min_mem, MAX(memory) as max_mem, AVG(memory)::int as avg_mem, COUNT(*) as count FROM task WHERE definition_id IN (SELECT definition_id FROM task_def WHERE alias = 'forklift-deploy-model-v1') AND queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(queued_at) ORDER BY date; ``` ### Results ``` date | min_mem | max_mem | avg_mem | count ------------+---------+---------+---------+------- 2025-12-07 | 4000 | 33000 | 13431 | 35 2025-12-08 | 4000 | 33000 | 10792 | 38 2025-12-09 | 4000 | 33000 | 13062 | 34 2025-12-10 | 4000 | 33000 | 13117 | 52 2025-12-11 | 4000 | 19000 | 9392 | 13 2025-12-12 | 4000 | 33000 | 11842 | 12 2025-12-13 | 4000 | 33000 | 9524 | 46 2025-12-14 | 4000 | 33000 | 8930 | 27 2025-12-15 | 4000 | 33000 | 18078 | 40 2025-12-16 | 4000 | 33000 | 10807 | 15 2025-12-17 | 4000 | 7000 | 5007 | 15 ← Fix deployed ``` ### Analysis - **Baseline**: 8GB (8000 MB) - **Before fix**: Jobs randomly allocated 4-33GB (some below baseline, many elevated) - **After fix**: Jobs allocated 4-7GB (all at or below baseline) ✅ ### The command Field Content Query to inspect the command field: ```sql SELECT DISTINCT command, command_hash FROM task WHERE definition_id IN (SELECT definition_id FROM task_def WHERE alias = 'forklift-deploy-model-v1') AND queued_at >= CURRENT_DATE LIMIT 1; ``` Result shows forklift jobs run this **206-character shell script**: ```bash # # Use absolute latest forklift # mkdir -p /code/stitchfix cd /code/stitchfix git clone -b $GIT_BRANCH --single-branch git@github.com:stitchfix/forklift.git cd forklift/destinations/ml_model_deploy/ ./run ``` **Key Insight**: The command field is **NOT empty** - but `command_hash` was NULL before the fix, preventing proper ARA lookups. --- ## Query 9: command_hash Population Status by Date ### Query ```sql SELECT DATE(queued_at) as date, command_hash IS NULL as hash_null, COUNT(*) as count FROM task WHERE definition_id IN (SELECT definition_id FROM task_def WHERE alias = 'forklift-deploy-model-v1') AND queued_at >= CURRENT_DATE - INTERVAL '10 days' GROUP BY DATE(queued_at), command_hash IS NULL ORDER BY date, hash_null; ``` ### Results ``` date | hash_null | count ------------+-----------+------- 2025-12-07 | t | 35 2025-12-08 | t | 38 2025-12-09 | t | 34 2025-12-10 | t | 52 2025-12-11 | t | 13 2025-12-12 | t | 12 2025-12-13 | t | 46 2025-12-14 | t | 27 2025-12-15 | t | 40 2025-12-16 | t | 15 2025-12-17 | f | 15 ← command_hash now populated! ``` ### Analysis - **Dec 7-16**: 100% of forklift jobs had `command_hash = NULL` - **Dec 17**: 100% of forklift jobs have `command_hash = 19432e77696deb6666bb12c67feb2b8d` - The fix not only changed the lookup logic but also **started calculating command_hash** for new jobs --- ## Conclusions ### Fix Effectiveness: ✅ Confirmed 1. **forklift-deploy-model-v1 issue**: **100% resolved** - Before: 12.3 elevated jobs/day (average, elevated to 18-33GB) - After: 0 elevated jobs (all at or below 8GB baseline) - Root cause discovered: - The command field was populated (206-char shell script) but `command_hash` was **NULL** - The description field was also **NULL** - The fix both (a) started calculating `command_hash` and (b) changed lookup logic - Now all forklift jobs get the same `command_hash` and ARA finds no OOM history for it 2. **350GB allocations**: **Working as designed** - Jobs continue at expected levels - High command hash diversity (different workloads) - Baseline of 50MB suggests these are script runners with variable workloads - ARA correctly identifies specific commands with OOM history ### Before and After Comparison | Metric | Dec 15 (Before) | Dec 17 (After) | Change | |--------|----------------|----------------|---------| | 350GB jobs | 15 | 14 | -7% (normal variance) | | forklift elevated | 21 | 0 | -100% ✅ | | Total elevated | 36 | 14 | -61% | ### Recommendations 1. **Monitor next 7 days**: Verify forklift-deploy-model-v1 remains at baseline (8GB) ✅ 2. **350GB jobs**: These appear legitimate - monitor for OOM failures to validate 3. **Command hash calculation**: - Investigate why `command_hash` was NULL before Dec 17 - Verify all new jobs now properly calculate `command_hash` - Consider backfilling `command_hash` for historical records if needed for analytics 4. **ARA lookup logic**: Confirm the fix properly handles NULL `command_hash` cases (doesn't match) 5. **Documentation**: Update ARA docs to clarify: - `command_hash` is calculated from the `command` field (not `description`) - ARA requires valid `command_hash` for proper operation - Behavior when `command_hash` is NULL --- ## Appendix: Container Information - **Database Container**: `360a9dd48242` (postgres:16) - **Database URL**: Available as `$FLOTILLA_DATABASE_URL` in container environment - **Report Generated**: 2025-12-17 (updated with latest data) - **Analysis Period**: 2025-12-07 to 2025-12-17 (10 days) - **Fix Deployed**: 2025-12-16 ### Update Log - **Initial report**: Generated with data up to 12 jobs on Dec 17 - **Updated**: Refreshed with latest data showing 14 jobs on Dec 17 (100% unique command hashes) --- ## Sample Query Template To reproduce this analysis or run ad-hoc queries: ```bash docker exec 360a9dd48242 bash -c 'psql $FLOTILLA_DATABASE_URL -c "YOUR_QUERY_HERE"' ``` Example: ```bash docker exec 360a9dd48242 bash -c 'psql $FLOTILLA_DATABASE_URL -c "SELECT COUNT(*) FROM task WHERE memory = 350000 AND queued_at >= CURRENT_DATE - INTERVAL '\''1 day'\'';"' ``` ================================================ FILE: clients/cluster/cluster.go ================================================ package cluster import ( "fmt" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" ) // // Client validates whether or not the given definition can be run // on the specified cluster. This is to prevent infinite queue // times - the case that the requested resources will -never- become // available on the user's chosen cluster // type Client interface { Name() string Initialize(conf config.Config) error CanBeRun(clusterName string, executableResources state.ExecutableResources) (bool, error) ListClusters() ([]state.ClusterMetadata, error) } // NewClusterClient returns a cluster client func NewClusterClient(conf config.Config, name string) (Client, error) { switch name { case "eks": eksc := &EKSClusterClient{} if err := eksc.Initialize(conf); err != nil { return nil, errors.Wrap(err, "problem initializing EKSClusterClient") } return eksc, nil default: return nil, fmt.Errorf("No Client named [%s] was found", name) } } ================================================ FILE: clients/cluster/eks_cluster_client.go ================================================ package cluster import ( "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" ) // EKSClusterClient is the cluster client for EKS // [NOTE] This client assumes the EKS cluster is capable is running a mixed varieties of jobs. type EKSClusterClient struct{} func (EKSClusterClient) Name() string { return "" } func (EKSClusterClient) Initialize(conf config.Config) error { return nil } // CanBeRun for EKSCluster is always true func (EKSClusterClient) CanBeRun(clusterName string, executableResources state.ExecutableResources) (bool, error) { return true, nil } // Since it is a single cluster environment for EKS, slice of clusters is empty. func (EKSClusterClient) ListClusters() ([]state.ClusterMetadata, error) { return []state.ClusterMetadata{}, nil } ================================================ FILE: clients/httpclient/client.go ================================================ package httpclient import ( "bytes" "encoding/json" "fmt" "io" "net/http" "net/url" "strings" "time" ) type RetryableError interface { Err() string } type HttpRetryableError struct { e error } func (re HttpRetryableError) Error() string { return re.e.Error() } func (re HttpRetryableError) Err() string { return re.e.Error() } type RequestExecutor interface { Do(req *http.Request, timeout time.Duration, entity interface{}) error } type defaultExecutor struct{} func (de *defaultExecutor) Do(req *http.Request, timeout time.Duration, entity interface{}) error { client := http.Client{Timeout: timeout} if client.Timeout == 0 { client.Timeout = time.Second * 10 } r, err := client.Do(req) if r != nil { defer r.Body.Close() } if err != nil { return err } if r.StatusCode >= 200 && r.StatusCode < 400 { return json.NewDecoder(r.Body).Decode(entity) } else if r.StatusCode >= 500 { return HttpRetryableError{fmt.Errorf("Error response: %v", r.Status)} } else { return fmt.Errorf("Error response: %v", r.Status) } } // Generic http client to make http requests. type Client struct { Host string Timeout time.Duration RetryCount int Executor RequestExecutor } func (c *Client) Get(path string, headers map[string]string, entity interface{}) error { req, err := c.prepareRequestNoBody("GET", path, headers) if err != nil { return fmt.Errorf("httpclient GET: %v", err) } return c.doRequestWithRetry(req, entity) } func (c *Client) Delete(path string, headers map[string]string, entity interface{}) error { req, err := c.prepareRequestNoBody("DELETE", path, headers) if err != nil { return fmt.Errorf("httpclient DELETE: %v", err) } return c.doRequestWithRetry(req, entity) } func (c *Client) Put(path string, headers map[string]string, inEntity interface{}, outEntity interface{}) error { req, err := c.prepareRequestWithBody("PUT", path, headers, inEntity) if err != nil { return fmt.Errorf("httpclient PUT: %v", err) } return c.doRequestWithRetry(req, outEntity) } func (c *Client) Post(path string, headers map[string]string, inEntity interface{}, outEntity interface{}) error { req, err := c.prepareRequestWithBody("POST", path, headers, inEntity) if err != nil { return fmt.Errorf("httpclient POST: %v", err) } return c.doRequestWithRetry(req, outEntity) } func (c *Client) prepareRequestNoBody(method string, path string, headers map[string]string) (*http.Request, error) { return c.makeRequest(method, path, headers, nil) } func (c *Client) prepareRequestWithBody(method string, path string, headers map[string]string, entity interface{}) (*http.Request, error) { encoded, err := json.Marshal(entity) if err != nil { return nil, fmt.Errorf("httpclient get: %v", err) } return c.makeRequest(method, path, headers, bytes.NewBuffer(encoded)) } func (c *Client) makeURL(path string) (string, error) { host := c.Host if !strings.HasPrefix(c.Host, "http") { host = strings.Join([]string{"http://", c.Host}, "") } u, err := url.Parse(host) if err != nil { return "", fmt.Errorf("Unable to parse hostname (%v): %v", c.Host, err) } parsedPath, err := url.Parse(path) if err != nil { return "", fmt.Errorf("Unable to parse path (%v): %v", path, err) } u.Path = parsedPath.Path u.RawQuery = parsedPath.RawQuery return u.String(), nil } func (c *Client) makeRequest(method, path string, headers map[string]string, body io.Reader) (*http.Request, error) { u, err := c.makeURL(path) req, err := http.NewRequest(method, u, body) if headers != nil { for k, v := range headers { req.Header.Set(k, v) } } if err != nil { return nil, fmt.Errorf("could not create request: %v", err) } return req, nil } func (c *Client) doRequestWithRetry(req *http.Request, entity interface{}) error { if c.Executor == nil { c.Executor = &defaultExecutor{} } err := c.retryRequest(3*time.Second, func() error { return c.Executor.Do(req, c.Timeout, entity) }) return err } type httpreqfunc func() error func (c *Client) retryRequest(sleepTime time.Duration, fn httpreqfunc) error { err := fn() if err != nil { _, isRetryable := err.(RetryableError) if !isRetryable { return err } toSleep := sleepTime for retries := 0; retries < c.RetryCount; retries++ { time.Sleep(toSleep) toSleep = toSleep * 2 err := fn() _, isRetryable := err.(RetryableError) if err == nil { return nil } else if !isRetryable { return err } } } return err } ================================================ FILE: clients/httpclient/client_test.go ================================================ package httpclient import ( "encoding/json" "errors" "fmt" "net/http" "net/http/httptest" "testing" "time" ) type Cupcake struct { Flavour string Sprinkles bool } const cupcakeResponse = `{"flavour": "vomit", "sprinkles": true}` type MockExecutor struct { TryCount int // keep track of how many times 'Do' got called } func (me *MockExecutor) Do(req *http.Request, timeout time.Duration, entity interface{}) error { me.TryCount += 1 if req.URL.Path == "/" { return HttpRetryableError{errors.New("bork")} } else { return errors.New("not found yo") } } func TestClientRetry(t *testing.T) { me := &MockExecutor{} retryCount := 2 client := &Client{ Host: "nope", Timeout: 1 * time.Second, RetryCount: retryCount, Executor: me, } client.Get("/", nil, &Cupcake{}) if me.TryCount != retryCount+1 { t.Errorf("Expected to try request [%v] times but got [%v]", retryCount+1, me.TryCount) } me.TryCount = 0 client.Get("/404", nil, &Cupcake{}) if me.TryCount != 1 { t.Errorf("Expected to try request [%v] times but got [%v]", 1, me.TryCount) } } func TestClientDo(t *testing.T) { testServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.Method { case "GET", "DELETE": if len(r.URL.RawQuery) > 0 { fmt.Fprintf(w, `{"flavour":"vague","sprinkles":false}`) } else { fmt.Fprintf(w, cupcakeResponse) } case "PUT", "POST": content := r.Header.Get("Content-Type") if content != "application/json" { t.Errorf("Expected Content-Type to eq %s got %s", "application/json", content) } c := Cupcake{} err := json.NewDecoder(r.Body).Decode(&c) if err != nil { t.Errorf("Expected body to deserialize but got error %s", err.Error()) } fmt.Fprintf(w, cupcakeResponse) } })) cupcake := Cupcake{} client := &Client{ Host: testServer.URL, Timeout: 1 * time.Second, RetryCount: 1, } var err error var headers = map[string]string{ "Content-Type": "application/json", } err = client.Get("/", nil, &cupcake) if err != nil { t.Errorf("Expected err to be nil got %s", err.Error()) } if cupcake.Flavour != "vomit" { t.Errorf("Expected flavour to be 'vomit', got: %v", cupcake.Flavour) } if !cupcake.Sprinkles { t.Errorf("Expected sprinkles to be true, got: %v", cupcake.Sprinkles) } cupcake = Cupcake{} err = client.Get("/?some_rando_param=thing", nil, &cupcake) if err != nil { t.Errorf("Expected err to be nil got %s", err.Error()) } if cupcake.Flavour != "vague" { t.Errorf("Expected flavour to be 'vague', got: %v", cupcake.Flavour) } if cupcake.Sprinkles { t.Errorf("Expected sprinkles to be false, got: %v", cupcake.Sprinkles) } cupcake = Cupcake{} err = client.Put("/", headers, &Cupcake{"vomit", true}, &cupcake) if err != nil { t.Errorf("Expected err to be nil got %s", err.Error()) } if cupcake.Flavour != "vomit" { t.Errorf("Expected flavour to be 'vomit', got: %v", cupcake.Flavour) } if !cupcake.Sprinkles { t.Errorf("Expected sprinkles to be true, got: %v", cupcake.Sprinkles) } cupcake = Cupcake{} err = client.Post("/", headers, &Cupcake{"vomit", true}, &cupcake) if err != nil { t.Errorf("Expected err to be nil got %s", err.Error()) } if cupcake.Flavour != "vomit" { t.Errorf("Expected flavour to be 'vomit', got: %v", cupcake.Flavour) } if !cupcake.Sprinkles { t.Errorf("Expected sprinkles to be true, got: %v", cupcake.Sprinkles) } cupcake = Cupcake{} err = client.Delete("/", nil, &cupcake) if err != nil { t.Errorf("Expected err to be nil got %s", err.Error()) } } ================================================ FILE: clients/logs/eks_cloudwatch_logs_client.go ================================================ package logs import ( "encoding/json" "fmt" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/cloudwatchlogs" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/exceptions" "github.com/stitchfix/flotilla-os/state" "log" "net/http" "os" "sort" "strings" awstrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/aws/aws-sdk-go/aws" ) // // EKSCloudWatchLogsClient corresponds with the aws logs driver // for ECS and returns logs for runs // type EKSCloudWatchLogsClient struct { logRetentionInDays int64 logNamespace string logsClient logsClient logger *log.Logger } type EKSCloudWatchLog struct { Log string `json:"log"` } // // Name returns the name of the logs client // func (lc *EKSCloudWatchLogsClient) Name() string { return "eks-cloudwatch" } // // Initialize sets up the EKSCloudWatchLogsClient // func (lc *EKSCloudWatchLogsClient) Initialize(conf config.Config) error { //confLogOptions := conf.GetStringMapString("eks_log_driver_options") awsRegion := conf.GetString("eks_log_driver_options_awslogs_region") if len(awsRegion) == 0 { awsRegion = conf.GetString("aws_default_region") } if len(awsRegion) == 0 { return errors.Errorf( "EKSCloudWatchLogsClient needs one of [eks.log.driver.options.awslogs-region] or [aws_default_region] set in config") } // // log.namespace in conf takes precedence over log.driver.options.awslogs-group // lc.logNamespace = conf.GetString("eks_log_namespace") if len(lc.logNamespace) == 0 { return errors.Errorf( "EKSCloudWatchLogsClient needs one of [eks.log.driver.options.awslogs-group] or [eks.log.namespace] set in config") } lc.logRetentionInDays = int64(conf.GetInt("eks_log_retention_days")) if lc.logRetentionInDays == 0 { lc.logRetentionInDays = int64(30) } flotillaMode := conf.GetString("flotilla_mode") if flotillaMode != "test" { sess := awstrace.WrapSession(session.Must(session.NewSession(&aws.Config{ Region: aws.String(awsRegion)}))) lc.logsClient = cloudwatchlogs.New(sess) } lc.logger = log.New(os.Stderr, "[cloudwatchlogs] ", log.Ldate|log.Ltime|log.Lshortfile) return lc.createNamespaceIfNotExists() } // // Logs returns all logs from the log stream identified by handle since lastSeen // func (lc *EKSCloudWatchLogsClient) Logs(executable state.Executable, run state.Run, lastSeen *string, role *string, facility *string) (string, *string, error) { startFromHead := true //Pod isn't there yet - dont return a 404 if run.PodName == nil { return "", nil, nil } handle := lc.toStreamName(run) args := &cloudwatchlogs.GetLogEventsInput{ LogGroupName: &lc.logNamespace, LogStreamName: &handle, StartFromHead: &startFromHead, } if lastSeen != nil && len(*lastSeen) > 0 { args.NextToken = lastSeen } result, err := lc.logsClient.GetLogEvents(args) if err != nil { if aerr, ok := err.(awserr.Error); ok { if aerr.Code() == cloudwatchlogs.ErrCodeResourceNotFoundException { return "", nil, exceptions.MissingResource{err.Error()} } else if request.IsErrorThrottle(err) { lc.logger.Printf( "thottled getting logs; executable_id: %v, run_id: %s, error: %+v\n", executable.GetExecutableID(), run.RunID, err) return "", lastSeen, nil } } return "", nil, errors.Wrap(err, "problem getting logs") } if len(result.Events) == 0 { return "", result.NextForwardToken, nil } message := lc.logsToMessage(result.Events) return message, result.NextForwardToken, nil } // This method doesn't return log string, it is a placeholder only. func (lc *EKSCloudWatchLogsClient) LogsText(executable state.Executable, run state.Run, w http.ResponseWriter) error { return errors.Errorf("EKSCloudWatchLogsClient does not support LogsText method.") } // Generate stream name func (lc *EKSCloudWatchLogsClient) toStreamName(run state.Run) string { return fmt.Sprintf("%s", *run.PodName) } // Convert Cloudwatch logs to strings func (lc *EKSCloudWatchLogsClient) logsToMessage(events []*cloudwatchlogs.OutputLogEvent) string { sort.Sort(byTimestamp(events)) messages := make([]string, len(events)) for i, event := range events { var l EKSCloudWatchLog err := json.Unmarshal([]byte(*event.Message), &l) if err != nil { messages[i] = *event.Message } messages[i] = l.Log } return strings.Join(messages, "") } func (lc *EKSCloudWatchLogsClient) createNamespaceIfNotExists() error { exists, err := lc.namespaceExists() if err != nil { return errors.Wrapf(err, "problem checking if log namespace [%s] exists", lc.logNamespace) } if !exists { return lc.createNamespace() } return nil } // Check for the existence of a namespace. func (lc *EKSCloudWatchLogsClient) namespaceExists() (bool, error) { result, err := lc.logsClient.DescribeLogGroups(&cloudwatchlogs.DescribeLogGroupsInput{ LogGroupNamePrefix: &lc.logNamespace, }) if err != nil { return false, errors.Wrapf(err, "problem describing log groups with prefix [%s]", lc.logNamespace) } if len(result.LogGroups) == 0 { return false, nil } for _, group := range result.LogGroups { if *group.LogGroupName == lc.logNamespace { return true, nil } } return false, nil } // Creates namespace is not present. func (lc *EKSCloudWatchLogsClient) createNamespace() error { _, err := lc.logsClient.CreateLogGroup(&cloudwatchlogs.CreateLogGroupInput{ LogGroupName: &lc.logNamespace, }) if err != nil { return errors.Wrapf(err, "problem creating log group with log group name [%s]", lc.logNamespace) } _, err = lc.logsClient.PutRetentionPolicy(&cloudwatchlogs.PutRetentionPolicyInput{ LogGroupName: &lc.logNamespace, RetentionInDays: &lc.logRetentionInDays, }) if err != nil { return errors.Wrapf(err, "problem setting log group retention policy for log group name [%s]", lc.logNamespace) } return nil } ================================================ FILE: clients/logs/eks_s3_logs_client.go ================================================ package logs import ( "bufio" "bytes" "compress/gzip" "context" "encoding/json" "fmt" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/s3" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" awstrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/aws/aws-sdk-go/aws" "io" "log" "net/http" "os" "strconv" "strings" "time" ) // EKSS3LogsClient corresponds with the aws logs driver // for ECS and returns logs for runs type EKSS3LogsClient struct { logRetentionInDays int64 logNamespace string s3Client *s3.S3 s3Bucket string s3BucketRootDir string logger *log.Logger emrS3LogsBucket string emrS3LogsBasePath string } type s3Log struct { Log string `json:"log"` Stream string `json:"stream"` Time time.Time `json:"time"` } // Name returns the name of the logs client func (lc *EKSS3LogsClient) Name() string { return "eks-s3" } // Initialize sets up the EKSS3LogsClient func (lc *EKSS3LogsClient) Initialize(conf config.Config) error { //confLogOptions := conf.GetStringMapString("eks_log_driver_options") awsRegion := conf.GetString("eks_log_driver_options_awslogs_region") if len(awsRegion) == 0 { awsRegion = conf.GetString("aws_default_region") } if len(awsRegion) == 0 { return errors.Errorf( "EKSS3LogsClient needs one of [eks.log.driver.options.awslogs-region] or [aws_default_region] set in config") } flotillaMode := conf.GetString("flotilla_mode") if flotillaMode != "test" { sess := awstrace.WrapSession(session.Must(session.NewSession(&aws.Config{ Region: aws.String(awsRegion)}))) sess = awstrace.WrapSession(sess) lc.s3Client = s3.New(sess, aws.NewConfig().WithRegion(awsRegion)) } lc.emrS3LogsBucket = conf.GetString("emr_log_bucket") lc.emrS3LogsBasePath = conf.GetString("emr_log_base_path") s3BucketName := conf.GetString("eks_log_driver_options_s3_bucket_name") if len(s3BucketName) == 0 { return errors.Errorf( "EKSS3LogsClient needs [eks_log_driver_options_s3_bucket_name] set in config") } lc.s3Bucket = s3BucketName s3BucketRootDir := conf.GetString("eks_log_driver_options_s3_bucket_root_dir") if len(s3BucketRootDir) == 0 { return errors.Errorf( "EKSS3LogsClient needs [eks.log.driver.options.s3_bucket_root_dir] set in config") } lc.s3BucketRootDir = s3BucketRootDir lc.logger = log.New(os.Stderr, "[s3logs] ", log.Ldate|log.Ltime|log.Lshortfile) return nil } func (lc *EKSS3LogsClient) emrLogsToMessageString(run state.Run, lastSeen *string, role *string, facility *string) (string, *string, error) { s3DirName, err := lc.emrDriverLogsPath(run) if err != nil { return "", aws.String(""), errors.Errorf("No logs") } params := &s3.ListObjectsV2Input{ Bucket: aws.String(lc.emrS3LogsBucket), Prefix: aws.String(s3DirName), MaxKeys: aws.Int64(1000), } pageNum := 0 lastModified := &time.Time{} var key *string err = lc.s3Client.ListObjectsV2Pages(params, func(result *s3.ListObjectsV2Output, lastPage bool) bool { pageNum++ if result != nil { for _, content := range result.Contents { if strings.Contains(*content.Key, *role) && strings.Contains(*content.Key, *facility) && lastModified.Before(*content.LastModified) { if content != nil && *content.Size < int64(10000000) { key = content.Key lastModified = content.LastModified } } } } if lastPage { return false } return pageNum <= 10 }) if key == nil { lc.logger.Println(fmt.Sprintf("run=%s emr logging key not found for role=%s facility=%s", run.RunID, *role, *facility)) return "", aws.String(""), errors.Errorf("No driver logs found") } startPosition := int64(0) if lastSeen != nil { parsed, err := strconv.ParseInt(*lastSeen, 10, 64) if err == nil { startPosition = parsed } } s3Obj, err := lc.s3Client.GetObjectWithContext( context.Background(), &s3.GetObjectInput{ Bucket: aws.String(lc.emrS3LogsBucket), Key: aws.String(*key), }, func(r *request.Request) { // Otherwise we get an unzipped response. r.HTTPRequest.Header.Add("Accept-Encoding", "gzip") }) if s3Obj != nil && err == nil { if s3Obj.ContentLength != nil && *s3Obj.ContentLength > int64(10000000) { return "", aws.String(""), errors.Errorf("Logs > 10MB, will not display.") } defer s3Obj.Body.Close() gr, err := gzip.NewReader(s3Obj.Body) if err != nil { return "", aws.String(""), err } defer gr.Close() reader := bufio.NewReader(gr) var b0 bytes.Buffer counter := int64(0) for { line, err := reader.ReadBytes('\n') if err != nil { if err == io.EOF { err = nil return b0.String(), aws.String(fmt.Sprintf("%d", counter)), nil } } else { if counter >= startPosition { b0.Write(line) } counter = counter + 1 } } } return "", aws.String(""), errors.Errorf("No driver logs found") } func (lc *EKSS3LogsClient) emrDriverLogsPath(run state.Run) (string, error) { if run.SparkExtension.EMRJobId != nil && run.SparkExtension.VirtualClusterId != nil { return fmt.Sprintf("%s/%s/jobs/%s/", lc.emrS3LogsBasePath, *run.SparkExtension.VirtualClusterId, *run.SparkExtension.EMRJobId, ), nil } return "", errors.New("couldn't construct s3 path.") } func (lc *EKSS3LogsClient) Logs(executable state.Executable, run state.Run, lastSeen *string, role *string, facility *string) (string, *string, error) { if *run.Engine == state.EKSSparkEngine { return lc.emrLogsToMessageString(run, lastSeen, role, facility) } result, err := lc.getS3Object(run) startPosition := int64(0) if lastSeen != nil { parsed, err := strconv.ParseInt(*lastSeen, 10, 64) if err == nil { startPosition = parsed } } if result != nil && err == nil { acc, position, err := lc.logsToMessageString(result, startPosition) newLastSeen := fmt.Sprintf("%d", position) return acc, &newLastSeen, err } return "", aws.String(""), errors.Errorf("No logs.") } // Logs returns all logs from the log stream identified by handle since lastSeen func (lc *EKSS3LogsClient) LogsText(executable state.Executable, run state.Run, w http.ResponseWriter) error { if run.Engine == nil || *run.Engine == state.EKSEngine { result, err := lc.getS3Object(run) if err != nil { return err } else if result != nil { return lc.logsToMessage(result, w) } } if *run.Engine == state.EKSSparkEngine { return lc.logsEMR(w) } return nil } // Fetch S3Object associated with the pod's log. func (lc *EKSS3LogsClient) getS3Object(run state.Run) (*s3.GetObjectOutput, error) { //Pod isn't there yet - dont return a 404 //if run.PodName == nil { // return nil, errors.New("no pod associated with the run.") //} s3DirName := lc.toS3DirName(run) // Get list of S3 objects in the run_id folder. result, err := lc.s3Client.ListObjects(&s3.ListObjectsInput{ Bucket: aws.String(lc.s3Bucket), Prefix: aws.String(s3DirName), }) if err != nil { return nil, errors.Wrap(err, "problem getting logs") } if result == nil || result.Contents == nil || len(result.Contents) == 0 { return nil, errors.New("no s3 files associated with the run.") } var key *string lastModified := &time.Time{} //Find latest log file (could have multiple log files per pod - due to pod retries) for _, content := range result.Contents { if strings.Contains(*content.Key, run.RunID) && lastModified.Before(*content.LastModified) { if content != nil && *content.Size < int64(10000000) { key = content.Key lastModified = content.LastModified } } } if key != nil { return lc.getS3Key(key) } else { return nil, errors.New("no s3 files associated with the run.") } } func (lc *EKSS3LogsClient) getS3Key(s3Key *string) (*s3.GetObjectOutput, error) { result, err := lc.s3Client.GetObject(&s3.GetObjectInput{ Bucket: aws.String(lc.s3Bucket), Key: aws.String(*s3Key), }) if err != nil { return nil, err } return result, nil } // Formulate dir name on S3. func (lc *EKSS3LogsClient) toS3DirName(run state.Run) string { return fmt.Sprintf("%s/%s", lc.s3BucketRootDir, run.RunID) } // Converts log messages from S3 to strings - returns the contents of the entire file. func (lc *EKSS3LogsClient) logsToMessage(result *s3.GetObjectOutput, w http.ResponseWriter) error { reader := bufio.NewReader(result.Body) for { line, err := reader.ReadBytes('\n') if err != nil { if err == io.EOF { err = nil } return err } else { var parsedLine s3Log parsedLine, err := parseLines(line) if err != nil { return err } _, err = io.WriteString(w, parsedLine.Log) if err != nil { return err } } } } func (lc *EKSS3LogsClient) logsEMR(w http.ResponseWriter) error { _, _ = io.WriteString(w, "todo!!!") return nil } // Converts log messages from S3 to strings, takes a starting offset. func (lc *EKSS3LogsClient) logsToMessageString(result *s3.GetObjectOutput, startingPosition int64) (string, int64, error) { acc := "" currentPosition := int64(0) // if less than/equal to 0, read entire log. if startingPosition <= 0 { startingPosition = currentPosition } // No S3 file or object, return "", 0, err if result == nil { return acc, startingPosition, errors.New("s3 object not present.") } reader := bufio.NewReader(result.Body) // Reading until startingPosition and discard unneeded lines. for currentPosition < startingPosition { currentPosition = currentPosition + 1 _, err := reader.ReadBytes('\n') if err != nil { if err == io.EOF { err = nil } return acc, startingPosition, err } } // Read upto MaxLogLines for currentPosition <= startingPosition+state.MaxLogLines { currentPosition = currentPosition + 1 line, err := reader.ReadBytes('\n') if err != nil { if err == io.EOF { err = nil } return acc, currentPosition, err } else { parsedLine, err := parseLines(line) if err == nil { acc = fmt.Sprintf("%s%s", acc, parsedLine.Log) } } } _ = result.Body.Close() return acc, currentPosition, nil } func parseLines(input []byte) (s3Log, error) { //handling both dockerengine and containterd log formats //TODO I don't love this - clean up post migration var parsedInput s3Log err := json.Unmarshal(input, &parsedInput) if err != nil { splitLines := strings.Split(string(input), " ") if len(splitLines) > 0 { layout := "2006-01-02T15:04:05.999999999Z" timestamp, err := time.Parse(layout, splitLines[0]) if err != nil { return parsedInput, err } parsedInput.Time = timestamp parsedInput.Stream = splitLines[1] parsedInput.Log = strings.Join(splitLines[3:], " ") } } return parsedInput, nil } ================================================ FILE: clients/logs/logs.go ================================================ package logs import ( "fmt" "github.com/aws/aws-sdk-go/service/cloudwatchlogs" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/state" "net/http" ) // // Client returns logs for a Run // type Client interface { Name() string Initialize(config config.Config) error Logs(executable state.Executable, run state.Run, lastSeen *string, role *string, facility *string) (string, *string, error) LogsText(executable state.Executable, run state.Run, w http.ResponseWriter) error } type logsClient interface { DescribeLogGroups(input *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) CreateLogGroup(input *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) PutRetentionPolicy(input *cloudwatchlogs.PutRetentionPolicyInput) (*cloudwatchlogs.PutRetentionPolicyOutput, error) GetLogEvents(input *cloudwatchlogs.GetLogEventsInput) (*cloudwatchlogs.GetLogEventsOutput, error) } type byTimestamp []*cloudwatchlogs.OutputLogEvent func (events byTimestamp) Len() int { return len(events) } func (events byTimestamp) Swap(i, j int) { events[i], events[j] = events[j], events[i] } func (events byTimestamp) Less(i, j int) bool { return *(events[i].Timestamp) < *(events[j].Timestamp) } // // NewLogsClient creates and initializes a run logs client // func NewLogsClient(conf config.Config, logger flotillaLog.Logger, name string) (Client, error) { _ = logger.Log("level", "info", "message", "Initializing logs client", "client", name) switch name { case "eks": // awslogs as an ecs log driver sends logs to AWS CloudWatch Logs service ekscw := &EKSS3LogsClient{} if err := ekscw.Initialize(conf); err != nil { return nil, errors.Wrap(err, "problem initializing EKSCloudWatchLogsClient") } return ekscw, nil default: return nil, fmt.Errorf("No Client named [%s] was found", name) } } ================================================ FILE: clients/metrics/datadog_metrics_client.go ================================================ package metrics import ( "fmt" "github.com/DataDog/datadog-go/v5/statsd" "github.com/stitchfix/flotilla-os/config" "os" "strings" "time" ) // Client accepts statsd metrics type DatadogStatsdMetricsClient struct { client *statsd.Client } // Initialize the client. Assumes the following keys are passed in: // *metrics.dogstatsd.address* -- localhost:8125 // *metrics.dogstatsd.namespace* -- fixed key you want to prefix to all the metrics func (dd *DatadogStatsdMetricsClient) Init(conf config.Config) error { host := os.Getenv("DD_AGENT_HOST") var addr string // If the host contains a colon and does not contain a square bracket, then the address is ipv6 if strings.Contains(host, ":") && !strings.Contains(host, "[") { addr = fmt.Sprintf("[%s]:8125", host) } else { addr = fmt.Sprintf("%s:8125", host) } client, err := statsd.New(addr, statsd.WithNamespace(conf.GetString("metrics_dogstatsd_namespace"))) if err != nil { return err } dd.client = client return nil } // Decrement metric value, tags associated with the metric, and rate corresponds to the value func (dd *DatadogStatsdMetricsClient) Decrement(name Metric, tags []string, rate float64) error { return dd.client.Decr(string(name), tags, rate) } // Increment metric value, tags associated with the metric, and rate corresponds to the value func (dd *DatadogStatsdMetricsClient) Increment(name Metric, tags []string, rate float64) error { return dd.client.Incr(string(name), tags, rate) } // Histogram tracks the statistical distribution of a set of values func (dd *DatadogStatsdMetricsClient) Histogram(name Metric, value float64, tags []string, rate float64) error { return dd.client.Histogram(string(name), value, tags, rate) } // Distribution tracks the statistical distribution of a set of values func (dd *DatadogStatsdMetricsClient) Distribution(name Metric, value float64, tags []string, rate float64) error { return dd.client.Distribution(string(name), value, tags, rate) } // Timing sends timing information, it is an alias for TimeInMilliseconds func (dd *DatadogStatsdMetricsClient) Timing(name Metric, value time.Duration, tags []string, rate float64) error { return dd.client.Timing(string(name), value, tags, rate) } // Set counts the number of unique elements in a group func (dd *DatadogStatsdMetricsClient) Set(name Metric, value string, tags []string, rate float64) error { return dd.client.Set(string(name), value, tags, rate) } // NewEvent creates a new event with the given title and text. func (dd *DatadogStatsdMetricsClient) Event(e event) error { se := statsd.NewEvent(e.Title, e.Text) se.Tags = e.Tags return dd.client.Event(se) } ================================================ FILE: clients/metrics/metrics.go ================================================ package metrics import ( "fmt" "sync" "time" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" ) type Metric string const ( // Metric associated to submission of jobs to EKS EngineEKSExecute Metric = "engine.eks.execute" // Metric associated to submission of jobs to SQS queue, before EKS submission. EngineEKSEnqueue Metric = "engine.eks.enqueue" // Metric associated to submission of jobs to EMR EngineEMRExecute Metric = "engine.emr.execute" // Metric associated to submission of jobs to SQS queue, before EMR submission. EngineEMREnqueue Metric = "engine.emr.enqueue" // Metric associated to termination of jobs via the API. EngineEKSTerminate Metric = "engine.eks.terminate" // Metric associated to termination of jobs via the API. EngineEMRTerminate Metric = "engine.emr.terminate" // Metric associated to termination of pods hopping between hosts. EngineEKSRunPodnameChange Metric = "engine.eks.run_podname_changed" // Metric associated to pod events where there was a Cluster Autoscale event. EngineEKSNodeTriggeredScaledUp Metric = "engine.eks.triggered_scale_up" // Timing for status worker processEKSRun StatusWorkerProcessEKSRun Metric = "status_worker.timing.process_eks_run" // Timing for acquire lock StatusWorkerAcquireLock Metric = "status_worker.timing.acquire_lock" // Timing for fetch_pod_metrics StatusWorkerFetchPodMetrics Metric = "status_worker.timing.fetch_pod_metrics" // Timing for fetch_update_status StatusWorkerFetchUpdateStatus Metric = "status_worker.timing.fetch_update_status" // Metric for locked runs StatusWorkerLockedRuns Metric = "status_worker.locked_runs" // Timing for fetch metrics StatusWorkerFetchMetrics Metric = "status_worker.fetch_metrics" // Timing for get pod list StatusWorkerGetPodList Metric = "status_worker.get_pod_list" // Timing for get events StatusWorkerGetEvents Metric = "status_worker.get_events" // Timing for get job StatusWorkerGetJob Metric = "status_worker.get_job" // Engine update run EngineUpdateRun Metric = "engine.update_run" // ARA metrics - tracking Auto Resource Adjustment behavior EngineEKSARAEstimationAttempted Metric = "engine.eks.ara.estimation_attempted" EngineEKSARAEstimationSucceeded Metric = "engine.eks.ara.estimation_succeeded" EngineEKSARAEstimationFailed Metric = "engine.eks.ara.estimation_failed" EngineEKSARAResourceAdjustment Metric = "engine.eks.ara.resource_adjustment" EngineEKSARANoHistoricalData Metric = "engine.eks.ara.no_historical_data" EngineEKSARAHitMaxMemory Metric = "engine.eks.ara.hit_max_memory" EngineEKSARAHitMaxCPU Metric = "engine.eks.ara.hit_max_cpu" EngineEKSARAMemoryIncreaseRatio Metric = "engine.eks.ara.memory_increase_ratio" EngineEKSARACPUIncreaseRatio Metric = "engine.eks.ara.cpu_increase_ratio" EngineEKSARAFinalMemoryMB Metric = "engine.eks.ara.final_memory_mb" EngineEKSARAFinalCPUMillicores Metric = "engine.eks.ara.final_cpu_millicores" EngineEKSARADefaultMemory Metric = "engine.eks.ara.default_memory" EngineEKSARAARAMemory Metric = "engine.eks.ara.ara_memory" EngineEKSARADefaultCPU Metric = "engine.eks.ara.default_cpu" EngineEKSARAARACPU Metric = "engine.eks.ara.ara_cpu" EngineEKSARAMemoryIncrease Metric = "engine.eks.ara.memory_increase" EngineEKSARACPUIncrease Metric = "engine.eks.ara.cpu_increase" EngineEKSARANullCommandHash Metric = "engine.eks.ara.null_command_hash" ) type MetricTag string const ( // Metric tag for job success. StatusSuccess MetricTag = "status:success" // Metric tag for job failure. StatusFailure MetricTag = "status:failure" ) type Client interface { Init(conf config.Config) error Decrement(name Metric, tags []string, rate float64) error Increment(name Metric, tags []string, rate float64) error Histogram(name Metric, value float64, tags []string, rate float64) error Distribution(name Metric, value float64, tags []string, rate float64) error Set(name Metric, value string, tags []string, rate float64) error Event(evt event) error Timing(name Metric, value time.Duration, tags []string, rate float64) error } type event struct { Title string Text string Tags []string } var once sync.Once var instance Client // Instantiating the Metrics Client. func InstantiateClient(conf config.Config) error { // Return an error if `metrics_client` isn't set in config. if !conf.IsSet("metrics_client") { return fmt.Errorf("`metrics_client` not set in config, unable to instantiate metrics client") } var err error = nil name := conf.GetString("metrics_client") once.Do(func() { switch name { case "dogstatsd": instance = &DatadogStatsdMetricsClient{} if err = instance.Init(conf); err != nil { err = errors.Errorf("Unable to initialize dogstatsd client.") instance = nil break } default: err = fmt.Errorf("no client named [%s] was found", name) } }) return err } // Decr is just Count of -1 func Decrement(name Metric, tags []string, rate float64) error { if instance != nil { return instance.Decrement(name, tags, rate) } return errors.Errorf("MetricsClient instance is nil, unable to send Decrement metric.") } // Incr is just Count of -1 func Increment(name Metric, tags []string, rate float64) error { if instance != nil { return instance.Increment(name, tags, rate) } return errors.Errorf("MetricsClient instance is nil, unable to send Increment metric.") } // Histogram tracks the statistical distribution of a set of values func Histogram(name Metric, value float64, tags []string, rate float64) error { if instance != nil { return instance.Histogram(name, value, tags, rate) } return errors.Errorf("MetricsClient instance is nil, unable to send Histogram metric.") } // Distribution tracks the statistical distribution of a set of values func Distribution(name Metric, value float64, tags []string, rate float64) error { if instance != nil { return instance.Distribution(name, value, tags, rate) } return errors.Errorf("MetricsClient instance is nil, unable to send Distribution metric.") } // Set counts the number of unique elements in a group func Set(name Metric, value string, tags []string, rate float64) error { if instance != nil { return instance.Set(name, value, tags, rate) } return errors.Errorf("MetricsClient instance is nil, unable to send Set metric.") } // NewEvent creates a new event with the given title and text. func Event(title string, text string, tags []string) error { if instance != nil { return instance.Event(event{ Title: title, Text: text, Tags: tags, }) } return errors.Errorf("MetricsClient instance is nil, unable to send Event metric.") } // Timing sends timing information, it is an alias for TimeInMilliseconds func Timing(name Metric, value time.Duration, tags []string, rate float64) error { if instance != nil { return instance.Timing(name, value, tags, rate) } return errors.Errorf("MetricsClient instance is nil, unable to send Event metric.") } ================================================ FILE: clients/middleware/client.go ================================================ package middleware import ( "github.com/stitchfix/flotilla-os/state" "net/http" ) type Client interface { AnnotateLaunchRequest(headers *http.Header, lr *state.LaunchRequestV2) error } type middlewareClient struct{} func NewClient() (Client, error) { return &middlewareClient{}, nil } func (mwC middlewareClient) AnnotateLaunchRequest(headers *http.Header, lr *state.LaunchRequestV2) error { return nil } ================================================ FILE: conf/config.yml ================================================ aws_default_region: us-east-1 cluster_client: eks create_database_schema: true database_url: postgresql://flotilla:flotilla@localhost/flotilla?sslmode=disable eks_clusters: 'clusta, cupcake' eks_cluster_default: 'clusta' eks_gpu_cluster_default: 'clusta' eks_tier_default: '4' eks_log_driver_name: awslogs eks_log_driver_options_awslogs-group: flotilla-eks-namespace eks_log_driver_options_awslogs-region: us-east-1 eks_log_namespace: flotilla-eks-namespace eks_log_retention_days: 90 enabled_workers: - retry - submit execution_engine: eks flotilla_mode: test http_server_cors_allowed_origins: - http://localhost:3001 http_server_listen_address: :3000 http_server_read_timeout_seconds: 5 http_server_write_timeout_seconds: 10 logs_client: cloudwatch metrics_client: dogstatsd metrics_dogstatsd_address: 127.0.0.1:8125 metrics_dogstatsd_namespace: my.flotilla.namespace metrics_dogstatsd_tags: - test owner_id_var: FLOTILLA_RUN_OWNER_ID queue_manager: sqs queue_namespace: dev-flotilla queue_process_time: 45 queue_retention_seconds: 604800 queue_status: flotilla-status-updates-dev queue_status_rule: flotilla-task-status readonly_database_url: postgresql://flotilla:flotilla@localhost/flotilla?sslmode=disable ================================================ FILE: config/config.go ================================================ package config import ( "github.com/pkg/errors" "github.com/spf13/viper" "strings" ) // // Config interface to wrap external configuration object // type Config interface { GetString(key string) string GetStringSlice(key string) []string GetStringMapString(key string) map[string]string GetInt(key string) int GetBool(key string) bool GetFloat64(key string) float64 IsSet(key string) bool } // // NewConfig initializes a configuration object // - if confDir is non-nil searches there and loads a "config.yml" // - sets configuration to read from environment variables automatically // func NewConfig(confDir *string) (Config, error) { v := viper.New() if v == nil { return &conf{}, errors.New("Error initializing internal config") } if confDir != nil { v.SetConfigName("config") v.SetConfigType("yaml") v.AddConfigPath(*confDir) if err := v.ReadInConfig(); err != nil { return &conf{}, errors.Wrapf(err, "problem reading config from [%s]", *confDir) } } v.AutomaticEnv() v.SetEnvKeyReplacer(strings.NewReplacer(".", "_")) return &conf{v}, nil } type conf struct { v *viper.Viper } // GetString returns the value associated with the key as a string. func (c *conf) GetString(key string) string { return c.v.GetString(key) } // GetFloat returns the value associated with the key as a float. func (c *conf) GetFloat64(key string) float64 { return c.v.GetFloat64(key) } // GetInt returns the value associated with the key as an integer. func (c *conf) GetInt(key string) int { return c.v.GetInt(key) } // GetBool returns the value associated with the key as a boolean. func (c *conf) GetBool(key string) bool { return c.v.GetBool(key) } // GetStringMapString returns the value associated with the key as a map of strings. func (c *conf) GetStringMapString(key string) map[string]string { return c.v.GetStringMapString(key) } // GetStringSlice returns the value associated with the key as a slice of strings. func (c *conf) GetStringSlice(key string) []string { return c.v.GetStringSlice(key) } // IsSet checks to see if the key has been set in any of the data locations. // IsSet is case-insensitive for a key. func (c *conf) IsSet(key string) bool { return c.v.IsSet(key) } ================================================ FILE: config/config_test.go ================================================ package config import ( "os" "testing" ) func TestNewConfig(t *testing.T) { var c Config c, _ = NewConfig(nil) toSet := "sprinkles" os.Setenv("CUPCAKE", toSet) if c.GetString("cupcake") != toSet { t.Errorf("Environment variables not set - expected %s but was %s", toSet, c.GetString("cupcake")) } confDir := "../conf" c, _ = NewConfig(&confDir) if !c.IsSet("queue_namespace") || c.GetString("queue_namespace") != "dev-flotilla" { t.Errorf("Expected to read from conf dir [queue_namespace]:[dev-flotilla], was: %s", c.GetString("queue_namespace")) } } ================================================ FILE: datadog-ara-dashboard-api.json ================================================ { "title": "Flotilla ARA (Auto Resource Adjustment) Metrics", "description": "Dashboard tracking Auto Resource Adjustment behavior for EKS and Spark jobs. Monitors resource growth patterns, over-provisioning detection, and OOM-based memory adjustments. Use the engine filter to view EKS (P99-based 1.75x/1.25x) vs Spark (OOM-based 1.25x/3.0x) jobs separately.", "widgets": [ { "id": 1, "layout": { "x": 0, "y": 0, "width": 47, "height": 15 }, "definition": { "title": "ARA Estimation Attempts vs Successes", "title_size": "16", "title_align": "left", "show_legend": true, "legend_layout": "auto", "legend_columns": [ "avg", "min", "max", "value", "sum" ], "type": "timeseries", "requests": [ { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.estimation_attempted{$cluster,$env,$engine}.as_count()" } ], "style": { "palette": "dog_classic", "line_type": "solid", "line_width": "normal" }, "display_type": "bars" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.estimation_succeeded{$cluster,$env,$engine}.as_count()" } ], "style": { "palette": "green", "line_type": "solid", "line_width": "normal" }, "display_type": "bars" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.estimation_failed{$cluster,$env,$engine}.as_count()" } ], "style": { "palette": "red", "line_type": "solid", "line_width": "normal" }, "display_type": "bars" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.no_historical_data{$cluster,$env,$engine}.as_count()" } ], "style": { "palette": "orange", "line_type": "solid", "line_width": "normal" }, "display_type": "bars" } ], "yaxis": { "label": "", "scale": "linear", "include_zero": true, "min": "auto", "max": "auto" }, "markers": [] } }, { "id": 2, "layout": { "x": 48, "y": 0, "width": 47, "height": 15 }, "definition": { "title": "ARA Resource Adjustments", "title_size": "16", "title_align": "left", "show_legend": true, "legend_size": "0", "type": "timeseries", "requests": [ { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.resource_adjustment{$cluster,$env,$engine}.as_count()" } ], "style": { "palette": "blue", "line_type": "solid", "line_width": "normal" }, "display_type": "bars" } ], "yaxis": { "label": "", "scale": "linear", "include_zero": true, "min": "auto", "max": "auto" }, "markers": [] } }, { "id": 3, "layout": { "x": 0, "y": 16, "width": 47, "height": 15 }, "definition": { "title": "Max Resource Limits Hit (Critical)", "title_size": "16", "title_align": "left", "show_legend": true, "legend_size": "0", "type": "timeseries", "requests": [ { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.hit_max_memory{$cluster,$env,$engine}.as_count()" } ], "style": { "palette": "red", "line_type": "solid", "line_width": "thick" }, "display_type": "line" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.hit_max_cpu{$cluster,$env,$engine}.as_count()" } ], "style": { "palette": "orange", "line_type": "solid", "line_width": "normal" }, "display_type": "line" } ], "yaxis": { "label": "", "scale": "linear", "include_zero": true, "min": "auto", "max": "auto" }, "markers": [ { "label": "Alert Threshold", "value": "y = 0", "display_type": "error dashed" } ] } }, { "id": 4, "layout": { "x": 48, "y": 16, "width": 23, "height": 15 }, "definition": { "title": "Success Rate", "title_size": "16", "title_align": "left", "type": "query_value", "requests": [ { "conditional_formats": [ { "comparator": ">=", "value": 95, "palette": "green_on_white" }, { "comparator": ">=", "value": 80, "palette": "yellow_on_white" }, { "comparator": "<", "value": 80, "palette": "red_on_white" } ], "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.estimation_succeeded{$cluster,$env,$engine}.as_count()", "aggregator": "sum" }, { "data_source": "metrics", "name": "query2", "query": "sum:algo.flotilla.engine.eks.ara.estimation_attempted{$cluster,$env,$engine}.as_count()", "aggregator": "sum" } ], "formulas": [ { "number_format": { "unit": { "label": "%", "type": "custom_unit_label" } }, "formula": "(query1 / query2) * 100" } ] } ], "autoscale": true, "precision": 2 } }, { "id": 5, "layout": { "x": 72, "y": 16, "width": 23, "height": 15 }, "definition": { "title": "Max Memory Hits (Last Hour)", "title_size": "16", "title_align": "left", "type": "query_value", "requests": [ { "conditional_formats": [ { "comparator": ">", "value": 0, "palette": "red_on_white" }, { "comparator": "=", "value": 0, "palette": "green_on_white" } ], "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.hit_max_memory{$cluster,$env,$engine}.as_count()", "aggregator": "sum" } ] } ], "autoscale": true, "custom_unit": "", "precision": 0 } }, { "id": 6, "layout": { "x": 0, "y": 32, "width": 31, "height": 15 }, "definition": { "title": "Memory Increase Ratio Distribution", "title_size": "16", "title_align": "left", "show_legend": false, "type": "heatmap", "yaxis": { "label": "", "scale": "linear", "include_zero": true, "min": "auto", "max": "auto" }, "requests": [ { "style": { "palette": "YlOrRd" }, "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.memory_increase_ratio{$cluster,$env,$engine} by {cluster}" } ] } ] } }, { "id": 7, "layout": { "x": 32, "y": 32, "width": 31, "height": 15 }, "definition": { "title": "CPU Increase Ratio Distribution", "title_size": "16", "title_align": "left", "show_legend": false, "type": "heatmap", "yaxis": { "label": "", "scale": "linear", "include_zero": true, "min": "auto", "max": "auto" }, "requests": [ { "style": { "palette": "YlOrRd" }, "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.cpu_increase_ratio{$cluster,$env,$engine} by {cluster}" } ] } ] } }, { "id": 8, "layout": { "x": 64, "y": 32, "width": 31, "height": 15 }, "definition": { "title": "Top Clusters by Max Memory Hits", "title_size": "16", "title_align": "left", "type": "toplist", "requests": [ { "style": { "palette": "red" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "sum:algo.flotilla.engine.eks.ara.hit_max_memory{$cluster,$env,$engine}.as_count()", "aggregator": "avg" } ], "formulas": [ { "formula": "query1" } ], "sort": { "order_by": [ { "type": "formula", "index": 0, "order": "desc" } ] } } ] } }, { "id": 9, "layout": { "x": 0, "y": 48, "width": 23, "height": 15 }, "definition": { "title": "Default Memory Distribution (Before ARA)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "blue" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.default_memory{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 10, "layout": { "x": 24, "y": 48, "width": 23, "height": 15 }, "definition": { "title": "ARA Memory Distribution (After ARA)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "orange" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.ara_memory{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 11, "layout": { "x": 48, "y": 48, "width": 23, "height": 15 }, "definition": { "title": "Final Memory Distribution (After Bounds)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "red" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.final_memory_mb{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 12, "layout": { "x": 72, "y": 48, "width": 23, "height": 15 }, "definition": { "title": "Memory Increase (Absolute MB)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "purple" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.memory_increase{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 13, "layout": { "x": 0, "y": 64, "width": 23, "height": 15 }, "definition": { "title": "Default CPU Distribution (Before ARA)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "blue" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.default_cpu{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 14, "layout": { "x": 24, "y": 64, "width": 23, "height": 15 }, "definition": { "title": "ARA CPU Distribution (After ARA)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "orange" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.ara_cpu{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 15, "layout": { "x": 48, "y": 64, "width": 23, "height": 15 }, "definition": { "title": "Final CPU Distribution (After Bounds)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "red" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.final_cpu_millicores{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 16, "layout": { "x": 72, "y": 64, "width": 23, "height": 15 }, "definition": { "title": "CPU Increase (Absolute Millicores)", "title_size": "16", "title_align": "left", "show_legend": false, "type": "distribution", "requests": [ { "style": { "palette": "purple" }, "response_format": "scalar", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.cpu_increase{$cluster,$env,$engine} by {cluster}", "aggregator": "avg" } ] } ] } }, { "id": 17, "layout": { "x": 0, "y": 80, "width": 47, "height": 15 }, "definition": { "title": "Resource Growth Over Time", "title_size": "16", "title_align": "left", "show_legend": true, "legend_size": "0", "type": "timeseries", "requests": [ { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.default_memory{$cluster,$env,$engine}" } ], "style": { "palette": "blue", "line_type": "solid", "line_width": "normal" }, "display_type": "line" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.ara_memory{$cluster,$env,$engine}" } ], "style": { "palette": "orange", "line_type": "solid", "line_width": "normal" }, "display_type": "line" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.final_memory_mb{$cluster,$env,$engine}" } ], "style": { "palette": "red", "line_type": "solid", "line_width": "thick" }, "display_type": "line" } ], "yaxis": { "label": "Memory (MB)", "scale": "linear", "include_zero": true, "min": "auto", "max": "auto" }, "markers": [ { "label": "248GB Limit (Non-GPU EKS)", "value": "y = 248000", "display_type": "error dashed" } ] } }, { "id": 18, "layout": { "x": 48, "y": 80, "width": 47, "height": 15 }, "definition": { "title": "CPU Growth Over Time", "title_size": "16", "title_align": "left", "show_legend": true, "legend_size": "0", "type": "timeseries", "requests": [ { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.default_cpu{$cluster,$env,$engine}" } ], "style": { "palette": "blue", "line_type": "solid", "line_width": "normal" }, "display_type": "line" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.ara_cpu{$cluster,$env,$engine}" } ], "style": { "palette": "orange", "line_type": "solid", "line_width": "normal" }, "display_type": "line" }, { "response_format": "timeseries", "queries": [ { "data_source": "metrics", "name": "query1", "query": "avg:algo.flotilla.engine.eks.ara.final_cpu_millicores{$cluster,$env,$engine}" } ], "style": { "palette": "red", "line_type": "solid", "line_width": "thick" }, "display_type": "line" } ], "yaxis": { "label": "CPU (millicores)", "scale": "linear", "include_zero": true, "min": "auto", "max": "auto" }, "markers": [ { "label": "60K Limit", "value": "y = 60000", "display_type": "error dashed" } ] } }, { "id": 19, "layout": { "x": 0, "y": 96, "width": 47, "height": 30 }, "definition": { "title": "ARA Logs - Resource Adjustments & Max Limits", "title_size": "16", "title_align": "left", "requests": [ { "response_format": "event_list", "query": { "data_source": "logs_stream", "query_string": "source:flotilla (\"ARA adjusted resources\" OR \"Spark ARA adjusted executor memory\" OR \"Spark ARA adjusted driver memory\" OR \"ARA resource allocation hit maximum limit\" OR \"ARA memory allocation hit maximum limit\" OR \"ARA CPU allocation hit maximum limit\")", "indexes": [], "storage": "hot", "sort": { "order": "desc", "column": "timestamp" } }, "columns": [ { "field": "status_line", "width": "auto" }, { "field": "timestamp", "width": "auto" }, { "field": "host", "width": "auto" }, { "field": "service", "width": "auto" }, { "field": "source", "width": "auto" }, { "field": "@status", "width": "auto" }, { "field": "content", "width": "compact" } ] } ], "type": "list_stream" } }, { "id": 20, "layout": { "x": 48, "y": 96, "width": 47, "height": 30 }, "definition": { "title": "ARA Logs - Historical Data Lookups", "title_size": "16", "title_align": "left", "requests": [ { "response_format": "event_list", "query": { "data_source": "logs_stream", "query_string": "source:flotilla (\"ARA: Historical resource data found\" OR \"ARA: No historical resource data found\" OR \"ARA: Error querying historical resource data\")", "indexes": [], "storage": "hot", "sort": { "order": "desc", "column": "timestamp" } }, "columns": [ { "field": "status_line", "width": "auto" }, { "field": "timestamp", "width": "auto" }, { "field": "host", "width": "auto" }, { "field": "service", "width": "auto" }, { "field": "source", "width": "auto" }, { "field": "@status", "width": "auto" }, { "field": "content", "width": "compact" } ] } ], "type": "list_stream" } } ], "template_variables": [ { "name": "cluster", "prefix": "cluster", "available_values": [], "default": "*" }, { "name": "env", "prefix": "env", "available_values": [], "default": "*" }, { "name": "engine", "prefix": "engine", "available_values": [ "eks", "eks-spark" ], "default": "*" } ], "layout_type": "free", "notify_list": [], "pause_auto_refresh": false } ================================================ FILE: docker-compose.yml ================================================ version: '3' services: ui: build: context: ./ui args: FLOTILLA_API: http://localhost:3000/api/v1 DEFAULT_CLUSTER: default environment: FLOTILLA_API: http://localhost:3000/api/v1 DEFAULT_CLUSTER: default ports: - 5000:5000 flotilla: build: . environment: DATABASE_URL: postgresql://flotilla:flotilla@db/flotilla?sslmode=disable FLOTILLA_MODE: dev HTTP_SERVER_CORS_ALLOWED_ORIGINS: http://localhost:5000 AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY} ports: - 3000:3000 db: image: postgres environment: POSTGRES_USER: flotilla POSTGRES_DB: flotilla POSTGRES_PASSWORD: flotilla ports: - 5432:5432 ================================================ FILE: docs/ara-command-hash-bug-report.md ================================================ # ARA command_hash Bug Report ## Executive Summary The Auto Resource Adjustment (ARA) feature has a **critical bug** where `command_hash` is calculated from the **description** field instead of the actual command, causing: 1. **21,357 runs** (23 definitions) with NULL command_hash receive **no ARA benefit** 2. **Hundreds of thousands of runs** share ARA data across **completely different commands** that happen to have the same description This means jobs can inherit resource allocations from unrelated workloads, leading to incorrect over- or under-provisioning. ## The Bug ### How command_hash Should Work `command_hash` is used by ARA to match similar jobs and apply historical OOM data. The intent is to group jobs running the **same command**. ### How It Actually Works **Location:** `flotilla/endpoints.go:451-453, 514-516, 592-593` ```go if lr.CommandHash == nil && lr.Description != nil { lr.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*lr.Description)))) } ``` **Problems:** 1. Hash is MD5 of **Description**, not Command 2. If Description is NULL, command_hash stays NULL 3. NULL command_hash never matches anything in SQL (`command_hash = NULL` always FALSE) ## Impact by the Numbers ### Bug #1: NULL command_hash (No ARA) ```sql SELECT COUNT(*) as total_runs, COUNT(DISTINCT definition_id) as definitions_affected FROM task WHERE command_hash IS NULL; ``` **Result:** - **21,357 runs** have NULL command_hash - **23 definitions** affected - These jobs **never benefit from ARA** despite it being enabled **Example:** Definition `sf-base_python-3_11-...` has **55 different commands**, all with NULL command_hash, none sharing ARA data. ### Bug #2: Description-based Hash (Incorrect ARA Sharing) ```sql -- Find command_hash values with multiple different commands SELECT definition_id, command_hash, COUNT(DISTINCT command) as distinct_commands, COUNT(*) FILTER (WHERE exit_code = 137) as oom_count, COUNT(*) as total_runs FROM task WHERE command_hash IS NOT NULL AND command IS NOT NULL GROUP BY definition_id, command_hash HAVING COUNT(DISTINCT command) > 1 ORDER BY oom_count DESC, total_runs DESC LIMIT 1; ``` **Result:** - **Worst case:** `command_hash = 407f6885beaec163a742e8c3c8a50d3e` - **176 different commands** share the same hash - **115 OOMs** across these different commands - **287 total runs** - All share description: "Calibrate Psale Prod / Calibrate Psale" **Other severe cases:** - `a0798e54ea76fb8dc1e743fe37f761e0`: 2 commands, **87,142 runs** affected - `1eeb37af6d7e0e4bb2a73a0f61ac7a79`: 2 commands, **52,844 runs** affected - `123fad187daf3847583761f5495e3ce8`: 2 commands, **39,181 runs** affected ## Concrete Example: The Smoking Gun ### Timeline **November 22-24, 2025** - Daily data processing job with description "Calibrate Psale Prod / Calibrate Psale" #### OOMs in 3-Day Window (Contributing to ARA): | Date | Run ID | Memory | Command Differs By | |------|--------|--------|-------------------| | Nov 22 | `eks-c662-2a1e-44f7...` | 1024 MB | `--as_of 20251121` | | Nov 22 | `eks-a9fd-92f6-4fe1...` | 1792 MB | `--as_of 20251121` | | Nov 23 | `eks-055c-c578-4951...` | 1024 MB | `--as_of 20251122` | **ARA Calculation:** - P99([1024, 1792, 1024]) = 1792 MB - 1792 MB × 1.75 = **3136 MB** #### Next Day Run (Inherits OOM Data): | Date | Run ID | Memory | Command Differs By | Exit Code | |------|--------|--------|-------------------|-----------| | Nov 24 | `eks-0d33-a443-43b9...` | **3136 MB** | `--as_of 20251123` | 0 (Success) | ### The Commands Are Different! **Nov 23 OOM Command:** ```bash python3 /dsn-algo-adhoc/damien/projects/fy25q4_psale_calibration/calibrate.py --as_of 20251122 ``` **Nov 24 Command (Got ARA from above):** ```bash python3 /dsn-algo-adhoc/damien/projects/fy25q4_psale_calibration/calibrate.py --as_of 20251123 ``` **Only difference:** The date parameter (`20251122` vs `20251123`) **Why this matters:** These are daily data processing jobs. Each date's data could have completely different characteristics and memory requirements, but they share ARA data because they have the same description. ### Verification The exact ARA query for the Nov 24 run returns: ```sql SELECT cast((percentile_disc(0.99) within GROUP (ORDER BY A.max_memory_used)) * 1.75 as int) as memory FROM (SELECT memory as max_memory_used FROM TASK WHERE queued_at >= '2025-11-21 15:10:01' AND queued_at < '2025-11-24 15:10:01' AND (exit_code = 137 or exit_reason = 'OOMKilled') AND definition_id = 'sf-base_python-3_9-59ab1a32-cdda-4eb8-5824-49d17d96b1fd' AND command_hash = '407f6885beaec163a742e8c3c8a50d3e' LIMIT 30) A; ``` **Result:** 3136 MB ← **Exactly what the Nov 24 run received** ## Concrete Example #2: Catastrophic Case at 350GB Maximum ### The Worst-Case Scenario: ML Training at the Limit **Definition:** `sf-base_pytorch2-24__5-py3-698fef2e-4bad-4e45-624c-c57fec2f2aa7` **Command Hash:** `b4c7adde0a3dc7dd13a8da282f1693c1` **Shared Description:** "CTSM PF ATRF Metrics SubSeqRefactor 12-2 Train Staging / Model Training" This case demonstrates the bug at its most destructive: **12 completely different machine learning training configurations** all sharing one command_hash and **starting at the 350GB maximum memory limit from day one**. ### The Three Training Configurations All run PyTorch model training (`client_time_series_model/train.py`) but with **completely different parameters**: #### Configuration A: March 2 Data, Full Dataset ```bash python3 train.py --as_of 20250302 --max_epochs 4 --pct_client_subset_dev 100 ``` - **Runs:** 24 - **OOMs:** 22 (92% OOM rate!) - **Training:** Full dataset (100% of clients), 4 epochs - **Memory:** 350GB (maximum limit) #### Configuration B: June 28 Data, 10% Subset ```bash python3 train.py --as_of 20250628 --max_epochs 10 --pct_client_subset_dev 10 ``` - **Runs:** 24 - **OOMs:** 8 (33% OOM rate) - **Training:** 10% of data, 10 epochs - **Memory:** 350GB (maximum limit) #### Configuration C: May 17 Data, 1% Subset ```bash python3 train.py --as_of 20250517 --max_epochs 10 --pct_client_subset_dev 1 ``` - **Runs:** 18 - **OOMs:** 2 (11% OOM rate) - **Training:** Only 1% of data, 10 epochs - **Memory:** 350GB (maximum limit) ### The Cross-Contamination Timeline **August 14-September 4, 2025** - All runs execute at 350GB from the start: ``` Aug 14: Config C (1% data) → OOM at 350GB Aug 14: Config A (100% data) → 18 OOMs at 350GB over 6 days Aug 19: Config A continues → More OOMs at ceiling Aug 28: Config B (10% data) → 8 OOMs at 350GB Aug 28: Configs A, B, C mix → All hit 350GB ceiling Sep 1-4: Various configs → Continue OOM'ing at maximum ``` ### Why This is Catastrophic 1. **No room to grow:** ARA wants to increase memory after OOMs, but all runs are already at the 350GB maximum limit 2. **Massive over-provisioning for small jobs:** Configuration C trains on **1% of the data** but gets **350GB** because Configurations A and B OOM'd with full datasets 3. **Trapped at the ceiling:** Once at max memory, ARA becomes useless: - Jobs that need >350GB: Keep OOM'ing, can't grow further - Jobs that need <<350GB: Massively over-allocated, wasting resources 4. **Cross-training contamination:** Three completely different ML experiments share OOM history: - Different months of training data (March, May, June) - Different model hyperparameters (4 vs 10 epochs) - Different data sizes (100% vs 10% vs 1% of clients) ### The Numbers **Total Impact:** - **83 runs** across **12 different commands** - **32 OOMs** (39% OOM rate **at maximum memory**) - **All 83 runs allocated 350GB** regardless of actual needs **Configuration C alone** (1% subset): - Likely needs <50GB based on data size - Receives 350GB due to cross-contamination - **700% over-provisioned** (7x more memory than needed) ### Root Cause All 12 commands share the same description: ``` "CTSM PF ATRF Metrics SubSeqRefactor 12-2 Train Staging / Model Training" ``` Therefore: `command_hash = MD5(description) = b4c7adde0a3dc7dd13a8da282f1693c1` ARA cannot distinguish between: - Training on March data vs June data (4 months apart) - 4 epochs vs 10 epochs (2.5x difference) - 100% data vs 10% vs 1% (100x difference!) ### What Should Happen If `command_hash` were calculated from the actual command: - **Config A hash:** MD5("...as_of 20250302...max_epochs 4...pct_client_subset_dev 100...") - **Config B hash:** MD5("...as_of 20250628...max_epochs 10...pct_client_subset_dev 10...") - **Config C hash:** MD5("...as_of 20250517...max_epochs 10...pct_client_subset_dev 1...") Each would have **independent ARA history** based on its actual resource needs: - Config A might legitimately need 350GB (full dataset) - Config B might need ~50GB (10% subset) - Config C might need ~10GB (1% subset) Instead, all three get 350GB because they share a description. ## Why This Causes Over-Provisioning 1. **Cross-contamination:** Jobs inherit OOM data from unrelated workloads 2. **Compounding growth:** The 1.75x multiplier compounds across different jobs 3. **Never stabilizes:** Each day's job can trigger growth for the next day's job 4. **Reaches maximum:** Eventually hits the 350GB limit, explaining the "jobs growing to 300GB" issue ## Scale of the Problem ### Definitions with Most Cross-Command OOMs ```sql SELECT definition_id, command_hash, COUNT(DISTINCT command) as distinct_commands, COUNT(*) FILTER (WHERE exit_code = 137 OR exit_reason = 'OOMKilled') as oom_count, COUNT(*) as total_runs FROM task WHERE command_hash IS NOT NULL AND engine = 'eks' AND command IS NOT NULL GROUP BY definition_id, command_hash HAVING COUNT(DISTINCT command) > 1 AND COUNT(*) FILTER (WHERE exit_code = 137 OR exit_reason = 'OOMKilled') > 0 ORDER BY oom_count DESC LIMIT 10; ``` | Rank | command_hash | Distinct Commands | OOMs | Total Runs | |------|--------------|-------------------|------|------------| | 1 | `407f6885beaec163...` | 176 | 115 | 287 | | 2 | `a5bdb8f3302110219...` | 164 | 87 | 304 | | 3 | `2344c10bd7229...` | 184 | 83 | 564 | | 4 | `7803d8faa568610...` | 97 | 82 | 261 | | 5 | `90ceb0cabff4958...` | 135 | 82 | 230 | All from the same definition: `sf-base_python-3_9-59ab1a32-cdda-4eb8-5824-49d17d96b1fd` ### Definitions with NULL command_hash (No ARA) ```sql SELECT definition_id, COUNT(DISTINCT command) as distinct_commands, COUNT(*) as total_runs FROM task WHERE command_hash IS NULL AND command IS NOT NULL GROUP BY definition_id HAVING COUNT(DISTINCT command) > 1 ORDER BY total_runs DESC LIMIT 5; ``` | Definition ID | Distinct Commands | Total Runs | |---------------|-------------------|------------| | `sf-base_python-3_11-7449eda4-b8b3-4146-77c5-a47f8caac81b` | 55 | 91 | | `sf-base_python-3_9-59ab1a32-cdda-4eb8-5824-49d17d96b1fd` | 40 | 49 | | `data-platform-d834291f-d984-408e-5da4-8646f7e2f5b7` | 4 | 31 | | `platform-8a651dbe-1794-485b-6ba4-ba58b4a10212` | 5 | 21 | | `sf-base_pytorch2-24__5-py3-ceef4c9e-6ebc-41e5-6cef-a334aed6e829` | 6 | 17 | ## Root Cause Analysis ### Design Intent vs Implementation **Intended behavior:** - Jobs running the **same command** should share ARA data - Different commands should have separate ARA histories **Actual behavior:** - Jobs with the **same description** share ARA data - Command can be completely different ### Why Description Was Used Looking at the code flow: 1. API receives execution request with optional `description` field 2. If `command_hash` not provided by client, generate from description 3. **Problem:** Command isn't available yet at this point in the flow 4. Command is constructed later during job submission **The Disconnect:** - `command_hash` is set in `flotilla/endpoints.go` (API layer) - Actual `command` is finalized in `execution/adapter/eks_adapter.go` (execution layer) - By the time the command is known, the hash is already set ## The Fix ### Recommended Solution Calculate `command_hash` from the **actual command** that will run: **Location to fix:** Where the Run object gets its final command, likely in the execution service before calling `EstimateRunResources()`. **Pseudocode:** ```go // After command is finalized, before ARA lookup if run.Command != nil && len(*run.Command) > 0 { run.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*run.Command)))) } else { // Fallback: use description if no command (shouldn't happen for EKS jobs) if run.Description != nil { run.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*run.Description)))) } } ``` ### Migration Strategy **Challenge:** Changing command_hash breaks ARA history **Options:** 1. **Clean break (Recommended):** - Fix the hash calculation - Accept that ARA starts fresh for all jobs - Monitor via new instrumentation to ensure it works correctly 2. **Dual-hash lookup:** - Try command-based hash first - Fall back to description-based hash for historical data - Gradually phase out old hashes 3. **Per-definition rollout:** - Fix hash for definitions most affected by the bug - Leave others on old behavior temporarily - Migrate gradually ### Testing Plan 1. **Verify hash calculation:** - Unit tests ensuring hash comes from command, not description - Integration tests with various command/description combinations 2. **Verify ARA still works:** - Test that identical commands share ARA data - Test that different commands DON'T share data 3. **Monitor after deployment:** - Use new `ara.*` metrics to track behavior - Watch for unexpected resource changes - Check logs for `ara.no_historical_data` - should increase initially ## Impact on Current Investigation This bug significantly impacts the "jobs growing to 300GB" investigation: 1. **Over-provisioning is worse than thought:** - Jobs inherit OOMs from unrelated workloads - The 1.75x multiplier compounds across different jobs - Growth isn't just from retrying the same job, but cross-contamination 2. **Instrumentation still valuable:** - The new ARA metrics will help measure the bug's impact - After fixing, metrics will show if ARA works correctly 3. **Fix priority:** - This bug should be fixed **before** tuning ARA multipliers - Otherwise, you're tuning a broken system ## Queries for Further Investigation ### Find your most affected definitions ```sql -- Definitions with most OOM cross-contamination SELECT definition_id, command_hash, COUNT(DISTINCT MD5(command)) as distinct_commands, COUNT(*) FILTER (WHERE exit_code = 137 OR exit_reason = 'OOMKilled') as oom_count, COUNT(*) as total_runs, MAX(memory) as max_memory_allocated FROM task WHERE command_hash IS NOT NULL AND engine = 'eks' AND command IS NOT NULL AND queued_at >= CURRENT_TIMESTAMP - INTERVAL '30 days' GROUP BY definition_id, command_hash HAVING COUNT(DISTINCT MD5(command)) > 1 AND COUNT(*) FILTER (WHERE exit_code = 137 OR exit_reason = 'OOMKilled') > 0 ORDER BY oom_count * distinct_commands DESC LIMIT 20; ``` ### Find jobs hitting memory limits with cross-command contamination ```sql -- Jobs at max memory (350GB) that share command_hash with different commands SELECT DISTINCT t1.definition_id, t1.command_hash FROM task t1 JOIN task t2 ON t1.definition_id = t2.definition_id AND t1.command_hash = t2.command_hash AND MD5(t1.command) != MD5(t2.command) WHERE t1.memory >= 300000 -- Close to or at max AND t1.queued_at >= CURRENT_TIMESTAMP - INTERVAL '7 days' GROUP BY t1.definition_id, t1.command_hash HAVING COUNT(DISTINCT MD5(t1.command)) > 1; ``` ## Recommendations 1. **Immediate:** - Review the examples in this report with the team - Decide on fix approach (clean break vs dual-hash) - Prioritize this fix before tuning ARA parameters 2. **Short-term:** - Implement command-based hash calculation - Deploy with new instrumentation - Monitor via `ara.*` metrics 3. **Long-term:** - Consider whether description should exist separately from command - Review if ARA should use command hash at all, or something more semantic - Add validation to prevent command_hash from being NULL ## Related Files - **Bug location:** `flotilla/endpoints.go:451-453, 514-516, 592-593` - **ARA query:** `state/pg_queries.go:54-66` (TaskResourcesSelectCommandSQL) - **ARA lookup:** `state/pg_state_manager.go:118-162` (EstimateRunResources) - **Resource adjustment:** `execution/adapter/eks_adapter.go:352-421` (adaptiveResources) - **New instrumentation:** `docs/ara-instrumentation.md` ## Database Evidence All evidence in this report is from production database queries run on 2025-11-24. Key run IDs for reproduction: - OOM: `eks-055c-c578-4951-75d8-3f5a0bb15b37` (Nov 23, 1024 MB, OOM) - Inherited: `eks-0d33-a443-43b9-45f9-04b780868880` (Nov 24, 3136 MB, Success) - Command hash: `407f6885beaec163a742e8c3c8a50d3e` - Definition: `sf-base_python-3_9-59ab1a32-cdda-4eb8-5824-49d17d96b1fd` ================================================ FILE: docs/ara-command-hash-fix-locations.md ================================================ # ARA command_hash Fix: Implementation Locations ## ✅ STATUS: IMPLEMENTED **All code changes have been completed.** This document now serves as a record of what was changed. **Changes made:** 1. ✅ Added command_hash calculation from command in `services/execution.go` 2. ✅ Removed description-based hash calculation from `flotilla/endpoints.go` (3 locations) 3. ✅ Optimized SQL query in `state/pg_queries.go` to use direct parameter 4. ✅ Updated call site in `execution/adapter/eks_adapter.go` with NULL check **Remaining work:** - ⏳ Add unit tests (see Testing Plan section) - ⏳ Deploy and monitor (see Success Criteria section) --- ## Executive Summary The `command_hash` bug required moving hash calculation from the API layer (where only description is available) to the execution service layer (where the actual command is finalized). ## Current Broken Flow ``` 1. API Layer (flotilla/endpoints.go:451-453, 514-516, 592-593) ├─ Receives execution request ├─ Sets: lr.CommandHash = MD5(description) ❌ WRONG └─ Passes to execution service 2. Execution Service (services/execution.go:320-327) ├─ Constructs final command from template/request ├─ Command is now finalized ✓ └─ But hash was already set from description ❌ 3. Database (state/pg_state_manager.go:1168) └─ Stores the wrong hash from step 1 ❌ 4. EKS Adapter (execution/adapter/eks_adapter.go:109) ├─ Final command formatting └─ Hash still wrong ❌ 5. ARA Lookup (execution/adapter/eks_adapter.go:369) └─ Uses wrong hash to query historical data ❌ ``` ## Fixed Flow ``` 1. API Layer (flotilla/endpoints.go) ├─ Receives execution request └─ Does NOT set command_hash (remove this code) ✓ 2. Execution Service (services/execution.go:359) ├─ Constructs final command ├─ Calculates: fields.CommandHash = MD5(command) ✓ NEW └─ Passes to CreateRun 3. Database (state/pg_state_manager.go:1168) └─ Stores correct hash ✓ 4. EKS Adapter (execution/adapter/eks_adapter.go:109) └─ Command already hashed correctly ✓ 5. ARA Lookup (execution/adapter/eks_adapter.go:369) └─ Uses correct hash ✓ ``` ## Code Changes Required ### 1. PRIMARY FIX: Add hash calculation in services/execution.go **Location:** `services/execution.go:359` (right before constructing the Run object) **Current code (lines 319-381):** ```go if *fields.Engine == state.EKSEngine { executableCmd, err := executable.GetExecutableCommand(req) if err != nil { return run, err } if (fields.Command == nil || len(*fields.Command) == 0) && (len(executableCmd) > 0) { fields.Command = aws.String(executableCmd) } executableID := executable.GetExecutableID() // ... spot/ondemand logic ... } if *fields.Engine == state.EKSSparkEngine { // ... spark setup ... } if fields.NodeLifecycle == nil { fields.NodeLifecycle = &state.SpotLifecycle } run = state.Run{ RunID: runID, // ... Command: fields.Command, CommandHash: fields.CommandHash, // ❌ Uses wrong hash from API layer // ... } ``` **New code (insert at line ~359, before `run = state.Run{...}`):** ```go if fields.NodeLifecycle == nil { fields.NodeLifecycle = &state.SpotLifecycle } // Calculate command_hash from actual command (FIX for ARA bug) // This ensures jobs with different commands have different hashes, // even if they share the same description. if fields.Command != nil && len(*fields.Command) > 0 { fields.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*fields.Command)))) } // If command is NULL/empty, command_hash remains NULL (malformed job) // Do NOT fall back to description - that was the bug we're fixing run = state.Run{ RunID: runID, // ... Command: fields.Command, CommandHash: fields.CommandHash, // ✓ Now has correct hash // ... } ``` **Why this location:** - Command is finalized (line 326 for EKS, or from request) - Before `CreateRun` is called (line 653) - Works for both EKS standard and Spark engines - No database update needed (hash set correctly from start) **Imports needed:** ```go import ( "crypto/md5" // ... existing imports ... ) ``` ### 2. CLEANUP: Remove broken hash calculation from endpoints.go **Locations to modify:** - `flotilla/endpoints.go:451-453` (CreateRunV2) - `flotilla/endpoints.go:514-516` (CreateRunV4) - `flotilla/endpoints.go:592-594` (CreateRunByAlias) **Current code (appears in 3 places):** ```go if lr.CommandHash == nil && lr.Description != nil { lr.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*lr.Description)))) } ``` **Action:** **REMOVED these 3 blocks entirely** ✅ COMPLETED **Rationale:** - This was the source of the bug (hashing description instead of command) - Hash will now be calculated correctly in execution service - API clients already don't pass command_hash, so removal has no client impact - No fallback to description - that perpetuates the bug ### 3. OPTIMIZATION: Update SQL query to use direct parameter ✅ COMPLETED **File:** `state/pg_queries.go` **Location:** Line 64 **Changed from:** ```sql AND command_hash = (SELECT command_hash FROM task WHERE run_id = $2) ``` **Changed to:** ```sql AND command_hash = $2 ``` **Benefit:** Eliminates unnecessary subquery, improves performance ### 4. OPTIMIZATION: Update call site to pass command_hash ✅ COMPLETED **File:** `execution/adapter/eks_adapter.go` **Location:** Lines 368-422 (in `adaptiveResources` function) **Changed from:** ```go if !isGPUJob { estimatedResources, err := manager.EstimateRunResources(ctx, *executable.GetExecutableID(), run.RunID) if err == nil { // ARA found historical data... } else { // No historical data available _ = metrics.Increment(metrics.EngineEKSARANoHistoricalData, metricTags, 1) } } ``` **Changed to:** ```go if !isGPUJob { // Only attempt ARA if we have a command hash if run.CommandHash == nil { // Command hash is NULL - job has no command (malformed job definition) _ = metrics.Increment(metrics.EngineEKSARANullCommandHash, metricTags, 1) _ = a.logger.Log( "level", "warn", "message", "Skipping ARA - NULL command_hash", "reason", "Job has no command (malformed definition)", "run_id", run.RunID, "definition_id", *executable.GetExecutableID(), ) } else { estimatedResources, err := manager.EstimateRunResources(ctx, *executable.GetExecutableID(), *run.CommandHash) if err == nil { // ARA found historical data... } else { // No historical data available _ = metrics.Increment(metrics.EngineEKSARANoHistoricalData, metricTags, 1) } } } ``` **Changes:** - Added NULL check for `run.CommandHash` - Pass `*run.CommandHash` instead of `run.RunID` - Added specific metric and logging for NULL case **Note:** The metric `metrics.EngineEKSARANullCommandHash` may need to be added to the metrics package. ### 5. OPTIONAL: Add validation/logging **Location:** `state/pg_state_manager.go:1168` (CreateRun, where command_hash is stored) **Add validation before insert:** ```go // Validate that command_hash matches command (helps catch bugs) if r.Command != nil && r.CommandHash != nil { expectedHash := fmt.Sprintf("%x", md5.Sum([]byte(*r.Command))) if expectedHash != *r.CommandHash { // Log mismatch but don't fail (for observability) flotillaLog.Log( "message", "WARNING: command_hash mismatch", "run_id", r.RunID, "expected_hash", expectedHash, "actual_hash", *r.CommandHash, ) } } ``` ## Migration Considerations ### Clean Break (Recommended) Since current command_hash values are incorrect, the best approach is: 1. **Deploy the fix** - All new runs get correct hash 2. **Accept loss of history** - New hashes won't match old hashes 3. **Monitor ARA metrics** - Use instrumentation to verify behavior 4. **Expect initial spike** - `ara.no_historical_data` metric will increase temporarily **Why this is OK:** - Current ARA data is contaminated anyway - Better to start fresh with correct data - New instrumentation will help monitor the recovery ### Alternative: Dual-Hash Lookup (NOT IMPLEMENTED) **Decision:** We chose the clean break approach. No dual-hash lookup was implemented. **Reason:** The historical data is contaminated and would perpetuate the bug. Starting fresh with correct hashing is the right approach. ## Testing Plan ### Unit Tests **Location:** `services/execution_test.go` ```go func TestCommandHashCalculatedFromCommand(t *testing.T) { // Test that command_hash is MD5 of command, not description req := &state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: aws.String("python script.py --arg value"), Description: aws.String("Different description"), }, } run, err := executionService.constructBaseRunFromExecutable(ctx, definition, req) expectedHash := fmt.Sprintf("%x", md5.Sum([]byte("python script.py --arg value"))) assert.Equal(t, expectedHash, *run.CommandHash) assert.NotEqual(t, fmt.Sprintf("%x", md5.Sum([]byte("Different description"))), *run.CommandHash) } func TestCommandHashWithSameDescriptionDifferentCommands(t *testing.T) { // Test that different commands get different hashes even with same description description := "Daily processing job" req1 := &state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: aws.String("python process.py --date 2025-01-01"), Description: aws.String(description), }, } req2 := &state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: aws.String("python process.py --date 2025-01-02"), Description: aws.String(description), }, } run1, _ := executionService.constructBaseRunFromExecutable(ctx, definition, req1) run2, _ := executionService.constructBaseRunFromExecutable(ctx, definition, req2) assert.NotEqual(t, run1.CommandHash, run2.CommandHash, "Different commands should have different hashes even with same description") } ``` ### Integration Tests **Verify end-to-end:** 1. Submit two runs with: - Same description - Different commands (e.g., different dates) 2. Check database: ```sql SELECT command, command_hash, description FROM task WHERE run_id IN ('run1', 'run2'); ``` 3. Verify: - Different commands → different hashes ✓ - Same description ✓ - Hashes are MD5 of commands ✓ ### Production Verification **After deployment, monitor:** 1. **New runs have non-NULL hash:** ```sql SELECT COUNT(*) FROM task WHERE queued_at > NOW() - INTERVAL '1 hour' AND command_hash IS NULL AND command IS NOT NULL; ``` Should be 0. 2. **Hash matches command:** ```sql SELECT run_id, command, command_hash, MD5(command) as expected_hash FROM task WHERE queued_at > NOW() - INTERVAL '1 hour' LIMIT 100; ``` Verify `command_hash = expected_hash`. 3. **ARA metrics (from instrumentation):** - `ara.no_historical_data` - will spike initially (expected) - `ara.resource_adjustment` - should stabilize over 3-7 days - `ara.hit_max_memory` - should decrease for over-provisioned jobs ## Rollback Plan If the fix causes issues: 1. **Quick rollback:** Revert the code changes and redeploy 2. **Data is safe:** Database schema unchanged, no migrations needed 3. **Monitoring:** New instrumentation continues to work regardless ## Summary of Changes Made | File | Lines | Action | Status | |------|-------|--------|--------| | `services/execution.go` | 5 | **ADD** crypto/md5 import | ✅ COMPLETED | | `services/execution.go` | 361-368 | **ADD** command_hash calculation | ✅ COMPLETED | | `flotilla/endpoints.go` | 451-453 | **REMOVE** description-based hash | ✅ COMPLETED | | `flotilla/endpoints.go` | 510-512 | **REMOVE** description-based hash | ✅ COMPLETED | | `flotilla/endpoints.go` | 584-586 | **REMOVE** description-based hash | ✅ COMPLETED | | `state/pg_queries.go` | 64 | **MODIFY** Remove subquery, use $2 directly | ✅ COMPLETED | | `execution/adapter/eks_adapter.go` | 369-422 | **ADD** NULL check and pass *run.CommandHash | ✅ COMPLETED | | `services/execution_test.go` | New | **ADD** unit tests (TODO) | ⏳ PENDING | ## Timeline Estimate - Code changes: 30 minutes - Unit tests: 1 hour - Integration testing: 2 hours - Deployment: Standard release process - Monitoring period: 3-7 days for ARA to stabilize ## Success Criteria 1. ✓ All new runs have `command_hash = MD5(command)` 2. ✓ Different commands have different hashes 3. ✓ Zero NULL command_hash for new runs (except truly NULL commands) 4. ✓ ARA metrics stabilize within 7 days 5. ✓ OOM rates decrease for previously over-provisioned jobs ================================================ FILE: docs/ara-command-hash-history.md ================================================ # History of command_hash Implementation ## Timeline of Changes ### January 17, 2020 - Original Design (Commit a5d7e0f) **Author:** Ujjwal Sarin **PR:** #269 **Title:** "Adding command hash to task" **What was added:** 1. `command_hash` column added to `task` table 2. Changed ARA query from matching exact `command` text to `command_hash` 3. **Database automatically calculated hash:** `MD5($17)` where `$17` is the command parameter **Original CreateRun SQL:** ```sql INSERT INTO task ( ..., command, ..., command_hash ) VALUES ( ..., $17, ..., MD5($17) ); ``` **Original UpdateRun SQL:** ```sql UPDATE task SET command = $17, ..., command_hash = MD5($17) WHERE run_id = $1; ``` **Intent:** Hash was calculated FROM THE COMMAND to group similar jobs for ARA resource estimation. **Original Query Change:** ```sql -- BEFORE: Match exact command text WHERE command = (SELECT command FROM TASK WHERE run_id = $2) -- AFTER: Match command hash WHERE command_hash = (SELECT command_hash FROM task WHERE run_id = $2) ``` ### January 22, 2020 - Removed Auto-Hashing from UpdateRun (Commit fbe8409) **Author:** Ujjwal Sarin **Title:** "removing adding command_hash on updates" **What changed:** - Removed `command_hash = MD5($17)` from UpdateRun SQL - Left CreateRun unchanged (still had MD5 calculation) **Why this matters:** This suggests the design started shifting toward setting command_hash earlier in the flow, not in the database. ### December 31, 2021 - API Layer Auto-Generation from Description (Commit 7802cfe) **Author:** Ujjwal Sarin **Commit message:** "encode lr" **What was added:** ```go // In flotilla/endpoints.go - CreateRunV2, CreateRunV4, CreateRunByAlias if lr.CommandHash == nil && lr.Description != nil { lr.CommandHash = aws.String(hex.EncodeToString([]byte(*lr.Description))) } ``` **THE BUG INTRODUCED:** Changed from hashing the command to hashing the description. **Why description was used:** At the API layer (endpoints.go), the final command isn't constructed yet. The command gets finalized later during job submission in the execution layer. **Context:** This commit was for Spark executor estimation feature (see below). ### December 31, 2021 - Same Day: Changed to MD5 (Commit 7e84338) **Author:** Ujjwal Sarin **Title:** "adding support for predicting executor" **What changed:** ```go // Changed from hex encoding to MD5 (same day, 2 hours later) if lr.CommandHash == nil && lr.Description != nil { lr.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*lr.Description)))) } ``` **What was added:** Spark executor count estimation using command_hash: ```go // execution/engine/emr_engine.go func (emr *EMRExecutionEngine) estimateExecutorCount(run state.Run, manager state.Manager) *int64 { if run.Engine != nil && *run.Engine == state.EKSSparkEngine { count, err := manager.EstimateExecutorCount(run.DefinitionID, *run.CommandHash) if err == nil { return aws.Int64(count) } } return aws.Int64(100) } ``` **New Query Added:** ```sql const TaskResourcesExecutorCountSQL = ` SELECT COALESCE(cast((percentile_disc(0.99) within GROUP (ORDER BY A.executor_count)) * 1.75 as int), 100) FROM (SELECT CASE WHEN (exit_reason like '%Exception%') THEN spark_extension->'num_executors' END FROM TASK WHERE queued_at >= CURRENT_TIMESTAMP - INTERVAL '7 days' AND engine = 'eks-spark' AND definition_id = $1 AND command_hash = $2 AND (exit_code != 0) LIMIT 30) A ` ``` **Significance:** This shows command_hash was being used for TWO features: 1. ARA memory/CPU estimation (original, Jan 2020) 2. Spark executor count estimation (new, Dec 2021) Both rely on grouping similar jobs, but the Dec 2021 implementation broke this by hashing description instead of command. ## Current State (2025) ### API Layer (flotilla/endpoints.go) ```go // Lines 451-453, 514-516, 592-593 if lr.CommandHash == nil && lr.Description != nil { lr.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*lr.Description)))) } ``` **Problem:** Hashes description, not command. ### Database Layer (state/pg_state_manager.go) ```go // CreateRun - Line 1168 r.CommandHash // Just uses whatever was passed in, no calculation ``` **Problem:** No fallback calculation. If API layer provides wrong hash, database accepts it. ### API Schema (state/models.go) ```go // LaunchRequestV2 - Line 1235 type LaunchRequestV2 struct { Command *string `json:"command,omitempty"` Description *string `json:"description,omitempty"` CommandHash *string `json:"command_hash,omitempty"` // ... } ``` **Observation:** `command_hash` IS exposed as an optional API field, but: 1. Clients rarely/never pass it explicitly 2. API layer auto-generates from description as fallback 3. This means nearly all command_hash values in production are MD5(description) ## Root Cause Analysis ### The Design Disconnect **Layer 1 - API (endpoints.go):** - Receives execution request - Command might not be finalized yet - Needs to set command_hash for downstream use - Only has description available - **Decision:** Hash description as proxy for command **Layer 2 - Execution (execution/adapter/eks_adapter.go):** - Constructs final command from template + parameters - Command is now known - But command_hash was already set in Layer 1 - **Missing:** No code to recalculate hash from actual command **Layer 3 - Database (state/pg_state_manager.go):** - Just stores whatever command_hash was provided - No validation that hash matches command - **Assumption:** Hash was calculated correctly upstream ### Why This Wasn't Caught 1. **Description often stable:** Many jobs use the same description repeatedly 2. **Worked for simple cases:** Jobs with truly identical descriptions often have identical commands 3. **Gradual degradation:** As users started parameterizing commands (dates, configs), descriptions stayed same but commands diverged 4. **No monitoring:** Until the recent instrumentation patches, there was no visibility into ARA behavior ## Evidence from Production ### NULL command_hash - **21,357 runs** with NULL command_hash (description also NULL) - These runs get NO ARA benefit despite feature being enabled ### Cross-Command Contamination - **Worst case:** 176 different commands sharing one command_hash - **High-volume case:** 87,142 runs across 2 different commands - **ML Training catastrophe:** 12 different training configs all sharing 350GB allocation ### The Smoking Gun From docs/ara-command-hash-bug-report.md: **Daily jobs differing only by date:** ```bash # Nov 23 OOM python3 calibrate.py --as_of 20251122 # Nov 24 (inherited ARA from above) python3 calibrate.py --as_of 20251123 ``` Both have description "Calibrate Psale Prod / Calibrate Psale" → Same command_hash → Share ARA data → Nov 24 job gets 3136 MB from Nov 23 OOM **The data being processed is completely different** (different dates), but they share resource allocation history. ## The Original Intent vs Reality ### Original Intent (Jan 2020) - Jobs running the **same command** share ARA data - Different commands have separate ARA histories - Performance optimization: hash instead of full text comparison ### Current Reality (Dec 2021 - Present) - Jobs with the **same description** share ARA data - Commands can be completely different - Leads to incorrect resource allocation ## Why Description Was Chosen Looking at the code flow: 1. API receives execution request (`flotilla/endpoints.go`) - Has: description (optional), command template - Needs: command_hash for ARA lookup 2. Command construction happens later (`execution/adapter/eks_adapter.go`) - Combines template + env vars + parameters - Final command not available at API layer 3. Timing problem: - `command_hash` needed before `adaptiveResources()` call - `command` not finalized until during job construction - Description available early, command available late **The Compromise:** Use description as a "proxy" for command. **Why it seemed reasonable:** - Description often correlates with command - Better than nothing for grouping similar jobs - Performance: avoid expensive string operations on long commands **Why it fails:** - Parameterized commands (dates, configs, data subsets) - Description captures "what" but not "how" - Catastrophic cross-contamination at scale ## Related Queries ### Original ARA Query (2020-2021) ```sql -- Before command_hash WHERE command = (SELECT command FROM TASK WHERE run_id = $2) ``` ### Current ARA Query (2022-Present) ```sql -- Using command_hash WHERE command_hash = (SELECT command_hash FROM task WHERE run_id = $2) ``` **Irony:** The query change was meant to make ARA more efficient, but combined with description-based hashing, it made it incorrect. ## Conclusion The bug wasn't a single mistake but an **architectural mismatch**: 1. **2020:** Designed command_hash to group identical commands 2. **2021:** Needed to set hash early in request flow 3. **2021:** Command not available early, description chosen as proxy 4. **2021-2025:** Production usage reveals proxy doesn't work at scale The fix requires moving command_hash calculation to **after** command is finalized, or making command available earlier in the flow. ## References - **Original feature:** Commit a5d7e0f (Jan 17, 2020) - **Auto-hash removal:** Commit fbe8409 (Jan 22, 2020) - **Bug introduction:** Commit 7802cfe (Dec 31, 2021) - **MD5 change:** Commit 7e84338 (Dec 31, 2021) - **ARA enablement:** Commit 4c0ffc8 (Feb 23, 2022) - **Bug documentation:** docs/ara-command-hash-bug-report.md (Nov 25, 2025) ================================================ FILE: docs/ara-instrumentation.md ================================================ # ARA Instrumentation Guide ## Overview This document describes the instrumentation added to measure Auto Resource Adjustment (ARA) behavior in Flotilla. The goal is to understand how often ARA causes resource growth and identify potential over-provisioning, particularly when jobs repeatedly hit maximum resource limits (~300GB memory). ## Background: How ARA Works ### What is ARA? Auto Resource Adjustment (ARA) is a feature that automatically adjusts CPU and memory resources for Kubernetes jobs based on historical usage data from previous runs that experienced Out-Of-Memory (OOM) failures. ### Historical Context 1. **Initial Implementation (~2020):** ARA was introduced as an optional feature controlled by the `adaptive_resource_allocation` field on task definitions 2. **Global Override (Jan 2020):** Added `eks.ara_enabled` config parameter for global control 3. **Always Enabled (Mar 2022, commit 6eb44086):** ARA was hardcoded to always be enabled in `execution/engine/eks_engine.go:70` - All jobs now run with ARA regardless of configuration - The toggle was removed ### ARA Algorithm **Location:** `execution/adapter/eks_adapter.go:adaptiveResources()` **Process:** 1. Job starts with default resources from task definition 2. ARA queries historical data via `EstimateRunResources()` in `state/pg_state_manager.go` 3. SQL query (`state/pg_queries.go:TaskResourcesSelectCommandSQL`) looks for: - Jobs from the same definition with matching command hash - That OOM'd (exit_code=137 or exit_reason='OOMKilled') - Within the last 3 days - Up to 30 most recent runs 4. Calculates P99 (99th percentile) of resource usage and applies multipliers: - **Memory:** P99 max memory × **1.75** - **CPU:** P99 max CPU × **1.25** 5. Ensures request ≤ limit, applies bounds checking **Resource Limits:** - Min CPU: 256 millicores - Max CPU: 60,000 millicores (94,000 for GPU jobs) - Min Memory: 512 MB - Max Memory: **350,000 MB** (~341 GB) for standard jobs (376,000 MB for GPU) ### Why Jobs Grow to ~300GB The 1.75x multiplier compounds with each OOM: 1. Job runs with 10GB → OOMs 2. Next run: 10GB × 1.75 = 17.5GB → OOMs 3. Next run: 17.5GB × 1.75 = 30.6GB → OOMs 4. Pattern continues: 30.6GB → 53.5GB → 93.6GB → 163GB → 285GB → **350GB limit hit** Each OOM triggers exponential growth until the maximum limit is reached. ## Instrumentation Added ### Metrics (DataDog) All metrics use low-cardinality tags (`cluster` only) to avoid excessive volume. #### Counters | Metric | Description | When to Alert | |--------|-------------|---------------| | `engine.eks.ara.resource_adjustment` | Incremented when ARA triggers resource changes | Track frequency of ARA usage | | `engine.eks.ara.estimation_attempted` | Total ARA estimation attempts | Baseline metric | | `engine.eks.ara.estimation_succeeded` | Successful ARA estimations | Success rate tracking | | `engine.eks.ara.estimation_failed` | Failed ARA estimations (errors) | Error tracking | | `engine.eks.ara.no_historical_data` | Jobs with no ARA historical data (using defaults) | Monitor new job patterns | | `engine.eks.ara.hit_max_memory` | **Jobs hitting 350GB memory limit** | **Critical: indicates over-provisioning** | | `engine.eks.ara.hit_max_cpu` | Jobs hitting CPU limit | Monitor CPU exhaustion | #### Histograms/Distributions | Metric | Description | Use Case | |--------|-------------|----------| | `engine.eks.ara.memory_increase_ratio` | Ratio of adjusted/original memory | Understand typical growth (e.g., 1.75 = 75% increase) | | `engine.eks.ara.cpu_increase_ratio` | Ratio of adjusted/original CPU | Understand CPU scaling patterns | | `engine.eks.ara.final_memory_mb` | Final memory allocated (after ARA + bounds) | Distribution of actual allocations | | `engine.eks.ara.final_cpu_millicores` | Final CPU allocated (after ARA + bounds) | Distribution of CPU allocations | | `engine.eks.ara.default_memory` | Default memory before ARA | Baseline memory distribution | | `engine.eks.ara.ara_memory` | ARA-adjusted memory | ARA memory distribution | | `engine.eks.ara.default_cpu` | Default CPU before ARA | Baseline CPU distribution | | `engine.eks.ara.ara_cpu` | ARA-adjusted CPU | ARA CPU distribution | | `engine.eks.ara.memory_increase` | Absolute memory increase (MB) | Track growth amounts | | `engine.eks.ara.cpu_increase` | Absolute CPU increase (millicores) | Track CPU growth amounts | ### Structured Logging All logs use key-value pairs compatible with standard log aggregation tools. #### ARA Adjustment Logs (Info Level) **Location:** `execution/adapter/eks_adapter.go:adaptiveResources()` **When:** ARA triggers resource changes based on historical data **Fields:** ``` message: "ARA adjusted resources" definition_id: run_id: cluster: default_cpu_millicores: adjusted_cpu_millicores: cpu_ratio: default_memory_mb: adjusted_memory_mb: memory_ratio: ``` #### Limit Hit Logs (Warning Level) - CRITICAL **Location:** `execution/adapter/eks_adapter.go:checkResourceBounds()` **When:** Jobs hit maximum memory or CPU limits **Memory Limit Example:** ``` level: "warn" message: "ARA memory allocation hit maximum limit - potential over-provisioning" definition_id: run_id: cluster: default_memory_mb: requested_memory_mb: final_memory_mb: 350000 memory_overage_mb: ara_triggered: true/false ``` **CPU Limit Example:** ``` level: "warn" message: "ARA CPU allocation hit maximum limit" definition_id: run_id: cluster: default_cpu_millicores: requested_cpu_millicores: final_cpu_millicores: 60000 cpu_overage_millicores: ara_triggered: true/false ``` #### Historical Data Lookup Logs **Location:** `state/pg_state_manager.go:EstimateRunResources()` **Success:** ``` message: "ARA: Historical resource data found" definition_id: command_hash: estimated_memory_mb: estimated_cpu_millicores: ``` **No Data (Expected):** ``` message: "ARA: No historical resource data found" definition_id: command_hash: ``` **Error:** ``` level: "error" message: "ARA: Error querying historical resource data" definition_id: command_hash: error: ``` ## Using the Instrumentation ### Key Questions You Can Answer #### 1. How often does ARA trigger resource increases? **DataDog Query:** ``` sum:engine.eks.ara.resource_adjustment{*}.as_count() ``` Compare to total job submissions to get percentage. #### 2. How many jobs are hitting the ~300GB limit? ⭐ MOST IMPORTANT **DataDog Query:** ``` sum:engine.eks.ara.hit_max_memory{*}.as_count() ``` **Log Query (to identify specific jobs):** ``` message:"ARA memory allocation hit maximum limit - potential over-provisioning" ``` Group by `definition_id` to find which task definitions are affected. #### 3. What's the typical resource growth ratio? **DataDog Query:** ``` avg:engine.eks.ara.memory_increase_ratio{*} p50:engine.eks.ara.memory_increase_ratio{*} p90:engine.eks.ara.memory_increase_ratio{*} p99:engine.eks.ara.memory_increase_ratio{*} ``` A ratio of 1.75 means 75% increase, 3.0 means 200% increase, etc. #### 4. Distribution of final memory allocations **DataDog Query:** ``` avg:engine.eks.ara.final_memory_mb{*} p50:engine.eks.ara.final_memory_mb{*} p90:engine.eks.ara.final_memory_mb{*} p95:engine.eks.ara.final_memory_mb{*} p99:engine.eks.ara.final_memory_mb{*} ``` Shows the actual memory being allocated across all jobs. #### 5. Which specific definitions are over-provisioning? **Log Filter:** ``` message:"potential over-provisioning" ``` Extract `definition_id` and `memory_overage_mb` to prioritize which jobs need attention. ### Recommended Alerts #### Critical: Excessive Memory Limit Hits **Metric:** `engine.eks.ara.hit_max_memory` **Threshold:** Alert if > 10 hits per hour **Why:** Indicates jobs are repeatedly hitting the 350GB limit, suggesting either: - Jobs genuinely need more than 350GB (need larger instances) - ARA is over-provisioning (need to adjust multipliers) #### High CPU Limit Hits **Metric:** `engine.eks.ara.hit_max_cpu` **Threshold:** Alert if > 5 hits per hour **Why:** CPU exhaustion can cause job failures or slowdowns. ### Investigation Workflow When you see high `engine.eks.ara.hit_max_memory` counts: 1. **Identify affected definitions:** ``` Log filter: message:"potential over-provisioning" Group by: definition_id Sort by: count ``` 2. **Analyze a specific definition:** ``` Filter: definition_id:"" AND message:"ARA" Look for patterns: - How much overage? (memory_overage_mb) - What was the original default? (default_memory_mb) - Growth ratio? (memory_ratio) ``` 3. **Check job success rate:** - Are these jobs actually succeeding despite hitting the limit? - Or are they still OOM'ing even at max resources? 4. **Decide on action:** - If jobs succeed at max limit: Likely over-provisioning, consider: - Reducing ARA multiplier from 1.75x to 1.5x or 1.25x - Making ARA configurable per-definition again - Setting reasonable max limits per definition type - If jobs fail even at max limit: Jobs legitimately need more resources: - Increase max memory limit - Use larger instance types - Optimize job code to use less memory ## Code Locations ### Metrics Constants - File: `clients/metrics/metrics.go` - Lines: 51-59 ### Main Instrumentation - File: `execution/adapter/eks_adapter.go` - Functions: `adaptiveResources()`, `checkResourceBounds()` - Lines: 352-492 ### Historical Data Logging - File: `state/pg_state_manager.go` - Function: `EstimateRunResources()` - Lines: 118-162 ### ARA SQL Query - File: `state/pg_queries.go` - Constant: `TaskResourcesSelectCommandSQL` - Lines: 54-66 ## Future Improvements Based on instrumentation data, consider: 1. **Make ARA configurable again** - Restore per-definition or global toggles for A/B testing 2. **Adjust multipliers** - If 1.75x is too aggressive, reduce to 1.5x or 1.25x 3. **Per-definition limits** - Set different max memory based on job type 4. **Graduated multipliers** - Use smaller multipliers as resources grow (e.g., 1.75x up to 50GB, then 1.25x) 5. **Decay historical data** - Weight recent OOMs more than old ones 6. **Track actual usage vs allocation** - Compare requested resources to what jobs actually use ## Related Documentation - ARA Feature Documentation: `docs/ara.md` - State Models: `state/models.go` - Resource Queries: `state/pg_queries.go` - Main CLAUDE.md: Project overview and development guide ================================================ FILE: docs/ara.md ================================================ *Adaptive Resource Allocation for Kubernetes Pods* At StitchFix we empower our data scientists to deploy their models and applications end to end without needing engineering skills. To facilitate batch processing we use Flotilla, a task execution service. Flotilla can run jobs on top of Kubernetes or AWS ECS. One of the problems we faced was how much CPU and memory should we assign to the container pods? The workloads are highly variable on their demands. If we give too few resources the jobs may run slower and in the pathological case of running out of memory. If we give too much we are wasting resources and starving other jobs that could potentially be scheduled alongside. Solution The first step was to accurately record the utilization of the resources per pod. We looked at a few different monitoring solutions (kube-state-metrics, Prometheus, and metrics-server). We decided to use the metrics-server since it provided a simple API and tracked the state of the pods in memory. ``` helm install --name=metrics-server --namespace=kube-system --set args={'--metric-resolution=1s'} stable/metrics-server ``` To instrument fetching the pod metrics, we used the metrics ClientSet. While the job is running, Flotilla fetches the metrics every 2-5 seconds. If the prior recorded value of memory and CPU are lower than what the Metrics Server is outputting the highest of the two are recorded back with job metadata. Also, an MD5 checksum of the command and its arguments are stored in the database. This becomes a signature of the job and its resources. The core [query for ARA](https://github.com/stitchfix/flotilla-os/blob/master/state/pg_queries.go#L53-L66) and the associated [adapter code](https://github.com/stitchfix/flotilla-os/blob/master/execution/adapter/eks_adapter.go#L269-L301) ================================================ FILE: exceptions/errors.go ================================================ package exceptions // // MalformedInput describes malformed or otherwise incorrect input // type MalformedInput struct { ErrorString string } func (e MalformedInput) Error() string { return e.ErrorString } // // ConflictingResource describes a conflict case: // eg. definition already exists, reserved fields // type ConflictingResource struct { ErrorString string } func (e ConflictingResource) Error() string { return e.ErrorString } // // ResourceMissing describes case where a resource does not exist // eg. missing definition or run or no image found // type MissingResource struct { ErrorString string } func (e MissingResource) Error() string { return e.ErrorString } ================================================ FILE: execution/adapter/eks_adapter.go ================================================ package adapter import ( "context" "errors" "fmt" "os" "regexp" "strings" "time" "github.com/aws/aws-sdk-go/aws" "github.com/stitchfix/flotilla-os/clients/metrics" "github.com/stitchfix/flotilla-os/exceptions" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/state" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) type EKSAdapter interface { AdaptJobToFlotillaRun(job *batchv1.Job, run state.Run, pod *corev1.Pod) (state.Run, error) AdaptFlotillaDefinitionAndRunToJob(ctx context.Context, executable state.Executable, run state.Run, schedulerName string, manager state.Manager, araEnabled bool) (batchv1.Job, error) } type eksAdapter struct { logger flotillaLog.Logger } // NewEKSAdapter configures and returns an eks adapter for translating // from EKS api specific objects to our representation func NewEKSAdapter(logger flotillaLog.Logger) (EKSAdapter, error) { adapter := eksAdapter{logger: logger} return &adapter, nil } // Adapting Kubernetes batch/v1 job to a Flotilla run object. // This method maps the exit code & timestamps from Kubernetes to Flotilla's Run object. func (a *eksAdapter) AdaptJobToFlotillaRun(job *batchv1.Job, run state.Run, pod *corev1.Pod) (state.Run, error) { updated := run if job.Status.Active == 1 && job.Status.CompletionTime == nil { updated.Status = state.StatusRunning } else if job.Status.Succeeded == 1 { if pod != nil { if pod.Status.Phase == corev1.PodSucceeded { var exitCode int64 = 0 var exitReason = fmt.Sprintf("Pod %s Exited Successfully", pod.Name) updated.ExitReason = &exitReason updated.Status = state.StatusStopped updated.ExitCode = &exitCode } } else { var exitCode int64 = 0 updated.Status = state.StatusStopped updated.ExitCode = &exitCode } } else if job.Status.Failed == 1 { var exitCode int64 = 1 updated.Status = state.StatusStopped if pod != nil { if pod.Status.ContainerStatuses != nil && len(pod.Status.ContainerStatuses) > 0 { containerStatus := pod.Status.ContainerStatuses[len(pod.Status.ContainerStatuses)-1] if containerStatus.State.Terminated != nil { updated.ExitReason = &containerStatus.State.Terminated.Reason exitCode = int64(containerStatus.State.Terminated.ExitCode) } } } updated.ExitCode = &exitCode } if pod != nil && len(pod.Spec.Containers) > 0 { container := pod.Spec.Containers[0] //First three lines are injected by Flotilla, strip those out. if len(container.Command) > 3 { cmd := strings.Join(container.Command[3:], "\n") updated.Command = &cmd } } if job != nil && job.Status.StartTime != nil { updated.StartedAt = &job.Status.StartTime.Time } if updated.Status == state.StatusStopped { if job != nil && job.Status.CompletionTime != nil { updated.FinishedAt = &job.Status.CompletionTime.Time } else { finishedAt := time.Now() updated.FinishedAt = &finishedAt } } return updated, nil } // Adapting Flotilla run object to Kubernetes batch/v1 job. // 1. Construction of the cmd that will be run. // 2. Resources associated to a pod (includes Adaptive Resource Allocation) // 3. Environment variables to be setup. // 4. Port mappings. // 5. Node lifecycle. // 6. Node affinity and anti-affinity func (a *eksAdapter) AdaptFlotillaDefinitionAndRunToJob(ctx context.Context, executable state.Executable, run state.Run, schedulerName string, manager state.Manager, araEnabled bool) (batchv1.Job, error) { cmd := "" if run.Command != nil && len(*run.Command) > 0 { cmd = *run.Command } cmdSlice := a.constructCmdSlice(cmd) cmd = strings.Join(cmdSlice[3:], "\n") run.Command = &cmd resourceRequirements, run := a.constructResourceRequirements(ctx, executable, run, manager, araEnabled) volumeMounts, volumes := a.constructVolumeMounts(ctx, executable, run, manager, araEnabled) container := corev1.Container{ Name: run.RunID, Image: run.Image, Command: cmdSlice, Resources: resourceRequirements, Env: a.envOverrides(executable, run), Ports: a.constructContainerPorts(executable), ImagePullPolicy: corev1.PullAlways, } if volumeMounts != nil { container.VolumeMounts = volumeMounts } affinity := a.constructAffinity(ctx, executable, run, manager) tolerations := a.constructTolerations(executable, run) annotations := map[string]string{} annotations["prometheus.io/port"] = "9090" annotations["prometheus.io/scrape"] = "true" labels := state.GetLabels(run) jobSpec := batchv1.JobSpec{ TTLSecondsAfterFinished: &state.TTLSecondsAfterFinished, ActiveDeadlineSeconds: run.ActiveDeadlineSeconds, BackoffLimit: &state.EKSBackoffLimit, Template: corev1.PodTemplateSpec{ ObjectMeta: v1.ObjectMeta{ Annotations: annotations, Labels: labels, }, Spec: corev1.PodSpec{ SchedulerName: schedulerName, Containers: []corev1.Container{container}, RestartPolicy: corev1.RestartPolicyNever, ServiceAccountName: *run.ServiceAccount, Affinity: affinity, Tolerations: tolerations, }, }, } if volumes != nil { jobSpec.Template.Spec.Volumes = volumes } eksJob := batchv1.Job{ Spec: jobSpec, ObjectMeta: v1.ObjectMeta{ Name: run.RunID, }, } return eksJob, nil } func (a *eksAdapter) constructEviction(ctx context.Context, run state.Run, manager state.Manager) string { if run.Gpu != nil && *run.Gpu > 0 { return "false" } if run.NodeLifecycle != nil && *run.NodeLifecycle == state.OndemandLifecycle { return "false" } if run.CommandHash != nil { nodeType, err := manager.GetNodeLifecycle(ctx, run.DefinitionID, *run.CommandHash) if err == nil && nodeType == state.OndemandLifecycle { return "false" } } return "true" } func (a *eksAdapter) constructContainerPorts(executable state.Executable) []corev1.ContainerPort { var containerPorts []corev1.ContainerPort executableResources := executable.GetExecutableResources() if executableResources.Ports != nil && len(*executableResources.Ports) > 0 { for _, port := range *executableResources.Ports { containerPorts = append(containerPorts, corev1.ContainerPort{ ContainerPort: int32(port), }) } } return containerPorts } func (a *eksAdapter) constructTolerations(executable state.Executable, run state.Run) []corev1.Toleration { executableResources := executable.GetExecutableResources() tolerations := []corev1.Toleration{} isGPU := (executableResources.Gpu != nil && *executableResources.Gpu > 0) || (run.Gpu != nil && *run.Gpu > 0) if isGPU { tolerations = append(tolerations, corev1.Toleration{ Key: "nvidia.com/gpu", Operator: "Equal", Value: "true", Effect: "NoSchedule", }) } isWaitForData := run.Labels["kube_task_type"] == "wait_for_data" if team, ok := run.Labels["team"]; ok && team != "" && !isGPU && !isWaitForData { tolerations = append(tolerations, corev1.Toleration{ Key: team, Operator: "Equal", Value: "true", Effect: "NoSchedule", }) } return tolerations } func (a *eksAdapter) constructAffinity(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) *corev1.Affinity { affinity := &corev1.Affinity{} var requiredMatch []corev1.NodeSelectorRequirement var preferredMatches []corev1.PreferredSchedulingTerm //todo move to config nodeLifecycleKey := "karpenter.sh/capacity-type" nodeArchKey := "kubernetes.io/arch" var nodeLifecycle []string if run.NodeLifecycle != nil && *run.NodeLifecycle == state.OndemandLifecycle { nodeLifecycle = append(nodeLifecycle, "on-demand") } else { nodeLifecycle = append(nodeLifecycle, "spot", "on-demand") } //todo move to config arch := []string{"amd64"} if run.Arch != nil && *run.Arch == "arm64" { arch = []string{"arm64"} } requiredMatch = append(requiredMatch, corev1.NodeSelectorRequirement{ Key: nodeLifecycleKey, Operator: corev1.NodeSelectorOpIn, Values: nodeLifecycle, }) requiredMatch = append(requiredMatch, corev1.NodeSelectorRequirement{ Key: nodeArchKey, Operator: corev1.NodeSelectorOpIn, Values: arch, }) executableResources := executable.GetExecutableResources() isGPU := (run.Gpu != nil && *run.Gpu > 0) || (executableResources.Gpu != nil && *executableResources.Gpu > 0) isWaitForData := run.Labels["kube_task_type"] == "wait_for_data" if team, ok := run.Labels["team"]; ok && team != "" && !isGPU && !isWaitForData { requiredMatch = append(requiredMatch, corev1.NodeSelectorRequirement{ Key: "team", Operator: corev1.NodeSelectorOpIn, Values: []string{team}, }) if env := os.Getenv("FLOTILLA_MODE"); env != "" { requiredMatch = append(requiredMatch, corev1.NodeSelectorRequirement{ Key: "environment", Operator: corev1.NodeSelectorOpIn, Values: []string{env}, }) } } affinity = &corev1.Affinity{ NodeAffinity: &corev1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ NodeSelectorTerms: []corev1.NodeSelectorTerm{ { MatchExpressions: requiredMatch, }, }, }, PreferredDuringSchedulingIgnoredDuringExecution: preferredMatches, }, } return affinity } func (a *eksAdapter) constructResourceRequirements(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager, araEnabled bool) (corev1.ResourceRequirements, state.Run) { var ephemeralStorageRequestQuantity resource.Quantity maxEphemeralStorage := state.MaxEphemeralStorage limits := make(corev1.ResourceList) requests := make(corev1.ResourceList) cpuLimit, memLimit, cpuRequest, memRequest := a.adaptiveResources(ctx, executable, run, manager, araEnabled) // Round CPU values to avoid systemd cgroup rounding issues. cpuLimit = a.roundCPUMillicores(cpuLimit) cpuRequest = a.roundCPUMillicores(cpuRequest) cpuLimitQuantity := resource.MustParse(fmt.Sprintf("%dm", cpuLimit)) cpuRequestQuantity := resource.MustParse(fmt.Sprintf("%dm", cpuRequest)) memLimitQuantity := resource.MustParse(fmt.Sprintf("%dM", memLimit)) memRequestQuantity := resource.MustParse(fmt.Sprintf("%dM", memRequest)) limits[corev1.ResourceCPU] = cpuLimitQuantity limits[corev1.ResourceMemory] = memLimitQuantity requests[corev1.ResourceCPU] = cpuRequestQuantity requests[corev1.ResourceMemory] = memRequestQuantity executableResources := executable.GetExecutableResources() if run.Gpu != nil && *run.Gpu > 0 { limits["nvidia.com/gpu"] = resource.MustParse(fmt.Sprintf("%d", *run.Gpu)) requests["nvidia.com/gpu"] = resource.MustParse(fmt.Sprintf("%d", *run.Gpu)) run.NodeLifecycle = &state.OndemandLifecycle } else if executableResources.Gpu != nil && *executableResources.Gpu > 0 { limits["nvidia.com/gpu"] = resource.MustParse(fmt.Sprintf("%d", *executableResources.Gpu)) requests["nvidia.com/gpu"] = resource.MustParse(fmt.Sprintf("%d", *executableResources.Gpu)) run.NodeLifecycle = &state.OndemandLifecycle } run.Memory = aws.Int64(memRequestQuantity.ScaledValue(resource.Mega)) run.Cpu = aws.Int64(cpuRequestQuantity.ScaledValue(resource.Milli)) run.MemoryLimit = aws.Int64(memLimitQuantity.ScaledValue(resource.Mega)) run.CpuLimit = aws.Int64(cpuLimitQuantity.ScaledValue(resource.Milli)) if run.EphemeralStorage != nil { ephemeralStorageRequest := *run.EphemeralStorage if ephemeralStorageRequest > maxEphemeralStorage { ephemeralStorageRequest = maxEphemeralStorage } ephemeralStorageRequestQuantity = resource.MustParse(fmt.Sprintf("%dM", ephemeralStorageRequest)) requests[corev1.ResourceEphemeralStorage] = ephemeralStorageRequestQuantity run.EphemeralStorage = aws.Int64(ephemeralStorageRequestQuantity.ScaledValue(resource.Mega)) } resourceRequirements := corev1.ResourceRequirements{ Limits: limits, Requests: requests, } return resourceRequirements, run } func (a *eksAdapter) constructVolumeMounts(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager, araEnabled bool) ([]corev1.VolumeMount, []corev1.Volume) { var mounts []corev1.VolumeMount = nil var volumes []corev1.Volume = nil if run.Gpu != nil && *run.Gpu > 0 { mounts = make([]corev1.VolumeMount, 1) mounts[0] = corev1.VolumeMount{Name: "shared-memory", MountPath: "/dev/shm"} volumes = make([]corev1.Volume, 1) sharedLimit := resource.MustParse(fmt.Sprintf("%dGi", *run.Gpu*int64(8))) emptyDir := corev1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sharedLimit} volumes[0] = corev1.Volume{Name: "shared-memory", VolumeSource: corev1.VolumeSource{EmptyDir: &emptyDir}} } if run.RequiresDocker { volumes = append(volumes, corev1.Volume{ Name: "dockersock", VolumeSource: corev1.VolumeSource{ HostPath: &corev1.HostPathVolumeSource{ Path: "/var/run/docker.sock", Type: nil, }, }, }) mounts = append(mounts, corev1.VolumeMount{ Name: "dockersock", MountPath: "/var/run/docker.sock", }) } return mounts, volumes } func (a *eksAdapter) adaptiveResources(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager, araEnabled bool) (int64, int64, int64, int64) { executableResources := executable.GetExecutableResources() // Check both run.Gpu (from execution request) and executableResources.Gpu (from definition) // This matches the GPU allocation logic in constructResourceRequirements (lines 300-308) isGPUJob := (run.Gpu != nil && *run.Gpu > 0) || (executableResources.Gpu != nil && *executableResources.Gpu > 0) cpuLimit, memLimit := a.getResourceDefaults(run, executable) cpuRequest, memRequest := a.getResourceDefaults(run, executable) // Track default resources before ARA defaultCPU := cpuRequest defaultMem := memRequest // Create tags for metrics (engine + cluster to avoid high cardinality) metricTags := []string{"engine:eks"} if run.ClusterName != "" { metricTags = append(metricTags, fmt.Sprintf("cluster:%s", run.ClusterName)) } if !isGPUJob && araEnabled { // Check if command_hash is NULL (malformed job with no command) if run.CommandHash == nil { // Command hash is NULL - skip ARA for malformed jobs _ = metrics.Increment(metrics.EngineEKSARANullCommandHash, metricTags, 1) if a.logger != nil { _ = a.logger.Log( "level", "warn", "message", "Skipping ARA - NULL command_hash", "reason", "Job has no command (malformed definition)", "run_id", run.RunID, "definition_id", *executable.GetExecutableID(), ) } } else { // Track ARA estimation attempt _ = metrics.Increment(metrics.EngineEKSARAEstimationAttempted, metricTags, 1) // Pass command_hash directly instead of run_id (optimization) estimatedResources, err := manager.EstimateRunResources(ctx, *executable.GetExecutableID(), *run.CommandHash) if err == nil { // Track successful estimation _ = metrics.Increment(metrics.EngineEKSARAEstimationSucceeded, metricTags, 1) // Extract int64 values from NullInt64 (we know they're valid because err == nil) estimatedCPU := estimatedResources.Cpu.Int64 estimatedMemory := estimatedResources.Memory.Int64 // Detect if ARA actually triggered resource changes araTriggered := (estimatedCPU != cpuRequest || estimatedMemory != memRequest) if araTriggered { // Track that ARA triggered resource adjustment _ = metrics.Increment(metrics.EngineEKSARAResourceAdjustment, metricTags, 1) // Track the magnitude of adjustment as ratios (better for understanding relative growth) if defaultMem > 0 { memoryRatio := float64(estimatedMemory) / float64(defaultMem) _ = metrics.Histogram(metrics.EngineEKSARAMemoryIncreaseRatio, memoryRatio, metricTags, 1) } if defaultCPU > 0 { cpuRatio := float64(estimatedCPU) / float64(defaultCPU) _ = metrics.Histogram(metrics.EngineEKSARACPUIncreaseRatio, cpuRatio, metricTags, 1) } // Log detailed information when ARA triggers (INFO level) if a.logger != nil { _ = a.logger.Log( "level", "info", "message", "ARA adjusted resources", "definition_id", *executable.GetExecutableID(), "run_id", run.RunID, "cluster", run.ClusterName, "default_cpu_millicores", defaultCPU, "adjusted_cpu_millicores", estimatedCPU, "cpu_ratio", float64(estimatedCPU)/float64(defaultCPU), "default_memory_mb", defaultMem, "adjusted_memory_mb", estimatedMemory, "memory_ratio", float64(estimatedMemory)/float64(defaultMem), ) } } cpuRequest = estimatedCPU memRequest = estimatedMemory // Calculate resource increases for absolute tracking cpuIncrease := cpuRequest - defaultCPU memIncrease := memRequest - defaultMem // Emit default and ARA resource distributions _ = metrics.Distribution(metrics.EngineEKSARADefaultCPU, float64(defaultCPU), metricTags, 1) _ = metrics.Distribution(metrics.EngineEKSARAARACPU, float64(cpuRequest), metricTags, 1) _ = metrics.Distribution(metrics.EngineEKSARADefaultMemory, float64(defaultMem), metricTags, 1) _ = metrics.Distribution(metrics.EngineEKSARAARAMemory, float64(memRequest), metricTags, 1) // Emit increase amounts if cpuIncrease > 0 { _ = metrics.Distribution(metrics.EngineEKSARACPUIncrease, float64(cpuIncrease), metricTags, 1) } if memIncrease > 0 { _ = metrics.Distribution(metrics.EngineEKSARAMemoryIncrease, float64(memIncrease), metricTags, 1) } } else { // Check if this is a missing resource error (expected for new jobs) vs a real error var missingResource exceptions.MissingResource if errors.As(err, &missingResource) { // No historical data available - this is expected for new jobs or jobs that haven't OOM'd _ = metrics.Increment(metrics.EngineEKSARANoHistoricalData, metricTags, 1) } else { // Track failed estimation (actual error) _ = metrics.Increment(metrics.EngineEKSARAEstimationFailed, metricTags, 1) } } if cpuRequest > cpuLimit { cpuLimit = cpuRequest } if memRequest > memLimit { memLimit = memRequest } } } // Check bounds - this will also emit metrics/logs for max hits cpuRequestBeforeBounds := cpuRequest memRequestBeforeBounds := memRequest cpuRequest, memRequest, maxCPUHit, maxMemHit := a.checkResourceBounds(cpuRequest, memRequest, isGPUJob, run, executable, defaultCPU, defaultMem) cpuLimit, memLimit, _, _ = a.checkResourceBounds(cpuLimit, memLimit, isGPUJob, run, executable, defaultCPU, defaultMem) // Emit final resource distributions _ = metrics.Histogram(metrics.EngineEKSARAFinalMemoryMB, float64(memRequest), metricTags, 1) _ = metrics.Histogram(metrics.EngineEKSARAFinalCPUMillicores, float64(cpuRequest), metricTags, 1) // Emit structured log when max resources hit if maxMemHit || maxCPUHit { a.emitARAMetrics(run, defaultCPU, defaultMem, cpuRequest, memRequest, cpuRequestBeforeBounds, memRequestBeforeBounds, maxCPUHit, maxMemHit) } return cpuLimit, memLimit, cpuRequest, memRequest } // emitARAMetrics logs structured information when ARA hits max resource bounds func (a *eksAdapter) emitARAMetrics(run state.Run, defaultCPU int64, defaultMem int64, finalCPU int64, finalMem int64, requestedCPU int64, requestedMem int64, maxCPUHit bool, maxMemHit bool) { if a.logger == nil { return } logFields := []interface{}{ "level", "warn", "message", "ARA resource allocation hit maximum limit", "run_id", run.RunID, "cluster", run.ClusterName, "default_cpu_millicores", defaultCPU, "default_memory_mb", defaultMem, "requested_cpu_millicores", requestedCPU, "requested_memory_mb", requestedMem, "final_cpu_millicores", finalCPU, "final_memory_mb", finalMem, "max_cpu_hit", maxCPUHit, "max_memory_hit", maxMemHit, } if run.DefinitionID != "" { logFields = append(logFields, "definition_id", run.DefinitionID) } if run.ExecutableID != nil { logFields = append(logFields, "executable_id", *run.ExecutableID) } if run.Command != nil { logFields = append(logFields, "command", *run.Command) } // Add overage information for memory (critical for 300GB issue) if maxMemHit { overage := requestedMem - finalMem logFields = append(logFields, "memory_overage_mb", overage) // Critical message for memory over-provisioning logFields[3] = "ARA memory allocation hit maximum limit - potential over-provisioning" } if maxCPUHit { overage := requestedCPU - finalCPU logFields = append(logFields, "cpu_overage_millicores", overage) } _ = a.logger.Log(logFields...) } // checkResourceBounds enforces resource limits and emits metrics/logs when limits are hit // Returns: adjusted CPU, adjusted memory, whether max CPU was hit, whether max memory was hit func (a *eksAdapter) checkResourceBounds(cpu int64, mem int64, isGPUJob bool, run state.Run, executable state.Executable, defaultCPU int64, defaultMem int64) (int64, int64, bool, bool) { maxMem := state.MaxMem maxCPU := state.MaxCPU if isGPUJob { maxMem = state.MaxGPUMem maxCPU = state.MaxGPUCPU } // Create tags for metrics (engine + cluster to avoid high cardinality) metricTags := []string{"engine:eks"} if run.ClusterName != "" { metricTags = append(metricTags, fmt.Sprintf("cluster:%s", run.ClusterName)) } maxCPUHit := false maxMemHit := false if cpu < state.MinCPU { cpu = state.MinCPU } if cpu > maxCPU { maxCPUHit = true // Track hitting max CPU limit _ = metrics.Increment(metrics.EngineEKSARAHitMaxCPU, metricTags, 1) cpu = maxCPU } if mem < state.MinMem { mem = state.MinMem } if mem > maxMem { maxMemHit = true // Track hitting max memory limit - THIS IS THE KEY METRIC _ = metrics.Increment(metrics.EngineEKSARAHitMaxMemory, metricTags, 1) mem = maxMem } return cpu, mem, maxCPUHit, maxMemHit } func (a *eksAdapter) getResourceDefaults(run state.Run, executable state.Executable) (int64, int64) { // 1. Init with the global defaults cpu := state.MinCPU mem := state.MinMem executableResources := executable.GetExecutableResources() // 2. Look up Run level // 3. If not at Run level check Definitions if run.Cpu != nil && *run.Cpu != 0 { cpu = *run.Cpu } else { if executableResources.Cpu != nil && *executableResources.Cpu != 0 { cpu = *executableResources.Cpu } } if run.Memory != nil && *run.Memory != 0 { mem = *run.Memory } else { if executableResources.Memory != nil && *executableResources.Memory != 0 { mem = *executableResources.Memory } } // 4. Override for very large memory requests. // Remove after migration. if mem >= 36864 && mem < 131072 && (executableResources.Gpu == nil || *executableResources.Gpu == 0) { // using the 8x ratios between cpu and memory ~ r5 class of instances cpuOverride := mem / 8 if cpuOverride > cpu { cpu = cpuOverride } } return cpu, mem } func (a *eksAdapter) getLastRun(ctx context.Context, manager state.Manager, run state.Run) state.Run { var lastRun state.Run runList, err := manager.ListRuns(ctx, 1, 0, "started_at", "desc", map[string][]string{ "queued_at_since": { time.Now().AddDate(0, 0, -7).Format(time.RFC3339), }, "status": {state.StatusStopped}, "command": {strings.Replace(*run.Command, "'", "''", -1)}, "executable_id": {*run.ExecutableID}, }, nil, []string{state.EKSEngine}) if err == nil && len(runList.Runs) > 0 { lastRun = runList.Runs[0] } return lastRun } func (a *eksAdapter) constructCmdSlice(cmdString string) []string { bashCmd := "bash" optLogin := "-l" optStr := "-cex" return []string{bashCmd, optLogin, optStr, cmdString} } func (a *eksAdapter) envOverrides(executable state.Executable, run state.Run) []corev1.EnvVar { pairs := make(map[string]string) resources := executable.GetExecutableResources() if resources.Env != nil && len(*resources.Env) > 0 { for _, ev := range *resources.Env { name := a.sanitizeEnvVar(ev.Name) value := ev.Value pairs[name] = value } } if run.Env != nil && len(*run.Env) > 0 { for _, ev := range *run.Env { name := a.sanitizeEnvVar(ev.Name) value := ev.Value pairs[name] = value } } var res []corev1.EnvVar for key := range pairs { if len(key) > 0 { res = append(res, corev1.EnvVar{ Name: key, Value: pairs[key], }) } } return res } func (a *eksAdapter) sanitizeEnvVar(key string) string { // Environment variable can't start with a $ if strings.HasPrefix(key, "$") { key = strings.Replace(key, "$", "", 1) } // Environment variable names can't contain spaces. key = strings.Replace(key, " ", "", -1) return key } func (a *eksAdapter) sanitizeLabel(key string) string { key = strings.TrimSpace(key) key = regexp.MustCompile(`[^-a-z0-9A-Z_.]+`).ReplaceAllString(key, "_") key = strings.TrimPrefix(key, "_") key = strings.ToLower(key) if len(key) > 63 { key = key[:63] } return key } // roundCPUMillicores rounds CPU millicores to the nearest 250m (quarter core) to avoid systemd cgroup rounding issues. When CPU limits produce non-integer percentages func (a *eksAdapter) roundCPUMillicores(millicores int64) int64 { return ((millicores + 125) / 250) * 250 } ================================================ FILE: execution/adapter/eks_adapter_test.go ================================================ package adapter import ( "context" "database/sql" "errors" "testing" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" ) func TestRoundCPUMillicores(t *testing.T) { adapter := &eksAdapter{} tests := []struct { name string input int64 expected int64 }{ // The problematic case that triggered this fix {"1024m rounds to 1000m", 1024, 1000}, // Edge cases around quarters {"1000m stays 1000m", 1000, 1000}, {"1125m rounds to 1250m", 1125, 1250}, {"1150m rounds to 1250m", 1150, 1250}, {"1250m stays 1250m", 1250, 1250}, // Test rounding up and down {"100m rounds to 0m", 100, 0}, {"125m rounds to 250m", 125, 250}, {"137m rounds to 250m", 137, 250}, {"250m stays 250m", 250, 250}, {"374m rounds to 250m", 374, 250}, {"375m rounds to 500m", 375, 500}, {"500m stays 500m", 500, 500}, {"624m rounds to 500m", 624, 500}, {"625m rounds to 750m", 625, 750}, {"750m stays 750m", 750, 750}, // Higher values - test both rounding up and down {"2048m rounds to 2000m", 2048, 2000}, {"2100m rounds to 2000m", 2100, 2000}, {"2126m rounds UP to 2250m", 2126, 2250}, {"3000m stays 3000m", 3000, 3000}, {"3001m rounds to 3000m", 3001, 3000}, {"3126m rounds UP to 3250m", 3126, 3250}, {"3200m rounds UP to 3250m", 3200, 3250}, // Large values {"60000m stays 60000m", 60000, 60000}, {"60024m rounds to 60000m", 60024, 60000}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := adapter.roundCPUMillicores(tt.input) if result != tt.expected { t.Errorf("roundCPUMillicores(%d) = %d, want %d", tt.input, result, tt.expected) } }) } } // TestRoundCPUAvoidsCgroupIssue verifies that rounded values avoid the systemd // cgroup rounding issue where non-integer percentages get rounded up by systemd func TestRoundCPUAvoidsCgroupIssue(t *testing.T) { adapter := &eksAdapter{} // Test values that would cause systemd rounding issues problematicValues := []int64{ 1024, // 102.4% -> systemd rounds to 103% 1025, // 102.5% -> systemd rounds to 103% 1026, // 102.6% -> systemd rounds to 103% 2048, // 204.8% -> systemd rounds to 205% 3072, // 307.2% -> systemd rounds to 308% } for _, input := range problematicValues { result := adapter.roundCPUMillicores(input) // Verify result is a multiple of 250 (quarter core) if result%250 != 0 { t.Errorf("roundCPUMillicores(%d) = %d, which is not a multiple of 250m", input, result) } // Verify result produces an integer percentage (whole or quarter) // Valid: 0%, 25%, 50%, 75%, 100%, 125%, etc. // 1000m = 100%, 250m = 25% percentage := (result * 100) / 1000 // percentage with 1 decimal place if percentage%25 != 0 { t.Errorf("roundCPUMillicores(%d) = %d, which produces non-quarter percentage (%d)", input, result, percentage) } } } // mockLogger implements flotillaLog.Logger for testing type mockLogger struct { logCalls [][]interface{} eventCalls [][]interface{} } func (m *mockLogger) Log(keyvals ...interface{}) error { m.logCalls = append(m.logCalls, keyvals) return nil } func (m *mockLogger) Event(keyvals ...interface{}) error { m.eventCalls = append(m.eventCalls, keyvals) return nil } func (m *mockLogger) reset() { m.logCalls = nil m.eventCalls = nil } // mockStateManager implements state.Manager for testing type mockStateManager struct { estimateResourcesResult state.TaskResources estimateResourcesError error } func (m *mockStateManager) EstimateRunResources(ctx context.Context, executableID string, commandHash string) (state.TaskResources, error) { return m.estimateResourcesResult, m.estimateResourcesError } // Stub implementations for required interface methods func (m *mockStateManager) Name() string { return "mock" } func (m *mockStateManager) Initialize(conf config.Config) error { return nil } func (m *mockStateManager) Cleanup() error { return nil } func (m *mockStateManager) ListDefinitions(ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string) (state.DefinitionList, error) { return state.DefinitionList{}, nil } func (m *mockStateManager) GetDefinition(ctx context.Context, definitionID string) (state.Definition, error) { return state.Definition{}, nil } func (m *mockStateManager) GetDefinitionByAlias(ctx context.Context, alias string) (state.Definition, error) { return state.Definition{}, nil } func (m *mockStateManager) UpdateDefinition(ctx context.Context, definitionID string, updates state.Definition) (state.Definition, error) { return state.Definition{}, nil } func (m *mockStateManager) CreateDefinition(ctx context.Context, d state.Definition) error { return nil } func (m *mockStateManager) DeleteDefinition(ctx context.Context, definitionID string) error { return nil } func (m *mockStateManager) ListRuns(ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string, engines []string) (state.RunList, error) { return state.RunList{}, nil } func (m *mockStateManager) EstimateExecutorCount(ctx context.Context, executableID string, commandHash string) (int64, error) { return 0, nil } func (m *mockStateManager) ExecutorOOM(ctx context.Context, executableID string, commandHash string) (bool, error) { return false, nil } func (m *mockStateManager) DriverOOM(ctx context.Context, executableID string, commandHash string) (bool, error) { return false, nil } func (m *mockStateManager) GetRun(ctx context.Context, runID string) (state.Run, error) { return state.Run{}, nil } func (m *mockStateManager) CreateRun(ctx context.Context, r state.Run) error { return nil } func (m *mockStateManager) UpdateRun(ctx context.Context, runID string, updates state.Run) (state.Run, error) { return state.Run{}, nil } func (m *mockStateManager) ListGroups(ctx context.Context, limit int, offset int, name *string) (state.GroupsList, error) { return state.GroupsList{}, nil } func (m *mockStateManager) ListTags(ctx context.Context, limit int, offset int, name *string) (state.TagsList, error) { return state.TagsList{}, nil } func (m *mockStateManager) ListWorkers(ctx context.Context, engine string) (state.WorkersList, error) { return state.WorkersList{}, nil } func (m *mockStateManager) BatchUpdateWorkers(ctx context.Context, updates []state.Worker) (state.WorkersList, error) { return state.WorkersList{}, nil } func (m *mockStateManager) GetWorker(ctx context.Context, workerType string, engine string) (state.Worker, error) { return state.Worker{}, nil } func (m *mockStateManager) UpdateWorker(ctx context.Context, workerType string, updates state.Worker) (state.Worker, error) { return state.Worker{}, nil } func (m *mockStateManager) GetExecutableByTypeAndID(ctx context.Context, executableType state.ExecutableType, executableID string) (state.Executable, error) { return state.Definition{}, nil } func (m *mockStateManager) GetTemplateByID(ctx context.Context, templateID string) (state.Template, error) { return state.Template{}, nil } func (m *mockStateManager) GetLatestTemplateByTemplateName(ctx context.Context, templateName string) (bool, state.Template, error) { return false, state.Template{}, nil } func (m *mockStateManager) GetTemplateByVersion(ctx context.Context, templateName string, templateVersion int64) (bool, state.Template, error) { return false, state.Template{}, nil } func (m *mockStateManager) ListTemplates(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) { return state.TemplateList{}, nil } func (m *mockStateManager) ListTemplatesLatestOnly(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) { return state.TemplateList{}, nil } func (m *mockStateManager) CreateTemplate(ctx context.Context, t state.Template) error { return nil } func (m *mockStateManager) ListFailingNodes(ctx context.Context) (state.NodeList, error) { return state.NodeList{}, nil } func (m *mockStateManager) GetPodReAttemptRate(ctx context.Context) (float32, error) { return 0, nil } func (m *mockStateManager) GetNodeLifecycle(ctx context.Context, executableID string, commandHash string) (string, error) { return "", nil } func (m *mockStateManager) GetTaskHistoricalRuntime(ctx context.Context, executableID string, runId string) (float32, error) { return 0, nil } func (m *mockStateManager) CheckIdempotenceKey(ctx context.Context, idempotenceKey string) (string, error) { return "", nil } func (m *mockStateManager) GetRunByEMRJobId(ctx context.Context, emrJobId string) (state.Run, error) { return state.Run{}, nil } func (m *mockStateManager) GetResources(ctx context.Context, runID string) (state.Run, error) { return state.Run{}, nil } func (m *mockStateManager) ListClusterStates(ctx context.Context) ([]state.ClusterMetadata, error) { return nil, nil } func (m *mockStateManager) UpdateClusterMetadata(ctx context.Context, cluster state.ClusterMetadata) error { return nil } func (m *mockStateManager) DeleteClusterMetadata(ctx context.Context, clusterID string) error { return nil } func (m *mockStateManager) GetClusterByID(ctx context.Context, clusterID string) (state.ClusterMetadata, error) { return state.ClusterMetadata{}, nil } func (m *mockStateManager) GetRunStatus(ctx context.Context, runID string) (state.RunStatus, error) { return state.RunStatus{}, nil } // mockExecutable implements state.Executable for testing type mockExecutable struct { executableID string resources *state.ExecutableResources } func (m *mockExecutable) GetExecutableID() *string { return &m.executableID } func (m *mockExecutable) GetExecutableType() *state.ExecutableType { t := state.ExecutableTypeDefinition return &t } func (m *mockExecutable) GetExecutableResources() *state.ExecutableResources { return m.resources } func (m *mockExecutable) GetExecutableCommand(req state.ExecutionRequest) (string, error) { return "", nil } func (m *mockExecutable) GetExecutableResourceName() string { return m.executableID } func TestAdaptiveResources_NonGPUJob_ARAEnabled_Success(t *testing.T) { logger := &mockLogger{} adapter, err := NewEKSAdapter(logger) if err != nil { t.Fatalf("Failed to create adapter: %v", err) } executableID := "test-executable" executable := &mockExecutable{ executableID: executableID, resources: &state.ExecutableResources{ Memory: int64Ptr(1000), Cpu: int64Ptr(500), }, } commandHash := "test-command-hash" run := state.Run{ RunID: "test-run", ExecutableID: &executableID, CommandHash: &commandHash, } manager := &mockStateManager{ estimateResourcesResult: state.TaskResources{ Cpu: sql.NullInt64{Int64: 2000, Valid: true}, Memory: sql.NullInt64{Int64: 3000, Valid: true}, }, estimateResourcesError: nil, } // Note: We can't easily test metrics emission since they're package-level functions, // but we can verify the logic works correctly cpuLimit, memLimit, cpuRequest, memRequest := adapter.(*eksAdapter).adaptiveResources( context.Background(), executable, run, manager, true, // araEnabled ) // Verify ARA increased resources if cpuRequest != 2000 { t.Errorf("Expected CPU request 2000, got %d", cpuRequest) } if memRequest != 3000 { t.Errorf("Expected memory request 3000, got %d", memRequest) } if cpuLimit != 2000 { t.Errorf("Expected CPU limit 2000, got %d", cpuLimit) } if memLimit != 3000 { t.Errorf("Expected memory limit 3000, got %d", memLimit) } } func TestAdaptiveResources_GPUJob_SkipsARA(t *testing.T) { logger := &mockLogger{} adapter, err := NewEKSAdapter(logger) if err != nil { t.Fatalf("Failed to create adapter: %v", err) } executableID := "test-executable" gpu := int64(1) executable := &mockExecutable{ executableID: executableID, resources: &state.ExecutableResources{ Memory: int64Ptr(1000), Cpu: int64Ptr(500), }, } run := state.Run{ RunID: "test-run", ExecutableID: &executableID, Gpu: &gpu, } manager := &mockStateManager{} _, _, cpuRequest, memRequest := adapter.(*eksAdapter).adaptiveResources( context.Background(), executable, run, manager, true, // araEnabled ) // Verify GPU jobs use defaults (no ARA) defaultCPU := int64(500) defaultMem := int64(1000) if cpuRequest != defaultCPU { t.Errorf("Expected CPU request %d (default), got %d", defaultCPU, cpuRequest) } if memRequest != defaultMem { t.Errorf("Expected memory request %d (default), got %d", defaultMem, memRequest) } } func TestAdaptiveResources_EstimationFailed(t *testing.T) { logger := &mockLogger{} adapter, err := NewEKSAdapter(logger) if err != nil { t.Fatalf("Failed to create adapter: %v", err) } executableID := "test-executable" executable := &mockExecutable{ executableID: executableID, resources: &state.ExecutableResources{ Memory: int64Ptr(1000), Cpu: int64Ptr(500), }, } run := state.Run{ RunID: "test-run", ExecutableID: &executableID, } manager := &mockStateManager{ estimateResourcesError: errors.New("estimation failed"), } _, _, cpuRequest, memRequest := adapter.(*eksAdapter).adaptiveResources( context.Background(), executable, run, manager, true, // araEnabled ) // Verify defaults are used when estimation fails defaultCPU := int64(500) defaultMem := int64(1000) if cpuRequest != defaultCPU { t.Errorf("Expected CPU request %d (default), got %d", defaultCPU, cpuRequest) } if memRequest != defaultMem { t.Errorf("Expected memory request %d (default), got %d", defaultMem, memRequest) } } func TestAdaptiveResources_MaxResourceBoundsHit(t *testing.T) { logger := &mockLogger{} adapter, err := NewEKSAdapter(logger) if err != nil { t.Fatalf("Failed to create adapter: %v", err) } executableID := "test-executable" definitionID := "test-definition" command := "test-command" commandHash := "test-command-hash" executable := &mockExecutable{ executableID: executableID, resources: &state.ExecutableResources{ Memory: int64Ptr(1000), Cpu: int64Ptr(500), }, } run := state.Run{ RunID: "test-run", ExecutableID: &executableID, DefinitionID: definitionID, Command: &command, CommandHash: &commandHash, ClusterName: "test-cluster", } // Return resources that exceed max bounds manager := &mockStateManager{ estimateResourcesResult: state.TaskResources{ Cpu: sql.NullInt64{Int64: state.MaxCPU + 10000, Valid: true}, // Exceeds max Memory: sql.NullInt64{Int64: state.MaxMem + 50000, Valid: true}, // Exceeds max }, estimateResourcesError: nil, } cpuLimit, memLimit, cpuRequest, memRequest := adapter.(*eksAdapter).adaptiveResources( context.Background(), executable, run, manager, true, // araEnabled ) // Verify resources are capped at max bounds if cpuRequest != state.MaxCPU { t.Errorf("Expected CPU request capped at %d, got %d", state.MaxCPU, cpuRequest) } if memRequest != state.MaxMem { t.Errorf("Expected memory request capped at %d, got %d", state.MaxMem, memRequest) } if cpuLimit != state.MaxCPU { t.Errorf("Expected CPU limit capped at %d, got %d", state.MaxCPU, cpuLimit) } if memLimit != state.MaxMem { t.Errorf("Expected memory limit capped at %d, got %d", state.MaxMem, memLimit) } // Verify logger was called for max resource hit // There should be two logs: one for ARA adjustment, one for max bounds hit if len(logger.logCalls) < 2 { t.Errorf("Expected at least 2 logger.Log calls (ARA adjustment + max bounds hit), got %d", len(logger.logCalls)) return } // Find the max bounds hit log (should have level:warn) var maxBoundsLog []interface{} for _, logCall := range logger.logCalls { for i := 0; i < len(logCall); i += 2 { if i+1 < len(logCall) && logCall[i] == "level" && logCall[i+1] == "warn" { maxBoundsLog = logCall break } } if maxBoundsLog != nil { break } } if maxBoundsLog == nil { t.Errorf("Expected log with level:warn for max bounds hit, got logCalls: %v", logger.logCalls) return } // Verify log contains expected fields foundMessage := false foundRunID := false for i := 0; i < len(maxBoundsLog); i += 2 { if i+1 < len(maxBoundsLog) { key := maxBoundsLog[i] value := maxBoundsLog[i+1] if key == "message" { msg := value.(string) if msg == "ARA resource allocation hit maximum limit" || msg == "ARA memory allocation hit maximum limit - potential over-provisioning" { foundMessage = true } } if key == "run_id" && value == "test-run" { foundRunID = true } } } if !foundMessage { t.Errorf("Expected log to contain message about max resource hit") } if !foundRunID { t.Error("Expected log to contain 'run_id: test-run'") } } func TestAdaptiveResources_ARADisabled(t *testing.T) { logger := &mockLogger{} adapter, err := NewEKSAdapter(logger) if err != nil { t.Fatalf("Failed to create adapter: %v", err) } executableID := "test-executable" executable := &mockExecutable{ executableID: executableID, resources: &state.ExecutableResources{ Memory: int64Ptr(1000), Cpu: int64Ptr(500), }, } run := state.Run{ RunID: "test-run", ExecutableID: &executableID, } manager := &mockStateManager{} _, _, cpuRequest, memRequest := adapter.(*eksAdapter).adaptiveResources( context.Background(), executable, run, manager, false, // araEnabled = false ) // Verify defaults are used when ARA is disabled defaultCPU := int64(500) defaultMem := int64(1000) if cpuRequest != defaultCPU { t.Errorf("Expected CPU request %d (default), got %d", defaultCPU, cpuRequest) } if memRequest != defaultMem { t.Errorf("Expected memory request %d (default), got %d", defaultMem, memRequest) } } func TestEmitARAMetrics_StructuredLog(t *testing.T) { logger := &mockLogger{} adapter, err := NewEKSAdapter(logger) if err != nil { t.Fatalf("Failed to create adapter: %v", err) } executableID := "test-executable" definitionID := "test-definition" command := "test-command" run := state.Run{ RunID: "test-run", ExecutableID: &executableID, DefinitionID: definitionID, Command: &command, ClusterName: "test-cluster", } adapter.(*eksAdapter).emitARAMetrics(run, 1000, 2000, 3000, 4000, 5000, 6000, true, true) // Verify logger was called if len(logger.logCalls) == 0 { t.Error("Expected logger.Log to be called") return } logCall := logger.logCalls[0] expectedFields := map[string]interface{}{ "level": "warn", "message": "ARA memory allocation hit maximum limit - potential over-provisioning", "run_id": "test-run", "cluster": "test-cluster", "default_cpu_millicores": int64(1000), "default_memory_mb": int64(2000), "requested_cpu_millicores": int64(5000), "requested_memory_mb": int64(6000), "final_cpu_millicores": int64(3000), "final_memory_mb": int64(4000), "max_cpu_hit": true, "max_memory_hit": true, "definition_id": "test-definition", "executable_id": "test-executable", "command": "test-command", "memory_overage_mb": int64(2000), // 6000 - 4000 "cpu_overage_millicores": int64(2000), // 5000 - 3000 } // Verify all expected fields are present logMap := make(map[interface{}]interface{}) for i := 0; i < len(logCall); i += 2 { if i+1 < len(logCall) { logMap[logCall[i]] = logCall[i+1] } } for key, expectedValue := range expectedFields { if actualValue, ok := logMap[key]; !ok { t.Errorf("Expected log to contain field '%s'", key) } else if actualValue != expectedValue { t.Errorf("Expected log field '%s' to be %v, got %v", key, expectedValue, actualValue) } } } func TestEmitARAMetrics_NilLogger(t *testing.T) { // Create adapter with nil logger (shouldn't panic) adapter := &eksAdapter{logger: nil} run := state.Run{ RunID: "test-run", } // Should not panic adapter.emitARAMetrics(run, 1000, 2000, 3000, 4000, 5000, 6000, true, true) } // Helper function func int64Ptr(i int64) *int64 { return &i } ================================================ FILE: execution/engine/dcm.go ================================================ package engine import ( "context" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/eks" "github.com/pkg/errors" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/state" kubernetestrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" metricsv "k8s.io/metrics/pkg/client/clientset/versioned" "os" "os/exec" "path/filepath" "sync" ) // DynamicClusterManager handles dynamic loading of K8s clients type DynamicClusterManager struct { mutex sync.RWMutex log flotillaLog.Logger eksClient *eks.EKS awsRegion string manager state.Manager awsSession *session.Session } // getKubeconfigBaseDir returns the base directory for kubeconfig files func getKubeconfigBaseDir() string { dir := os.Getenv("EKS_KUBECONFIG_BASEPATH") if dir != "" { dir, _ = os.Getwd() } return dir } // NewDynamicClusterManager creates a cluster manager that loads clusters from the state manager func NewDynamicClusterManager(awsRegion string, log flotillaLog.Logger, manager state.Manager) (*DynamicClusterManager, error) { sess := session.Must(session.NewSession(&aws.Config{ Region: aws.String(awsRegion), })) eksClient := eks.New(sess) return &DynamicClusterManager{ log: log, eksClient: eksClient, awsRegion: awsRegion, manager: manager, awsSession: sess, }, nil } // getOrCreateKubeconfig ensures a valid kubeconfig exists for the given cluster func (dcm *DynamicClusterManager) getOrCreateKubeconfig(clusterName string) (string, error) { kubeconfigBaseDir := getKubeconfigBaseDir() kubeconfigPath := filepath.Join(kubeconfigBaseDir, clusterName) if _, err := os.Stat(kubeconfigBaseDir); os.IsNotExist(err) { if err := os.MkdirAll(kubeconfigBaseDir, 0755); err != nil { return "", errors.Wrap(err, "failed to create directory for kubeconfigs") } } needsGeneration := false if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { needsGeneration = true } else { _, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) if err != nil { needsGeneration = true } } if needsGeneration { if err := dcm.generateKubeconfig(clusterName, kubeconfigPath); err != nil { return "", err } } return kubeconfigPath, nil } // generateKubeconfig creates a kubeconfig file for the specified cluster func (dcm *DynamicClusterManager) generateKubeconfig(clusterName, kubeconfigPath string) error { cmd := exec.Command("aws", "eks", "update-kubeconfig", "--name", clusterName, "--region", dcm.awsRegion, "--kubeconfig", kubeconfigPath) if output, err := cmd.CombinedOutput(); err != nil { dcm.log.Log("level", "error", "message", "Failed to generate kubeconfig", "cluster", clusterName, "error", err.Error(), "output", string(output)) return errors.Wrapf(err, "failed to generate kubeconfig: %s", string(output)) } dcm.log.Log("level", "info", "message", "Successfully generated kubeconfig", "cluster", clusterName, "path", kubeconfigPath) return nil } // createRestConfig builds a rest.Config from a kubeconfig path func (dcm *DynamicClusterManager) createRestConfig(kubeconfigPath string) (*rest.Config, error) { config, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) if err != nil { return nil, errors.Wrap(err, "failed to load kubeconfig") } config.WrapTransport = kubernetestrace.WrapRoundTripper return config, nil } // GetKubernetesClient returns a k8s client for the requested cluster func (dcm *DynamicClusterManager) GetKubernetesClient(clusterName string) (kubernetes.Clientset, error) { kubeconfigPath, err := dcm.getOrCreateKubeconfig(clusterName) if err != nil { return kubernetes.Clientset{}, err } config, err := dcm.createRestConfig(kubeconfigPath) if err != nil { return kubernetes.Clientset{}, err } kClient, err := kubernetes.NewForConfig(config) if err != nil { return kubernetes.Clientset{}, errors.Wrap(err, "failed to create kubernetes client") } return *kClient, nil } // GetMetricsClient returns a metrics client for the requested cluster func (dcm *DynamicClusterManager) GetMetricsClient(clusterName string) (metricsv.Clientset, error) { kubeconfigPath, err := dcm.getOrCreateKubeconfig(clusterName) if err != nil { return metricsv.Clientset{}, err } config, err := dcm.createRestConfig(kubeconfigPath) if err != nil { return metricsv.Clientset{}, err } metricsClient, err := metricsv.NewForConfig(config) if err != nil { return metricsv.Clientset{}, errors.Wrap(err, "failed to create metrics client") } return *metricsClient, nil } // InitializeClusters handles both static and dynamic cluster configurations func (dcm *DynamicClusterManager) InitializeClusters(ctx context.Context, staticClusters []string) error { kubeconfigBaseDir := getKubeconfigBaseDir() if err := os.MkdirAll(kubeconfigBaseDir, 0755); err != nil { return errors.Wrap(err, "failed to create directory for kubeconfigs") } // Initialize static clusters for _, clusterName := range staticClusters { kubeconfigPath := filepath.Join(kubeconfigBaseDir, clusterName) if err := dcm.generateKubeconfig(clusterName, kubeconfigPath); err != nil { dcm.log.Log("level", "error", "message", "Failed to initialize static cluster", "cluster", clusterName, "error", err.Error()) } } // Initialize dynamic clusters from state manager clusters, err := dcm.manager.ListClusterStates(ctx) if err != nil { return errors.Wrap(err, "failed to list clusters") } for _, cluster := range clusters { if cluster.Status == state.StatusActive { kubeconfigPath := filepath.Join(kubeconfigBaseDir, cluster.Name) if err := dcm.generateKubeconfig(cluster.Name, kubeconfigPath); err != nil { dcm.log.Log("level", "error", "message", "Failed to initialize dynamic cluster", "cluster", cluster.Name, "error", err.Error()) } } } return nil } ================================================ FILE: execution/engine/eks_engine.go ================================================ package engine import ( "bytes" "context" "fmt" "github.com/go-redis/redis" "github.com/stitchfix/flotilla-os/utils" "strings" "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/s3" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/clients/metrics" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/execution/adapter" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/queue" "github.com/stitchfix/flotilla-os/state" awstrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/aws/aws-sdk-go/aws" "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sJson "k8s.io/apimachinery/pkg/runtime/serializer/json" "k8s.io/client-go/kubernetes" metricsv "k8s.io/metrics/pkg/client/clientset/versioned" ) // EKSExecutionEngine submits runs to EKS. type EKSExecutionEngine struct { kClients map[string]kubernetes.Clientset metricsClients map[string]metricsv.Clientset adapter adapter.EKSAdapter qm queue.Manager log flotillaLog.Logger jobQueue string jobNamespace string jobTtl int jobSA string jobARAEnabled bool schedulerName string serializer *k8sJson.Serializer s3Client *s3.S3 s3Bucket string s3BucketRootDir string statusQueue string clusters []string clusterManager *DynamicClusterManager stateManager state.Manager redisClient *redis.Client } // Initialize configures the EKSExecutionEngine and initializes internal clients func (ee *EKSExecutionEngine) Initialize(conf config.Config) error { ee.jobQueue = conf.GetString("eks_job_queue") ee.schedulerName = "default-scheduler" if conf.IsSet("eks_scheduler_name") { ee.schedulerName = conf.GetString("eks_scheduler_name") } if conf.IsSet("eks_status_queue") { ee.statusQueue = conf.GetString("eks_status_queue") } ee.jobNamespace = conf.GetString("eks_job_namespace") ee.jobTtl = conf.GetInt("eks_job_ttl") ee.jobSA = conf.GetString("eks_default_service_account") ee.jobARAEnabled = true clusterManager, err := NewDynamicClusterManager( conf.GetString("aws_default_region"), ee.log, ee.stateManager, ) if err != nil { return errors.Wrap(err, "failed to create dynamic cluster manager") } ee.clusterManager = clusterManager // Get static clusters if configured var staticClusters []string if conf.IsSet("eks_clusters") { clusters := strings.Split(conf.GetString("eks_clusters"), ",") for i := range clusters { staticClusters = append(staticClusters, strings.TrimSpace(clusters[i])) } } // Initialize all clusters (both static and dynamic) if err := clusterManager.InitializeClusters(context.Background(), staticClusters); err != nil { ee.log.Log("level", "error", "message", "failed to initialize clusters", "error", err.Error()) } adapt, err := adapter.NewEKSAdapter(ee.log) if err != nil { return err } ee.serializer = k8sJson.NewSerializerWithOptions( k8sJson.DefaultMetaFactory, nil, nil, k8sJson.SerializerOptions{ Yaml: true, Pretty: true, Strict: true, }, ) awsRegion := conf.GetString("eks_manifest_storage_options_region") awsConfig := &aws.Config{Region: aws.String(awsRegion)} sess := awstrace.WrapSession(session.Must(session.NewSessionWithOptions(session.Options{Config: *awsConfig}))) sess = awstrace.WrapSession(sess) ee.s3Client = s3.New(sess, aws.NewConfig().WithRegion(awsRegion)) ee.s3Bucket = conf.GetString("eks_manifest_storage_options_s3_bucket_name") ee.s3BucketRootDir = conf.GetString("eks_manifest_storage_options_s3_bucket_root_dir") ee.adapter = adapt return nil } func (ee *EKSExecutionEngine) Execute(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) (state.Run, bool, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.execute", "") span.SetTag("job.run_id", run.RunID) span.SetTag("job.tier", run.Tier) defer span.Finish() utils.TagJobRun(span, run) if run.Namespace == nil || *run.Namespace == "" { clusters, err := manager.ListClusterStates(ctx) if err == nil { for _, cluster := range clusters { if cluster.Name == run.ClusterName && cluster.Namespace != "" { run.Namespace = &cluster.Namespace break } } } } if run.ServiceAccount == nil { run.ServiceAccount = aws.String(ee.jobSA) } tierTag := fmt.Sprintf("tier:%s", run.Tier) job, err := ee.adapter.AdaptFlotillaDefinitionAndRunToJob(ctx, executable, run, ee.schedulerName, manager, ee.jobARAEnabled) if err != nil { exitReason := fmt.Sprintf("Error creating k8s manigest - %s", err.Error()) run.ExitReason = &exitReason return run, false, err } kClient, err := ee.getKClient(run) if err != nil { exitReason := fmt.Sprintf("Invalid cluster name - %s", run.ClusterName) run.ExitReason = &exitReason return run, false, err } result, err := kClient.BatchV1().Jobs(ee.jobNamespace).Create(ctx, &job, metav1.CreateOptions{}) if err != nil { // Job is already submitted, don't retry if strings.Contains(strings.ToLower(err.Error()), "already exists") { return run, false, nil } // Job spec is invalid, don't retry. if strings.Contains(strings.ToLower(err.Error()), "is invalid") { exitReason := err.Error() run.ExitReason = &exitReason return run, false, err } // Legitimate submit error, retryable. _ = metrics.Increment(metrics.EngineEKSExecute, []string{string(metrics.StatusFailure), tierTag}, 1) return run, true, err } var b0 bytes.Buffer err = ee.serializer.Encode(result, &b0) if err == nil { putObject := s3.PutObjectInput{ Bucket: aws.String(ee.s3Bucket), Body: bytes.NewReader(b0.Bytes()), Key: aws.String(fmt.Sprintf("%s/%s/%s.yaml", ee.s3BucketRootDir, run.RunID, run.RunID)), ContentType: aws.String("text/yaml"), } _, err = ee.s3Client.PutObject(&putObject) if err != nil { _ = ee.log.Log("level", "error", "message", "s3_upload_error", "error", err.Error()) } } _ = metrics.Increment(metrics.EngineEKSExecute, []string{string(metrics.StatusSuccess), tierTag}, 1) run, _ = ee.getPodName(run) adaptedRun, err := ee.adapter.AdaptJobToFlotillaRun(result, run, nil) if err != nil { return adaptedRun, false, err } // Set status to running. adaptedRun.Status = state.StatusRunning if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) } else { span.SetTag("job.submitted", true) utils.TagJobRun(span, adaptedRun) } return adaptedRun, false, nil } func (ee *EKSExecutionEngine) getPodName(run state.Run) (state.Run, error) { podList, err := ee.getPodList(run) if err != nil { return run, err } if podList != nil && podList.Items != nil && len(podList.Items) > 0 { pod := podList.Items[len(podList.Items)-1] run.PodName = &pod.Name run.Namespace = &pod.Namespace if pod.Spec.Containers != nil && len(pod.Spec.Containers) > 0 { container := pod.Spec.Containers[len(pod.Spec.Containers)-1] cpu := container.Resources.Requests.Cpu().ScaledValue(resource.Milli) cpuLimit := container.Resources.Limits.Cpu().ScaledValue(resource.Milli) run.Cpu = &cpu run.CpuLimit = &cpuLimit run = ee.getInstanceDetails(pod, run) mem := container.Resources.Requests.Memory().ScaledValue(resource.Mega) run.Memory = &mem memLimit := container.Resources.Limits.Memory().ScaledValue(resource.Mega) run.MemoryLimit = &memLimit } } return run, nil } func (ee *EKSExecutionEngine) getInstanceDetails(pod v1.Pod, run state.Run) state.Run { if len(pod.Spec.NodeName) > 0 { run.InstanceDNSName = pod.Spec.NodeName } return run } func (ee *EKSExecutionEngine) getPodList(run state.Run) (*v1.PodList, error) { ctx := context.Background() kClient, err := ee.getKClient(run) if err != nil { return &v1.PodList{}, err } if run.PodName != nil { pod, err := kClient.CoreV1().Pods(ee.jobNamespace).Get(ctx, *run.PodName, metav1.GetOptions{}) if pod != nil { return &v1.PodList{Items: []v1.Pod{*pod}}, err } } else { if run.QueuedAt == nil { return &v1.PodList{}, err } queuedAt := *run.QueuedAt if time.Now().After(queuedAt.Add(time.Minute * time.Duration(5))) { podList, err := kClient.CoreV1().Pods(ee.jobNamespace).List(ctx, metav1.ListOptions{ LabelSelector: fmt.Sprintf("job-name=%s", run.RunID), }) return podList, err } } return &v1.PodList{}, err } func (ee *EKSExecutionEngine) getKClient(run state.Run) (kubernetes.Clientset, error) { ctx := context.Background() ctx, span := utils.TraceJob(ctx, "flotilla.job.get_k8s_client", run.RunID) defer span.Finish() startTime := time.Now() kClient, err := ee.clusterManager.GetKubernetesClient(run.ClusterName) span.SetTag("k8s.client_init_ms", time.Since(startTime).Milliseconds()) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) span.SetTag("error.type", "k8s_client_init") return kubernetes.Clientset{}, errors.Wrapf(err, "failed to get Kubernetes client for cluster %s", run.ClusterName) } return kClient, nil } func (ee *EKSExecutionEngine) Terminate(ctx context.Context, run state.Run) error { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.eks_terminate", run.RunID) defer span.Finish() utils.TagJobRun(span, run) gracePeriod := int64(300) deletionPropagation := metav1.DeletePropagationBackground _ = ee.log.Log("level", "info", "message", "terminating run", "run_id", run.RunID) deleteOptions := &metav1.DeleteOptions{ GracePeriodSeconds: &gracePeriod, PropagationPolicy: &deletionPropagation, } kClient, err := ee.getKClient(run) if err != nil { exitReason := fmt.Sprint(err.Error()) run.ExitReason = &exitReason return err } _ = kClient.BatchV1().Jobs(ee.jobNamespace).Delete(ctx, run.RunID, *deleteOptions) if run.PodName != nil { _ = kClient.CoreV1().Pods(ee.jobNamespace).Delete(ctx, *run.PodName, *deleteOptions) } tierTag := fmt.Sprintf("tier:%s", run.Tier) _ = metrics.Increment(metrics.EngineEKSTerminate, []string{string(metrics.StatusSuccess), tierTag}, 1) return nil } func (ee *EKSExecutionEngine) Enqueue(ctx context.Context, run state.Run) error { var span tracer.Span ctx, span = utils.TraceJob(ctx, "flotilla.job.eks_enqueue", "") defer span.Finish() span.SetTag("job.run_id", run.RunID) utils.TagJobRun(span, run) tierTag := fmt.Sprintf("tier:%s", run.Tier) // Get qurl qurl, err := ee.qm.QurlFor(ee.jobQueue, false) if err != nil { _ = metrics.Increment(metrics.EngineEKSEnqueue, []string{string(metrics.StatusFailure), tierTag}, 1) return errors.Wrapf(err, "problem getting queue url for [%s]", run.ClusterName) } // Queue run if err = ee.qm.Enqueue(ctx, qurl, run); err != nil { _ = metrics.Increment(metrics.EngineEKSEnqueue, []string{string(metrics.StatusFailure), tierTag}, 1) return errors.Wrapf(err, "problem enqueing run [%s] to queue [%s]", run.RunID, qurl) } _ = metrics.Increment(metrics.EngineEKSEnqueue, []string{string(metrics.StatusSuccess), tierTag}, 1) return nil } func (ee *EKSExecutionEngine) PollRuns(ctx context.Context) ([]RunReceipt, error) { qurl, err := ee.qm.QurlFor(ee.jobQueue, false) if err != nil { return nil, errors.Wrap(err, "problem listing queues to poll") } queues := []string{qurl} var runs []RunReceipt for _, qurl := range queues { // // Get new queued Run // runReceipt, err := ee.qm.ReceiveRun(ctx, qurl) if err != nil { return runs, errors.Wrapf(err, "problem receiving run from queue url [%s]", qurl) } if runReceipt.Run == nil { continue } if runReceipt.TraceID != 0 && runReceipt.ParentID != 0 { ee.log.Log("level", "info", "message", "Received run with trace context", "run_id", runReceipt.Run.RunID, "trace_id", runReceipt.TraceID, "parent_id", runReceipt.ParentID) } runs = append(runs, RunReceipt{ RunReceipt: runReceipt, TraceID: runReceipt.TraceID, ParentID: runReceipt.ParentID, SamplingPriority: runReceipt.SamplingPriority, }) } return runs, nil } // PollStatus is a dummy function as EKS does not emit task status // change events. func (ee *EKSExecutionEngine) PollStatus(ctx context.Context) (RunReceipt, error) { return RunReceipt{}, nil } // Reads off SQS queue and generates a Run object based on the runId func (ee *EKSExecutionEngine) PollRunStatus(ctx context.Context) (state.Run, error) { return state.Run{}, nil } // Define returns a blank task definition and an error for the EKS engine. func (ee *EKSExecutionEngine) Define(ctx context.Context, td state.Definition) (state.Definition, error) { return td, errors.New("Definition of tasks are only for ECSs.") } // Deregister returns an error for the EKS engine. func (ee *EKSExecutionEngine) Deregister(ctx context.Context, definition state.Definition) error { return errors.Errorf("EKSExecutionEngine does not allow for deregistering of task definitions.") } func (ee *EKSExecutionEngine) Get(ctx context.Context, run state.Run) (state.Run, error) { if ctx == nil { ctx = context.Background() } kClient, err := ee.getKClient(run) if err != nil { return state.Run{}, err } job, err := kClient.BatchV1().Jobs(ee.jobNamespace).Get(ctx, run.RunID, metav1.GetOptions{}) if err != nil { return state.Run{}, errors.Errorf("error getting kubernetes job %s", err) } updates, err := ee.adapter.AdaptJobToFlotillaRun(job, run, nil) if err != nil { return state.Run{}, errors.Errorf("error adapting kubernetes job to flotilla run %s", err) } return updates, nil } func (ee *EKSExecutionEngine) GetEvents(ctx context.Context, run state.Run) (state.PodEventList, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.get_events", run.RunID) defer span.Finish() utils.TagJobRun(span, run) if run.PodName == nil { return state.PodEventList{}, nil } kClient, err := ee.getKClient(run) if err != nil { return state.PodEventList{}, err } eventList, err := kClient.CoreV1().Events(ee.jobNamespace).List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name==%s", *run.PodName)}) if err != nil { return state.PodEventList{}, errors.Errorf("error getting kubernetes event for flotilla run %s", err) } var podEvents []state.PodEvent for _, e := range eventList.Items { eTime := e.FirstTimestamp.Time runEvent := state.PodEvent{ Message: e.Message, Timestamp: &eTime, EventType: e.Type, Reason: e.Reason, SourceObject: e.ObjectMeta.Name, } if strings.Contains(e.Reason, "TriggeredScaleUp") { source := fmt.Sprintf("source:%s", e.ObjectMeta.Name) _ = metrics.Increment(metrics.EngineEKSNodeTriggeredScaledUp, []string{source}, 1) } podEvents = append(podEvents, runEvent) } podEventList := state.PodEventList{ Total: len(podEvents), PodEvents: podEvents, } return podEventList, nil } func (ee *EKSExecutionEngine) FetchPodMetrics(ctx context.Context, run state.Run) (state.Run, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.eks_fetch_metrics", run.RunID) defer span.Finish() utils.TagJobRun(span, run) if run.PodName != nil { metricsClient, err := ee.clusterManager.GetMetricsClient(run.ClusterName) if err != nil { return run, errors.Wrapf(err, "failed to get metrics client for cluster %s", run.ClusterName) } start := time.Now() podMetrics, err := metricsClient.MetricsV1beta1().PodMetricses(ee.jobNamespace).Get(ctx, *run.PodName, metav1.GetOptions{}) _ = metrics.Timing(metrics.StatusWorkerFetchMetrics, time.Since(start), []string{run.ClusterName}, 1) if err != nil { return run, err } if len(podMetrics.Containers) > 0 { containerMetrics := podMetrics.Containers[0] mem := containerMetrics.Usage.Memory().ScaledValue(resource.Mega) if run.MaxMemoryUsed == nil || *run.MaxMemoryUsed == 0 || *run.MaxMemoryUsed < mem { run.MaxMemoryUsed = &mem } cpu := containerMetrics.Usage.Cpu().MilliValue() if run.MaxCpuUsed == nil || *run.MaxCpuUsed == 0 || *run.MaxCpuUsed < cpu { run.MaxCpuUsed = &cpu } } if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) } else if run.MaxMemoryUsed != nil { span.SetTag("job.metrics.memory_mb", *run.MaxMemoryUsed) } if run.MaxCpuUsed != nil { span.SetTag("job.metrics.cpu_millicores", *run.MaxCpuUsed) } return run, nil } return run, errors.New("no pod associated with the run.") } func (ee *EKSExecutionEngine) FetchUpdateStatus(ctx context.Context, run state.Run) (state.Run, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.eks_fetch_status", run.RunID) defer span.Finish() utils.TagJobRun(span, run) kClient, err := ee.getKClient(run) if err != nil { return state.Run{}, err } start := time.Now() job, err := kClient.BatchV1().Jobs(ee.jobNamespace).Get(ctx, run.RunID, metav1.GetOptions{}) span.SetTag("k8s.job_get_ms", time.Since(start).Milliseconds()) _ = metrics.Timing(metrics.StatusWorkerGetJob, time.Since(start), []string{run.ClusterName}, 1) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) span.SetTag("error.type", "k8s_get_job") return run, err } if job.Status.Active > 0 { span.SetTag("job.k8s.active", job.Status.Active) } if job.Status.Succeeded > 0 { span.SetTag("job.k8s.succeeded", job.Status.Succeeded) } if job.Status.Failed > 0 { span.SetTag("job.k8s.failed", job.Status.Failed) } var mostRecentPod *v1.Pod var mostRecentPodCreationTimestamp metav1.Time start = time.Now() podList, err := ee.getPodList(run) _ = metrics.Timing(metrics.StatusWorkerGetPodList, time.Since(start), []string{run.ClusterName}, 1) if err == nil && podList != nil && podList.Items != nil && len(podList.Items) > 0 { // Iterate over associated pods to find the most recent. for _, p := range podList.Items { if mostRecentPodCreationTimestamp.Before(&p.CreationTimestamp) || len(podList.Items) == 1 { mostRecentPod = &p mostRecentPodCreationTimestamp = p.CreationTimestamp } } // If the run doesn't have an associated pod name yet OR // there is a newer pod (i.e. the old pod was killed), // update it. if mostRecentPod != nil && (run.PodName == nil || mostRecentPod.Name != *run.PodName) { if run.PodName != nil && mostRecentPod.Name != *run.PodName { _ = metrics.Increment(metrics.EngineEKSRunPodnameChange, []string{}, 1) } run.PodName = &mostRecentPod.Name run = ee.getInstanceDetails(*mostRecentPod, run) } // Pod didn't change, but Instance information is not populated. if mostRecentPod != nil && len(run.InstanceDNSName) == 0 { run = ee.getInstanceDetails(*mostRecentPod, run) } if mostRecentPod != nil && mostRecentPod.Spec.Containers != nil && len(mostRecentPod.Spec.Containers) > 0 { container := mostRecentPod.Spec.Containers[len(mostRecentPod.Spec.Containers)-1] cpu := container.Resources.Requests.Cpu().ScaledValue(resource.Milli) run.Cpu = &cpu mem := container.Resources.Requests.Memory().ScaledValue(resource.Mega) run.Memory = &mem cpuLimit := container.Resources.Limits.Cpu().ScaledValue(resource.Milli) run.CpuLimit = &cpuLimit memLimit := container.Resources.Limits.Memory().ScaledValue(resource.Mega) run.MemoryLimit = &memLimit } } //run, _ = ee.FetchPodMetrics(ctx, run) hoursBack := time.Now().Add(-24 * time.Hour) start = time.Now() var events state.PodEventList //events, err = ee.GetEvents(ctx, run) _ = metrics.Timing(metrics.StatusWorkerGetEvents, time.Since(start), []string{run.ClusterName}, 1) if err == nil && len(events.PodEvents) > 0 { newEvents := events.PodEvents if run.PodEvents != nil && len(*run.PodEvents) > 0 { priorEvents := *run.PodEvents for _, newEvent := range newEvents { unseen := true for _, priorEvent := range priorEvents { if priorEvent.Equal(newEvent) { unseen = false break } } if unseen { priorEvents = append(priorEvents, newEvent) } } run.PodEvents = &priorEvents } else { run.PodEvents = &newEvents } } if run.PodEvents != nil { attemptCount := int64(0) for _, podEvent := range *run.PodEvents { if strings.Contains(podEvent.Reason, "Scheduled") { attemptCount = attemptCount + 1 } } run.AttemptCount = &attemptCount } // Handle edge case for dangling jobs. // Run used to have a pod and now it is not there, job is older than 24 hours. Terminate it. if err == nil && podList != nil && podList.Items != nil && len(podList.Items) == 0 && run.PodName != nil && run.QueuedAt.Before(hoursBack) { err = ee.Terminate(ctx, run) if err == nil { job.Status.Failed = 1 mostRecentPod = nil } } return ee.adapter.AdaptJobToFlotillaRun(job, run, mostRecentPod) } ================================================ FILE: execution/engine/emr_engine.go ================================================ package engine import ( "bytes" "context" "encoding/json" "fmt" "os" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/emrcontainers" "github.com/aws/aws-sdk-go/service/s3" "github.com/go-redis/redis" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/clients/metrics" "github.com/stitchfix/flotilla-os/exceptions" "github.com/stitchfix/flotilla-os/utils" "github.com/stitchfix/flotilla-os/config" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/queue" "github.com/stitchfix/flotilla-os/state" awstrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/aws/aws-sdk-go/aws" "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" _ "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" k8sJson "k8s.io/apimachinery/pkg/runtime/serializer/json" "k8s.io/client-go/kubernetes" _ "k8s.io/client-go/kubernetes/scheme" "regexp" "strings" ) // EMRExecutionEngine submits runs to EMR-EKS. type EMRExecutionEngine struct { sqsQueueManager queue.Manager log flotillaLog.Logger emrJobQueue string emrJobNamespace string emrJobRoleArn map[string]string emrJobSA string emrVirtualClusters map[string]string emrContainersClient *emrcontainers.EMRContainers schedulerName string s3Client *s3.S3 awsRegion string s3LogsBucket string s3EventLogPath string s3LogsBasePath string s3ManifestBucket string s3ManifestBasePath string serializer *k8sJson.Serializer clusters []string driverInstanceType string kClients map[string]kubernetes.Clientset clusterManager *DynamicClusterManager stateManager state.Manager redisClient *redis.Client lakekeeperSecretName string } // Initialize configures the EMRExecutionEngine and initializes internal clients func (emr *EMRExecutionEngine) Initialize(conf config.Config) error { emr.emrVirtualClusters = make(map[string]string) emr.emrVirtualClusters = conf.GetStringMapString("emr_virtual_clusters") emr.emrJobQueue = conf.GetString("emr_job_queue") emr.emrJobNamespace = conf.GetString("emr_job_namespace") emr.emrJobRoleArn = conf.GetStringMapString("emr_job_role_arn") emr.awsRegion = conf.GetString("emr_aws_region") emr.s3LogsBucket = conf.GetString("emr_log_bucket") emr.s3LogsBasePath = conf.GetString("emr_log_base_path") emr.s3EventLogPath = conf.GetString("emr_log_event_log_path") emr.s3ManifestBucket = conf.GetString("emr_manifest_bucket") emr.s3ManifestBasePath = conf.GetString("emr_manifest_base_path") emr.emrJobSA = conf.GetString("emr_default_service_account") emr.schedulerName = conf.GetString("eks_scheduler_name") emr.driverInstanceType = conf.GetString("emr_driver_instance_type") emr.lakekeeperSecretName = conf.GetString("emr_lakekeeper_secret_name") awsConfig := &aws.Config{Region: aws.String(emr.awsRegion)} sess := session.Must(session.NewSessionWithOptions(session.Options{Config: *awsConfig})) sess = awstrace.WrapSession(sess) emr.s3Client = s3.New(sess, aws.NewConfig().WithRegion(emr.awsRegion)) emr.emrContainersClient = emrcontainers.New(sess, aws.NewConfig().WithRegion(emr.awsRegion)) emr.serializer = k8sJson.NewSerializerWithOptions( k8sJson.SimpleMetaFactory{}, nil, nil, k8sJson.SerializerOptions{ Yaml: true, Pretty: true, Strict: true, }, ) clusterManager, err := NewDynamicClusterManager( emr.awsRegion, emr.log, emr.stateManager, ) if err != nil { return errors.Wrap(err, "failed to create dynamic cluster manager") } emr.clusterManager = clusterManager // Get static clusters if configured var staticClusters []string if conf.IsSet("eks_clusters") { clusters := strings.Split(conf.GetString("eks_clusters"), ",") for i := range clusters { staticClusters = append(staticClusters, strings.TrimSpace(clusters[i])) } } // Initialize all clusters (both static and dynamic) if err := clusterManager.InitializeClusters(context.Background(), staticClusters); err != nil { emr.log.Log("level", "error", "message", "failed to initialize clusters", "error", err.Error()) } return nil } func (emr *EMRExecutionEngine) getKClient(run state.Run) (kubernetes.Clientset, error) { kClient, err := emr.clusterManager.GetKubernetesClient(run.ClusterName) if err != nil { return kubernetes.Clientset{}, errors.Wrapf(err, "failed to get Kubernetes client for cluster %s", run.ClusterName) } return kClient, nil } func (emr *EMRExecutionEngine) Execute(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) (state.Run, bool, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.emr_execute", run.RunID) defer span.Finish() utils.TagJobRun(span, run) run = emr.estimateExecutorCount(run, manager) run = emr.estimateMemoryResources(ctx, run, manager) if run.ServiceAccount == nil || *run.ServiceAccount == "" { run.ServiceAccount = aws.String(emr.emrJobSA) } if run.CommandHash != nil && run.NodeLifecycle != nil && *run.NodeLifecycle == state.SpotLifecycle { nodeType, err := manager.GetNodeLifecycle(ctx, run.DefinitionID, *run.CommandHash) if err == nil && nodeType == state.OndemandLifecycle { run.NodeLifecycle = &state.OndemandLifecycle } } startJobRunInput, err := emr.generateEMRStartJobRunInput(ctx, executable, run, manager) emrJobManifest := aws.String(fmt.Sprintf("%s/%s/%s.json", emr.s3ManifestBasePath, run.RunID, "start-job-run-input")) obj, err := json.MarshalIndent(startJobRunInput, "", "\t") if err == nil { emrJobManifest = emr.writeStringToS3(emrJobManifest, obj) } emr.log.Log("level", "info", "message", "Start EMR JobRun", "ExecutionRoleArn", startJobRunInput.ExecutionRoleArn) tierTag := fmt.Sprintf("tier:%s", run.Tier) startJobRunOutput, err := emr.emrContainersClient.StartJobRun(&startJobRunInput) if err == nil { run.SparkExtension.VirtualClusterId = startJobRunOutput.VirtualClusterId run.SparkExtension.EMRJobId = startJobRunOutput.Id run.SparkExtension.EMRJobManifest = emrJobManifest run.Status = state.StatusQueued _ = metrics.Increment(metrics.EngineEMRExecute, []string{string(metrics.StatusSuccess), tierTag}, 1) } else { run.ExitReason = aws.String(fmt.Sprintf("%v", err)) run.ExitCode = aws.Int64(-1) run.StartedAt = run.QueuedAt run.FinishedAt = run.QueuedAt run.Status = state.StatusStopped _ = emr.log.Log("level", "error", "message", "EMR job submission error", "error", err.Error()) _ = metrics.Increment(metrics.EngineEKSExecute, []string{string(metrics.StatusFailure), tierTag}, 1) return run, false, err } if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) } else { span.SetTag("emr.job_id", *run.SparkExtension.EMRJobId) span.SetTag("emr.virtual_cluster_id", *run.SparkExtension.VirtualClusterId) utils.TagJobRun(span, run) } return run, false, nil } func (emr *EMRExecutionEngine) generateApplicationConf(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) []*emrcontainers.Configuration { if ctx == nil { ctx = context.Background() } sparkDefaults := map[string]*string{ "spark.kubernetes.driver.podTemplateFile": emr.driverPodTemplate(ctx, executable, run, manager), "spark.kubernetes.executor.podTemplateFile": emr.executorPodTemplate(ctx, executable, run, manager), "spark.kubernetes.container.image": &run.Image, "spark.eventLog.dir": aws.String(fmt.Sprintf("s3://%s/%s", emr.s3LogsBucket, emr.s3EventLogPath)), "spark.history.fs.logDirectory": aws.String(fmt.Sprintf("s3://%s/%s", emr.s3LogsBucket, emr.s3EventLogPath)), "spark.eventLog.enabled": aws.String("true"), "spark.default.parallelism": aws.String("256"), "spark.sql.shuffle.partitions": aws.String("256"), // PrometheusServlet metrics config "spark.metrics.conf.*.sink.prometheusServlet.class": aws.String("org.apache.spark.metrics.sink.PrometheusServlet"), "spark.metrics.conf.*.sink.prometheusServlet.path": aws.String("/metrics/driver/prometheus"), "master.sink.prometheusServlet.path": aws.String("/metrics/master/prometheus"), "applications.sink.prometheusServlet.path": aws.String("/metrics/applications/prometheus"), // Metrics grouped per component instance and source namespace e.g., Component instance = Driver or Component instance = shuffleService "spark.kubernetes.driver.service.annotation.prometheus.io/port": aws.String("4040"), "spark.kubernetes.driver.service.annotation.prometheus.io/path": aws.String("/metrics/driver/prometheus/"), "spark.kubernetes.driver.service.annotation.prometheus.io/scrape": aws.String("true"), // Executor-level metrics are sent from each executor to the driver. Prometheus endpoint at: /metrics/executors/prometheus "spark.kubernetes.driver.annotation.prometheus.io/scrape": aws.String("true"), "spark.kubernetes.driver.annotation.prometheus.io/path": aws.String("/metrics/executors/prometheus/"), "spark.kubernetes.driver.annotation.prometheus.io/port": aws.String("4040"), "spark.ui.prometheus.enabled": aws.String("true"), } hiveDefaults := map[string]*string{} for _, k := range run.SparkExtension.ApplicationConf { sparkDefaults[*k.Name] = k.Value } if run.SparkExtension.HiveConf != nil { for _, k := range run.SparkExtension.HiveConf { if k.Name != nil && k.Value != nil { hiveDefaults[*k.Name] = k.Value } } } return []*emrcontainers.Configuration{ { Classification: aws.String("spark-defaults"), Properties: sparkDefaults, }, { Classification: aws.String("spark-hive-site"), Properties: hiveDefaults, }, } } func (emr *EMRExecutionEngine) generateEMRStartJobRunInput(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) (emrcontainers.StartJobRunInput, error) { roleArn := emr.emrJobRoleArn[*run.ServiceAccount] if ctx == nil { ctx = context.Background() } dbClusters, err := emr.stateManager.ListClusterStates(ctx) if err != nil { emr.log.Log("level", "error", "message", "failed to get clusters from database", "error", err.Error()) return emrcontainers.StartJobRunInput{}, err } var clusterID string clusterFound := false for _, cluster := range dbClusters { if cluster.Namespace == emr.emrJobNamespace && cluster.Name == run.ClusterName { clusterID = cluster.EMRVirtualCluster if cluster.SparkServerURI != "" { run.SparkExtension.SparkServerURI = aws.String(cluster.SparkServerURI) } clusterFound = true break } } if !clusterFound { clusterID = emr.emrVirtualClusters[run.ClusterName] } if clusterID == "" { return emrcontainers.StartJobRunInput{}, fmt.Errorf("EMR virtual cluster ID not found for EKS cluster: %s", run.ClusterName) } startJobRunInput := emrcontainers.StartJobRunInput{ ClientToken: &run.RunID, ConfigurationOverrides: &emrcontainers.ConfigurationOverrides{ MonitoringConfiguration: &emrcontainers.MonitoringConfiguration{ PersistentAppUI: aws.String(emrcontainers.PersistentAppUIEnabled), S3MonitoringConfiguration: &emrcontainers.S3MonitoringConfiguration{ LogUri: aws.String(fmt.Sprintf("s3://%s/%s", emr.s3LogsBucket, emr.s3LogsBasePath)), }, }, ApplicationConfiguration: emr.generateApplicationConf(ctx, executable, run, manager), }, ExecutionRoleArn: &roleArn, JobDriver: &emrcontainers.JobDriver{ SparkSubmitJobDriver: &emrcontainers.SparkSubmitJobDriver{ EntryPoint: run.SparkExtension.SparkSubmitJobDriver.EntryPoint, EntryPointArguments: run.SparkExtension.SparkSubmitJobDriver.EntryPointArguments, SparkSubmitParameters: emr.sparkSubmitParams(run), }}, Name: &run.RunID, ReleaseLabel: run.SparkExtension.EMRReleaseLabel, VirtualClusterId: &clusterID, } return startJobRunInput, nil } func (emr *EMRExecutionEngine) generateTags(run state.Run) map[string]*string { tags := make(map[string]*string) if run.Env != nil && len(*run.Env) > 0 { for _, ev := range *run.Env { name := emr.sanitizeEnvVar(ev.Name) space := regexp.MustCompile(`\s+`) if len(ev.Value) < 256 && len(name) < 128 { tags[name] = aws.String(space.ReplaceAllString(ev.Value, "")) } } } return tags } // generates volumes and volumemounts depending on cluster name. // TODO cleanup after migration func generateVolumesForCluster(clusterName string, isEmptyDir bool) ([]v1.Volume, []v1.VolumeMount) { var volumes []v1.Volume var volumeMounts []v1.VolumeMount if isEmptyDir { // Use a emptyDir volume specificVolume := v1.Volume{ Name: "shared-lib-volume", VolumeSource: v1.VolumeSource{ EmptyDir: &(v1.EmptyDirVolumeSource{}), }, } volumes = append(volumes, specificVolume) } else { // Use the persistent volume claim sharedLibVolume := v1.Volume{ Name: "shared-lib-volume", VolumeSource: v1.VolumeSource{ PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ ClaimName: "s3-claim", }, }, } volumes = append(volumes, sharedLibVolume) } volumeMount := v1.VolumeMount{ Name: "shared-lib-volume", MountPath: "/var/lib/app", } volumeMounts = append(volumeMounts, volumeMount) return volumes, volumeMounts } func (emr *EMRExecutionEngine) driverPodTemplate(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) *string { if ctx == nil { ctx = context.Background() } // Override driver pods to always be on ondemand nodetypes. run.NodeLifecycle = &state.OndemandLifecycle workingDir := "/var/lib/app" if run.SparkExtension != nil && run.SparkExtension.SparkSubmitJobDriver != nil && run.SparkExtension.SparkSubmitJobDriver.WorkingDir != nil { workingDir = *run.SparkExtension.SparkSubmitJobDriver.WorkingDir } volumes, volumeMounts := generateVolumesForCluster(run.ClusterName, true) podSpec := v1.PodSpec{ TerminationGracePeriodSeconds: aws.Int64(90), Volumes: volumes, SchedulerName: emr.schedulerName, Containers: []v1.Container{ { Name: "spark-kubernetes-driver", Env: append(emr.envOverrides(executable, run), emr.lakekeeperSecretEnvVars()...), VolumeMounts: volumeMounts, WorkingDir: workingDir, }, }, InitContainers: []v1.Container{{ Name: fmt.Sprintf("init-driver-%s", run.RunID), Image: run.Image, Env: emr.envOverrides(executable, run), VolumeMounts: volumeMounts, Command: emr.constructCmdSlice(run.SparkExtension.DriverInitCommand), }}, RestartPolicy: v1.RestartPolicyNever, Affinity: emr.constructAffinity(ctx, executable, run, manager, true), Tolerations: emr.constructTolerations(executable, run), } if emr.driverInstanceType != "" { podSpec.NodeSelector = map[string]string{ "node.kubernetes.io/instance-type": emr.driverInstanceType, } } labels := state.GetLabels(run) pod := v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ "karpenter.sh/do-not-evict": "true", "flotilla-run-id": run.RunID, }, Labels: labels, }, Spec: podSpec, } key := aws.String(fmt.Sprintf("%s/%s/%s.yaml", emr.s3ManifestBasePath, run.RunID, "driver-template")) return emr.writeK8ObjToS3(&pod, key) } func (emr *EMRExecutionEngine) executorPodTemplate(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) *string { if ctx == nil { ctx = context.Background() } workingDir := "/var/lib/app" if run.SparkExtension != nil && run.SparkExtension.SparkSubmitJobDriver != nil && run.SparkExtension.SparkSubmitJobDriver.WorkingDir != nil { workingDir = *run.SparkExtension.SparkSubmitJobDriver.WorkingDir } labels := state.GetLabels(run) // TODO Remove after migration volumes, volumeMounts := generateVolumesForCluster(run.ClusterName, true) pod := v1.Pod{ Status: v1.PodStatus{}, ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ "karpenter.sh/do-not-evict": "true", "flotilla-run-id": run.RunID}, Labels: labels, }, Spec: v1.PodSpec{ TerminationGracePeriodSeconds: aws.Int64(90), Volumes: volumes, SchedulerName: emr.schedulerName, Containers: []v1.Container{ { Name: "spark-kubernetes-executor", Env: emr.envOverrides(executable, run), VolumeMounts: volumeMounts, WorkingDir: workingDir, }, }, InitContainers: []v1.Container{{ Name: fmt.Sprintf("init-executor-%s", run.RunID), Image: run.Image, Env: emr.envOverrides(executable, run), VolumeMounts: volumeMounts, Command: emr.constructCmdSlice(run.SparkExtension.ExecutorInitCommand), }}, RestartPolicy: v1.RestartPolicyNever, Affinity: emr.constructAffinity(ctx, executable, run, manager, false), Tolerations: emr.constructTolerations(executable, run), }, } key := aws.String(fmt.Sprintf("%s/%s/%s.yaml", emr.s3ManifestBasePath, run.RunID, "executor-template")) return emr.writeK8ObjToS3(&pod, key) } func (emr *EMRExecutionEngine) writeK8ObjToS3(obj runtime.Object, key *string) *string { var b0 bytes.Buffer err := emr.serializer.Encode(obj, &b0) payload := bytes.ReplaceAll(b0.Bytes(), []byte("status: {}"), []byte("")) payload = bytes.ReplaceAll(payload, []byte("creationTimestamp: null"), []byte("")) payload = bytes.ReplaceAll(payload, []byte("resources: {}"), []byte("")) if err == nil { putObject := s3.PutObjectInput{ Bucket: aws.String(emr.s3ManifestBucket), Body: bytes.NewReader(payload), Key: key, ContentType: aws.String("text/yaml"), } _, err = emr.s3Client.PutObject(&putObject) if err != nil { _ = emr.log.Log("level", "error", "message", "s3_upload_error", "error", err.Error()) } } return aws.String(fmt.Sprintf("s3://%s/%s", emr.s3ManifestBucket, *key)) } func (emr *EMRExecutionEngine) writeStringToS3(key *string, body []byte) *string { if body != nil && key != nil { putObject := s3.PutObjectInput{ Bucket: aws.String(emr.s3ManifestBucket), Body: bytes.NewReader(body), Key: key, ContentType: aws.String("text/yaml"), } _, err := emr.s3Client.PutObject(&putObject) if err != nil { _ = emr.log.Log("level", "error", "message", "s3_upload_error", "error", err.Error()) } } return aws.String(fmt.Sprintf("s3://%s/%s", emr.s3ManifestBucket, *key)) } func (emr *EMRExecutionEngine) constructEviction(ctx context.Context, run state.Run, manager state.Manager) string { if ctx == nil { ctx = context.Background() } if run.NodeLifecycle != nil && *run.NodeLifecycle == state.OndemandLifecycle { return "false" } if run.CommandHash != nil { nodeType, err := manager.GetNodeLifecycle(ctx, run.DefinitionID, *run.CommandHash) if err == nil && nodeType == state.OndemandLifecycle { return "false" } } return "true" } func (emr *EMRExecutionEngine) constructTolerations(executable state.Executable, run state.Run) []v1.Toleration { tolerations := []v1.Toleration{} tolerations = append(tolerations, v1.Toleration{ Key: "emr", Operator: "Equal", Value: "true", Effect: "NoSchedule", }) if team, ok := run.Labels["team"]; ok && team != "" { tolerations = append(tolerations, v1.Toleration{ Key: team, Operator: "Equal", Value: "true", Effect: "NoSchedule", }) } return tolerations } func (emr *EMRExecutionEngine) constructAffinity(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager, driver bool) *v1.Affinity { affinity := &v1.Affinity{} if ctx == nil { ctx = context.Background() } var requiredMatch []v1.NodeSelectorRequirement //todo move to config nodeLifecycleKey := "karpenter.sh/capacity-type" nodeArchKey := "kubernetes.io/arch" newCluster := true arch := []string{"amd64"} if run.Arch != nil && *run.Arch == "arm64" { arch = []string{"arm64"} } var nodeLifecycle []string nodePreference := "spot" if (run.NodeLifecycle != nil && *run.NodeLifecycle == state.OndemandLifecycle) || driver { nodeLifecycle = append(nodeLifecycle, "on-demand") nodePreference = "on-demand" } else { nodeLifecycle = append(nodeLifecycle, "spot", "on-demand") } if run.CommandHash != nil { nodeType, err := manager.GetNodeLifecycle(ctx, run.DefinitionID, *run.CommandHash) if err == nil && nodeType == state.OndemandLifecycle { nodeLifecycle = []string{"on-demand"} } } requiredMatch = append(requiredMatch, v1.NodeSelectorRequirement{ Key: nodeLifecycleKey, Operator: v1.NodeSelectorOpIn, Values: nodeLifecycle, }) requiredMatch = append(requiredMatch, v1.NodeSelectorRequirement{ Key: nodeArchKey, Operator: v1.NodeSelectorOpIn, Values: arch, }) if team, ok := run.Labels["team"]; ok && team != "" { requiredMatch = append(requiredMatch, v1.NodeSelectorRequirement{ Key: "team", Operator: v1.NodeSelectorOpIn, Values: []string{team}, }) if env := os.Getenv("FLOTILLA_MODE"); env != "" { requiredMatch = append(requiredMatch, v1.NodeSelectorRequirement{ Key: "environment", Operator: v1.NodeSelectorOpIn, Values: []string{env}, }) } } //todo remove conditional after migration _, hasTeam := run.Labels["team"] if newCluster && !hasTeam { requiredMatch = append(requiredMatch, v1.NodeSelectorRequirement{ Key: "emr", Operator: v1.NodeSelectorOpIn, Values: []string{"true"}, }) } affinity = &v1.Affinity{ NodeAffinity: &v1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ NodeSelectorTerms: []v1.NodeSelectorTerm{ { MatchExpressions: requiredMatch, }, }, }, PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{{ Weight: 50, Preference: v1.NodeSelectorTerm{ MatchExpressions: []v1.NodeSelectorRequirement{{ Key: nodeLifecycleKey, Operator: v1.NodeSelectorOpIn, Values: []string{nodePreference}, }}, }, }}, }, PodAffinity: &v1.PodAffinity{ PreferredDuringSchedulingIgnoredDuringExecution: []v1.WeightedPodAffinityTerm{ { Weight: 40, PodAffinityTerm: v1.PodAffinityTerm{ LabelSelector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "flotilla-run-id": run.RunID}, }, TopologyKey: "topology.kubernetes.io/zone", }, }, }, }, } return affinity } func (emr *EMRExecutionEngine) estimateExecutorCount(run state.Run, manager state.Manager) state.Run { return run } // buildMetricTags creates a standard set of tags for Spark ARA metrics func (emr *EMRExecutionEngine) buildMetricTags(run state.Run) []string { tags := []string{"engine:eks-spark"} if run.ClusterName != "" { tags = append(tags, fmt.Sprintf("cluster:%s", run.ClusterName)) } return tags } func setResourceSuffix(value string) string { if strings.Contains(value, "g") || strings.Contains(value, "m") { return strings.ToUpper(value) } if strings.Contains(value, "K") { return strings.ToLower(value) } return value } func (emr *EMRExecutionEngine) estimateMemoryResources(ctx context.Context, run state.Run, manager state.Manager) state.Run { // Early return for NULL command_hash if run.CommandHash == nil { metricTags := emr.buildMetricTags(run) _ = metrics.Increment(metrics.EngineEKSARANullCommandHash, metricTags, 1) if emr.log != nil { _ = emr.log.Log( "level", "warn", "message", "Skipping Spark ARA - NULL command_hash", "reason", "Spark job has no command_hash (malformed)", "run_id", run.RunID, "definition_id", run.DefinitionID, ) } return run } if ctx == nil { ctx = context.Background() } metricTags := emr.buildMetricTags(run) // Track adjustment attempt _ = metrics.Increment(metrics.EngineEKSARAEstimationAttempted, metricTags, 1) // Query for OOMs executorOOM, executorErr := manager.ExecutorOOM(ctx, run.DefinitionID, *run.CommandHash) driverOOM, driverErr := manager.DriverOOM(ctx, run.DefinitionID, *run.CommandHash) // Track query success/failure if executorErr != nil || driverErr != nil { var missingResource exceptions.MissingResource if errors.As(executorErr, &missingResource) || errors.As(driverErr, &missingResource) { // No historical data - expected for new jobs _ = metrics.Increment(metrics.EngineEKSARANoHistoricalData, metricTags, 1) } else { // Query failed with real error _ = metrics.Increment(metrics.EngineEKSARAEstimationFailed, metricTags, 1) } } else { // Query succeeded _ = metrics.Increment(metrics.EngineEKSARAEstimationSucceeded, metricTags, 1) } var sparkSubmitConf []state.Conf for _, k := range run.SparkExtension.SparkSubmitJobDriver.SparkSubmitConf { if *k.Name == "spark.executor.memory" && k.Value != nil { // 1.25x executor memory - OOM in the last 30 days if executorOOM { originalValue := *k.Value quantity := resource.MustParse(setResourceSuffix(originalValue)) originalMB := quantity.Value() / (1024 * 1024) // Convert to MB quantity.Set(int64(float64(quantity.Value()) * 1.25)) adjustedMB := quantity.Value() / (1024 * 1024) k.Value = aws.String(strings.ToLower(quantity.String())) // Emit metrics with component:executor tag executorTags := append(metricTags, "component:executor") _ = metrics.Increment(metrics.EngineEKSARAResourceAdjustment, executorTags, 1) _ = metrics.Histogram(metrics.EngineEKSARAMemoryIncreaseRatio, 1.25, executorTags, 1) _ = metrics.Distribution(metrics.EngineEKSARADefaultMemory, float64(originalMB), executorTags, 1) _ = metrics.Distribution(metrics.EngineEKSARAARAMemory, float64(adjustedMB), executorTags, 1) increaseMB := adjustedMB - originalMB _ = metrics.Distribution(metrics.EngineEKSARAMemoryIncrease, float64(increaseMB), executorTags, 1) // Log executor adjustment if emr.log != nil { _ = emr.log.Log( "level", "info", "message", "Spark ARA adjusted executor memory", "definition_id", run.DefinitionID, "run_id", run.RunID, "cluster", run.ClusterName, "component", "executor", "default_memory_mb", originalMB, "adjusted_memory_mb", adjustedMB, "increase_ratio", 1.25, "oom_detected", true, ) } } else { quantity := resource.MustParse(setResourceSuffix(*k.Value)) minVal := resource.MustParse("1G") if quantity.MilliValue() > minVal.MilliValue() { quantity.Set(int64(float64(quantity.Value()) * 1.0)) k.Value = aws.String(strings.ToLower(quantity.String())) } } } if driverOOM { // Bump up driver by 3x, jvm memory strings if *k.Name == "spark.driver.memory" && k.Value != nil { originalValue := *k.Value quantity := resource.MustParse(setResourceSuffix(originalValue)) originalMB := quantity.Value() / (1024 * 1024) quantity.Set(quantity.Value() * 3) adjustedMB := quantity.Value() / (1024 * 1024) k.Value = aws.String(strings.ToLower(quantity.String())) // Emit metrics with component:driver tag driverTags := append(metricTags, "component:driver") _ = metrics.Increment(metrics.EngineEKSARAResourceAdjustment, driverTags, 1) _ = metrics.Histogram(metrics.EngineEKSARAMemoryIncreaseRatio, 3.0, driverTags, 1) _ = metrics.Distribution(metrics.EngineEKSARADefaultMemory, float64(originalMB), driverTags, 1) _ = metrics.Distribution(metrics.EngineEKSARAARAMemory, float64(adjustedMB), driverTags, 1) increaseMB := adjustedMB - originalMB _ = metrics.Distribution(metrics.EngineEKSARAMemoryIncrease, float64(increaseMB), driverTags, 1) // Log driver adjustment if emr.log != nil { _ = emr.log.Log( "level", "info", "message", "Spark ARA adjusted driver memory", "definition_id", run.DefinitionID, "run_id", run.RunID, "cluster", run.ClusterName, "component", "driver", "default_memory_mb", originalMB, "adjusted_memory_mb", adjustedMB, "increase_ratio", 3.0, "oom_detected", true, ) } } } sparkSubmitConf = append(sparkSubmitConf, state.Conf{Name: k.Name, Value: k.Value}) } run.SparkExtension.SparkSubmitJobDriver.SparkSubmitConf = sparkSubmitConf return run } func (emr *EMRExecutionEngine) sparkSubmitParams(run state.Run) *string { var buffer bytes.Buffer buffer.WriteString(fmt.Sprintf(" --name %s", run.RunID)) for _, k := range run.SparkExtension.SparkSubmitJobDriver.SparkSubmitConf { buffer.WriteString(fmt.Sprintf(" --conf %s=%s", *k.Name, *k.Value)) } buffer.WriteString(fmt.Sprintf(" --conf %s=%s", "spark.kubernetes.executor.podNamePrefix", run.RunID)) buffer.WriteString(fmt.Sprintf(" --conf spark.log4j.rootLogger=DEBUG")) buffer.WriteString(fmt.Sprintf(" --conf spark.log4j.rootCategory=DEBUG")) if run.SparkExtension.SparkSubmitJobDriver.Class != nil { buffer.WriteString(fmt.Sprintf(" --class %s", *run.SparkExtension.SparkSubmitJobDriver.Class)) } if len(run.SparkExtension.SparkSubmitJobDriver.Files) > 0 { files := strings.Join(run.SparkExtension.SparkSubmitJobDriver.Files, ",") buffer.WriteString(fmt.Sprintf(" --files %s", files)) } if len(run.SparkExtension.SparkSubmitJobDriver.PyFiles) > 0 { files := strings.Join(run.SparkExtension.SparkSubmitJobDriver.PyFiles, ",") buffer.WriteString(fmt.Sprintf(" --py-files %s", files)) } if len(run.SparkExtension.SparkSubmitJobDriver.Jars) > 0 { jars := strings.Join(run.SparkExtension.SparkSubmitJobDriver.Jars, ",") buffer.WriteString(fmt.Sprintf(" --jars %s", jars)) } return aws.String(buffer.String()) } func (emr *EMRExecutionEngine) Terminate(ctx context.Context, run state.Run) error { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.emr_terminate", run.RunID) defer span.Finish() utils.TagJobRun(span, run) if run.Status == state.StatusStopped { return errors.New("Run is already in a stopped state.") } cancelJobRunInput := emrcontainers.CancelJobRunInput{ Id: run.SparkExtension.EMRJobId, VirtualClusterId: run.SparkExtension.VirtualClusterId, } tierTag := fmt.Sprintf("tier:%s", run.Tier) key := aws.String(fmt.Sprintf("%s/%s/%s.json", emr.s3ManifestBasePath, run.RunID, "cancel-job-run-input")) obj, err := json.Marshal(cancelJobRunInput) if err == nil { emr.writeStringToS3(key, obj) } _, err = emr.emrContainersClient.CancelJobRun(&cancelJobRunInput) if err != nil { _ = metrics.Increment(metrics.EngineEMRTerminate, []string{string(metrics.StatusFailure), tierTag}, 1) _ = emr.log.Log("level", "error", "message", "EMR job termination error", "error", err.Error()) } _ = metrics.Increment(metrics.EngineEMRTerminate, []string{string(metrics.StatusSuccess), tierTag}, 1) return err } func (emr *EMRExecutionEngine) Enqueue(ctx context.Context, run state.Run) error { var span tracer.Span ctx, span = utils.TraceJob(ctx, "flotilla.job.emr_enqueue", "") defer span.Finish() span.SetTag("job.run_id", run.RunID) span.SetTag("job.tier", run.Tier) utils.TagJobRun(span, run) tierTag := fmt.Sprintf("tier:%s", run.Tier) qurl, err := emr.sqsQueueManager.QurlFor(emr.emrJobQueue, false) if err != nil { _ = metrics.Increment(metrics.EngineEMREnqueue, []string{string(metrics.StatusFailure), tierTag}, 1) _ = emr.log.Log("level", "error", "message", "EMR job enqueue error", "error", err.Error()) return errors.Wrapf(err, "problem getting queue url for [%s]", run.ClusterName) } // Queue run if err = emr.sqsQueueManager.Enqueue(ctx, qurl, run); err != nil { _ = metrics.Increment(metrics.EngineEMREnqueue, []string{string(metrics.StatusFailure), tierTag}, 1) _ = emr.log.Log("level", "error", "message", "EMR job enqueue error", "error", err.Error()) return errors.Wrapf(err, "problem enqueing run [%s] to queue [%s]", run.RunID, qurl) } _ = metrics.Increment(metrics.EngineEMREnqueue, []string{string(metrics.StatusSuccess), tierTag}, 1) return nil } func (emr *EMRExecutionEngine) PollRuns(ctx context.Context) ([]RunReceipt, error) { qurl, err := emr.sqsQueueManager.QurlFor(emr.emrJobQueue, false) if err != nil { return nil, errors.Wrap(err, "problem listing queues to poll") } queues := []string{qurl} var runs []RunReceipt for _, qurl := range queues { // // Get new queued Run // runReceipt, err := emr.sqsQueueManager.ReceiveRun(ctx, qurl) if err != nil { return runs, errors.Wrapf(err, "problem receiving run from queue url [%s]", qurl) } if runReceipt.Run == nil { continue } runs = append(runs, RunReceipt{ RunReceipt: runReceipt, TraceID: runReceipt.TraceID, ParentID: runReceipt.ParentID, SamplingPriority: runReceipt.SamplingPriority, }) } return runs, nil } func (emr *EMRExecutionEngine) PollStatus(ctx context.Context) (RunReceipt, error) { return RunReceipt{}, nil } func (emr *EMRExecutionEngine) PollRunStatus(ctx context.Context) (state.Run, error) { return state.Run{}, nil } func (emr *EMRExecutionEngine) Define(ctx context.Context, td state.Definition) (state.Definition, error) { return td, nil } func (emr *EMRExecutionEngine) Deregister(ctx context.Context, definition state.Definition) error { return errors.Errorf("EMRExecutionEngine does not allow for deregistering of task definitions.") } func (emr *EMRExecutionEngine) Get(ctx context.Context, run state.Run) (state.Run, error) { if ctx == nil { ctx = context.Background() } return run, nil } func (emr *EMRExecutionEngine) GetEvents(ctx context.Context, run state.Run) (state.PodEventList, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.emr_get_events", run.RunID) defer span.Finish() utils.TagJobRun(span, run) return state.PodEventList{}, nil } func (emr *EMRExecutionEngine) FetchPodMetrics(ctx context.Context, run state.Run) (state.Run, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.emr_fetch_metrics", run.RunID) defer span.Finish() utils.TagJobRun(span, run) return run, nil } func (emr *EMRExecutionEngine) FetchUpdateStatus(ctx context.Context, run state.Run) (state.Run, error) { var span tracer.Span if ctx == nil { ctx = context.Background() } ctx, span = utils.TraceJob(ctx, "flotilla.job.emr_fetch_status", run.RunID) defer span.Finish() utils.TagJobRun(span, run) return run, nil } func (emr *EMRExecutionEngine) lakekeeperSecretEnvVars() []v1.EnvVar { if emr.lakekeeperSecretName == "" { return nil } return []v1.EnvVar{ { Name: "OAUTH2_CLIENT_ID", ValueFrom: &v1.EnvVarSource{ SecretKeyRef: &v1.SecretKeySelector{ LocalObjectReference: v1.LocalObjectReference{Name: emr.lakekeeperSecretName}, Key: "client_id", Optional: aws.Bool(true), }, }, }, { Name: "OAUTH2_CLIENT_SECRET", ValueFrom: &v1.EnvVarSource{ SecretKeyRef: &v1.SecretKeySelector{ LocalObjectReference: v1.LocalObjectReference{Name: emr.lakekeeperSecretName}, Key: "client_secret", Optional: aws.Bool(true), }, }, }, { Name: "OAUTH2_SERVER_URI", ValueFrom: &v1.EnvVarSource{ SecretKeyRef: &v1.SecretKeySelector{ LocalObjectReference: v1.LocalObjectReference{Name: emr.lakekeeperSecretName}, Key: "token_url", Optional: aws.Bool(true), }, }, }, { Name: "OAUTH2_SCOPE", ValueFrom: &v1.EnvVarSource{ SecretKeyRef: &v1.SecretKeySelector{ LocalObjectReference: v1.LocalObjectReference{Name: emr.lakekeeperSecretName}, Key: "scope", Optional: aws.Bool(true), }, }, }, { Name: "CATALOG_URI", ValueFrom: &v1.EnvVarSource{ SecretKeyRef: &v1.SecretKeySelector{ LocalObjectReference: v1.LocalObjectReference{Name: emr.lakekeeperSecretName}, Key: "uri", Optional: aws.Bool(true), }, }, }, { Name: "WAREHOUSE", ValueFrom: &v1.EnvVarSource{ SecretKeyRef: &v1.SecretKeySelector{ LocalObjectReference: v1.LocalObjectReference{Name: emr.lakekeeperSecretName}, Key: "warehouse", Optional: aws.Bool(true), }, }, }, } } func (emr *EMRExecutionEngine) envOverrides(executable state.Executable, run state.Run) []v1.EnvVar { pairs := make(map[string]string) resources := executable.GetExecutableResources() if resources.Env != nil && len(*resources.Env) > 0 { for _, ev := range *resources.Env { name := emr.sanitizeEnvVar(ev.Name) value := ev.Value pairs[name] = value } } if run.Env != nil && len(*run.Env) > 0 { for _, ev := range *run.Env { name := emr.sanitizeEnvVar(ev.Name) value := ev.Value pairs[name] = value } } var res []v1.EnvVar for key := range pairs { if len(key) > 0 { res = append(res, v1.EnvVar{ Name: key, Value: pairs[key], }) } } return res } func (emr *EMRExecutionEngine) sanitizeEnvVar(key string) string { // Environment variable can't start with emr $ if strings.HasPrefix(key, "$") { key = strings.Replace(key, "$", "", 1) } // Environment variable names can't contain spaces. key = strings.Replace(key, " ", "", -1) return key } func (emr *EMRExecutionEngine) constructCmdSlice(command *string) []string { cmdString := "" if command != nil { cmdString = *command } bashCmd := "bash" optLogin := "-l" optStr := "-ce" return []string{bashCmd, optLogin, optStr, cmdString} } ================================================ FILE: execution/engine/engine.go ================================================ package engine import ( "context" "fmt" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/queue" "github.com/stitchfix/flotilla-os/state" ) // Engine defines the execution engine interface. type Engine interface { Initialize(conf config.Config) error Execute(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) (state.Run, bool, error) Terminate(ctx context.Context, run state.Run) error Enqueue(ctx context.Context, run state.Run) error PollRuns(ctx context.Context) ([]RunReceipt, error) PollRunStatus(ctx context.Context) (state.Run, error) PollStatus(ctx context.Context) (RunReceipt, error) GetEvents(ctx context.Context, run state.Run) (state.PodEventList, error) FetchUpdateStatus(ctx context.Context, run state.Run) (state.Run, error) FetchPodMetrics(ctx context.Context, run state.Run) (state.Run, error) // Legacy methods from the ECS era. Here for backwards compatibility. Define(ctx context.Context, definition state.Definition) (state.Definition, error) Deregister(ctx context.Context, definition state.Definition) error } type RunReceipt struct { queue.RunReceipt TraceID uint64 ParentID uint64 SamplingPriority int } // NewExecutionEngine initializes and returns a new Engine func NewExecutionEngine(conf config.Config, qm queue.Manager, name string, logger log.Logger, clusterManager *DynamicClusterManager, stateManager state.Manager) (Engine, error) { switch name { case state.EKSEngine: eksEng := &EKSExecutionEngine{qm: qm, log: logger, clusterManager: clusterManager, stateManager: stateManager} if err := eksEng.Initialize(conf); err != nil { return nil, errors.Wrap(err, "problem initializing EKSExecutionEngine") } return eksEng, nil case state.EKSSparkEngine: emrEng := &EMRExecutionEngine{sqsQueueManager: qm, log: logger, clusterManager: clusterManager, stateManager: stateManager} if err := emrEng.Initialize(conf); err != nil { return nil, errors.Wrap(err, "problem initializing EMRExecutionEngine") } return emrEng, nil default: return nil, fmt.Errorf("no Engine named [%s] was found", name) } } ================================================ FILE: flotilla/app.go ================================================ package flotilla import ( "context" "github.com/stitchfix/flotilla-os/clients/middleware" "github.com/stitchfix/flotilla-os/queue" "github.com/stitchfix/flotilla-os/utils" "net/http" "strings" "time" "github.com/pkg/errors" "github.com/rs/cors" "github.com/stitchfix/flotilla-os/clients/cluster" "github.com/stitchfix/flotilla-os/clients/logs" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/execution/engine" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/services" "github.com/stitchfix/flotilla-os/state" "github.com/stitchfix/flotilla-os/worker" ) type App struct { address string mode string corsAllowedOrigins []string logger flotillaLog.Logger readTimeout time.Duration writeTimeout time.Duration handler http.Handler workerManager worker.Worker } // Start the Application. func (app *App) Run() error { srv := &http.Server{ Addr: app.address, Handler: app.handler, ReadTimeout: app.readTimeout, WriteTimeout: app.writeTimeout, } // Start worker manager's run goroutine. app.workerManager.GetTomb().Go(func() error { ctx, span := utils.TraceJob(context.Background(), "worker_manager.run", "startup") defer span.Finish() return app.workerManager.Run(ctx) }) return srv.ListenAndServe() } // Function to initialize a new Flotilla app. func NewApp(conf config.Config, log flotillaLog.Logger, eksLogsClient logs.Client, eksExecutionEngine engine.Engine, stateManager state.Manager, eksClusterClient cluster.Client, eksQueueManager queue.Manager, emrExecutionEngine engine.Engine, emrQueueManager queue.Manager, middlewareClient middleware.Client, clusterManager *engine.DynamicClusterManager, ) (App, error) { var app App app.logger = log app.configure(conf) executionService, err := services.NewExecutionService(conf, eksExecutionEngine, stateManager, eksClusterClient, emrExecutionEngine) if err != nil { return app, errors.Wrap(err, "problem initializing execution service") } templateService, err := services.NewTemplateService(conf, stateManager) if err != nil { return app, errors.Wrap(err, "problem initializing template service") } eksLogService, err := services.NewLogService(stateManager, eksLogsClient) if err != nil { return app, errors.Wrap(err, "problem initializing eks log service") } workerService, err := services.NewWorkerService(conf, stateManager) if err != nil { return app, errors.Wrap(err, "problem initializing worker service") } definitionService, err := services.NewDefinitionService(stateManager) if err != nil { return app, errors.Wrap(err, "problem initializing definition service") } ep := endpoints{ executionService: executionService, eksLogService: eksLogService, workerService: workerService, templateService: templateService, logger: log, middlewareClient: middlewareClient, definitionService: definitionService, } app.configureRoutes(ep) if err = app.initializeEKSWorkers(conf, log, eksExecutionEngine, emrExecutionEngine, stateManager, eksQueueManager, clusterManager); err != nil { return app, errors.Wrap(err, "problem eks initializing workers") } return app, nil } func (app *App) configure(conf config.Config) { app.address = conf.GetString("http_server_listen_address") if len(app.address) == 0 { app.address = ":5000" } readTimeout := conf.GetInt("http_server_read_timeout_seconds") if readTimeout == 0 { readTimeout = 5 } writeTimeout := conf.GetInt("http_server_write_timeout_seconds") if writeTimeout == 0 { writeTimeout = 10 } app.readTimeout = time.Duration(readTimeout) * time.Second app.writeTimeout = time.Duration(writeTimeout) * time.Second app.mode = conf.GetString("flotilla_mode") app.corsAllowedOrigins = strings.Split(conf.GetString("http_server_cors_allowed_origins"), ",") } func (app *App) configureRoutes(ep endpoints) { router := NewRouter(ep) c := cors.New(cors.Options{ AllowedOrigins: app.corsAllowedOrigins, AllowedMethods: []string{"GET", "DELETE", "POST", "PUT"}, }) app.handler = c.Handler(router) } func (app *App) initializeEKSWorkers( conf config.Config, log flotillaLog.Logger, ee engine.Engine, emr engine.Engine, sm state.Manager, qm queue.Manager, clusterManager *engine.DynamicClusterManager) error { workerManager, err := worker.NewWorker("worker_manager", log, conf, ee, emr, sm, qm, clusterManager) _ = app.logger.Log("level", "info", "message", "Starting worker", "name", "worker_manager") if err != nil { return errors.Wrapf(err, "problem initializing worker with name [%s]", "worker_manager") } app.workerManager = workerManager return nil } func (app *App) initializeEMRWorkers( conf config.Config, log flotillaLog.Logger, ee engine.Engine, emr engine.Engine, sm state.Manager, qm queue.Manager, clusterManager *engine.DynamicClusterManager) error { workerManager, err := worker.NewWorker("worker_manager", log, conf, ee, emr, sm, qm, clusterManager) _ = app.logger.Log("level", "info", "message", "Starting worker", "name", "worker_manager") if err != nil { return errors.Wrapf(err, "problem initializing worker with name [%s]", "worker_manager") } app.workerManager = workerManager return nil } ================================================ FILE: flotilla/endpoints.go ================================================ package flotilla import ( "encoding/json" "fmt" "net/http" "net/url" "strconv" "strings" "github.com/gorilla/mux" "github.com/stitchfix/flotilla-os/clients/middleware" "github.com/stitchfix/flotilla-os/exceptions" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/services" "github.com/stitchfix/flotilla-os/state" "github.com/stitchfix/flotilla-os/utils" ) type endpoints struct { executionService services.ExecutionService definitionService services.DefinitionService templateService services.TemplateService eksLogService services.LogService workerService services.WorkerService middlewareClient middleware.Client logger flotillaLog.Logger } type listRequest struct { limit int offset int sortBy string order string filters map[string][]string envFilters map[string]string } func (ep *endpoints) getURLParam(v url.Values, key string, defaultValue string) string { val, ok := v[key] if ok && len(val) > 0 { return val[0] } return defaultValue } func (ep *endpoints) getFilters(params url.Values, nonFilters map[string]bool) (map[string][]string, map[string]string) { filters := make(map[string][]string) envFilters := make(map[string]string) for k, v := range params { if !nonFilters[k] && len(v) > 0 { // Env filters have the "env" key and are "|" separated key-value pairs // // eg. env=FOO|BAR&env=CUPCAKE|SPRINKLES // if k == "env" { for _, kv := range v { split := strings.Split(kv, "|") if len(split) == 2 { envFilters[split[0]] = split[1] } } } else { filters[k] = v } } } return filters, envFilters } func (ep *endpoints) decodeListRequest(r *http.Request) listRequest { var lr listRequest params := r.URL.Query() lr.limit, _ = strconv.Atoi(ep.getURLParam(params, "limit", "1024")) lr.offset, _ = strconv.Atoi(ep.getURLParam(params, "offset", "0")) lr.sortBy = ep.getURLParam(params, "sort_by", "group_name") lr.order = ep.getURLParam(params, "order", "asc") lr.filters, lr.envFilters = ep.getFilters(params, map[string]bool{ "limit": true, "offset": true, "sort_by": true, "order": true, }) return lr } // Note: the difference between this method and `decodeListRequest` is that // this method does not assume that all entities can be sorted by `group_name`. // Instead, it relies on the IOrderable interface's DefaultOrderField method. func (ep *endpoints) decodeOrderableListRequest(r *http.Request, orderable state.IOrderable) listRequest { var lr listRequest params := r.URL.Query() lr.limit, _ = strconv.Atoi(ep.getURLParam(params, "limit", "1024")) lr.offset, _ = strconv.Atoi(ep.getURLParam(params, "offset", "0")) lr.sortBy = ep.getURLParam(params, "sort_by", orderable.DefaultOrderField()) lr.order = ep.getURLParam(params, "order", "asc") lr.filters, lr.envFilters = ep.getFilters(params, map[string]bool{ "limit": true, "offset": true, "sort_by": true, "order": true, }) return lr } func (ep *endpoints) decodeRequest(r *http.Request, entity interface{}) error { return json.NewDecoder(r.Body).Decode(entity) } func (ep endpoints) encodeError(w http.ResponseWriter, err error) { w.Header().Set("Content-Type", "application/json; charset=utf-8") switch err.(type) { case exceptions.MalformedInput: w.WriteHeader(http.StatusBadRequest) case exceptions.ConflictingResource: w.WriteHeader(http.StatusConflict) case exceptions.MissingResource: w.WriteHeader(http.StatusNotFound) default: w.WriteHeader(http.StatusInternalServerError) } _ = json.NewEncoder(w).Encode(map[string]interface{}{ "error": err.Error(), }) } func (ep *endpoints) encodeResponse(w http.ResponseWriter, response interface{}) { w.Header().Set("Content-Type", "application/json; charset=utf-8") _ = json.NewEncoder(w).Encode(response) } func (ep *endpoints) ListDefinitions(w http.ResponseWriter, r *http.Request) { lr := ep.decodeListRequest(r) definitionList, err := ep.definitionService.List( r.Context(), lr.limit, lr.offset, lr.sortBy, lr.order, lr.filters, lr.envFilters) if definitionList.Definitions == nil { definitionList.Definitions = []state.Definition{} } if err != nil { ep.logger.Log( "level", "error", "message", "problem listing definitions", "operation", "ListDefinitions", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { response := make(map[string]interface{}) response["total"] = definitionList.Total response["definitions"] = definitionList.Definitions response["limit"] = lr.limit response["offset"] = lr.offset response["sort_by"] = lr.sortBy response["order"] = lr.order response["env_filters"] = lr.envFilters for k, v := range lr.filters { response[k] = v } ep.encodeResponse(w, response) } } // Fetches definition from DB using definition id. func (ep *endpoints) GetDefinition(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) definition, err := ep.definitionService.Get(r.Context(), vars["definition_id"]) if err != nil { ep.logger.Log( "level", "error", "message", "problem getting definitions", "operation", "GetDefinition", "error", fmt.Sprintf("%+v", err), "definition_id", vars["definition_id"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, definition) } } // Fetches definition from DB using definition alias. func (ep *endpoints) GetDefinitionByAlias(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) definition, err := ep.definitionService.GetByAlias(r.Context(), vars["alias"]) if err != nil { ep.logger.Log( "level", "error", "message", "problem getting definition by alias", "operation", "GetDefinitionByAlias", "error", fmt.Sprintf("%+v", err), "alias", vars["alias"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, definition) } } // Creates new definition. func (ep *endpoints) CreateDefinition(w http.ResponseWriter, r *http.Request) { var definition state.Definition err := ep.decodeRequest(r, &definition) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } created, err := ep.definitionService.Create(r.Context(), &definition) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating definition", "operation", "CreateDefinition", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { ep.encodeResponse(w, created) } } // Updates existing definition. func (ep *endpoints) UpdateDefinition(w http.ResponseWriter, r *http.Request) { var definition state.Definition err := ep.decodeRequest(r, &definition) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } vars := mux.Vars(r) updated, err := ep.definitionService.Update(r.Context(), vars["definition_id"], definition) if err != nil { ep.logger.Log( "level", "error", "message", "problem updating definition", "operation", "UpdateDefinition", "error", fmt.Sprintf("%+v", err), "definition_id", vars["definition_id"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, updated) } } // Deletes a defiition. func (ep *endpoints) DeleteDefinition(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) err := ep.definitionService.Delete(r.Context(), vars["definition_id"]) if err != nil { ep.logger.Log( "level", "error", "message", "problem deleting definition", "operation", "DeleteDefinition", "error", fmt.Sprintf("%+v", err), "definition_id", vars["definition_id"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, map[string]bool{"deleted": true}) } } // List all runs, supports filtering based on environment variables. // ListRequest is object used here to construct the query. func (ep *endpoints) ListRuns(w http.ResponseWriter, r *http.Request) { lr := ep.decodeListRequest(r) runList, err := ep.executionService.List(r.Context(), lr.limit, lr.offset, lr.order, lr.sortBy, lr.filters, lr.envFilters) if err != nil { ep.logger.Log( "level", "error", "message", "problem listing runs", "operation", "ListRuns", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { response := make(map[string]interface{}) response["total"] = runList.Total response["history"] = runList.Runs response["limit"] = lr.limit response["offset"] = lr.offset response["sort_by"] = lr.sortBy response["order"] = lr.order response["env_filters"] = lr.envFilters for k, v := range lr.filters { response[k] = v } ep.encodeResponse(w, response) } } // List runs for a definition ID. func (ep *endpoints) ListDefinitionRuns(w http.ResponseWriter, r *http.Request) { lr := ep.decodeListRequest(r) vars := mux.Vars(r) definitionID, ok := vars["definition_id"] if ok { lr.filters["definition_id"] = []string{definitionID} } runList, err := ep.executionService.List(r.Context(), lr.limit, lr.offset, lr.order, lr.sortBy, lr.filters, lr.envFilters) if err != nil { ep.logger.Log( "level", "error", "message", "problem listing definition runs", "operation", "ListDefinitionRuns", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { response := ep.createListRunsResponse(runList, lr) ep.encodeResponse(w, response) } } // List runs based on a template id. func (ep *endpoints) ListTemplateRuns(w http.ResponseWriter, r *http.Request) { lr := ep.decodeListRequest(r) vars := mux.Vars(r) tplID, ok := vars["template_id"] if ok { lr.filters["executable_id"] = []string{tplID} } runList, err := ep.executionService.List(r.Context(), lr.limit, lr.offset, lr.order, lr.sortBy, lr.filters, lr.envFilters) if err != nil { ep.logger.Log( "level", "error", "message", "problem listing runs for template", "operation", "ListTemplateRuns", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { response := ep.createListRunsResponse(runList, lr) ep.encodeResponse(w, response) } } func (ep *endpoints) createListRunsResponse(runList state.RunList, req listRequest) map[string]interface{} { response := make(map[string]interface{}) response["total"] = runList.Total response["history"] = runList.Runs response["limit"] = req.limit response["offset"] = req.offset response["sort_by"] = req.sortBy response["order"] = req.order response["env_filters"] = req.envFilters for k, v := range req.filters { response[k] = v } return response } // Fetches a run based on Run ID. func (ep *endpoints) GetRun(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) run, err := ep.executionService.Get(r.Context(), vars["run_id"]) if err != nil { ep.logger.Log( "level", "error", "message", "problem getting run", "operation", "GetRun", "error", fmt.Sprintf("%+v", err), "run_id", vars["run_id"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, run) } } // Fetches a run based on Run ID. func (ep *endpoints) GetPayload(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) run, err := ep.executionService.Get(r.Context(), vars["run_id"]) if err != nil { ep.logger.Log( "level", "error", "message", "problem getting run", "operation", "GetRun", "error", fmt.Sprintf("%+v", err), "run_id", vars["run_id"]) ep.encodeError(w, err) } else { if run.ExecutionRequestCustom != nil { ep.encodeResponse(w, run.ExecutionRequestCustom) } else { ep.encodeResponse(w, map[string]string{}) } } } // Creates a new Run (deprecated). Only present for legacy support. func (ep *endpoints) CreateRun(w http.ResponseWriter, r *http.Request) { var lr state.LaunchRequest err := ep.decodeRequest(r, &lr) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } vars := mux.Vars(r) req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Env: lr.Env, OwnerID: "v1-unknown", Command: nil, Memory: nil, Cpu: nil, Gpu: nil, Engine: &state.DefaultEngine, EphemeralStorage: nil, NodeLifecycle: nil, CommandHash: nil, Tier: lr.Tier, }, } run, err := ep.executionService.CreateDefinitionRunByDefinitionID(r.Context(), vars["definition_id"], &req) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating run", "operation", "CreateRun", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { ep.encodeResponse(w, run) } } // Creates a new Run (deprecated). Only present for legacy support. func (ep *endpoints) CreateRunV2(w http.ResponseWriter, r *http.Request) { var lr state.LaunchRequestV2 err := ep.decodeRequest(r, &lr) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } err = ep.middlewareClient.AnnotateLaunchRequest(&r.Header, &lr) if err != nil { ep.encodeError(w, err) return } // check if OwnerEmail is present in lr.EventLabels if len(lr.RunTags.OwnerEmail) == 0 || len(lr.RunTags.TeamName) == 0 { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("run_tags must exist in body and contain [owner_email] and [team_name]")}) return } vars := mux.Vars(r) if lr.Engine == nil { if lr.SparkExtension != nil { lr.Engine = &state.EKSSparkEngine } else { lr.Engine = &state.EKSEngine } } req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Env: lr.Env, OwnerID: lr.RunTags.OwnerEmail, Command: nil, Memory: nil, Cpu: nil, Gpu: nil, Engine: lr.Engine, EphemeralStorage: nil, NodeLifecycle: nil, SparkExtension: lr.SparkExtension, Description: lr.Description, CommandHash: lr.CommandHash, IdempotenceKey: lr.IdempotenceKey, Arch: lr.Arch, Labels: lr.Labels, ServiceAccount: lr.ServiceAccount, Tier: lr.Tier, }, } run, err := ep.executionService.CreateDefinitionRunByDefinitionID(r.Context(), vars["definition_id"], &req) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating V2 run", "operation", "CreateRunV2", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { ep.encodeResponse(w, run) } } // Creates a new Run. func (ep *endpoints) CreateRunV4(w http.ResponseWriter, r *http.Request) { var lr state.LaunchRequestV2 err := ep.decodeRequest(r, &lr) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } err = ep.middlewareClient.AnnotateLaunchRequest(&r.Header, &lr) if err != nil { ep.encodeError(w, err) return } if len(lr.RunTags.OwnerID) == 0 { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("run_tags must exist in body and contain [owner_id]")}) return } if lr.Engine == nil { if lr.SparkExtension != nil { lr.Engine = &state.EKSSparkEngine } else { lr.Engine = &state.EKSEngine } } if lr.NodeLifecycle != nil { if !utils.StringSliceContains(state.NodeLifeCycles, *lr.NodeLifecycle) { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("Nodelifecyle must be [normal, spot]")}) return } } else { lr.NodeLifecycle = &state.DefaultLifecycle } vars := mux.Vars(r) req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Env: lr.Env, OwnerID: lr.RunTags.OwnerID, Command: lr.Command, Memory: lr.Memory, Cpu: lr.Cpu, Gpu: lr.Gpu, EphemeralStorage: lr.EphemeralStorage, Engine: lr.Engine, NodeLifecycle: lr.NodeLifecycle, ActiveDeadlineSeconds: lr.ActiveDeadlineSeconds, SparkExtension: lr.SparkExtension, Description: lr.Description, CommandHash: lr.CommandHash, IdempotenceKey: lr.IdempotenceKey, Arch: lr.Arch, Labels: lr.Labels, ServiceAccount: lr.ServiceAccount, Tier: lr.Tier, }, } run, err := ep.executionService.CreateDefinitionRunByDefinitionID(r.Context(), vars["definition_id"], &req) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating V4 run", "operation", "CreateRunV4", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { ep.encodeResponse(w, run) } } // Creates a new Run based on definition alias. func (ep *endpoints) CreateRunByAlias(w http.ResponseWriter, r *http.Request) { var lr state.LaunchRequestV2 err := ep.decodeRequest(r, &lr) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } err = ep.middlewareClient.AnnotateLaunchRequest(&r.Header, &lr) if err != nil { ep.encodeError(w, err) return } if len(lr.RunTags.OwnerID) == 0 { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("run_tags must exist in body and contain [owner_id]")}) return } if lr.Engine == nil || *lr.Engine == "ecs" { if lr.SparkExtension != nil { lr.Engine = &state.EKSSparkEngine } else { lr.Engine = &state.EKSEngine } } if lr.NodeLifecycle != nil { if !utils.StringSliceContains(state.NodeLifeCycles, *lr.NodeLifecycle) { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("Nodelifecyle must be [normal, spot]")}) return } } else { lr.NodeLifecycle = &state.DefaultLifecycle } vars := mux.Vars(r) req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Env: lr.Env, OwnerID: lr.RunTags.OwnerID, Command: lr.Command, Memory: lr.Memory, Cpu: lr.Cpu, Gpu: lr.Gpu, EphemeralStorage: lr.EphemeralStorage, Engine: lr.Engine, NodeLifecycle: lr.NodeLifecycle, ActiveDeadlineSeconds: lr.ActiveDeadlineSeconds, SparkExtension: lr.SparkExtension, Description: lr.Description, CommandHash: lr.CommandHash, IdempotenceKey: lr.IdempotenceKey, Arch: lr.Arch, Labels: lr.Labels, ServiceAccount: lr.ServiceAccount, Tier: lr.Tier, }, } run, err := ep.executionService.CreateDefinitionRunByAlias(r.Context(), vars["alias"], &req) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating run alias", "operation", "CreateRunByAlias", "error", fmt.Sprintf("%+v", err), "alias", vars["alias"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, run) } } // Stops a run based on run ID. func (ep *endpoints) StopRun(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) userInfo := ep.ExtractUserInfo(r) err := ep.executionService.Terminate(r.Context(), vars["run_id"], userInfo) if err != nil { ep.logger.Log( "level", "error", "message", "problem stopping run", "operation", "StopRun", "error", fmt.Sprintf("%+v", err), "run_id", vars["run_id"]) } ep.encodeResponse(w, map[string]bool{"terminated": true}) } // Extracts user info if present in the headers.s func (ep *endpoints) ExtractUserInfo(r *http.Request) state.UserInfo { var userInfo state.UserInfo for name, headers := range r.Header { name = strings.ToLower(name) for _, h := range headers { if strings.Contains(name, "-name") { userInfo.Name = h } if strings.Contains(name, "-email") { userInfo.Email = h } } } return userInfo } // Update an existing run. func (ep *endpoints) UpdateRun(w http.ResponseWriter, r *http.Request) { var run state.Run err := ep.decodeRequest(r, &run) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } vars := mux.Vars(r) err = ep.executionService.UpdateStatus(r.Context(), vars["run_id"], run.Status, run.ExitCode, run.RunExceptions, run.ExitReason) if err != nil { ep.logger.Log( "level", "error", "message", "problem updating run", "operation", "UpdateRun", "error", fmt.Sprintf("%+v", err), "run_id", vars["run_id"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, map[string]bool{"updated": true}) } } // Get Pod Events (EKS only) for a run ID. func (ep *endpoints) GetEvents(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) run, err := ep.executionService.Get(r.Context(), vars["run_id"]) if err != nil { ep.logger.Log( "level", "error", "message", "problem getting run", "operation", "GetRun", "error", fmt.Sprintf("%+v", err), "run_id", vars["run_id"]) ep.encodeError(w, err) return } var podEventList state.PodEventList if run.PodEvents != nil { podEventList.Total = len(*run.PodEvents) podEventList.PodEvents = *run.PodEvents } else { // If run doesn't have PodEvents in the cached record, fetch them podEventList, _ = ep.executionService.GetEvents(r.Context(), run) } ep.encodeResponse(w, podEventList) } // Get logs for a run. func (ep *endpoints) GetLogs(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) params := r.URL.Query() lastSeen := ep.getURLParam(params, "last_seen", "") rawText := ep.getStringBoolVal(ep.getURLParam(params, "raw_text", "")) run, err := ep.executionService.Get(r.Context(), vars["run_id"]) role := ep.getURLParam(params, "role", "driver") facility := ep.getURLParam(params, "facility", "stderr") if err != nil { _ = ep.logger.Log( "message", "problem getting run", "operation", "GetRun", "error", fmt.Sprintf("%+v", err), "run_id", vars["run_id"]) ep.encodeError(w, err) return } if run.Engine == nil { run.Engine = &state.DefaultEngine } if rawText == true { _ = ep.eksLogService.LogsText(vars["run_id"], w) } else { log, newLastSeen, err := ep.eksLogService.Logs(vars["run_id"], &lastSeen, &role, &facility) res := map[string]string{ "log": "", "last_seen": lastSeen, } if err == nil { res = map[string]string{ "log": log, "last_seen": *newLastSeen, } } ep.encodeResponse(w, res) } } // Get list of groups. func (ep *endpoints) GetGroups(w http.ResponseWriter, r *http.Request) { response := make(map[string]interface{}) response["total"] = 0 response["groups"] = []string{} ep.encodeResponse(w, response) } // Get listing of tags. func (ep *endpoints) GetTags(w http.ResponseWriter, r *http.Request) { response := make(map[string]interface{}) response["total"] = 0 response["tags"] = []string{} ep.encodeResponse(w, response) } func (ep *endpoints) ListClusters(w http.ResponseWriter, r *http.Request) { clusters, err := ep.executionService.ListClusters(r.Context()) if err != nil { ep.encodeError(w, err) return } ep.encodeResponse(w, map[string]interface{}{ "clusters": clusters, }) } // List active workers. func (ep *endpoints) ListWorkers(w http.ResponseWriter, r *http.Request) { wl, err := ep.workerService.List(r.Context(), state.EKSEngine) wlEKS, errEKS := ep.workerService.List(r.Context(), state.EKSEngine) if wl.Workers == nil { wl.Workers = []state.Worker{} } if wlEKS.Workers == nil { wlEKS.Workers = []state.Worker{} } if err != nil || errEKS != nil { ep.encodeError(w, err) } else { response := make(map[string]interface{}) response["total"] = wl.Total + wlEKS.Total response["workers"] = append(wl.Workers, wlEKS.Workers...) ep.encodeResponse(w, response) } } // Get information about an active worker. func (ep *endpoints) GetWorker(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) worker, err := ep.workerService.Get(r.Context(), vars["worker_type"], state.DefaultEngine) if err != nil { ep.encodeError(w, err) } else { ep.encodeResponse(w, worker) } } // Update worker counts. func (ep *endpoints) UpdateWorker(w http.ResponseWriter, r *http.Request) { var worker state.Worker err := ep.decodeRequest(r, &worker) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } vars := mux.Vars(r) updated, err := ep.workerService.Update(r.Context(), vars["worker_type"], worker) if err != nil { ep.encodeError(w, err) } else { ep.encodeResponse(w, updated) } } // Update batches of workers - used to turn on/off in bulk. func (ep *endpoints) BatchUpdateWorkers(w http.ResponseWriter, r *http.Request) { var wks []state.Worker err := ep.decodeRequest(r, &wks) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } updated, err := ep.workerService.BatchUpdate(r.Context(), wks) if err != nil { ep.encodeError(w, err) } else { ep.encodeResponse(w, updated) } } func (ep *endpoints) getStringBoolVal(s string) bool { l := strings.ToLower(s) if l == "true" { return true } return false } // Create a new template run based on template name/alias. func (ep *endpoints) CreateTemplateRunByName(w http.ResponseWriter, r *http.Request) { var req state.TemplateExecutionRequest err := ep.decodeRequest(r, &req) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } if len(req.OwnerID) == 0 { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("request payload must contain [owner_id]; the run_tags field is deprecated for the v7 endpoint.")}) return } req.Engine = &state.DefaultEngine if req.NodeLifecycle != nil { if !utils.StringSliceContains(state.NodeLifeCycles, *req.NodeLifecycle) { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("Nodelifecyle must be [normal, spot]")}) return } } else { req.NodeLifecycle = &state.DefaultLifecycle } vars := mux.Vars(r) run, err := ep.executionService.CreateTemplateRunByTemplateName(r.Context(), vars["template_name"], vars["template_version"], &req) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating template run", "operation", "CreateTemplateRun", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { ep.encodeResponse(w, run) } } // Create a new template run based on template id. func (ep *endpoints) CreateTemplateRun(w http.ResponseWriter, r *http.Request) { var req state.TemplateExecutionRequest err := ep.decodeRequest(r, &req) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } if len(req.OwnerID) == 0 { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("request payload must contain [owner_id]; the run_tags field is deprecated for the v7 endpoint.")}) return } req.Engine = &state.DefaultEngine if req.NodeLifecycle != nil { if !utils.StringSliceContains(state.NodeLifeCycles, *req.NodeLifecycle) { ep.encodeError(w, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("Nodelifecyle must be [normal, spot]")}) return } } else { req.NodeLifecycle = &state.DefaultLifecycle } vars := mux.Vars(r) run, err := ep.executionService.CreateTemplateRunByTemplateID(r.Context(), vars["template_id"], &req) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating template run", "operation", "CreateTemplateRun", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { ep.encodeResponse(w, run) } } // List all templates. func (ep *endpoints) ListTemplates(w http.ResponseWriter, r *http.Request) { var ( tl state.TemplateList err error ) lr := ep.decodeOrderableListRequest(r, &state.Template{}) params := r.URL.Query() latestOnly := ep.getStringBoolVal(ep.getURLParam(params, "latest_only", "true")) if latestOnly == true { tl, err = ep.templateService.ListLatestOnly(r.Context(), lr.limit, lr.offset, lr.sortBy, lr.order) } else { tl, err = ep.templateService.List(r.Context(), lr.limit, lr.offset, lr.sortBy, lr.order) } if tl.Templates == nil { tl.Templates = []state.Template{} } if err != nil { ep.logger.Log( "level", "error", "message", "problem listing templates", "operation", "ListTemplates", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { response := make(map[string]interface{}) response["total"] = tl.Total response["templates"] = tl.Templates response["limit"] = lr.limit response["offset"] = lr.offset response["sort_by"] = lr.sortBy response["order"] = lr.order ep.encodeResponse(w, response) } } // Get a template. func (ep *endpoints) GetTemplate(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) tpl, err := ep.templateService.GetByID(r.Context(), vars["template_id"]) if err != nil { ep.logger.Log( "level", "error", "message", "problem getting templates", "operation", "GetTemplate", "error", fmt.Sprintf("%+v", err), "template_id", vars["template_id"]) ep.encodeError(w, err) } else { ep.encodeResponse(w, tpl) } } // Create a template. func (ep *endpoints) CreateTemplate(w http.ResponseWriter, r *http.Request) { var req state.CreateTemplateRequest err := ep.decodeRequest(r, &req) if err != nil { ep.encodeError(w, exceptions.MalformedInput{ErrorString: err.Error()}) return } created, err := ep.templateService.Create(r.Context(), &req) if err != nil { ep.logger.Log( "level", "error", "message", "problem creating template", "operation", "CreateTemplate", "error", fmt.Sprintf("%+v", err)) ep.encodeError(w, err) } else { ep.encodeResponse(w, created) } } // Get a cluster. func (ep *endpoints) GetCluster(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) cluster, err := ep.executionService.GetClusterByID(r.Context(), vars["cluster_id"]) if err != nil { ep.encodeError(w, err) return } ep.encodeResponse(w, cluster) } // Update a cluster. func (ep *endpoints) UpdateCluster(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) var clusterMetadata state.ClusterMetadata if err := json.NewDecoder(r.Body).Decode(&clusterMetadata); err != nil { ep.encodeError(w, err) return } if vars["cluster_id"] != "" { clusterMetadata.ID = vars["cluster_id"] } err := ep.executionService.UpdateClusterMetadata(r.Context(), clusterMetadata) if err != nil { ep.encodeError(w, err) return } ep.encodeResponse(w, map[string]bool{"updated": true}) } func (ep *endpoints) DeleteCluster(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) err := ep.executionService.DeleteClusterMetadata(r.Context(), vars["cluster_id"]) if err != nil { ep.encodeError(w, err) return } ep.encodeResponse(w, map[string]bool{"deleted": true}) } // Health check endpoint. func (ep *endpoints) HealthCheck(w http.ResponseWriter, r *http.Request) { ep.encodeResponse(w, map[string]string{ "status": "healthy", "message": "Service is up and running", }) } // Create a new cluster. func (ep *endpoints) CreateCluster(w http.ResponseWriter, r *http.Request) { var cluster state.ClusterMetadata if err := json.NewDecoder(r.Body).Decode(&cluster); err != nil { ep.encodeError(w, err) return } cluster.ID = "" err := ep.executionService.UpdateClusterMetadata(r.Context(), cluster) if err != nil { ep.encodeError(w, err) return } ep.encodeResponse(w, map[string]bool{"created": true}) } func (ep *endpoints) GetRunStatus(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) runID := vars["run_id"] status, err := ep.executionService.GetRunStatus(r.Context(), runID) if err != nil { ep.logger.Log( "level", "error", "message", "problem getting run status", "operation", "GetRunStatus", "error", fmt.Sprintf("%+v", err), "run_id", runID) ep.encodeError(w, err) return } w.Header().Set("Cache-Control", "max-age=5") // Cache for 5 seconds exitCode := "unknown" if status.ExitCode != nil { exitCode = fmt.Sprintf("%v", *status.ExitCode) } statusHash := fmt.Sprintf("%s-%s", status.Status, exitCode) etag := fmt.Sprintf(`"%s"`, statusHash) w.Header().Set("ETag", etag) if match := r.Header.Get("If-None-Match"); match != "" && match == etag { w.WriteHeader(http.StatusNotModified) return } ep.encodeResponse(w, status) } ================================================ FILE: flotilla/endpoints_test.go ================================================ package flotilla import ( "bytes" "encoding/json" "net/http/httptest" "testing" "github.com/stitchfix/flotilla-os/clients/middleware" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/services" "github.com/stitchfix/flotilla-os/state" "github.com/stitchfix/flotilla-os/testutils" muxtrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/gorilla/mux" ) func setUp(t *testing.T) *muxtrace.Router { confDir := "../conf" c, _ := config.NewConfig(&confDir) imp := testutils.ImplementsAllTheThings{ T: t, Definitions: map[string]state.Definition{ "A": {DefinitionID: "A", Alias: "aliasA"}, "B": {DefinitionID: "B", Alias: "aliasB"}, "C": {DefinitionID: "C", Alias: "aliasC", ExecutableResources: state.ExecutableResources{Image: "invalidimage"}}, }, Runs: map[string]state.Run{ "runA": {DefinitionID: "A", ClusterName: "cluster1", GroupName: "A", RunID: "runA", Status: state.StatusRunning}, "runB": {DefinitionID: "B", ClusterName: "cluster2", GroupName: "B", RunID: "runB", InstanceDNSName: "cupcakedns", InstanceID: "cupcakeid"}, }, Qurls: map[string]string{ "A": "a/", "B": "b/", }, ClusterStates: []state.ClusterMetadata{ {Name: "cluster1", Status: state.StatusActive, StatusReason: "Active and healthy"}, {Name: "cluster2", Status: state.StatusActive, StatusReason: "Active and healthy"}, }, Groups: []string{"g1", "g2", "g3"}, Tags: []string{"t1", "t2", "t3"}, } ds, _ := services.NewDefinitionService(&imp) es, _ := services.NewExecutionService(c, &imp, &imp, &imp, &imp) ls, _ := services.NewLogService(&imp, &imp) mwc, _ := middleware.NewClient() ep := endpoints{definitionService: ds, executionService: es, eksLogService: ls, middlewareClient: mwc} return NewRouter(ep) } func TestEndpoints_CreateDefinition(t *testing.T) { router := setUp(t) newDef := `{"alias":"cupcake", "memory":100, "group_name":"cupcake", "image":"someimage", "command":"echo 'hi'"}` req := httptest.NewRequest("POST", "/api/v1/task", bytes.NewBufferString(newDef)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } r := state.Definition{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if len(r.DefinitionID) == 0 { t.Errorf("Expected non-empty definition id") } } func TestEndpoints_UpdateDefinition(t *testing.T) { router := setUp(t) updatedDef := `{"image":"updatedImage"}` req := httptest.NewRequest("PUT", "/api/v1/task/A", bytes.NewBufferString(updatedDef)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } r := state.Definition{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if r.Image != "updatedImage" { t.Errorf("Expected image [updatedImage] but was [%s]", r.Image) } } func TestEndpoints_CreateRun(t *testing.T) { router := setUp(t) newRun := `{"cluster":"cupcake", "env":[{"name":"E1","value":"V1"}]}` req := httptest.NewRequest("PUT", "/api/v1/task/A/execute", bytes.NewBufferString(newRun)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } r := state.Run{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if len(r.RunID) == 0 { t.Errorf("Expected non-empty run id") } if r.Status != state.StatusQueued { t.Errorf("Expected new run to have status [%s] but was [%s]", state.StatusQueued, r.Status) } } func TestEndpoints_CreateRun2(t *testing.T) { router := setUp(t) newRun := `{"cluster":"cupcake", "env":[{"name":"E1","value":"V1"}], "run_tags":{"owner_email":"flotilla@github.com", "team_name":"thebest"}}` req := httptest.NewRequest("PUT", "/api/v2/task/A/execute", bytes.NewBufferString(newRun)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } r := state.Run{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if len(r.RunID) == 0 { t.Errorf("Expected non-empty run id") } if r.Status != state.StatusQueued { t.Errorf("Expected new run to have status [%s] but was [%s]", state.StatusQueued, r.Status) } if r.User != "flotilla@github.com" { t.Errorf("Expected new run to have user set to run_tags.owner_email but was [%s]", r.User) } } func TestEndpoints_CreateRun4(t *testing.T) { router := setUp(t) newRun := `{"cluster":"cluster1", "env":[{"name":"E1","value":"V1"}], "run_tags":{"owner_id":"flotilla"}, "labels": {"foo": "bar"}}` req := httptest.NewRequest("PUT", "/api/v4/task/A/execute", bytes.NewBufferString(newRun)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v\n%s", resp.StatusCode, resp.Status) } r := state.Run{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if len(r.RunID) == 0 { t.Errorf("Expected non-empty run id") } if r.Status != state.StatusQueued { t.Errorf("Expected new run to have status [%s] but was [%s]", state.StatusQueued, r.Status) } if len(r.Labels) != 1 || r.Labels["foo"] != "bar" { labelRes, _ := json.Marshal(r.Labels) t.Error(string(labelRes)) } if r.User != "flotilla" { t.Errorf("Expected new run to have user set to run_tags.owner_id but was [%s]", r.User) } } func TestEndpoints_CreateRunByAlias(t *testing.T) { router := setUp(t) newRun := `{"cluster":"cupcake", "env":[{"name":"E1","value":"V1"}], "run_tags":{"owner_id":"flotilla"}}` req := httptest.NewRequest("PUT", "/api/v1/task/alias/aliasA/execute", bytes.NewBufferString(newRun)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } r := state.Run{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if len(r.RunID) == 0 { t.Errorf("Expected non-empty run id") } if r.Status != state.StatusQueued { t.Errorf("Expected new run to have status [%s] but was [%s]", state.StatusQueued, r.Status) } if r.User != "flotilla" { t.Errorf("Expected new run to have user set to run_tags.owner_id but was [%s]", r.User) } } func TestEndpoints_DeleteDefinition(t *testing.T) { router := setUp(t) req := httptest.NewRequest("DELETE", "/api/v1/task/A", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var ack map[string]bool err := json.NewDecoder(resp.Body).Decode(&ack) if err != nil { t.Error(err.Error()) } if _, ok := ack["deleted"]; !ok { t.Errorf("Expected [deleted] acknowledgement") } } func TestEndpoints_GetDefinition(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/task/A", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r state.Definition err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if r.DefinitionID != "A" { t.Errorf("Expected definition_id [A] but was [%s]", r.DefinitionID) } if r.Env == nil { t.Errorf("Expected non-nil environment") } } func TestEndpoints_GetDefinitionByAlias(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/task/alias/aliasA", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r state.Definition err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if r.DefinitionID != "A" { t.Errorf("Expected definition_id [A] but was [%s]", r.DefinitionID) } if r.Env == nil { t.Errorf("Expected non-nil environment") } } func TestEndpoints_GetGroups(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/groups", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if _, ok := r["total"]; !ok { t.Errorf("Expected total in response") } if _, ok := r["groups"]; !ok { t.Errorf("Expected groups in response") } groups, _ := r["groups"] if _, ok := groups.([]interface{}); !ok { t.Errorf("Cannot cast groups to list, expected list") } } func TestEndpoints_GetLogs(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/runA/logs", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if _, ok := r["log"]; !ok { t.Errorf("Expected log in response") } } func TestEndpoints_GetRun(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/history/runA", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r state.Run err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if r.RunID != "runA" { t.Errorf("Expected run with runID [runA] but was [%s]", r.RunID) } } func TestEndpoints_GetRun2(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/history/runB", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var other map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&other) if err != nil { t.Error(err.Error()) } instance, ok := other["instance"] if !ok { t.Errorf("Expected [instance] in response") } if _, ok = instance.(map[string]interface{}); !ok { t.Errorf("Expected [instance] in response to be a map") } } func TestEndpoints_GetTags(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/tags", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if _, ok := r["total"]; !ok { t.Errorf("Expected total in response") } if _, ok := r["tags"]; !ok { t.Errorf("Expected tags in response") } tags, _ := r["tags"] if _, ok := tags.([]interface{}); !ok { t.Errorf("Cannot cast tags to list, expected list") } } func TestEndpoints_ListDefinitions(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v1/task?limit=100&offset=2&sort_by=alias&order=desc&group_name=cupcake&env=E1%7CV1", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if _, ok := r["total"]; !ok { t.Errorf("Expected total in response") } if _, ok := r["definitions"]; !ok { t.Errorf("Expected definitions in response") } if _, ok := r["limit"]; !ok { t.Errorf("Expected limit in response") } if _, ok := r["offset"]; !ok { t.Errorf("Expected offset in response") } if _, ok := r["sort_by"]; !ok { t.Errorf("Expected sort_by in response") } if _, ok := r["order"]; !ok { t.Errorf("Expected order in response") } if _, ok := r["group_name"]; !ok { t.Errorf("Expected [group_name] filter in response") } if _, ok := r["env_filters"]; !ok { t.Errorf("Expected env_filters in response") } definitions, _ := r["definitions"] if _, ok := definitions.([]interface{}); !ok { t.Errorf("Cannot cast definitions to list, expected list") } envFilters, _ := r["env_filters"] if _, ok := envFilters.(map[string]interface{}); !ok { t.Errorf("Cannot cast env_filters to map, expected map") } envFiltersMap := envFilters.(map[string]interface{}) e1Filter, ok := envFiltersMap["E1"] if !ok { t.Errorf("Expected env_filters to contain key [E1]") } if e1Filter.(string) != "V1" { t.Errorf("Expected env_filter [E1:V1]") } } func TestEndpoints_ListRuns(t *testing.T) { router := setUp(t) req := httptest.NewRequest( "GET", "/api/v1/history?status=RUNNING&status=QUEUED&limit=100&offset=2&sort_by=started_at&order=desc&cluster=cupcake&env=E1%7CV1", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var r map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&r) if err != nil { t.Error(err.Error()) } if _, ok := r["total"]; !ok { t.Errorf("Expected total in response") } if _, ok := r["history"]; !ok { t.Errorf("Expected runs in response") } if _, ok := r["limit"]; !ok { t.Errorf("Expected limit in response") } if _, ok := r["offset"]; !ok { t.Errorf("Expected offset in response") } if _, ok := r["sort_by"]; !ok { t.Errorf("Expected sort_by in response") } if _, ok := r["order"]; !ok { t.Errorf("Expected order in response") } if _, ok := r["cluster"]; !ok { t.Errorf("Expected [cluster] filter in response") } if _, ok := r["env_filters"]; !ok { t.Errorf("Expected env_filters in response") } if _, ok := r["status"]; !ok { t.Errorf("Expected [status] filter in response") } runs, _ := r["history"] if _, ok := runs.([]interface{}); !ok { t.Errorf("Cannot cast runs to list, expected list") } statusFilters, _ := r["status"] if _, ok := statusFilters.([]interface{}); !ok { t.Errorf("Cannot cast status filters to list, expected list") } expectedStatusFilters := map[string]bool{"RUNNING": true, "QUEUED": true} statusFiltersList := statusFilters.([]interface{}) if len(statusFiltersList) != 2 { t.Errorf("Expected 2 status filters, was %v", len(statusFiltersList)) } for _, statusFilter := range statusFiltersList { if _, ok := expectedStatusFilters[statusFilter.(string)]; !ok { t.Errorf("Unexpected status filter: %s", statusFilter.(string)) } } envFilters, _ := r["env_filters"] if _, ok := envFilters.(map[string]interface{}); !ok { t.Errorf("Cannot cast env_filters to map, expected map") } envFiltersMap := envFilters.(map[string]interface{}) e1Filter, ok := envFiltersMap["E1"] if !ok { t.Errorf("Expected env_filters to contain key [E1]") } if e1Filter.(string) != "V1" { t.Errorf("Expected env_filter [E1:V1]") } } func TestEndpoints_StopRun(t *testing.T) { router := setUp(t) req := httptest.NewRequest("DELETE", "/api/v1/task/A/history/runA", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var ack map[string]bool err := json.NewDecoder(resp.Body).Decode(&ack) if err != nil { t.Error(err.Error()) } if _, ok := ack["terminated"]; !ok { t.Errorf("Expected [terminated] acknowledgement") } } func TestEndpoints_ListClusters(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v6/clusters", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var response map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&response) if err != nil { t.Error(err.Error()) } clusters, ok := response["clusters"] if !ok { t.Errorf("Expected clusters in response") } clustersList, ok := clusters.([]interface{}) if !ok { t.Errorf("Cannot cast clusters to list, expected list") } if len(clustersList) != 2 { t.Errorf("Expected 2 clusters, got %d", len(clustersList)) } cluster, ok := clustersList[0].(map[string]interface{}) if !ok { t.Errorf("Cannot cast cluster to map, expected map") } if _, ok := cluster["name"]; !ok { t.Errorf("Expected cluster to have name field") } if _, ok := cluster["status"]; !ok { t.Errorf("Expected cluster to have status field") } } func TestEndpoints_GetCluster(t *testing.T) { router := setUp(t) req := httptest.NewRequest("GET", "/api/v6/clusters/cluster1", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var cluster map[string]interface{} err := json.NewDecoder(resp.Body).Decode(&cluster) if err != nil { t.Error(err.Error()) } if _, ok := cluster["name"]; !ok { t.Errorf("Expected cluster to have name field") } if _, ok := cluster["status"]; !ok { t.Errorf("Expected cluster to have status field") } } func TestEndpoints_UpdateCluster(t *testing.T) { router := setUp(t) updateReq := `{"status":"ACTIVE", "reason":"Testing update"}` req := httptest.NewRequest("PUT", "/api/v6/clusters/cluster1", bytes.NewBufferString(updateReq)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var ack map[string]bool err := json.NewDecoder(resp.Body).Decode(&ack) if err != nil { t.Error(err.Error()) } if _, ok := ack["updated"]; !ok { t.Errorf("Expected [updated] acknowledgement") } } func TestEndpoints_DeleteCluster(t *testing.T) { router := setUp(t) req := httptest.NewRequest("DELETE", "/api/v6/clusters/cluster1", nil) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var ack map[string]bool err := json.NewDecoder(resp.Body).Decode(&ack) if err != nil { t.Error(err.Error()) } if _, ok := ack["deleted"]; !ok { t.Errorf("Expected [deleted] acknowledgement") } } func TestEndpoints_CreateCluster(t *testing.T) { router := setUp(t) req := httptest.NewRequest("POST", "/api/v6/clusters", bytes.NewBufferString(`{"name":"cluster1", "status":"ACTIVE", "reason":"Testing create"}`)) w := httptest.NewRecorder() router.ServeHTTP(w, req) resp := w.Result() if resp.Header.Get("Content-Type") != "application/json; charset=utf-8" { t.Errorf("Expected Content-Type [application/json; charset=utf-8], but was [%s]", resp.Header.Get("Content-Type")) } if resp.StatusCode != 200 { t.Errorf("Expected status 200, was %v", resp.StatusCode) } var ack map[string]bool err := json.NewDecoder(resp.Body).Decode(&ack) if err != nil { t.Error(err.Error()) } if _, ok := ack["created"]; !ok { t.Errorf("Expected [created] acknowledgement") } } ================================================ FILE: flotilla/router.go ================================================ package flotilla import ( muxtrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/gorilla/mux" ) // NewRouter creates and returns a Mux Router func NewRouter(ep endpoints) *muxtrace.Router { r := muxtrace.NewRouter() v1 := r.PathPrefix("/api/v1").Subrouter() v1.HandleFunc("/task", ep.ListDefinitions).Methods("GET") v1.HandleFunc("/task", ep.CreateDefinition).Methods("POST") v1.HandleFunc("/task/{definition_id}", ep.GetDefinition).Methods("GET") v1.HandleFunc("/task/{definition_id}", ep.UpdateDefinition).Methods("PUT") v1.HandleFunc("/task/{definition_id}", ep.DeleteDefinition).Methods("DELETE") v1.HandleFunc("/task/{definition_id}/execute", ep.CreateRun).Methods("PUT") v1.HandleFunc("/task/alias/{alias}", ep.GetDefinitionByAlias).Methods("GET") v1.HandleFunc("/task/alias/{alias}/execute", ep.CreateRunByAlias).Methods("PUT") v1.HandleFunc("/history", ep.ListRuns).Methods("GET") v1.HandleFunc("/history/{run_id}", ep.GetRun).Methods("GET") v1.HandleFunc("/task/history/{run_id}", ep.GetRun).Methods("GET") v1.HandleFunc("/task/{definition_id}/history", ep.ListDefinitionRuns).Methods("GET") v1.HandleFunc("/task/{definition_id}/history/{run_id}", ep.GetRun).Methods("GET") v1.HandleFunc("/task/{definition_id}/history/{run_id}", ep.StopRun).Methods("DELETE") v1.HandleFunc("/{run_id}/status", ep.UpdateRun).Methods("PUT") v1.HandleFunc("/{run_id}/logs", ep.GetLogs).Methods("GET") v1.HandleFunc("/{run_id}/events", ep.GetEvents).Methods("GET") v1.HandleFunc("/groups", ep.GetGroups).Methods("GET") v1.HandleFunc("/tags", ep.GetTags).Methods("GET") v1.HandleFunc("/clusters", ep.ListClusters).Methods("GET") v2 := r.PathPrefix("/api/v2").Subrouter() v2.HandleFunc("/task/{definition_id}/execute", ep.CreateRunV2).Methods("PUT") v4 := r.PathPrefix("/api/v4").Subrouter() v4.HandleFunc("/task/{definition_id}/execute", ep.CreateRunV4).Methods("PUT") v5 := r.PathPrefix("/api/v5").Subrouter() v5.HandleFunc("/worker", ep.ListWorkers).Methods("GET") v5.HandleFunc("/worker", ep.BatchUpdateWorkers).Methods("PUT") v5.HandleFunc("/worker/{worker_type}", ep.GetWorker).Methods("GET") v5.HandleFunc("/worker/{worker_type}", ep.UpdateWorker).Methods("PUT") v6 := r.PathPrefix("/api/v6").Subrouter() v6.HandleFunc("/clusters", ep.ListClusters).Methods("GET") v6.HandleFunc("/clusters", ep.CreateCluster).Methods("POST") v6.HandleFunc("/clusters/{cluster_id}", ep.GetCluster).Methods("GET") v6.HandleFunc("/clusters/{cluster_id}", ep.UpdateCluster).Methods("PUT") v6.HandleFunc("/clusters/{cluster_id}", ep.DeleteCluster).Methods("DELETE") v6.HandleFunc("/{run_id}/events", ep.GetEvents).Methods("GET") v6.HandleFunc("/groups", ep.GetGroups).Methods("GET") v6.HandleFunc("/health", ep.HealthCheck).Methods("GET") v6.HandleFunc("/history", ep.ListRuns).Methods("GET") v6.HandleFunc("/history/{run_id}", ep.GetRun).Methods("GET") v6.HandleFunc("/tags", ep.GetTags).Methods("GET") v6.HandleFunc("/task", ep.ListDefinitions).Methods("GET") v6.HandleFunc("/task", ep.CreateDefinition).Methods("POST") v6.HandleFunc("/task/alias/{alias}", ep.GetDefinitionByAlias).Methods("GET") v6.HandleFunc("/task/alias/{alias}/execute", ep.CreateRunByAlias).Methods("PUT") v6.HandleFunc("/task/{definition_id}", ep.GetDefinition).Methods("GET") v6.HandleFunc("/task/{definition_id}", ep.UpdateDefinition).Methods("PUT") v6.HandleFunc("/task/{definition_id}", ep.DeleteDefinition).Methods("DELETE") v6.HandleFunc("/task/{definition_id}/execute", ep.CreateRunV4).Methods("PUT") v6.HandleFunc("/task/{definition_id}/history", ep.ListDefinitionRuns).Methods("GET") v6.HandleFunc("/task/{definition_id}/history/{run_id}", ep.GetRun).Methods("GET") v6.HandleFunc("/task/{definition_id}/history/{run_id}", ep.StopRun).Methods("DELETE") v6.HandleFunc("/task/history/{run_id}", ep.GetRun).Methods("GET") v6.HandleFunc("/{run_id}/status", ep.UpdateRun).Methods("PUT") v6.HandleFunc("/{run_id}/status", ep.GetRunStatus).Methods("GET") v6.HandleFunc("/{run_id}/logs", ep.GetLogs).Methods("GET") v7 := r.PathPrefix("/api/v7").Subrouter() v7.HandleFunc("/template/{template_id}/execute", ep.CreateTemplateRun).Methods("PUT") v7.HandleFunc("/template/name/{template_name}/version/{template_version}/execute", ep.CreateTemplateRunByName).Methods("PUT") v7.HandleFunc("/template", ep.ListTemplates).Methods("GET") v7.HandleFunc("/template", ep.CreateTemplate).Methods("POST") v7.HandleFunc("/template/{template_id}", ep.GetTemplate).Methods("GET") v7.HandleFunc("/template/history/{run_id}", ep.GetRun).Methods("GET") v7.HandleFunc("/template/{template_id}/history", ep.ListTemplateRuns).Methods("GET") v7.HandleFunc("/template/{template_id}/history/{run_id}", ep.GetRun).Methods("GET") v7.HandleFunc("/template/{template_id}/history/{run_id}", ep.StopRun).Methods("DELETE") return r } ================================================ FILE: go.mod ================================================ module github.com/stitchfix/flotilla-os go 1.26.1 require ( github.com/DataDog/datadog-go/v5 v5.1.0 github.com/Masterminds/sprig v2.22.0+incompatible github.com/aws/aws-sdk-go v1.40.18 github.com/go-kit/kit v0.9.0 github.com/go-redis/redis v6.15.9+incompatible github.com/gorilla/mux v1.7.4-0.20190701202633-d83b6ffe499a github.com/jmoiron/sqlx v1.2.1-0.20190426154859-38398a30ed85 github.com/lib/pq v1.10.2 github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d github.com/pkg/errors v0.9.1 github.com/rs/cors v1.6.1-0.20190613161432-33ffc0734c60 github.com/spf13/viper v1.4.1-0.20190614151712-3349bd9cc288 github.com/xeipuuv/gojsonschema v0.0.0-20180618132009-1d523034197f go.uber.org/multierr v1.5.0 gopkg.in/DataDog/dd-trace-go.v1 v1.38.0 gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637 k8s.io/api v0.35.0 k8s.io/apimachinery v0.35.0 k8s.io/client-go v0.35.0 k8s.io/metrics v0.35.0 ) require ( github.com/DataDog/datadog-agent/pkg/obfuscate v0.0.0-20211129110424-6491aa3bf583 // indirect github.com/DataDog/datadog-go v4.8.3+incompatible // indirect github.com/DataDog/sketches-go v1.0.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver v1.5.0 // indirect github.com/Microsoft/go-winio v0.5.1 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dgraph-io/ristretto v0.1.0 // indirect github.com/dustin/go-humanize v1.0.0 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/fsnotify/fsnotify v1.4.9 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logfmt/logfmt v0.5.0 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/golang/glog v1.2.4 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/huandu/xstrings v1.3.0 // indirect github.com/imdario/mergo v0.3.6 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/magiconair/properties v1.8.1 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/copystructure v1.0.0 // indirect github.com/mitchellh/mapstructure v1.4.2 // indirect github.com/mitchellh/reflectwalk v1.0.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml v1.7.0 // indirect github.com/philhofer/fwd v1.1.1 // indirect github.com/spf13/afero v1.2.2 // indirect github.com/spf13/cast v1.3.0 // indirect github.com/spf13/jwalterweatherman v1.0.0 // indirect github.com/spf13/pflag v1.0.9 // indirect github.com/subosito/gotenv v1.2.0 // indirect github.com/tinylib/msgp v1.1.2 // indirect github.com/x448/float16 v0.8.4 // indirect github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f // indirect github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect go.uber.org/atomic v1.6.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.45.0 // indirect golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/sys v0.38.0 // indirect golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.9.0 // indirect golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect google.golang.org/protobuf v1.36.8 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) ================================================ FILE: go.sum ================================================ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= cloud.google.com/go/pubsub v1.4.0/go.mod h1:LFrqilwgdw4X2cJS9ALgzYmMu+ULyrUN6IHV3CPK4TM= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/Azure/go-autorest/autorest v0.9.0/go.mod h1:xyHB1BMZT0cuDHU7I0+g046+BFDTQ8rEZB0s4Yfa6bI= github.com/Azure/go-autorest/autorest/adal v0.5.0/go.mod h1:8Z9fGy2MpX0PvDjB1pEgQTmVqjGhiHBW7RJJEciWzS0= github.com/Azure/go-autorest/autorest/date v0.1.0/go.mod h1:plvfp3oPSKwf2DNjlBjWF/7vwR+cUD/ELuzDCXwHUVA= github.com/Azure/go-autorest/autorest/mocks v0.1.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0= github.com/Azure/go-autorest/autorest/mocks v0.2.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0= github.com/Azure/go-autorest/logger v0.1.0/go.mod h1:oExouG+K6PryycPJfVSxi/koC6LSNgds39diKLz7Vrc= github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbtp2fGCgRFtBroKn4Dk= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/DataDog/datadog-agent/pkg/obfuscate v0.0.0-20211129110424-6491aa3bf583 h1:3nVO1nQyh64IUY6BPZUpMYMZ738Pu+LsMt3E0eqqIYw= github.com/DataDog/datadog-agent/pkg/obfuscate v0.0.0-20211129110424-6491aa3bf583/go.mod h1:EP9f4GqaDJyP1F5jTNMtzdIpw3JpNs3rMSJOnYywCiw= github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go v4.8.2+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go v4.8.3+incompatible h1:fNGaYSuObuQb5nzeTQqowRAd9bpDIRRV4/gUtIBjh8Q= github.com/DataDog/datadog-go v4.8.3+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go/v5 v5.0.2/go.mod h1:ZI9JFB4ewXbw1sBnF4sxsR2k1H3xjV+PUAOUsHvKpcU= github.com/DataDog/datadog-go/v5 v5.1.0 h1:Zmq3tCk9+Tdq8Du73M71Zo6Dyx+cEo9QkCSCqQlHFaQ= github.com/DataDog/datadog-go/v5 v5.1.0/go.mod h1:KhiYb2Badlv9/rofz+OznKoEF5XKTonWyhx5K83AP8E= github.com/DataDog/gostackparse v0.5.0/go.mod h1:lTfqcJKqS9KnXQGnyQMCugq3u1FP6UZMfWR0aitKFMM= github.com/DataDog/sketches-go v1.0.0 h1:chm5KSXO7kO+ywGWJ0Zs6tdmWU8PBXSbywFVciL6BG4= github.com/DataDog/sketches-go v1.0.0/go.mod h1:O+XkJHWk9w4hDwY2ZUDU31ZC9sNYlYo8DiFsxjYeo1k= github.com/DataDog/zstd v1.3.5/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs= github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Masterminds/sprig v2.22.0+incompatible h1:z4yfnGrZ7netVz+0EDJ0Wi+5VZCSYp4Z0m2dk6cEM60= github.com/Masterminds/sprig v2.22.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuNhlNS5hqE0NB0E6fgfo2Br3o= github.com/Microsoft/go-winio v0.5.0/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= github.com/Microsoft/go-winio v0.5.1 h1:aPJp2QD7OOrhO5tQXqQoGSJc+DjDtWTGLOmNyAm6FgY= github.com/Microsoft/go-winio v0.5.1/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/Shopify/sarama v1.22.0/go.mod h1:lm3THZ8reqBDBQKQyb5HB3sY1lKp3grEbQ81aWSgPp4= github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/andybalholm/brotli v1.0.2/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= github.com/armon/go-metrics v0.3.0/go.mod h1:zXjbSimjXTd7vOpY8B0/2LpvNvDoXBuplAD+gJD3GYs= github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/aws/aws-sdk-go v1.25.37/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/aws/aws-sdk-go v1.34.28/go.mod h1:H7NKnBqNVzoTJpGfLrQkkD+ytBA93eiDYi/+8rV9s48= github.com/aws/aws-sdk-go v1.40.18 h1:ifWmCucvV20Kyx2t/l9+8gGqNzZ4CW+HO5uz8bCOK/o= github.com/aws/aws-sdk-go v1.40.18/go.mod h1:585smgzpB/KqRA+K3y/NL/oYRqQvpNJYvLm+LY1U59Q= github.com/aws/aws-sdk-go-v2 v1.0.0/go.mod h1:smfAbmpW+tcRVuNUjo3MOArSZmW72t62rkCzc2i0TWM= github.com/aws/aws-sdk-go-v2/config v1.0.0/go.mod h1:WysE/OpUgE37tjtmtJd8GXgT8s1euilE5XtUkRNUQ1w= github.com/aws/aws-sdk-go-v2/credentials v1.0.0/go.mod h1:/SvsiqBf509hG4Bddigr3NB12MIpfHhZapyBurJe8aY= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.0.0/go.mod h1:wpMHDCXvOXZxGCRSidyepa8uJHY4vaBGfY2/+oKU/Bc= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.0.0/go.mod h1:3jExOmpbjgPnz2FJaMOfbSk1heTkZ66aD3yNtVhnjvI= github.com/aws/aws-sdk-go-v2/service/sqs v1.0.0/go.mod h1:w5BclCU8ptTbagzXS/fHBr+vAyXUjggg/72qDIURKMk= github.com/aws/aws-sdk-go-v2/service/sts v1.0.0/go.mod h1:5f+cELGATgill5Pu3/vK3Ebuigstc+qYEHW5MvGWZO4= github.com/aws/smithy-go v1.0.0/go.mod h1:EzMw8dbp/YJL4A5/sbhGddag+NPT7q084agLbB9LgIw= github.com/aws/smithy-go v1.11.0/go.mod h1:3xHYmszWVx2c0kIwQeEVf9uSm4fYZt67FBJnwub1bgM= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932/go.mod h1:NOuUCSz6Q9T7+igc/hlvDOUdtWKryOrtFyIVABv/p7k= github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4= github.com/bradfitz/gomemcache v0.0.0-20220106215444-fb4bf637b56d/go.mod h1:H0wQNHz2YrLsuXOZozoeDmnHXkNCRmMW0gwFWDfEZDA= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cockroachdb/apd v1.1.0/go.mod h1:8Sl8LxpKi29FqWXR16WEFZRNSz3SoPzUzeMeY4+DwBQ= github.com/confluentinc/confluent-kafka-go v1.4.0/go.mod h1:u2zNLny2xq+5rWeTQjFHbDzzNuba4P1vo31r9r4uAdg= github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v0.0.0-20151105211317-5215b55f46b2/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/denisenkom/go-mssqldb v0.0.0-20200428022330-06a60b6afbbc/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU= github.com/denisenkom/go-mssqldb v0.11.0 h1:9rHa233rhdOyrz2GcP9NM+gi2psgJZ4GWDpL/7ND8HI= github.com/denisenkom/go-mssqldb v0.11.0/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU= github.com/dgraph-io/ristretto v0.1.0 h1:Jv3CGQHp9OjuMBSne1485aDpUkTKEcUqF+jm/LuerPI= github.com/dgraph-io/ristretto v0.1.0/go.mod h1:fux0lOrBhrVCJd3lcTHsIJhq1T2rokOu6v9Vcb3Q9ug= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM= github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs= github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU= github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I= github.com/elastic/go-elasticsearch/v6 v6.8.5/go.mod h1:UwaDJsD3rWLM5rKNFzv9hgox93HoX8utj1kxD9aFUcI= github.com/elastic/go-elasticsearch/v7 v7.17.1/go.mod h1:OJ4wdbtDNk5g503kvlHLyErCgQwwzmDtaFC4XyOxXA4= github.com/elazarl/goproxy v0.0.0-20170405201442-c4fc26588b6e/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/erikstmartin/go-testdb v0.0.0-20160219214506-8d10e4a1bae5/go.mod h1:a2zkGnVExMxdzMo3M0Hi/3sEU+cWnZpSni0O6/Yb/P0= github.com/evanphx/json-patch v4.2.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= github.com/fatih/structs v1.1.0/go.mod h1:9NiDSp5zOcgEDl+j00MP/WkGVPOlPRLejGD8Ga6PJ7M= github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/garyburd/redigo v1.6.3/go.mod h1:rTb6epsqigu3kYKBnaF028A7Tf/Aw5s0cqA47doKKqw= github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= github.com/gin-gonic/gin v1.7.0/go.mod h1:jD2toBW3GZUr5UMcdrwQA10I7RuaFOl/SGeDjXkfUtY= github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q= github.com/go-asn1-ber/asn1-ber v1.3.1/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-chi/chi v1.5.0/go.mod h1:REp24E+25iKvxgeTfHmdUoL5x15kBiDBlnIl5bCwe2k= github.com/go-chi/chi/v5 v5.0.0/go.mod h1:BBug9lr0cqtdAhsu6R4AAdvufI0/XBzAQSsUqJpoZOs= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/kit v0.9.0 h1:wDJmvq38kDhkVxi50ni9ykkdUr1PKgqKOoi01fa0Mdk= github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= github.com/go-ldap/ldap/v3 v3.1.3/go.mod h1:3rbOH3jRS2u6jg2rJnKAMLE/xQyCKIveG2Sa/Cohzb8= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logfmt/logfmt v0.5.0 h1:TrB8swr/68K7m9CcGut2g3UOihhbcbiMAYiuTXdEih4= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-openapi/jsonpointer v0.0.0-20160704185906-46af16f9f7b1/go.mod h1:+35s3my2LFTysnkMfxsJBAMHj/DoqoB9knIWoYG/Vk0= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= github.com/go-openapi/jsonreference v0.0.0-20160704190145-13c6e3589ad9/go.mod h1:W3Z9FmVs9qj+KR4zFKmDPGiLdk1D9Rlm7cyMvf57TTg= github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= github.com/go-openapi/spec v0.0.0-20160808142527-6aced65f8501/go.mod h1:J8+jY1nAiCcj+friV/PDoE1/3eeccG9LYBs0tYvLOWc= github.com/go-openapi/swag v0.0.0-20160704191624-1d0bd113de87/go.mod h1:DXUve3Dpr1UfpPtxFw+EFuQ41HhCWZfha5jSVRG7C7I= github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-pg/pg/v10 v10.0.0/go.mod h1:XHU1AkQW534GFuUdSiQ46+Xw6Ah+9+b8DlT4YwhiXL8= github.com/go-pg/zerochecker v0.2.0/go.mod h1:NJZ4wKL0NmTtz0GKCoJ8kym6Xn/EQzXRl2OnAe7MmDo= github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= github.com/go-redis/redis v6.15.9+incompatible h1:K0pv1D7EQUjfyoMql+r/jZqCLizCGKFlFgcHWWmHQjg= github.com/go-redis/redis v6.15.9+incompatible/go.mod h1:NAIEuMOZ/fxfXJIrKDQDz8wamY7mA7PouImQ2Jvg6kA= github.com/go-redis/redis/v7 v7.1.0/go.mod h1:JDNMw23GTyLNC4GZu9njt15ctBQVn7xjRfnwdHj/Dcg= github.com/go-redis/redis/v8 v8.0.0/go.mod h1:isLoQT/NFSP7V67lyvM9GmdvLdyZ7pEhsXvvyQtnQTo= github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w= github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0 h1:p104kn46Q8WdvHunIJ9dAyjPVtrBPhSr3KT2yUst43I= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/go-test/deep v1.0.2-0.20181118220953-042da051cf31/go.mod h1:wGDj63lr65AM2AQyKZd/NYHGb0R+1RLqB8NKt3aSFNA= github.com/go-test/deep v1.0.2/go.mod h1:wGDj63lr65AM2AQyKZd/NYHGb0R+1RLqB8NKt3aSFNA= github.com/gobuffalo/attrs v0.0.0-20190224210810-a9411de4debd/go.mod h1:4duuawTqi2wkkpB4ePgWMaai6/Kc6WEz83bhFwpHzj0= github.com/gobuffalo/depgen v0.0.0-20190329151759-d478694a28d3/go.mod h1:3STtPUQYuzV0gBVOY3vy6CfMm/ljR4pABfrTeHNLHUY= github.com/gobuffalo/depgen v0.1.0/go.mod h1:+ifsuy7fhi15RWncXQQKjWS9JPkdah5sZvtHc2RXGlg= github.com/gobuffalo/envy v1.6.15/go.mod h1:n7DRkBerg/aorDM8kbduw5dN3oXGswK5liaSCx4T5NI= github.com/gobuffalo/envy v1.7.0/go.mod h1:n7DRkBerg/aorDM8kbduw5dN3oXGswK5liaSCx4T5NI= github.com/gobuffalo/flect v0.1.0/go.mod h1:d2ehjJqGOH/Kjqcoz+F7jHTBbmDb38yXA598Hb50EGs= github.com/gobuffalo/flect v0.1.1/go.mod h1:8JCgGVbRjJhVgD6399mQr4fx5rRfGKVzFjbj6RE/9UI= github.com/gobuffalo/flect v0.1.3/go.mod h1:8JCgGVbRjJhVgD6399mQr4fx5rRfGKVzFjbj6RE/9UI= github.com/gobuffalo/genny v0.0.0-20190329151137-27723ad26ef9/go.mod h1:rWs4Z12d1Zbf19rlsn0nurr75KqhYp52EAGGxTbBhNk= github.com/gobuffalo/genny v0.0.0-20190403191548-3ca520ef0d9e/go.mod h1:80lIj3kVJWwOrXWWMRzzdhW3DsrdjILVil/SFKBzF28= github.com/gobuffalo/genny v0.1.0/go.mod h1:XidbUqzak3lHdS//TPu2OgiFB+51Ur5f7CSnXZ/JDvo= github.com/gobuffalo/genny v0.1.1/go.mod h1:5TExbEyY48pfunL4QSXxlDOmdsD44RRq4mVZ0Ex28Xk= github.com/gobuffalo/gitgen v0.0.0-20190315122116-cc086187d211/go.mod h1:vEHJk/E9DmhejeLeNt7UVvlSGv3ziL+djtTr3yyzcOw= github.com/gobuffalo/gogen v0.0.0-20190315121717-8f38393713f5/go.mod h1:V9QVDIxsgKNZs6L2IYiGR8datgMhB577vzTDqypH360= github.com/gobuffalo/gogen v0.1.0/go.mod h1:8NTelM5qd8RZ15VjQTFkAW6qOMx5wBbW4dSCS3BY8gg= github.com/gobuffalo/gogen v0.1.1/go.mod h1:y8iBtmHmGc4qa3urIyo1shvOD8JftTtfcKi+71xfDNE= github.com/gobuffalo/logger v0.0.0-20190315122211-86e12af44bc2/go.mod h1:QdxcLw541hSGtBnhUc4gaNIXRjiDppFGaDqzbrBd3v8= github.com/gobuffalo/mapi v1.0.1/go.mod h1:4VAGh89y6rVOvm5A8fKFxYG+wIW6LO1FMTG9hnKStFc= github.com/gobuffalo/mapi v1.0.2/go.mod h1:4VAGh89y6rVOvm5A8fKFxYG+wIW6LO1FMTG9hnKStFc= github.com/gobuffalo/packd v0.0.0-20190315124812-a385830c7fc0/go.mod h1:M2Juc+hhDXf/PnmBANFCqx4DM3wRbgDvnVWeG2RIxq4= github.com/gobuffalo/packd v0.1.0/go.mod h1:M2Juc+hhDXf/PnmBANFCqx4DM3wRbgDvnVWeG2RIxq4= github.com/gobuffalo/packr/v2 v2.0.9/go.mod h1:emmyGweYTm6Kdper+iywB6YK5YzuKchGtJQZ0Odn4pQ= github.com/gobuffalo/packr/v2 v2.2.0/go.mod h1:CaAwI0GPIAv+5wKLtv8Afwl+Cm78K/I/VCm/3ptBN+0= github.com/gobuffalo/syncx v0.0.0-20190224160051-33c29581e754/go.mod h1:HhnNqWY95UYwwW3uSASeV7vtgYkT2t16hJgV3AEPUpw= github.com/gocql/gocql v0.0.0-20220224095938-0eacd3183625/go.mod h1:3gM2c4D3AnkISwBxGnMMsS8Oy4y2lhbPRsH4xnJrHG8= github.com/gofiber/fiber/v2 v2.11.0/go.mod h1:oZTLWqYnqpMMuF922SjGbsYZsdpE1MCfh416HNdweIM= github.com/gofrs/uuid v3.2.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= github.com/gofrs/uuid v4.0.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/gogo/protobuf v1.2.2-0.20190723190241-65acae22fc9d/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe h1:lXe2qZdvpiX5WZkZR4hgp4KJVfY3nMkvmwbVkpv1rVY= github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe/go.mod h1:8vg3r2VgvsThLBIFL93Qb5yWzgyZWhEmBwUJWevAkK0= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/glog v1.2.4 h1:CNNw5U8lSiiBk7druxtSHHTsRWcxKoac6kZKm2peBBc= github.com/golang/glog v1.2.4/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= github.com/golang/protobuf v0.0.0-20161109072736-4bd1920723d7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/gomodule/redigo v1.7.0/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8/DtOE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20210423192551-a2663126120b/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= github.com/gophercloud/gophercloud v0.1.0/go.mod h1:vxM41WHh5uqHVBMZHzuwNOHh8XEoIEcSTewFxm1c5g8= github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/mux v1.5.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/mux v1.7.4-0.20190701202633-d83b6ffe499a h1:Rhv8JUcDkZJkUmzzjpysRtn5joJ/3T8Lt9QpdJZUz1c= github.com/gorilla/mux v1.7.4-0.20190701202633-d83b6ffe499a/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/graph-gophers/graphql-go v1.3.0/go.mod h1:9CQHMSxwO4MprSdzoIEobiHpoLtHm77vfxsvsIN5Vuc= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4= github.com/hashicorp/consul/api v1.0.0/go.mod h1:mbFwfRxOTDHZpT3iUsMAFcLNoVm6Xbe1xZ6KiSm8FY0= github.com/hashicorp/consul/internal v0.1.0/go.mod h1:zi9bMZYbiPHyAjgBWo7kCUcy5l2NrTdrkVupCc7Oo6c= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= github.com/hashicorp/go-hclog v0.0.0-20180709165350-ff2cf002a8dd/go.mod h1:9bjs9uLqI8l75knNv3lV1kA55veR+WUPSiKIWcQHudI= github.com/hashicorp/go-hclog v0.9.2/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= github.com/hashicorp/go-hclog v0.12.0/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-hclog v0.16.2/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-immutable-radix v1.3.1/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-kms-wrapping/entropy v0.1.0/go.mod h1:d1g9WGtAunDNpek8jUIEJnBlbgKS1N2Q61QkHiZyR1g= github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/go-plugin v1.0.1/go.mod h1:++UyYGoz3o5w9ZzAdZxtQKrWWP+iqPBn3cQptSMzBuY= github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs= github.com/hashicorp/go-retryablehttp v0.6.6/go.mod h1:vAew36LZh98gCBJNLH42IQ1ER/9wtLZZ8meHqQvEYWY= github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU= github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= github.com/hashicorp/go-sockaddr v1.0.2/go.mod h1:rB4wwRAUzs07qva3c5SdrY/NEtAUjGlgmH/UkBUC97A= github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.2/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-version v1.1.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/hashicorp/memberlist v0.1.6/go.mod h1:5VDNHjqFMgEcclnwmkCnC99IPwxBmIsxwY8qn+Nl0H4= github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= github.com/hashicorp/serf v0.8.6/go.mod h1:P/AVgr4UHsUYqVHG1y9eFhz8S35pqhGhLZaDpfGKIMo= github.com/hashicorp/vault/api v1.1.0/go.mod h1:R3Umvhlxi2TN7Ex2hzOowyeNb+SfbVWI973N+ctaFMk= github.com/hashicorp/vault/sdk v0.1.14-0.20200519221838-e0cfd64bc267/go.mod h1:WX57W2PwkrOPQ6rVQk+dy5/htHIaB4aBM70EwKThu10= github.com/hashicorp/yamux v0.0.0-20180604194846-3520598351bb/go.mod h1:+NfK9FKeTrX5uv1uIXGdwYDTeHna2qgaIlx54MXqjAM= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/huandu/xstrings v1.3.0 h1:gvV6jG9dTgFEncxo+AF7PH6MZXi/vZl25owA/8Dg8Wo= github.com/huandu/xstrings v1.3.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/jackc/chunkreader v1.0.0/go.mod h1:RT6O25fNZIuasFJRyZ4R/Y2BbhasbmZXF9QQ7T3kePo= github.com/jackc/chunkreader/v2 v2.0.0/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= github.com/jackc/chunkreader/v2 v2.0.1/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= github.com/jackc/pgconn v0.0.0-20190420214824-7e0022ef6ba3/go.mod h1:jkELnwuX+w9qN5YIfX0fl88Ehu4XC3keFuOJJk9pcnA= github.com/jackc/pgconn v0.0.0-20190824142844-760dd75542eb/go.mod h1:lLjNuW/+OfW9/pnVKPazfWOgNfH2aPem8YQ7ilXGvJE= github.com/jackc/pgconn v0.0.0-20190831204454-2fabfa3c18b7/go.mod h1:ZJKsE/KZfsUgOEh9hBm+xYTstcNHg7UPMVJqRfQxq4s= github.com/jackc/pgconn v1.4.0/go.mod h1:Y2O3ZDF0q4mMacyWV3AstPJpeHXWGEetiFttmq5lahk= github.com/jackc/pgconn v1.5.0/go.mod h1:QeD3lBfpTFe8WUnPZWN5KY/mB8FGMIYRdd8P8Jr0fAI= github.com/jackc/pgconn v1.5.1-0.20200601181101-fa742c524853/go.mod h1:QeD3lBfpTFe8WUnPZWN5KY/mB8FGMIYRdd8P8Jr0fAI= github.com/jackc/pgconn v1.6.4/go.mod h1:w2pne1C2tZgP+TvjqLpOigGzNqjBgQW9dUw/4Chex78= github.com/jackc/pgconn v1.8.0/go.mod h1:1C2Pb36bGIP9QHGBYCjnyhqu7Rv3sGshaQUvmfGIB/o= github.com/jackc/pgconn v1.9.0/go.mod h1:YctiPyvzfU11JFxoXokUOOKQXQmDMoJL9vJzHH8/2JY= github.com/jackc/pgconn v1.9.1-0.20210724152538-d89c8390a530/go.mod h1:4z2w8XhRbP1hYxkpTuBjTS3ne3J48K83+u0zoyvg2pI= github.com/jackc/pgconn v1.10.1/go.mod h1:4z2w8XhRbP1hYxkpTuBjTS3ne3J48K83+u0zoyvg2pI= github.com/jackc/pgio v1.0.0/go.mod h1:oP+2QK2wFfUWgr+gxjoBH9KGBb31Eio69xUb0w5bYf8= github.com/jackc/pgmock v0.0.0-20190831213851-13a1b77aafa2/go.mod h1:fGZlG77KXmcq05nJLRkk0+p82V8B8Dw8KN2/V9c/OAE= github.com/jackc/pgmock v0.0.0-20201204152224-4fe30f7445fd/go.mod h1:hrBW0Enj2AZTNpt/7Y5rr2xe/9Mn757Wtb2xeBzPv2c= github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65/go.mod h1:5R2h2EEX+qri8jOWMbJCtaPWkrrNc7OHwsp2TCqp7ak= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgproto3 v1.1.0/go.mod h1:eR5FA3leWg7p9aeAqi37XOTgTIbkABlvcPB3E5rlc78= github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190420180111-c116219b62db/go.mod h1:bhq50y+xrl9n5mRYyCBFKkpRVTLYJVWeCc+mEAI3yXA= github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190609003834-432c2951c711/go.mod h1:uH0AWtUmuShn0bcesswc4aBTWGvw0cAxIJp+6OB//Wg= github.com/jackc/pgproto3/v2 v2.0.0-rc3/go.mod h1:ryONWYqW6dqSg1Lw6vXNMXoBJhpzvWKnT95C46ckYeM= github.com/jackc/pgproto3/v2 v2.0.0-rc3.0.20190831210041-4c03ce451f29/go.mod h1:ryONWYqW6dqSg1Lw6vXNMXoBJhpzvWKnT95C46ckYeM= github.com/jackc/pgproto3/v2 v2.0.1/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= github.com/jackc/pgproto3/v2 v2.0.2/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= github.com/jackc/pgproto3/v2 v2.0.6/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= github.com/jackc/pgproto3/v2 v2.1.1/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= github.com/jackc/pgproto3/v2 v2.2.0/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= github.com/jackc/pgservicefile v0.0.0-20200307190119-3430c5407db8/go.mod h1:vsD4gTJCa9TptPL8sPkXrLZ+hDuNrZCnj29CQpr4X1E= github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b/go.mod h1:vsD4gTJCa9TptPL8sPkXrLZ+hDuNrZCnj29CQpr4X1E= github.com/jackc/pgtype v0.0.0-20190421001408-4ed0de4755e0/go.mod h1:hdSHsc1V01CGwFsrv11mJRHWJ6aifDLfdV3aVjFF0zg= github.com/jackc/pgtype v0.0.0-20190824184912-ab885b375b90/go.mod h1:KcahbBH1nCMSo2DXpzsoWOAfFkdEtEJpPbVLq8eE+mc= github.com/jackc/pgtype v0.0.0-20190828014616-a8802b16cc59/go.mod h1:MWlu30kVJrUS8lot6TQqcg7mtthZ9T0EoIBFiJcmcyw= github.com/jackc/pgtype v1.2.0/go.mod h1:5m2OfMh1wTK7x+Fk952IDmI4nw3nPrvtQdM0ZT4WpC0= github.com/jackc/pgtype v1.3.1-0.20200510190516-8cd94a14c75a/go.mod h1:vaogEUkALtxZMCH411K+tKzNpwzCKU+AnPzBKZ+I+Po= github.com/jackc/pgtype v1.3.1-0.20200606141011-f6355165a91c/go.mod h1:cvk9Bgu/VzJ9/lxTO5R5sf80p0DiucVtN7ZxvaC4GmQ= github.com/jackc/pgtype v1.4.2/go.mod h1:JCULISAZBFGrHaOXIIFiyfzW5VY0GRitRr8NeJsrdig= github.com/jackc/pgtype v1.8.1-0.20210724151600-32e20a603178/go.mod h1:C516IlIV9NKqfsMCXTdChteoXmwgUceqaLfjg2e3NlM= github.com/jackc/pgtype v1.9.0/go.mod h1:LUMuVrfsFfdKGLw+AFFVv6KtHOFMwRgDDzBt76IqCA4= github.com/jackc/pgx/v4 v4.0.0-20190420224344-cc3461e65d96/go.mod h1:mdxmSJJuR08CZQyj1PVQBHy9XOp5p8/SHH6a0psbY9Y= github.com/jackc/pgx/v4 v4.0.0-20190421002000-1b8f0016e912/go.mod h1:no/Y67Jkk/9WuGR0JG/JseM9irFbnEPbuWV2EELPNuM= github.com/jackc/pgx/v4 v4.0.0-pre1.0.20190824185557-6972a5742186/go.mod h1:X+GQnOEnf1dqHGpw7JmHqHc1NxDoalibchSk9/RWuDc= github.com/jackc/pgx/v4 v4.5.0/go.mod h1:EpAKPLdnTorwmPUUsqrPxy5fphV18j9q3wrfRXgo+kA= github.com/jackc/pgx/v4 v4.6.1-0.20200510190926-94ba730bb1e9/go.mod h1:t3/cdRQl6fOLDxqtlyhe9UWgfIi9R8+8v8GKV5TRA/o= github.com/jackc/pgx/v4 v4.6.1-0.20200606145419-4e5062306904/go.mod h1:ZDaNWkt9sW1JMiNn0kdYBaLelIhw7Pg4qd+Vk6tw7Hg= github.com/jackc/pgx/v4 v4.8.1/go.mod h1:4HOLxrl8wToZJReD04/yB20GDwf4KBYETvlHciCnwW0= github.com/jackc/pgx/v4 v4.12.1-0.20210724153913-640aa07df17c/go.mod h1:1QD0+tgSXP7iUjYm9C1NxKhny7lq6ee99u/z+IHFcgs= github.com/jackc/pgx/v4 v4.14.0/go.mod h1:jT3ibf/A0ZVCp89rtCIN0zCJxcE74ypROmHEZYsG/j8= github.com/jackc/puddle v0.0.0-20190413234325-e4ced69a3a2b/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle v0.0.0-20190608224051-11cab39313c9/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle v1.1.0/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle v1.1.1/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle v1.1.3/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle v1.2.0/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jinzhu/gorm v1.9.1/go.mod h1:Vla75njaFJ8clLU1W44h34PjIkijhjHIYnZxMqCdxqo= github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= github.com/jinzhu/now v1.1.1/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= github.com/jinzhu/now v1.1.3/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/jmoiron/sqlx v1.2.0/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks= github.com/jmoiron/sqlx v1.2.1-0.20190426154859-38398a30ed85 h1:M3C5MxZHP36CMRk0c0XWgtnixXDIEh8RE1cnnjCbjzw= github.com/jmoiron/sqlx v1.2.1-0.20190426154859-38398a30ed85/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks= github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v0.0.0-20180612202835-f2b4162afba3/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/julienschmidt/httprouter v1.1.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/karrick/godirwalk v1.8.0/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaRPx4tDPEn4= github.com/karrick/godirwalk v1.10.3/go.mod h1:RoGL9dQei4vP9ilrpETWE8CLOZ1kiN0LhBygSwrAsHA= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.12.2/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/klauspost/compress v1.13.4/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/klauspost/compress v1.14.2/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.8/go.mod h1:O1sed60cT9XZ5uDucP5qwvh+TE3NnUj51EiZO/lmSfw= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/labstack/echo v3.3.10+incompatible/go.mod h1:0INS7j/VjnFxD4E2wkz67b8cVwCLbBmJyDaka6Cmk1s= github.com/labstack/echo/v4 v4.2.0/go.mod h1:AA49e0DZ8kk5jTOOCKNuPR6oTnBS0dYiM4FW1e6jwpg= github.com/labstack/gommon v0.3.0/go.mod h1:MULnywXg0yavhxWKc+lOruYdAhDwPK9wf0OL7NoOu+k= github.com/labstack/gommon v0.3.1/go.mod h1:uW6kP17uPlLJsD3ijUYn3/M5bAxtlZhMI6m3MFxTMTM= github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.1.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.3.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.10.2 h1:AqzbZs4ZoCBp+GtejcpCpcxM3zlSMx29dXbUSeVtJb8= github.com/lib/pq v1.10.2/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/magiconair/properties v1.8.1 h1:ZC2Vc7/ZFkGmsVC9KvOjumD+G5lXy2RtTKyzRKO2BQ4= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20180730094502-03f2033d19d5/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/markbates/oncer v0.0.0-20181203154359-bf2de49a0be2/go.mod h1:Ld9puTsIW75CHf65OeIOkyKbteujpZVXDpWK6YGZbxE= github.com/markbates/safe v1.0.1/go.mod h1:nAqgmRi7cY2nqMc92/bSEeQA+R4OheNU2T1kNSCBdG0= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-colorable v0.1.6/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-colorable v0.1.7/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-colorable v0.1.11/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.7/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ= github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84= github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/mattn/go-sqlite3 v1.9.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= github.com/mattn/go-sqlite3 v1.14.12 h1:TJ1bhYJPV44phC+IMu1u2K/i5RriLTPe+yc68XDJ1Z0= github.com/mattn/go-sqlite3 v1.14.12/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/miekg/dns v1.1.25/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= github.com/mitchellh/copystructure v1.0.0 h1:Laisrj+bAB6b/yJwB5Bt3ITZhGJdqmxquMKeZ+mmkFQ= github.com/mitchellh/copystructure v1.0.0/go.mod h1:SNtv71yrdKgLRyLFxmLdkAbkKEFWgYaq1OVrnRcwhnw= github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-testing-interface v0.0.0-20171004221916-a61a99592b77/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo= github.com/mitchellh/gox v0.4.0/go.mod h1:Sd9lOJ0+aimLBi73mGofS1ycjY8lL3uZM3JPS42BGNg= github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY= github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.3.2/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/mitchellh/mapstructure v1.4.2 h1:6h7AQ0yhTcIsmFmnAwQls75jp2Gzs4iB8W7pjMO+rqo= github.com/mitchellh/mapstructure v1.4.2/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/mitchellh/reflectwalk v1.0.0 h1:9D+8oIskB4VJBN5SFlmc27fSlIBZaov1Wpk/IfikLNY= github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d h1:VhgPp6v9qf9Agr/56bj7Y/xa04UccTW04VP0Qed4vnQ= github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d/go.mod h1:YUTz3bUH2ZwIWBy3CJBeOBEugqcmXREj14T+iG/4k4U= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.10.1/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= github.com/onsi/ginkgo v1.14.1/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= github.com/onsi/ginkgo v1.16.4 h1:29JGrr5oVBm5ulCWet69zQkzWipVXIol6ygQUe/EzNc= github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vvnwo0= github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/onsi/gomega v1.10.2/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY= github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pelletier/go-toml v1.7.0 h1:7utD74fnzVc/cpcyy8sjrlFr5vYpypUixARcHIMIGuI= github.com/pelletier/go-toml v1.7.0/go.mod h1:vwGMzjaWMwyfHwgIBhI2YUM4fB6nL6lVAvS1LBMMhTE= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ= github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU= github.com/pierrec/lz4 v0.0.0-20190327172049-315a67e90e41/go.mod h1:3/3N9NVKO0jef7pBehbT1qWhCMrIgbYNnFAZCqQ5LRc= github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= github.com/pierrec/lz4 v2.5.2+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/profile v1.2.1/go.mod h1:hJw3o1OdXxsrSjjVksARp5W95eeEaEfptyVZyv6JUPA= github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM= github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.0.0-20181126121408-4724e9255275/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.2.2/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rs/cors v1.6.1-0.20190613161432-33ffc0734c60 h1:zjQeTJDXNmRPVGSsU1G3VErobzE1BwlmHuBqdyR4JgE= github.com/rs/cors v1.6.1-0.20190613161432-33ffc0734c60/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ= github.com/rs/zerolog v1.13.0/go.mod h1:YbFCdg8HfsridGWAh22vktObvhZbQsZXe4/zB0OKkWU= github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThCjNc= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/segmentio/kafka-go v0.4.29/go.mod h1:m1lXeqJtIFYZayv0shM/tjrAFljvWLTprxBHd+3PnaU= github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4= github.com/shopspring/decimal v0.0.0-20200227202807-02e2044944cc/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/afero v1.2.2 h1:5jhuqJyZCZf2JRofRvN/nIFgIWNzPa3/Vz8mYylgbWc= github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8= github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= github.com/spf13/jwalterweatherman v1.0.0 h1:XHEdyB+EcvlqZamSM4ZOMGlc93t6AcsBEu9Gc1vn7yk= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.4.1-0.20190614151712-3349bd9cc288 h1:qWb7etNPDy3ShqmQ+e8YM+30P6D3/n+QUwrAwxWIfnk= github.com/spf13/viper v1.4.1-0.20190614151712-3349bd9cc288/go.mod h1:LLu5zwCkRPEBY0VPcRMqh58VtcO8Lp1DgqwstU7rYlk= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v0.0.0-20151208002404-e3a8ff8ce365/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.2.0 h1:Slr1R9HxAlEKefgq5jn9U+DnETlIUa6HfgEzj0g5d7s= github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/tidwall/btree v0.3.0/go.mod h1:huei1BkDWJ3/sLXmO+bsCNELL+Bp2Kks9OLyQFkzvA8= github.com/tidwall/btree v1.1.0/go.mod h1:TzIRzen6yHbibdSfK6t8QimqbUnoxUSrZfeW7Uob0q4= github.com/tidwall/buntdb v1.2.0/go.mod h1:XLza/dhlwzO6dc5o/KWor4kfZSt3BP8QV+77ZMKfI58= github.com/tidwall/gjson v1.6.7/go.mod h1:zeFuBCIqD4sN/gmqBzZ4j7Jd6UcA2Fc56x7QFsv+8fI= github.com/tidwall/gjson v1.6.8/go.mod h1:zeFuBCIqD4sN/gmqBzZ4j7Jd6UcA2Fc56x7QFsv+8fI= github.com/tidwall/gjson v1.12.1/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/grect v0.1.0/go.mod h1:sa5O42oP6jWfTShL9ka6Sgmg3TgIK649veZe05B7+J8= github.com/tidwall/grect v0.1.4/go.mod h1:9FBsaYRaR0Tcy4UwefBX/UDcDcDy9V5jUcxHzv2jd5Q= github.com/tidwall/match v1.0.3/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/tidwall/pretty v1.0.2/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tidwall/rtred v0.1.2/go.mod h1:hd69WNXQ5RP9vHd7dqekAz+RIdtfBogmglkZSRxCHFQ= github.com/tidwall/tinyqueue v0.1.1/go.mod h1:O/QNHwrnjqr6IHItYrzoHAKYhBkLI67Q096fQP5zMYw= github.com/tinylib/msgp v1.1.2 h1:gWmO7n0Ys2RBEb7GPYB9Ujq8Mk5p2U08lRnmMcGy6BQ= github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc/go.mod h1:bciPuU6GHm1iF1pBvUfxfsH0Wmnc2VbpgvbI9ZWuIRs= github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM= github.com/twitchtv/twirp v8.1.1+incompatible/go.mod h1:RRJoFSAmTEh2weEqWtpPE3vFK5YBhA6bqp2l1kfCC5A= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= github.com/valyala/fasthttp v1.26.0/go.mod h1:cmWIqlu99AO/RKcp1HWaViTqc57FswJOfYYdPJBl8BA= github.com/valyala/fasthttp v1.32.0/go.mod h1:2rsYD01CKFrjjsvFxx75KlEUNpWNBY9JWD3K/7o2Cus= github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8= github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc= github.com/vmihailenco/bufpool v0.1.11/go.mod h1:AFf/MOy3l2CFTKbxwt0mp2MwnqjNEs5H/UxrkA5jxTQ= github.com/vmihailenco/msgpack/v4 v4.3.11/go.mod h1:gborTTJjAo/GWTqqRjrLCn9pgNN+NXzzngzBKDPIqw4= github.com/vmihailenco/msgpack/v5 v5.0.0-beta.1/go.mod h1:xlngVLeyQ/Qi05oQxhQ+oTuqa03RjMwMfk/7/TCs+QI= github.com/vmihailenco/msgpack/v5 v5.3.4/go.mod h1:7xyJ9e+0+9SaZT0Wt1RGleJXzli6Q/V5KbhBonMG9jc= github.com/vmihailenco/tagparser v0.1.1/go.mod h1:OeAg3pn3UbLjkWt+rN9oFYB6u/cQgqMEUPoW2WPyhdI= github.com/vmihailenco/tagparser v0.1.2/go.mod h1:OeAg3pn3UbLjkWt+rN9oFYB6u/cQgqMEUPoW2WPyhdI= github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= github.com/xdg-go/scram v1.0.2/go.mod h1:1WAq6h33pAW+iRreB34OORO2Nf7qel3VV3fjBj+hCSs= github.com/xdg-go/stringprep v1.0.2/go.mod h1:8F9zXuvzgwmyT5DUm4GUfZGDdT3W+LCvS6+da4O5kxM= github.com/xdg/scram v0.0.0-20180814205039-7eeb5667e42c/go.mod h1:lB8K/P019DLNhemzwFU4jHLhdvlE6uDZjXFejJXr49I= github.com/xdg/stringprep v1.0.0/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f h1:J9EGpcZtP0E/raorCMxlFGSTBrsSlaDGf3jU/qvAE2c= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0= github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xeipuuv/gojsonschema v0.0.0-20180618132009-1d523034197f h1:mvXjJIHRZyhNuGassLTcXTwjiWq7NmjdavZsUnmFybQ= github.com/xeipuuv/gojsonschema v0.0.0-20180618132009-1d523034197f/go.mod h1:5yf86TLmAcydyeJq5YvxkGPE2fm/u4myDekKRoLuqhs= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= github.com/zenazn/goji v1.0.1/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.mongodb.org/mongo-driver v1.5.1/go.mod h1:gRXCHX4Jo7J0IJ1oDQyUxF7jfy19UfxniMS4xxMmUqw= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opentelemetry.io/otel v0.11.0/go.mod h1:G8UCk+KooF2HLkgo8RHX9epABH/aRGYET7gQOqBVdB0= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= go.uber.org/atomic v1.6.0 h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk= go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.5.0 h1:KCa4XfM8CWFCpxXRGok+Q0SS/0XBhMDbHHGABQLvD2A= go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee h1:0mgffUl7nfd+FpvXMVz4IDEaUSmT1ysygQC7qYo7sG4= go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= go.uber.org/zap v1.9.1/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20180910181607-0e37d006457b/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190211182817-74369b46fc67/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190325154230-a5d413f7728c/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190404164418-38d8ce5564a5/go.mod h1:WFFai1msRO1wXaEeE5yQxYXgSfI8pQAWXbQop6sCtWE= golang.org/x/crypto v0.0.0-20190411191339-88737f569e3a/go.mod h1:WFFai1msRO1wXaEeE5yQxYXgSfI8pQAWXbQop6sCtWE= golang.org/x/crypto v0.0.0-20190418165655-df01cb2cc480/go.mod h1:WFFai1msRO1wXaEeE5yQxYXgSfI8pQAWXbQop6sCtWE= golang.org/x/crypto v0.0.0-20190422162423-af44ce270edf/go.mod h1:WFFai1msRO1wXaEeE5yQxYXgSfI8pQAWXbQop6sCtWE= golang.org/x/crypto v0.0.0-20190506204251-e1dfcc566284/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190911031432-227b76d455e7/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190923035154-9ee001bba392/go.mod h1:/lpIB1dKB+9EgE3H3cr1v9wB50oz8l4C4h62xy7jSTY= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201203163018-be400aefbc4c/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= golang.org/x/exp v0.0.0-20200901203048-c4f52b2c50aa/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= golang.org/x/exp v0.0.0-20200908183739-ae8ad444f925/go.mod h1:1phAWC201xIgDyaFpmDeZkgf70Q4Pd/CNqfRtVPtxNw= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/lint v0.0.0-20200302205851-738671d3881b h1:Wh+f8QHJXR411sJR8/vRBTZ7YapZaRvUcLFFJhusH0k= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.0.0-20170114055629-f2499483f923/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= golang.org/x/net v0.0.0-20210510120150-4163338589ed/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211020060615-d418f374d309/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190412183630-56d357773e84/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20170830134202-bb24a47a89ea/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190129075346-302c3dd5f1cc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190209173611-3b5209105503/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190403152447-81d4e9dc473e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190419153524-e8e3143a4f4a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190531175056-4c3a928424d2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190924154521-2837fb4f24fe/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200519105757-fe76b779f299/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200826173525-f9321e4c35a6/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210514084401-e8d321eab015/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211103235746-7861aae1554b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20200416051211-89c76fbcd5d1/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181011042414-1f849cf54d09/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190329151228-23e29df326fe/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190416151739-9c9e1878f421/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190420181800-aa740d480789/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190425163242-31fd60d6bfdc/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190531172133-b3315ee88b7d/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190823170909-c4a336ef6a2f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190907020128-2ca718005c18/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200527183253-8e7acdbce89d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190410155217-1f06c39b4373/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190513163551-3ee3066db522/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= google.golang.org/api v0.25.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/genproto v0.0.0-20200528110217-3d3490e7e671/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= google.golang.org/genproto v0.0.0-20200726014623-da3ae01ef02d/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/grpc v1.14.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= google.golang.org/grpc v1.22.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/DataDog/dd-trace-go.v1 v1.38.0 h1:vm/mYIZCEp5j2MoKPmwM3t6EGthxpvVbCOm2hRl5uDc= gopkg.in/DataDog/dd-trace-go.v1 v1.38.0/go.mod h1:GBhK4yaMJ1h329ivtKAqRNe1EZ944UnZwtz5lh7CnJc= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/inconshreveable/log15.v2 v2.0.0-20180818164646-67afb5ed74ec/go.mod h1:aPpfJ7XW+gOuirDoZ8gHhLh3kZ1B08FtV2bbmy7Jv3s= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/jinzhu/gorm.v1 v1.9.1/go.mod h1:56JJPUzbikvTVnoyP1nppSkbJ2L8sunqTBDY2fDrmFg= gopkg.in/olivere/elastic.v3 v3.0.75/go.mod h1:yDEuSnrM51Pc8dM5ov7U8aI/ToR3PG0llA8aRv2qmw0= gopkg.in/olivere/elastic.v5 v5.0.84/go.mod h1:LXF6q9XNBxpMqrcgax95C6xyARXWbbCXUrtTxrNrxJI= gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= gopkg.in/square/go-jose.v2 v2.5.1/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637 h1:yiW+nvdHb9LVqSHQBXfZCieqV4fzYhNBql77zY0ykqs= gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637/go.mod h1:BHsqpu/nsuzkT5BpiH1EMZPLyqSMM8JbIavyFACoFNk= gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gorm.io/driver/mysql v1.0.1/go.mod h1:KtqSthtg55lFp3S5kUXqlGaelnWpKitn4k1xZTnoiPw= gorm.io/driver/postgres v1.0.0/go.mod h1:wtMFcOzmuA5QigNsgEIb7O5lhvH1tHAF1RbWmLWV4to= gorm.io/driver/sqlserver v1.0.4/go.mod h1:ciEo5btfITTBCj9BkoUVDvgQbUdLWQNqdFY5OGuGnRg= gorm.io/gorm v1.9.19/go.mod h1:0HFTzE/SqkGTzK6TlDPPQbAYCluiVvhzoA1+aVyzenw= gorm.io/gorm v1.20.0/go.mod h1:0HFTzE/SqkGTzK6TlDPPQbAYCluiVvhzoA1+aVyzenw= gorm.io/gorm v1.20.6/go.mod h1:0HFTzE/SqkGTzK6TlDPPQbAYCluiVvhzoA1+aVyzenw= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= honnef.co/go/tools v0.0.1-2020.1.4 h1:UoveltGrhghAA7ePc+e+QYDHXrBps2PqFZiHkGR/xK8= honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= k8s.io/api v0.17.0/go.mod h1:npsyOePkeP0CPwyGfXDHxvypiYMJxBWAMpQxCaJ4ZxI= k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= k8s.io/apimachinery v0.17.0/go.mod h1:b9qmWdKlLuU9EBh+06BtLcSf/Mu89rWL33naRxs1uZg= k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= k8s.io/client-go v0.17.0/go.mod h1:TYgR6EUHs6k45hb6KWjVD6jFZvJV4gHDikv/It0xz+k= k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= k8s.io/klog v0.3.0/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20191107075043-30be4d16710a/go.mod h1:1TqjTSzOxsLGIKfj0lK8EeCP7K1iUG65v09OM0/WG5E= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= k8s.io/metrics v0.35.0 h1:xVFoqtAGm2dMNJAcB5TFZJPCen0uEqqNt52wW7ABbX8= k8s.io/metrics v0.35.0/go.mod h1:g2Up4dcBygZi2kQSEQVDByFs+VUwepJMzzQLJJLpq4M= k8s.io/utils v0.0.0-20191114184206-e782cd3c129f/go.mod h1:sZAwmy6armz5eXlNoLmJcl4F1QuKu7sr+mFQ0byX7Ew= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= mellium.im/sasl v0.2.1/go.mod h1:ROaEDLQNuf9vjKqE1SrAfnsobm2YKXT1gnN1uDp1PjQ= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff v0.0.0-20190525122527-15d366b2352e/go.mod h1:wWxsB5ozmmv/SG7nM11ayaAW51xMvak/t1r0CSlcokI= sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= ================================================ FILE: log/event.go ================================================ package log import ( "errors" "github.com/stitchfix/flotilla-os/clients/httpclient" "log" "os" "time" ) // // EventSink interface // type EventSink interface { Receive(keyvals ...interface{}) error } // // LocalEventSink - an implementation of EventSink that // simply logs events to os.Stderr. // type LocalEventSink struct { logger *log.Logger } // New Logs local sink. func NewLocalEventSink() *LocalEventSink { logger := log.New(os.Stderr, "[LocalEventSink] ", log.Ldate|log.Ltime|log.Lshortfile) return &LocalEventSink{logger} } // Receive Log events. func (localSink *LocalEventSink) Receive(keyvals ...interface{}) error { log.Printf("\n%v\n", keyvals) return nil } // // HTTPEventSink pushes arbitrary key-value // events to an external location // type HTTPEventSink struct { path string method string client httpclient.Client } // // HTTPEvent represents an arbitrary key-value // event // type HTTPEvent struct { Timestamp time.Time `json:"timestamp"` Message map[string]interface{} `json:"message"` } // // NewHTTPSink initializes and returns an HTTPEventSink // func NewHTTPSink(host string, path string, method string) HTTPEventSink { return HTTPEventSink{ path, method, httpclient.Client{Host: host}, } } func (httpsink *HTTPEventSink) headers() map[string]string { return map[string]string{ "Content-Type": "application/json", } } func (httpsink *HTTPEventSink) constructMessage(keyvals ...interface{}) (map[string]interface{}, error) { n := (len(keyvals) + 1) / 2 m := make(map[string]interface{}, n) for i := 0; i < len(keyvals); i += 2 { k := keyvals[i] key, ok := k.(string) if !ok { return m, errors.New("Not all keys are strings") } var v interface{} if i+1 < len(keyvals) { v = keyvals[i+1] } m[key] = v } return m, nil } // // Receive consumes an arbitrary set of keys and values (k1,v1,k2,v2,...), // constructs an HTTPEvent from them, and sends them to the configured // http endpoint using the configured method // func (httpsink *HTTPEventSink) Receive(keyvals ...interface{}) error { var err error var event HTTPEvent m, err := httpsink.constructMessage(keyvals...) if err != nil { return err } event.Message = m event.Timestamp = time.Now().UTC() var response interface{} return httpsink.client.Post( httpsink.method, httpsink.headers(), &event, &response) } ================================================ FILE: log/event_test.go ================================================ package log import ( "encoding/json" "net/http" "net/http/httptest" "testing" "time" ) type TestDomainSpecificEvent struct { Timestamp time.Time Message struct { A int `json: "a` B int `json: "b"` } } func TestHTTPEventSink_Receive(t *testing.T) { testServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { content := r.Header.Get("Content-Type") if content != "application/json" { t.Errorf("Expected Content-Type to eq %s got %s", "application/json", content) } e := TestDomainSpecificEvent{} err := json.NewDecoder(r.Body).Decode(&e) if err != nil { t.Errorf("Expected body to deserialize properly but got error %s", err.Error()) } })) httpSink := NewHTTPSink(testServer.URL, "/", "POST") httpSink.Receive("a", 1, "b", 2) err := httpSink.Receive(1, "noway") if err == nil { t.Errorf("Expected message construction to fail with non-string keys") } } ================================================ FILE: log/logger.go ================================================ package log import "github.com/go-kit/kit/log" // // Logger interface, supports log messages and "events" // where an event is an object that should get received // by the configured EventSinks // type Logger interface { Log(keyvals ...interface{}) error Event(keyvals ...interface{}) error } type logger struct { wrapped log.Logger sinks []EventSink } // // NewLogger sets up and returns a Logger // func NewLogger(wrapped log.Logger, sinks []EventSink) Logger { return &logger{wrapped, sinks} } func (l *logger) Log(keyvals ...interface{}) error { return l.wrapped.Log(keyvals...) } // // Event iterates through the configured EventSinks and // sends the event to each one // func (l *logger) Event(keyvals ...interface{}) error { var err error if l.sinks != nil { for _, sink := range l.sinks { if err = sink.Receive(keyvals...); err != nil { _ = l.Log("level", "error", "message", "error sending event", "sink", sink, "error", err) } } } return err } ================================================ FILE: log/logger_test.go ================================================ package log import ( "testing" ) type testLogger struct { keyvals []interface{} } func (tl *testLogger) Log(keyvals ...interface{}) error { tl.keyvals = keyvals return nil } type testSink struct { keyvals []interface{} } func (ts *testSink) Receive(keyvals ...interface{}) error { ts.keyvals = keyvals return nil } func TestLogger_Log(t *testing.T) { tl := &testLogger{} l := NewLogger(tl, nil) // Verify that the wrapped logger's Log method gets called l.Log("message", "value") if len(tl.keyvals) != 2 { t.Errorf("Expected log message with 2 values, got %v", len(tl.keyvals)) } m1 := tl.keyvals[0] m2 := tl.keyvals[1] if m1.(string) != "message" || m2.(string) != "value" { t.Errorf("Expected [message, value] but got %s", tl.keyvals) } } func TestLogger_Event(t *testing.T) { ts := &testSink{} tl := &testLogger{} l := NewLogger(tl, []EventSink{ts}) // Verify that the wrapped logger's Log method gets called l.Event("important_event", "act_on_me") if len(ts.keyvals) != 2 { t.Errorf("Expected to recieve event with 2 values, got %v", len(ts.keyvals)) } m1 := ts.keyvals[0] m2 := ts.keyvals[1] if m1.(string) != "important_event" || m2.(string) != "act_on_me" { t.Errorf("Expected [important_event, act_on_me] but got %s", ts.keyvals) } } ================================================ FILE: main.go ================================================ package main import ( "fmt" gklog "github.com/go-kit/kit/log" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/clients/cluster" "github.com/stitchfix/flotilla-os/clients/logs" "github.com/stitchfix/flotilla-os/clients/metrics" "github.com/stitchfix/flotilla-os/clients/middleware" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/execution/engine" "github.com/stitchfix/flotilla-os/flotilla" flotillaLog "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/queue" "github.com/stitchfix/flotilla-os/state" "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer" "log" "os" ) func main() { tracer.Start() defer tracer.Stop() args := os.Args if len(args) < 2 { fmt.Println("Usage: flotilla-os ") os.Exit(1) } // // Use go-kit for structured logging (JSON format for DataDog compatibility) // l := gklog.NewJSONLogger(gklog.NewSyncWriter(os.Stderr)) l = gklog.With(l, "ts", gklog.DefaultTimestampUTC) eventSinks := []flotillaLog.EventSink{flotillaLog.NewLocalEventSink()} logger := flotillaLog.NewLogger(l, eventSinks) // // Wrap viper for configuration // confDir := args[1] c, err := config.NewConfig(&confDir) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize config")) os.Exit(1) } // // Instantiate metrics client. // if err = metrics.InstantiateClient(c); err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize metrics client")) os.Exit(1) } // // Get state manager for reading and writing // state about definitions and runs // stateManager, err := state.NewStateManager(c, logger) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize state manager")) os.Exit(1) } // // Get registry client for validating images // if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize registry client")) os.Exit(1) } // // Get cluster client for validating definitions // against execution clusters // eksClusterClient, err := cluster.NewClusterClient(c, state.EKSEngine) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize EKS cluster client")) //TODO //os.Exit(1) } eksLogsClient, err := logs.NewLogsClient(c, logger, state.EKSEngine) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize EKS logs client")) //TODO //os.Exit(1) } // // Get queue manager for queuing runs // eksQueueManager, err := queue.NewQueueManager(c, state.EKSEngine) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize eks queue manager")) os.Exit(1) } emrQueueManager, err := queue.NewQueueManager(c, state.EKSSparkEngine) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize eks queue manager")) os.Exit(1) } clusterManager, err := engine.NewDynamicClusterManager( c.GetString("aws_default_region"), logger, stateManager, ) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize dynamic cluster manager")) os.Exit(1) } // // Get execution engine for interacting with backend // execution management framework (eg. EKS) // eksExecutionEngine, err := engine.NewExecutionEngine(c, eksQueueManager, state.EKSEngine, logger, clusterManager, stateManager) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize EKS execution engine")) os.Exit(1) } emrExecutionEngine, err := engine.NewExecutionEngine(c, eksQueueManager, state.EKSSparkEngine, logger, clusterManager, stateManager) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize EMR execution engine")) os.Exit(1) } middlewareClient, err := middleware.NewClient() if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize middleware client")) os.Exit(1) } app, err := flotilla.NewApp(c, logger, eksLogsClient, eksExecutionEngine, stateManager, eksClusterClient, eksQueueManager, emrExecutionEngine, emrQueueManager, middlewareClient, clusterManager) if err != nil { fmt.Printf("%+v\n", errors.Wrap(err, "unable to initialize app")) os.Exit(1) } log.Fatal(app.Run()) } ================================================ FILE: queue/manager.go ================================================ package queue import ( "context" "fmt" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" ) // Manager wraps operations on a queue type Manager interface { Name() string QurlFor(name string, prefixed bool) (string, error) Initialize(config.Config, string) error Enqueue(ctx context.Context, qURL string, run state.Run) error ReceiveRun(ctx context.Context, qURL string) (RunReceipt, error) ReceiveStatus(qURL string) (StatusReceipt, error) ReceiveCloudTrail(qURL string) (state.CloudTrailS3File, error) ReceiveKubernetesEvent(qURL string) (state.KubernetesEvent, error) ReceiveEMREvent(qURL string) (state.EmrEvent, error) ReceiveKubernetesRun(queue string) (string, error) List() ([]string, error) } // RunReceipt wraps a Run and a callback to use // when Run is finished processing type RunReceipt struct { Run *state.Run Done func() error TraceID uint64 ParentID uint64 SamplingPriority int } // StatusReceipt wraps a StatusUpdate and a callback to use // when StatusUpdate is finished applying type StatusReceipt struct { StatusUpdate *string Done func() error } // NewQueueManager returns the Manager configured via `queue_manager` func NewQueueManager(conf config.Config, name string) (Manager, error) { switch name { case state.EKSEngine: sqsEKS := &SQSManager{} if err := sqsEKS.Initialize(conf, state.EKSEngine); err != nil { return nil, errors.Wrap(err, "problem initializing SQSManager") } return sqsEKS, nil case state.EKSSparkEngine: sqsEKSSpark := &SQSManager{} if err := sqsEKSSpark.Initialize(conf, state.EKSSparkEngine); err != nil { return nil, errors.Wrap(err, "problem initializing SQSManager") } return sqsEKSSpark, nil default: return nil, fmt.Errorf("no QueueManager named [%s] was found", name) } } ================================================ FILE: queue/sqs_manager.go ================================================ package queue import ( "context" "encoding/json" "fmt" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/sqs" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" "github.com/stitchfix/flotilla-os/utils" awstrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/aws/aws-sdk-go/aws" "strconv" ) // SQSManager - queue manager implementation for sqs type SQSManager struct { namespace string retentionSeconds string visibilityTimeout string qc sqsClient qurlCache map[string]string } type sqsClient interface { GetQueueUrl(input *sqs.GetQueueUrlInput) (*sqs.GetQueueUrlOutput, error) CreateQueue(input *sqs.CreateQueueInput) (*sqs.CreateQueueOutput, error) ListQueues(input *sqs.ListQueuesInput) (*sqs.ListQueuesOutput, error) SendMessage(input *sqs.SendMessageInput) (*sqs.SendMessageOutput, error) ReceiveMessage(input *sqs.ReceiveMessageInput) (*sqs.ReceiveMessageOutput, error) DeleteMessage(input *sqs.DeleteMessageInput) (*sqs.DeleteMessageOutput, error) } // Name of queue manager - matches value in configuration func (qm *SQSManager) Name() string { return "sqs" } // Initialize new sqs queue manager func (qm *SQSManager) Initialize(conf config.Config, engine string) error { if !conf.IsSet("aws_default_region") { return errors.Errorf("SQSManager needs [aws_default_region] set in config") } qm.retentionSeconds = "604800" if conf.IsSet("queue_retention_seconds") { qm.retentionSeconds = conf.GetString("queue_retention_seconds") } qm.visibilityTimeout = "45" if conf.IsSet("queue_process_time") { qm.visibilityTimeout = conf.GetString("queue_process_time") } if !conf.IsSet("queue_namespace") { return errors.Errorf("SQSManager needs [queue_namespace] set in config") } qm.namespace = conf.GetString("queue_namespace") flotillaMode := conf.GetString("flotilla_mode") if flotillaMode != "test" { sess := awstrace.WrapSession(session.Must(session.NewSession(&aws.Config{ Region: aws.String(conf.GetString("aws_default_region"))}))) qm.qc = sqs.New(sess) } qm.qurlCache = make(map[string]string) return nil } // QurlFor returns the queue url that corresponds to the given name // * if the queue does not exist it is created func (qm *SQSManager) QurlFor(name string, prefixed bool) (string, error) { key := fmt.Sprintf("%s-%t", name, prefixed) val, ok := qm.qurlCache[key] if ok { return val, nil } val, err := qm.getOrCreateQueue(name, prefixed) if err == nil { qm.qurlCache[key] = val } return val, err } func (qm *SQSManager) getOrCreateQueue(name string, prefixed bool) (string, error) { qname := name if prefixed { qname = fmt.Sprintf("%s-%s", qm.namespace, name) } res, err := qm.qc.GetQueueUrl(&sqs.GetQueueUrlInput{ QueueName: &qname, }) if err != nil || res.QueueUrl == nil { cqi := sqs.CreateQueueInput{ Attributes: map[string]*string{ "MessageRetentionPeriod": &qm.retentionSeconds, "VisibilityTimeout": &qm.visibilityTimeout, }, QueueName: &qname, } createQueueResponse, err := qm.qc.CreateQueue(&cqi) if err != nil { return "", errors.Wrapf(err, "problem trying to create sqs queue with name [%s]", qname) } return *createQueueResponse.QueueUrl, nil } return *res.QueueUrl, nil } func (qm *SQSManager) messageFromRun(run state.Run) (*string, error) { jsonized, err := json.Marshal(run) if err != nil { return nil, errors.Wrapf(err, "problem trying to serialize run with id [%s] as json", run.RunID) } asString := string(jsonized) return &asString, nil } func (qm *SQSManager) runFromMessage(message *sqs.Message) (state.Run, error) { var run state.Run if message == nil { return run, errors.Errorf("can't generate Run from nil message") } body := message.Body if body == nil { return run, errors.Errorf("can't generate Run from empty message") } if err := json.Unmarshal([]byte(*body), &run); err != nil { errors.Wrapf(err, "problem trying to deserialize run from json [%s]", *body) } return run, nil } func (qm *SQSManager) statusFromMessage(message *sqs.Message) (string, error) { var statusUpdate string if message == nil { return statusUpdate, errors.Errorf("can't generate StatusUpdate from nil message") } body := message.Body if body == nil { return statusUpdate, errors.Errorf("can't generate StatusUpdate from empty message") } return *body, nil } // Enqueue queues run func (qm *SQSManager) Enqueue(ctx context.Context, qURL string, run state.Run) error { if len(qURL) == 0 { return errors.Errorf("no queue url specified, can't enqueue") } ctx, span := utils.TraceJob(ctx, "flotilla.queue.sqs_enqueue", "") defer span.Finish() span.SetTag("job.run_id", run.RunID) span.SetTag("queue.url", qURL) message, err := qm.messageFromRun(run) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return errors.WithStack(err) } sme := sqs.SendMessageInput{ QueueUrl: &qURL, MessageBody: message, MessageAttributes: map[string]*sqs.MessageAttributeValue{ "dd-trace-id": { DataType: aws.String("String"), StringValue: aws.String(fmt.Sprintf("%d", span.Context().TraceID())), }, "dd-parent-id": { DataType: aws.String("String"), StringValue: aws.String(fmt.Sprintf("%d", span.Context().SpanID())), }, "dd-sampling-priority": { DataType: aws.String("String"), StringValue: aws.String("1"), }, }, } _, err = qm.qc.SendMessage(&sme) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return errors.Wrap(err, "problem sending sqs message") } return nil } // Receive receives a new run to operate on func (qm *SQSManager) ReceiveRun(ctx context.Context, qURL string) (RunReceipt, error) { var receipt RunReceipt ctx, span := utils.TraceJob(ctx, "flotilla.queue.sqs_receive", "") defer span.Finish() span.SetTag("queue.url", qURL) if len(qURL) == 0 { return receipt, errors.Errorf("no queue url specified, can't dequeue") } maxMessages := int64(1) visibilityTimeout := int64(45) rmi := sqs.ReceiveMessageInput{ QueueUrl: &qURL, MaxNumberOfMessages: &maxMessages, VisibilityTimeout: &visibilityTimeout, MessageAttributeNames: []*string{ aws.String("dd-trace-id"), aws.String("dd-parent-id"), aws.String("dd-sampling-priority"), aws.String("All"), }, } var err error response, err := qm.qc.ReceiveMessage(&rmi) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return receipt, errors.Wrapf(err, "problem receiving sqs message from queue url [%s]", qURL) } if len(response.Messages) == 0 { return receipt, nil } run, err := qm.runFromMessage(response.Messages[0]) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return receipt, errors.WithStack(err) } var traceID, parentID uint64 var samplingPriority int if attr, exists := response.Messages[0].MessageAttributes["dd-trace-id"]; exists && attr.StringValue != nil { traceID, _ = strconv.ParseUint(*attr.StringValue, 10, 64) } if attr, exists := response.Messages[0].MessageAttributes["dd-parent-id"]; exists && attr.StringValue != nil { parentID, _ = strconv.ParseUint(*attr.StringValue, 10, 64) } if attr, exists := response.Messages[0].MessageAttributes["dd-sampling-priority"]; exists && attr.StringValue != nil { sp, _ := strconv.Atoi(*attr.StringValue) samplingPriority = sp } receipt.Run = &run receipt.Done = func() error { return qm.ack(qURL, response.Messages[0].ReceiptHandle) } receipt.TraceID = traceID receipt.ParentID = parentID receipt.SamplingPriority = samplingPriority return receipt, nil } func (qm *SQSManager) ReceiveStatus(qURL string) (StatusReceipt, error) { var receipt StatusReceipt if len(qURL) == 0 { return receipt, errors.Errorf("no queue url specified, can't dequeue") } maxMessages := int64(1) visibilityTimeout := int64(45) rmi := sqs.ReceiveMessageInput{ QueueUrl: &qURL, MaxNumberOfMessages: &maxMessages, VisibilityTimeout: &visibilityTimeout, } var err error response, err := qm.qc.ReceiveMessage(&rmi) if err != nil { return receipt, errors.Wrapf(err, "problem receiving sqs message from queue url [%s]", qURL) } if len(response.Messages) == 0 { return receipt, nil } statusUpdate, err := qm.statusFromMessage(response.Messages[0]) if err != nil { return receipt, errors.WithStack(err) } receipt.StatusUpdate = &statusUpdate receipt.Done = func() error { return qm.ack(qURL, response.Messages[0].ReceiptHandle) } return receipt, nil } func (qm *SQSManager) ReceiveCloudTrail(qURL string) (state.CloudTrailS3File, error) { var receipt state.CloudTrailS3File if len(qURL) == 0 { return receipt, errors.Errorf("no queue url specified, can't dequeue") } maxMessages := int64(1) visibilityTimeout := int64(45) rmi := sqs.ReceiveMessageInput{ QueueUrl: &qURL, MaxNumberOfMessages: &maxMessages, VisibilityTimeout: &visibilityTimeout, } var err error response, err := qm.qc.ReceiveMessage(&rmi) if err != nil { return receipt, errors.Wrapf(err, "problem receiving sqs message from queue url [%s]", qURL) } if response != nil && response.Messages != nil && len(response.Messages) > 0 && response.Messages[0].Body != nil { body := response.Messages[0].Body err = json.Unmarshal([]byte(*body), &receipt) _ = qm.ack(qURL, response.Messages[0].ReceiptHandle) } return receipt, nil } func (qm *SQSManager) ReceiveEMREvent(qURL string) (state.EmrEvent, error) { var emrEvent state.EmrEvent if len(qURL) == 0 { return emrEvent, errors.Errorf("no queue url specified, can't dequeue") } maxMessages := int64(1) visibilityTimeout := int64(45) rmi := sqs.ReceiveMessageInput{ QueueUrl: &qURL, MaxNumberOfMessages: &maxMessages, VisibilityTimeout: &visibilityTimeout, } var err error response, err := qm.qc.ReceiveMessage(&rmi) if err != nil { return emrEvent, errors.Wrapf(err, "problem receiving sqs message from queue url [%s]", qURL) } if response != nil && response.Messages != nil && len(response.Messages) > 0 && response.Messages[0].Body != nil { body := response.Messages[0].Body err = json.Unmarshal([]byte(*body), &emrEvent) emrEvent.Done = func() error { return qm.ack(qURL, response.Messages[0].ReceiptHandle) } } return emrEvent, nil } func (qm *SQSManager) ReceiveKubernetesEvent(qURL string) (state.KubernetesEvent, error) { var kubernetesEvent state.KubernetesEvent if len(qURL) == 0 { return kubernetesEvent, errors.Errorf("no queue url specified, can't dequeue") } maxMessages := int64(1) visibilityTimeout := int64(45) rmi := sqs.ReceiveMessageInput{ QueueUrl: &qURL, MaxNumberOfMessages: &maxMessages, VisibilityTimeout: &visibilityTimeout, } var err error response, err := qm.qc.ReceiveMessage(&rmi) if err != nil { return kubernetesEvent, errors.Wrapf(err, "problem receiving sqs message from queue url [%s]", qURL) } if response != nil && response.Messages != nil && len(response.Messages) > 0 && response.Messages[0].Body != nil { body := response.Messages[0].Body err = json.Unmarshal([]byte(*body), &kubernetesEvent) kubernetesEvent.Done = func() error { return qm.ack(qURL, response.Messages[0].ReceiptHandle) } } return kubernetesEvent, nil } func (qm *SQSManager) ReceiveKubernetesRun(queue string) (string, error) { var runId string qURL, err := qm.QurlFor(queue, false) if len(qURL) == 0 || err != nil { return runId, errors.Errorf("no queue url specified, can't dequeue") } maxMessages := int64(1) visibilityTimeout := int64(45) rmi := sqs.ReceiveMessageInput{ QueueUrl: &qURL, MaxNumberOfMessages: &maxMessages, VisibilityTimeout: &visibilityTimeout, } response, err := qm.qc.ReceiveMessage(&rmi) if err != nil { return runId, errors.Wrapf(err, "problem receiving sqs message from queue url [%s]", qURL) } if response != nil && response.Messages != nil && len(response.Messages) > 0 && response.Messages[0].Body != nil { _ = qm.ack(qURL, response.Messages[0].ReceiptHandle) return *response.Messages[0].Body, nil } return runId, errors.Wrapf(err, "no message") } // Ack acknowledges the receipt -AND- processing of the // the message referred to by handle func (qm *SQSManager) ack(qURL string, handle *string) error { if handle == nil { return errors.Errorf("cannot acknowledge message with nil receipt") } if len(*handle) == 0 { return errors.Errorf("cannot acknowledge message with empty receipt") } dmi := sqs.DeleteMessageInput{ QueueUrl: &qURL, ReceiptHandle: handle, } if _, err := qm.qc.DeleteMessage(&dmi); err != nil { return errors.Wrapf( err, "problem deleting sqs message with handle [%s] from queue url [%s]", *handle, qURL) } return nil } // List lists all the queue URLS available func (qm *SQSManager) List() ([]string, error) { response, err := qm.qc.ListQueues( &sqs.ListQueuesInput{QueueNamePrefix: &qm.namespace}) if err != nil { return nil, errors.Wrap(err, "problem listing sqs queues") } listed := make([]string, len(response.QueueUrls)) for i, qurl := range response.QueueUrls { listed[i] = *qurl } return listed, nil } ================================================ FILE: queue/sqs_manager_test.go ================================================ package queue import ( "context" "encoding/json" "errors" "github.com/aws/aws-sdk-go/service/sqs" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" "testing" ) type testSQSClient struct { t *testing.T queues []*string calls []string } func (qc *testSQSClient) GetQueueUrl(input *sqs.GetQueueUrlInput) (*sqs.GetQueueUrlOutput, error) { qc.calls = append(qc.calls, "GetQueueUrl") if input.QueueName == nil || len(*input.QueueName) == 0 { qc.t.Errorf("Expected non-nil and non empty QueueName") } if *input.QueueName == "qtest-nope" { return nil, errors.New("No queue here") } qurl := "cupcake" return &sqs.GetQueueUrlOutput{QueueUrl: &qurl}, nil } func (qc *testSQSClient) CreateQueue(input *sqs.CreateQueueInput) (*sqs.CreateQueueOutput, error) { qc.calls = append(qc.calls, "CreateQueue") if input.QueueName == nil || len(*input.QueueName) == 0 { qc.t.Errorf("Expected non-nil and non empty QueueName") } if _, ok := input.Attributes["MessageRetentionPeriod"]; !ok { qc.t.Errorf("Expected MessageRetentionPeriod in attributes") } if _, ok := input.Attributes["VisibilityTimeout"]; !ok { qc.t.Errorf("Expected VisibilityTimeout in attributes") } qurl := "nope" return &sqs.CreateQueueOutput{QueueUrl: &qurl}, nil } func (qc *testSQSClient) ListQueues(input *sqs.ListQueuesInput) (*sqs.ListQueuesOutput, error) { qc.calls = append(qc.calls, "ListQueues") if input.QueueNamePrefix == nil { qc.t.Errorf("Expected non-nil QueueNamePrefix") } if len(*input.QueueNamePrefix) == 0 { qc.t.Errorf("Expected non-empty QueueNamePrefix") } response := sqs.ListQueuesOutput{QueueUrls: qc.queues} return &response, nil } func (qc *testSQSClient) SendMessage(input *sqs.SendMessageInput) (*sqs.SendMessageOutput, error) { qc.calls = append(qc.calls, "SendMessage") if input.QueueUrl == nil { qc.t.Errorf("Expected non-nil QueueUrl") } if len(*input.QueueUrl) == 0 { qc.t.Errorf("Expected non-empty QueueUrl") } body := input.MessageBody if body == nil { qc.t.Errorf("Expected non-nil MessageBody") } var run state.Run var smo sqs.SendMessageOutput err := json.Unmarshal([]byte(*body), &run) if err != nil { qc.t.Errorf("Error deserializing MessageBody to Run, [%v]", err) } if len(run.RunID) == 0 { qc.t.Errorf("RunID of deserialized Run should not be empty") } return &smo, nil } func (qc *testSQSClient) ReceiveMessage(input *sqs.ReceiveMessageInput) (*sqs.ReceiveMessageOutput, error) { qc.calls = append(qc.calls, "ReceiveMessage") if input.VisibilityTimeout == nil { qc.t.Errorf("Expected non-nil VisibilityTimeout") } if input.MaxNumberOfMessages == nil { qc.t.Errorf("Expected non-nil MaxNumberOfMessages") } if *input.MaxNumberOfMessages != 1 { qc.t.Errorf("Expected MaxNumberOfMessages to be 1, was %v", *input.MaxNumberOfMessages) } if input.QueueUrl == nil { qc.t.Errorf("Expected non-nil QueueUrl") } if len(*input.QueueUrl) == 0 { qc.t.Errorf("Expected non-empty QueueUrl") } handle := "handle" asString := "" if *input.QueueUrl == "statusQ" { asString = `{"detail":{"taskArn":"sometaskarn","lastStatus":"STOPPED","version":17, "overrides":{"containerOverrides":[{"environment":[{"name":"FLOTILLA_SERVER_MODE","value":"prod"}]}]}}}` } else { jsonRun, _ := json.Marshal(state.Run{RunID: "cupcake"}) asString = string(jsonRun) } msg := sqs.Message{ ReceiptHandle: &handle, Body: &asString, } rmo := sqs.ReceiveMessageOutput{ Messages: []*sqs.Message{&msg}, } return &rmo, nil } func (qc *testSQSClient) DeleteMessage(input *sqs.DeleteMessageInput) (*sqs.DeleteMessageOutput, error) { qc.calls = append(qc.calls, "DeleteMessage") if input.QueueUrl == nil { qc.t.Errorf("Expected non-nil QueueUrl") } if len(*input.QueueUrl) == 0 { qc.t.Errorf("Expected non-empty QueueUrl") } if input.ReceiptHandle == nil { qc.t.Errorf("Expected non-nil ReceiptHandle") } if len(*input.ReceiptHandle) == 0 { qc.t.Errorf("Expected non-empty ReceiptHandle") } return &sqs.DeleteMessageOutput{}, nil } func setUp(t *testing.T) SQSManager { confDir := "../conf" c, _ := config.NewConfig(&confDir) qm := SQSManager{} qm.Initialize(c, state.EKSEngine) qm.namespace = "qtest" qA := "A" qB := "B" qC := "C" qStatus := "statusQ" testClient := testSQSClient{ t: t, queues: []*string{&qA, &qB, &qC, &qStatus}, } qm.qc = &testClient return qm } func TestSQSManager_List(t *testing.T) { qm := setUp(t) listed, _ := qm.List() if len(listed) != 4 { t.Errorf("Expected listed queues to be [4] but was %v", len(listed)) } } func TestSQSManager_Enqueue(t *testing.T) { qm := setUp(t) var err error toQ := state.Run{ RunID: "cupcake", } qm.Enqueue(context.Background(), "A", toQ) err = qm.Enqueue(context.Background(), "", toQ) if err == nil { t.Errorf("Expected empty queue url to result in error") } } func TestSQSManager_QurlFor(t *testing.T) { qm := setUp(t) testClient := testSQSClient{t: t} qm.qc = &testClient expectedCalls := map[string]bool{ "GetQueueUrl": true, } qm.QurlFor("cupcake", true) if len(testClient.calls) != len(expectedCalls) { t.Errorf( "Expected exactly %v calls for existing queue, but was %v", len(expectedCalls), len(testClient.calls)) } for _, call := range testClient.calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call for existing queue [%v]", call) } } testClient = testSQSClient{t: t} qm.qc = &testClient expectedCalls = map[string]bool{ "GetQueueUrl": true, "CreateQueue": true, } qm.QurlFor("nope", true) if len(testClient.calls) != len(expectedCalls) { t.Errorf( "Expected exactly %v calls for non-existing queue, but was %v", len(expectedCalls), len(testClient.calls)) } for _, call := range testClient.calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call for non-existing queue [%v]", call) } } } func TestSQSManager_ReceiveRun(t *testing.T) { qm := setUp(t) receipt, _ := qm.ReceiveRun(context.Background(), "A") receipt.Done() } func TestSQSManager_ReceiveStatus(t *testing.T) { qm := setUp(t) receipt, _ := qm.ReceiveStatus("statusQ") receipt.Done() } ================================================ FILE: services/definition.go ================================================ package services import ( "context" "fmt" "github.com/stitchfix/flotilla-os/exceptions" "github.com/stitchfix/flotilla-os/state" "strings" ) // // DefinitionService defines an interface for operations involving // definitions // * Like the ExecutionService, is an intermediary layer between state and the execution engine // type DefinitionService interface { Create(ctx context.Context, definition *state.Definition) (state.Definition, error) Get(ctx context.Context, definitionID string) (state.Definition, error) GetByAlias(ctx context.Context, alias string) (state.Definition, error) List(ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string) (state.DefinitionList, error) Update(ctx context.Context, definitionID string, updates state.Definition) (state.Definition, error) Delete(ctx context.Context, definitionID string) error // Metadata oriented ListGroups(ctx context.Context, limit int, offset int, name *string) (state.GroupsList, error) ListTags(ctx context.Context, limit int, offset int, name *string) (state.TagsList, error) } type definitionService struct { sm state.Manager } // // NewDefinitionService configures and returns a DefinitionService // func NewDefinitionService(stateManager state.Manager) (DefinitionService, error) { ds := definitionService{sm: stateManager} return &ds, nil } // // Create fully initialize and save the new definition // * Allocates new definition id // * Defines definition with execution engine // * Stores definition using state manager // func (ds *definitionService) Create(ctx context.Context, definition *state.Definition) (state.Definition, error) { if valid, reasons := definition.IsValid(); !valid { return state.Definition{}, exceptions.MalformedInput{strings.Join(reasons, "\n")} } exists, err := ds.aliasExists(ctx, definition.Alias) if err != nil { return state.Definition{}, err } if exists { return state.Definition{}, exceptions.ConflictingResource{ fmt.Sprintf("definition with alias [%s] aleady exists", definition.Alias)} } // Attach definition id here definitionID, err := state.NewDefinitionID(*definition) if err != nil { return state.Definition{}, err } definition.DefinitionID = definitionID return *definition, ds.sm.CreateDefinition(ctx, *definition) } func (ds *definitionService) aliasExists(ctx context.Context, alias string) (bool, error) { // Short circuit, to check if alias already exists dl, err := ds.sm.ListDefinitions( ctx, 1024, 0, "alias", "asc", map[string][]string{"alias": {alias}}, nil) if err != nil { return false, err } for _, def := range dl.Definitions { if def.Alias == alias { return true, nil } } return false, nil } // // Get returns the definition specified by definitionID // func (ds *definitionService) Get(ctx context.Context, definitionID string) (state.Definition, error) { return ds.sm.GetDefinition(ctx, definitionID) } func (ds *definitionService) GetByAlias(ctx context.Context, alias string) (state.Definition, error) { return ds.sm.GetDefinitionByAlias(ctx, alias) } // List lists definitions func (ds *definitionService) List(ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string) (state.DefinitionList, error) { return ds.sm.ListDefinitions(ctx, limit, offset, sortBy, order, filters, envFilters) } // UpdateStatus updates the definition specified by definitionID with the given updates func (ds *definitionService) Update(ctx context.Context, definitionID string, updates state.Definition) (state.Definition, error) { definition, err := ds.sm.GetDefinition(ctx, definitionID) if err != nil { return definition, err } definition.UpdateWith(updates) return ds.sm.UpdateDefinition(ctx, definitionID, definition) } // Delete deletes and deregisters the definition specified by definitionID func (ds *definitionService) Delete(ctx context.Context, definitionID string) error { return ds.sm.DeleteDefinition(ctx, definitionID) } func (ds *definitionService) ListGroups(ctx context.Context, limit int, offset int, name *string) (state.GroupsList, error) { return ds.sm.ListGroups(ctx, limit, offset, name) } func (ds *definitionService) ListTags(ctx context.Context, limit int, offset int, name *string) (state.TagsList, error) { return ds.sm.ListTags(ctx, limit, offset, name) } ================================================ FILE: services/definition_test.go ================================================ package services import ( "context" "github.com/stitchfix/flotilla-os/state" "github.com/stitchfix/flotilla-os/testutils" "testing" ) func setUpDefinitionServiceTest(t *testing.T) (DefinitionService, *testutils.ImplementsAllTheThings) { imp := testutils.ImplementsAllTheThings{ T: t, Definitions: map[string]state.Definition{ "A": {DefinitionID: "A"}, "B": {DefinitionID: "B"}, "C": {DefinitionID: "C", ExecutableResources: state.ExecutableResources{Image: "invalidimage"}}, }, Runs: map[string]state.Run{ "runA": {DefinitionID: "A", ClusterName: "A", GroupName: "A", RunID: "runA"}, "runB": {DefinitionID: "B", ClusterName: "B", GroupName: "B", RunID: "runB"}, }, Qurls: map[string]string{ "A": "a/", "B": "b/", }, } ds, _ := NewDefinitionService(&imp) return ds, &imp } func TestDefinitionService_Create(t *testing.T) { ds, imp := setUpDefinitionServiceTest(t) // Check that new definition id // Check that define is called // Check that save is called and has the new definition id memory := int64(512) newValidDef := state.Definition{ Alias: "cupcake", GroupName: "group-cupcake", Command: "echo 'hi'", ExecutableResources: state.ExecutableResources{ Image: "image:cupcake", Memory: &memory, }, } created, _ := ds.Create(context.Background(), &newValidDef) if len(created.DefinitionID) == 0 { t.Errorf("Expected non-empty definition id") } // order matters expected := []string{"ListDefinitions", "CreateDefinition"} if len(imp.Calls) != len(expected) { t.Errorf("Unexpected number of create calls, expected %v but was %v", len(expected), len(imp.Calls)) } for i, call := range imp.Calls { if expected[i] != call { t.Errorf("Expected call %v to be %s but was %s", i, expected[i], call) } } // Check that the saved definition is the one with the id _, ok := imp.Definitions[created.DefinitionID] if !ok { t.Errorf("Expected that definition with id %s would be saved in state manager", created.DefinitionID) } } func TestDefinitionService_Create2(t *testing.T) { // Check that invalid definitions return errors ds, _ := setUpDefinitionServiceTest(t) var err error memory := int64(512) invalid4 := state.Definition{ Alias: "cupcake", GroupName: "group-cupcake", ExecutableResources: state.ExecutableResources{Memory: &memory}, } _, err = ds.Create(context.Background(), &invalid4) if err == nil { t.Errorf("Expected invalid definition with no image to result in error") } } func TestDefinitionService_Update(t *testing.T) { ds, imp := setUpDefinitionServiceTest(t) memory := int64(512) d := state.Definition{ ExecutableResources: state.ExecutableResources{Memory: &memory}, } ds.Update(context.Background(), "A", d) // order matters expected := []string{"GetDefinition", "UpdateDefinition"} if len(imp.Calls) != len(expected) { t.Errorf("Unexpected number of create calls, expected %v but was %v", len(expected), len(imp.Calls)) } for i, call := range imp.Calls { if expected[i] != call { t.Errorf("Expected call %v to be %s but was %s", i, expected[i], call) } } } func TestDefinitionService_Delete(t *testing.T) { ds, imp := setUpDefinitionServiceTest(t) ds.Delete(context.Background(), "A") // order matters expected := []string{"DeleteDefinition"} if len(imp.Calls) != len(expected) { t.Errorf("Unexpected number of create calls, expected %v but was %v", len(expected), len(imp.Calls)) } for i, call := range imp.Calls { if expected[i] != call { t.Errorf("Expected call %v to be %s but was %s", i, expected[i], call) } } } ================================================ FILE: services/execution.go ================================================ package services import ( "context" "crypto/md5" "encoding/json" "errors" "fmt" "math/rand" "regexp" "slices" "strconv" "strings" "time" "github.com/stitchfix/flotilla-os/utils" "github.com/aws/aws-sdk-go/aws" "github.com/stitchfix/flotilla-os/clients/cluster" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/exceptions" "github.com/stitchfix/flotilla-os/execution/engine" "github.com/stitchfix/flotilla-os/state" ) // ExecutionService interacts with the state manager and queue manager to queue runs, and perform // CRUD operations on them // * Acts as an intermediary layer between state and the execution engine type ExecutionService interface { CreateDefinitionRunByDefinitionID(ctx context.Context, definitionID string, req *state.DefinitionExecutionRequest) (state.Run, error) CreateDefinitionRunByAlias(ctx context.Context, alias string, req *state.DefinitionExecutionRequest) (state.Run, error) List( ctx context.Context, limit int, offset int, sortOrder string, sortField string, filters map[string][]string, envFilters map[string]string) (state.RunList, error) Get(ctx context.Context, runID string) (state.Run, error) UpdateStatus(ctx context.Context, runID string, status string, exitCode *int64, runExceptions *state.RunExceptions, exitReason *string) error Terminate(ctx context.Context, runID string, userInfo state.UserInfo) error ReservedVariables() []string ListClusters(ctx context.Context) ([]state.ClusterMetadata, error) GetDefaultCluster() string GetEvents(ctx context.Context, run state.Run) (state.PodEventList, error) CreateTemplateRunByTemplateID(ctx context.Context, templateID string, req *state.TemplateExecutionRequest) (state.Run, error) CreateTemplateRunByTemplateName(ctx context.Context, templateName string, templateVersion string, req *state.TemplateExecutionRequest) (state.Run, error) UpdateClusterMetadata(ctx context.Context, cluster state.ClusterMetadata) error DeleteClusterMetadata(ctx context.Context, clusterID string) error GetClusterByID(ctx context.Context, clusterID string) (state.ClusterMetadata, error) GetRunStatus(ctx context.Context, runID string) (state.RunStatus, error) } type executionService struct { stateManager state.Manager eksClusterClient cluster.Client eksExecutionEngine engine.Engine emrExecutionEngine engine.Engine reservedEnv map[string]func(run state.Run) string eksClusterOverride string eksClusterDefault string eksTierDefault string eksGPUClusterOverride string eksGPUClusterDefault string checkImageValidity bool baseUri string spotReAttemptOverride float32 eksSpotOverride bool spotThresholdMinutes float64 terminateJobChannel chan state.TerminateJob validEksClusters []string //validEksClusterTiers string } func (es *executionService) GetEvents(ctx context.Context, run state.Run) (state.PodEventList, error) { ctx, span := utils.TraceJob(ctx, "flotilla.get_events", run.RunID) defer span.Finish() utils.TagJobRun(span, run) return es.eksExecutionEngine.GetEvents(ctx, run) } // NewExecutionService configures and returns an ExecutionService func NewExecutionService(conf config.Config, eksExecutionEngine engine.Engine, sm state.Manager, eksClusterClient cluster.Client, emrExecutionEngine engine.Engine) (ExecutionService, error) { es := executionService{ stateManager: sm, eksClusterClient: eksClusterClient, eksExecutionEngine: eksExecutionEngine, emrExecutionEngine: emrExecutionEngine, } // // Reserved environment variables dynamically generated // per run ownerKey := conf.GetString("owner_id_var") if len(ownerKey) == 0 { ownerKey = "FLOTILLA_RUN_OWNER_ID" } es.validEksClusters = strings.Split(conf.GetString("eks_clusters"), ",") for k, _ := range es.validEksClusters { es.validEksClusters[k] = strings.TrimSpace(es.validEksClusters[k]) } es.eksClusterOverride = conf.GetString("eks_cluster_override") es.eksGPUClusterOverride = conf.GetString("eks_gpu_cluster_override") es.eksClusterDefault = conf.GetString("eks_cluster_default") es.eksGPUClusterDefault = conf.GetString("eks_gpu_cluster_default") es.eksTierDefault = conf.GetString("eks_tier_default") //es.validEksClusterTiers = conf.GetString("eks_cluster_tiers") if !slices.Contains(es.validEksClusters, es.eksClusterDefault) || !slices.Contains(es.validEksClusters, es.eksGPUClusterDefault) { return nil, fmt.Errorf("an invalid cluster has been set as a default\nvalid_clusters:%s\neks_cluster_default:%s\neks_gpu_cluster_default:%s", es.validEksClusters, es.eksClusterDefault, es.eksGPUClusterDefault) } if conf.IsSet("check_image_validity") { es.checkImageValidity = conf.GetBool("check_image_validity") } else { es.checkImageValidity = true } if conf.IsSet("base_uri") { es.baseUri = conf.GetString("base_uri") } if conf.IsSet("eks_spot_reattempt_override") { es.spotReAttemptOverride = float32(conf.GetFloat64("eks_spot_reattempt_override")) } else { // defaults to 5% override. es.spotReAttemptOverride = float32(0.05) } if conf.IsSet("eks_spot_override") { es.eksSpotOverride = conf.GetBool("eks_spot_override") } else { es.eksSpotOverride = false } if conf.IsSet("eks_spot_threshold_minutes") { es.spotThresholdMinutes = conf.GetFloat64("eks_spot_threshold_minutes") } else { es.spotThresholdMinutes = 30.0 } es.reservedEnv = map[string]func(run state.Run) string{ "FLOTILLA_SERVER_MODE": func(run state.Run) string { return conf.GetString("flotilla_mode") }, "FLOTILLA_RUN_ID": func(run state.Run) string { return run.RunID }, "AWS_ROLE_SESSION_NAME": func(run state.Run) string { return run.RunID }, ownerKey: func(run state.Run) string { return run.User }, } es.terminateJobChannel = make(chan state.TerminateJob, 100) return &es, nil } // ReservedVariables returns the list of reserved run environment variable // names func (es *executionService) ReservedVariables() []string { var keys []string for k := range es.reservedEnv { keys = append(keys, k) } return keys } // Create constructs and queues a new Run on the cluster specified. func (es *executionService) CreateDefinitionRunByDefinitionID(ctx context.Context, definitionID string, req *state.DefinitionExecutionRequest) (state.Run, error) { ctx, span := utils.TraceJob(ctx, "flotilla.definition.create_run", "") defer span.Finish() span.SetTag("definition_id", definitionID) // Ensure definition exists definition, err := es.stateManager.GetDefinition(ctx, definitionID) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return state.Run{}, err } return es.createFromDefinition(ctx, definition, req) } // Create constructs and queues a new Run on the cluster specified, based on an alias func (es *executionService) CreateDefinitionRunByAlias(ctx context.Context, alias string, req *state.DefinitionExecutionRequest) (state.Run, error) { ctx, span := utils.TraceJob(ctx, "flotilla.alias.create_run", "") defer span.Finish() span.SetTag("alias", alias) // Ensure definition exists definition, err := es.stateManager.GetDefinitionByAlias(ctx, alias) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return state.Run{}, err } return es.createFromDefinition(ctx, definition, req) } func (es *executionService) createFromDefinition(ctx context.Context, definition state.Definition, req *state.DefinitionExecutionRequest) (state.Run, error) { var ( run state.Run err error ) ctx, span := utils.TraceJob(ctx, "flotilla.definition.create_run", run.RunID) defer span.Finish() fields := req.GetExecutionRequestCommon() rand.Seed(time.Now().Unix()) /* cluster is set based on the following precedence (low to high): 1. Cluster is passed in from request 2. Cluster from cluster metadata and active 3. Cluster from task definition 3. Default cluster from config cluster is then checked for validity. if required, cluster overrides should be introduced and set here */ clusterMetadata, err := es.ListClusters(ctx) var activeClusters []string if len(clusterMetadata) > 0 { for _, cluster := range clusterMetadata { if cluster.Status == state.StatusActive { if es.clusterSupportsTier(cluster, req.Tier) { activeClusters = append(activeClusters, cluster.Name) } } } } if req.ClusterName != "" { fields.ClusterName = req.ClusterName } else if len(activeClusters) > 0 { fields.ClusterName = activeClusters[rand.Intn(len(activeClusters))] } else if definition.TargetCluster != "" { fields.ClusterName = definition.TargetCluster } else if fields.Gpu != nil && *fields.Gpu > 0 { fields.ClusterName = es.eksGPUClusterDefault } else { fields.ClusterName = es.eksClusterDefault } for _, c := range clusterMetadata { es.validEksClusters = append(es.validEksClusters, c.Name) } if !es.isClusterValid(fields.ClusterName) { return run, fmt.Errorf("%s was not found in the list of valid clusters: %s", fields.ClusterName, es.validEksClusters) } span.SetTag("clusterName", fields.ClusterName) run.User = req.OwnerID es.sanitizeExecutionRequestCommonFields(fields) // Construct run object with StatusQueued and new UUID4 run id run, err = es.constructRunFromDefinition(ctx, definition, req) if err != nil { return run, err } return es.createAndEnqueueRun(ctx, run) } func (es *executionService) constructRunFromDefinition(ctx context.Context, definition state.Definition, req *state.DefinitionExecutionRequest) (state.Run, error) { run, err := es.constructBaseRunFromExecutable(ctx, definition, req) if err != nil { return run, err } run.DefinitionID = definition.DefinitionID run.Alias = definition.Alias queuedAt := time.Now() run.QueuedAt = &queuedAt run.GroupName = definition.GroupName run.RequiresDocker = definition.RequiresDocker if req.Description != nil { run.Description = req.Description } if req.IdempotenceKey != nil { run.IdempotenceKey = req.IdempotenceKey } if req.Arch != nil { run.Arch = req.Arch } if req.Labels != nil { run.Labels = *req.Labels } return run, nil } func (es *executionService) constructBaseRunFromExecutable(ctx context.Context, executable state.Executable, req state.ExecutionRequest) (state.Run, error) { resources := executable.GetExecutableResources() fields := req.GetExecutionRequestCommon() var ( run state.Run err error ) fields.Engine = req.GetExecutionRequestCommon().Engine fields.Tier = es.resolveRequestTier(req.GetExecutionRequestCommon().Tier) // Compute the executable command based on the execution request. If the // execution request did not specify an overriding command, use the computed // `executableCmd` as the Run's Command. runID, err := state.NewRunID(fields.Engine) if err != nil { return run, err } if *fields.Engine == state.EKSEngine { executableCmd, err := executable.GetExecutableCommand(req) if err != nil { return run, err } if (fields.Command == nil || len(*fields.Command) == 0) && (len(executableCmd) > 0) { fields.Command = aws.String(executableCmd) } executableID := executable.GetExecutableID() taskExecutionMinutes, _ := es.stateManager.GetTaskHistoricalRuntime(ctx, *executableID, runID) reAttemptRate, _ := es.stateManager.GetPodReAttemptRate(ctx) if reAttemptRate >= es.spotReAttemptOverride && fields.Engine != nil && fields.NodeLifecycle != nil && *fields.Engine == state.EKSEngine && *fields.NodeLifecycle == state.SpotLifecycle { fields.NodeLifecycle = &state.OndemandLifecycle } if taskExecutionMinutes > float32(es.spotThresholdMinutes) { fields.NodeLifecycle = &state.OndemandLifecycle } } if *fields.Engine == state.EKSSparkEngine { if req.GetExecutionRequestCommon().SparkExtension == nil { return run, errors.New("spark_extension can't be nil, when using eks-spark engine type") } fields.SparkExtension = req.GetExecutionRequestCommon().SparkExtension reAttemptRate, _ := es.stateManager.GetPodReAttemptRate(ctx) if reAttemptRate >= es.spotReAttemptOverride { fields.NodeLifecycle = &state.OndemandLifecycle } } if fields.NodeLifecycle == nil { fields.NodeLifecycle = &state.SpotLifecycle } // Calculate command_hash from actual command (FIX for ARA bug) // This ensures jobs with different commands have different hashes, // even if they share the same description. if fields.Command != nil && len(*fields.Command) > 0 { // Regular EKS jobs: Hash the command fields.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*fields.Command)))) } else if *fields.Engine == state.EKSSparkEngine && fields.Description != nil && len(*fields.Description) > 0 { // Spark jobs: Fall back to description (Spark jobs don't have commands) // The Spark "command" is in spark_extension, not the command field // Description uniquely identifies the Spark job type for ARA tracking fields.CommandHash = aws.String(fmt.Sprintf("%x", md5.Sum([]byte(*fields.Description)))) } // If both command and description are NULL, command_hash remains NULL (malformed job) run = state.Run{ RunID: runID, ClusterName: fields.ClusterName, Image: resources.Image, Status: state.StatusQueued, User: fields.OwnerID, Command: fields.Command, Memory: fields.Memory, Cpu: fields.Cpu, Gpu: fields.Gpu, Engine: fields.Engine, NodeLifecycle: fields.NodeLifecycle, EphemeralStorage: fields.EphemeralStorage, ExecutableID: executable.GetExecutableID(), ExecutableType: executable.GetExecutableType(), ActiveDeadlineSeconds: fields.ActiveDeadlineSeconds, TaskType: state.DefaultTaskType, SparkExtension: fields.SparkExtension, CommandHash: fields.CommandHash, ServiceAccount: fields.ServiceAccount, Tier: fields.Tier, } if fields.Labels != nil { run.Labels = *fields.Labels } runEnv := es.constructEnviron(run, fields.Env) run.Env = &runEnv return run, nil } func (es *executionService) constructEnviron(run state.Run, env *state.EnvList) state.EnvList { size := len(es.reservedEnv) if env != nil { size += len(*env) } runEnv := make([]state.EnvVar, size) i := 0 for k, f := range es.reservedEnv { runEnv[i] = state.EnvVar{ Name: k, Value: f(run), } i++ } if env != nil { for j, e := range *env { runEnv[i+j] = e } } return state.EnvList(runEnv) } // List returns a list of Runs // * validates definition_id and status filters func (es *executionService) List( ctx context.Context, limit int, offset int, sortOrder string, sortField string, filters map[string][]string, envFilters map[string]string) (state.RunList, error) { ctx, span := utils.TraceJob(ctx, "flotilla.list_runs", "") defer span.Finish() span.SetTag("limit", limit) span.SetTag("offset", offset) // If definition_id is present in filters, validate its // existence first definitionID, ok := filters["definition_id"] if ok { _, err := es.stateManager.GetDefinition(ctx, definitionID[0]) if err != nil { return state.RunList{}, err } } if statusFilters, ok := filters["status"]; ok { for _, status := range statusFilters { if !state.IsValidStatus(status) { // Status filter is invalid err := exceptions.MalformedInput{ ErrorString: fmt.Sprintf("invalid status [%s]", status)} return state.RunList{}, err } } } return es.stateManager.ListRuns(ctx, limit, offset, sortField, sortOrder, filters, envFilters, []string{state.EKSEngine, state.EKSSparkEngine}) } // Get returns the run with the given runID func (es *executionService) Get(ctx context.Context, runID string) (state.Run, error) { ctx, span := utils.TraceJob(ctx, "flotilla.get_run", runID) defer span.Finish() span.SetTag("run_id", runID) run, err := es.stateManager.GetRun(ctx, runID) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) } if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) } return run, err } // UpdateStatus is for supporting some legacy runs that still manually update their status func (es *executionService) UpdateStatus(ctx context.Context, runID string, status string, exitCode *int64, runExceptions *state.RunExceptions, exitReason *string) error { ctx, span := utils.TraceJob(ctx, "flotilla.update_status", runID) defer span.Finish() span.SetTag("run_id", runID) span.SetTag("status", status) if !state.IsValidStatus(status) { return exceptions.MalformedInput{ErrorString: fmt.Sprintf("status %s is invalid", status)} } run, err := es.stateManager.GetRun(ctx, runID) if err != nil { return err } var startedAt *time.Time if run.StartedAt == nil { startedAt = run.QueuedAt } else { startedAt = run.StartedAt } finishedAt := time.Now() if exitReason == nil { extractedExitReason := es.extractExitReason(runExceptions) exitReason = &extractedExitReason } _, err = es.stateManager.UpdateRun(ctx, runID, state.Run{Status: status, ExitCode: exitCode, ExitReason: exitReason, RunExceptions: runExceptions, FinishedAt: &finishedAt, StartedAt: startedAt}) return err } func (es *executionService) extractExitReason(runExceptions *state.RunExceptions) string { connectionError := regexp.MustCompile(`(?i).*(timeout|gatewayerror|socketerror|\s503\s|\s502\s|\s500\s|\s504\s|connectionerror).*`) pipError := regexp.MustCompile(`(?i).*(could\snot\sfind\sa\sversion|package\snot\sfound|ModuleNotFoundError|No\smatching\sdistribution\sfound).*`) yumError := regexp.MustCompile(`(?i).*(Nothing\sto\sdo).*`) gitError := regexp.MustCompile(`(?i).*(Could\snot\sread\sfrom\sremote\srepository|correct\saccess\srights|Repository\snot\sfound).*`) argumentError := regexp.MustCompile(`(?i).*(404|400|keyerror|column\smissing|RuntimeError).*`) syntaxError := regexp.MustCompile(`(?i).*(syntaxerror|typeerror|).*`) value, _ := json.Marshal(runExceptions) if value != nil { errorMsg := string(value) switch { case connectionError.MatchString(errorMsg): return "Connection error to downstream uri" case pipError.MatchString(errorMsg): return "Python pip package installation error" case yumError.MatchString(errorMsg): return "Yum installation error" case gitError.MatchString(errorMsg): return "Git clone error" case argumentError.MatchString(errorMsg): return "Data or argument error" case syntaxError.MatchString(errorMsg): return "Code or syntax error" default: return "Runtime exception encountered" } } return "Runtime exception encountered" } func (es *executionService) terminateWorker(jobChan <-chan state.TerminateJob) { ctx := context.Background() for job := range jobChan { runID := job.RunID userInfo := job.UserInfo ctx, span := utils.TraceJob(ctx, "flotilla.job.terminate_worker", runID) defer span.Finish() run, err := es.stateManager.GetRun(ctx, runID) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) break } utils.TagJobRun(span, run) if err != nil { break } subRuns, err := es.stateManager.ListRuns(ctx, 1000, 0, "status", "desc", nil, map[string]string{"PARENT_FLOTILLA_RUN_ID": run.RunID}, state.Engines) if err == nil && subRuns.Total > 0 { for _, subRun := range subRuns.Runs { es.terminateJobChannel <- state.TerminateJob{ RunID: subRun.RunID, UserInfo: job.UserInfo, } } } if run.Engine == nil { run.Engine = &state.EKSEngine } if run.Status != state.StatusStopped { if *run.Engine == state.EKSSparkEngine { err = es.emrExecutionEngine.Terminate(ctx, run) } else { err = es.eksExecutionEngine.Terminate(ctx, run) } exitReason := "Task terminated by user" if len(userInfo.Email) > 0 { exitReason = fmt.Sprintf("Task terminated by - %s", userInfo.Email) } exitCode := int64(1) finishedAt := time.Now() _, err = es.stateManager.UpdateRun(ctx, run.RunID, state.Run{ Status: state.StatusStopped, ExitReason: &exitReason, ExitCode: &exitCode, FinishedAt: &finishedAt, }) break } break } } // Terminate stops the run with the given runID func (es *executionService) Terminate(ctx context.Context, runID string, userInfo state.UserInfo) error { ctx, span := utils.TraceJob(ctx, "flotilla.terminate_run", runID) defer span.Finish() span.SetTag("run_id", runID) if userInfo.Email != "" { span.SetTag("user.email", userInfo.Email) } es.terminateJobChannel <- state.TerminateJob{RunID: runID, UserInfo: userInfo} go es.terminateWorker(es.terminateJobChannel) return nil } // ListClusters returns a list of all execution clusters available with their metadata func (es *executionService) ListClusters(ctx context.Context) ([]state.ClusterMetadata, error) { ctx, span := utils.TraceJob(ctx, "flotilla.list_clusters", "") defer span.Finish() clusters, err := es.stateManager.ListClusterStates(ctx) if err != nil { return nil, err } return clusters, nil } func (es *executionService) GetDefaultCluster() string { return es.eksClusterDefault } // sanitizeExecutionRequestCommonFields does what its name implies - sanitizes func (es *executionService) sanitizeExecutionRequestCommonFields(fields *state.ExecutionRequestCommon) { if fields.Engine == nil { fields.Engine = &state.EKSEngine } if es.eksSpotOverride { fields.NodeLifecycle = &state.OndemandLifecycle } if fields.ActiveDeadlineSeconds == nil { if fields.NodeLifecycle == &state.OndemandLifecycle { fields.ActiveDeadlineSeconds = &state.OndemandActiveDeadlineSeconds } else { fields.ActiveDeadlineSeconds = &state.SpotActiveDeadlineSeconds } } } // createAndEnqueueRun creates a run object in the DB, enqueues it, then // updates the db's run object with a new `queued_at` field. func (es *executionService) createAndEnqueueRun(ctx context.Context, run state.Run) (state.Run, error) { var err error ctx, span := utils.TraceJob(ctx, "flotilla.job.create_and_enqueue", "") defer span.Finish() span.SetTag("job.run_id", run.RunID) utils.TagJobRun(span, run) if run.IdempotenceKey != nil { priorRunId, err := es.stateManager.CheckIdempotenceKey(ctx, *run.IdempotenceKey) if err == nil && len(priorRunId) > 0 { priorRun, err := es.Get(ctx, priorRunId) if err == nil { return priorRun, nil } } } // Save run to source of state - it is *CRITICAL* to do this // -before- queuing to avoid processing unsaved runs if err = es.stateManager.CreateRun(ctx, run); err != nil { return run, err } if *run.Engine == state.EKSEngine { err = es.eksExecutionEngine.Enqueue(ctx, run) } else { err = es.emrExecutionEngine.Enqueue(ctx, run) } queuedAt := time.Now() if err != nil { return run, err } // UpdateStatus the run's QueuedAt field if run, err = es.stateManager.UpdateRun(ctx, run.RunID, state.Run{QueuedAt: &queuedAt}); err != nil { return run, err } return run, nil } func (es *executionService) CreateTemplateRunByTemplateName(ctx context.Context, templateName string, templateVersion string, req *state.TemplateExecutionRequest) (state.Run, error) { ctx, span := utils.TraceJob(ctx, "flotilla.template.create_run_by_name", "") defer span.Finish() span.SetTag("template_name", templateName) span.SetTag("template_version", templateVersion) version, err := strconv.Atoi(templateVersion) if err != nil { //use the "latest" template - version not a integer fetch, template, err := es.stateManager.GetLatestTemplateByTemplateName(ctx, templateName) if fetch && err == nil { return es.CreateTemplateRunByTemplateID(ctx, template.TemplateID, req) } } else { fetch, template, err := es.stateManager.GetTemplateByVersion(ctx, templateName, int64(version)) if fetch && err == nil { return es.CreateTemplateRunByTemplateID(ctx, template.TemplateID, req) } } return state.Run{}, errors.New(fmt.Sprintf("invalid template name or version, template_name: %s, template_version: %s", templateName, templateVersion)) } // Create constructs and queues a new Run on the cluster specified. func (es *executionService) CreateTemplateRunByTemplateID(ctx context.Context, templateID string, req *state.TemplateExecutionRequest) (state.Run, error) { ctx, span := utils.TraceJob(ctx, "flotilla.template.create_run_by_id", "") defer span.Finish() span.SetTag("template_id", templateID) // Ensure template exists template, err := es.stateManager.GetTemplateByID(ctx, templateID) if err != nil { return state.Run{}, err } return es.createFromTemplate(ctx, template, req) } func (es *executionService) createFromTemplate(ctx context.Context, template state.Template, req *state.TemplateExecutionRequest) (state.Run, error) { var ( run state.Run err error ) fields := req.GetExecutionRequestCommon() es.sanitizeExecutionRequestCommonFields(fields) // Construct run object with StatusQueued and new UUID4 run id run, err = es.constructRunFromTemplate(ctx, template, req) if err != nil { return run, err } if !req.DryRun { return es.createAndEnqueueRun(ctx, run) } return run, nil } func (es *executionService) constructRunFromTemplate(ctx context.Context, template state.Template, req *state.TemplateExecutionRequest) (state.Run, error) { run, err := es.constructBaseRunFromExecutable(ctx, template, req) if err != nil { return run, err } run.DefinitionID = template.TemplateID run.Alias = template.TemplateID run.GroupName = "template_group_name" run.ExecutionRequestCustom = req.GetExecutionRequestCustom() return run, nil } // resolveRequestTier returns the requested tier or default tier if empty func (es *executionService) resolveRequestTier(requestedTier state.Tier) state.Tier { if requestedTier == "" { return state.Tier(es.eksTierDefault) } return requestedTier } // clusterSupportsTier checks if a cluster supports the specified tier func (es *executionService) clusterSupportsTier(cluster state.ClusterMetadata, requestedTier state.Tier) bool { resolvedTier := es.resolveRequestTier(requestedTier) for _, allowedTier := range cluster.AllowedTiers { if allowedTier == string(resolvedTier) { return true } } return false } func (es *executionService) isClusterValid(clusterName string) bool { return slices.Contains(es.validEksClusters, clusterName) } func (es *executionService) UpdateClusterMetadata(ctx context.Context, cluster state.ClusterMetadata) error { ctx, span := utils.TraceJob(ctx, "flotilla.update_cluster_metadata", cluster.Name) defer span.Finish() span.SetTag("cluster_name", cluster.Name) return es.stateManager.UpdateClusterMetadata(ctx, cluster) } func (es *executionService) DeleteClusterMetadata(ctx context.Context, clusterID string) error { ctx, span := utils.TraceJob(ctx, "flotilla.delete_cluster_metadata", clusterID) defer span.Finish() span.SetTag("cluster_id", clusterID) return es.stateManager.DeleteClusterMetadata(ctx, clusterID) } func (es *executionService) GetClusterByID(ctx context.Context, clusterID string) (state.ClusterMetadata, error) { ctx, span := utils.TraceJob(ctx, "flotilla.get_cluster_by_id", clusterID) defer span.Finish() span.SetTag("cluster_id", clusterID) return es.stateManager.GetClusterByID(ctx, clusterID) } // GetRunStatus fetches only the essential status information for a run func (es *executionService) GetRunStatus(ctx context.Context, runID string) (state.RunStatus, error) { ctx, span := utils.TraceJob(ctx, "flotilla.get_run_status", runID) defer span.Finish() span.SetTag("run_id", runID) return es.stateManager.GetRunStatus(ctx, runID) } ================================================ FILE: services/execution_test.go ================================================ package services import ( "context" "crypto/md5" "fmt" "log" "testing" "github.com/aws/aws-sdk-go/aws" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/state" "github.com/stitchfix/flotilla-os/testutils" ) func setUp(t *testing.T) (ExecutionService, *testutils.ImplementsAllTheThings) { confDir := "../conf" c, _ := config.NewConfig(&confDir) imp := testutils.ImplementsAllTheThings{ T: t, Definitions: map[string]state.Definition{ "A": {DefinitionID: "A", Alias: "aliasA"}, "B": {DefinitionID: "B", Alias: "aliasB"}, "C": {DefinitionID: "C", Alias: "aliasC", ExecutableResources: state.ExecutableResources{Image: "invalidimage"}}, }, Runs: map[string]state.Run{ "runA": {DefinitionID: "A", ClusterName: "A", GroupName: "A", RunID: "runA"}, "runB": {DefinitionID: "B", ClusterName: "B", GroupName: "B", RunID: "runB"}, }, Qurls: map[string]string{ "A": "a/", "B": "b/", }, ClusterStates: []state.ClusterMetadata{ {Name: "cluster1", Status: state.StatusActive, StatusReason: "Active and healthy"}, {Name: "cluster2", Status: state.StatusActive, StatusReason: "Active and healthy"}, }, } es, err := NewExecutionService(c, &imp, &imp, &imp, &imp) if err != nil { log.Fatalf("error seting up execution service: %s", err.Error()) } return es, &imp } func TestExecutionService_CreateDefinitionRunByDefinitionID(t *testing.T) { ctx := context.Background() // Tests valid create es, imp := setUp(t) env := &state.EnvList{ {Name: "K1", Value: "V1"}, } expectedCalls := map[string]bool{ "GetDefinition": true, "CreateRun": true, "UpdateRun": true, "GetTaskHistoricalRuntime": true, "GetPodReAttemptRate": true, "Enqueue": true, "ListClusterStates": true, } cmd := "_test_cmd_" sa := "fooAccount" cpu := int64(512) engine := state.DefaultEngine req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ ClusterName: "clusta", Env: env, OwnerID: "somebody", Command: &cmd, Memory: nil, Cpu: &cpu, Engine: &engine, EphemeralStorage: nil, NodeLifecycle: nil, IdempotenceKey: nil, Arch: nil, ServiceAccount: &sa, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "B", &req) if err != nil { t.Error(err.Error()) } if len(imp.Calls) != len(expectedCalls) { t.Errorf("Expected exactly %v calls during run creation but was: %v", len(expectedCalls), len(imp.Calls)) } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during run creation: %s", call) } } if len(run.RunID) == 0 { t.Errorf("Expected Create to populated run with non-empty RunID") } if run.DefinitionID != "B" { t.Errorf("Expected definitionID 'B' but was '%s'", run.DefinitionID) } if run.Status != state.StatusQueued { t.Errorf("Expected new run to have status '%s' but was '%s'", state.StatusQueued, run.Status) } if run.User != "somebody" { t.Errorf("Expected new run to have user 'somebody' but was '%s'", run.User) } if run.QueuedAt == nil { t.Errorf("Expected new run to have a 'queued_at' field but was nil.") } if run.Env == nil { t.Errorf("Expected non-nil environment") } if len(*run.Env) != (len(es.ReservedVariables()) + len(*env)) { t.Errorf("Unexpected number of environment variables; expected %v but was %v", len(es.ReservedVariables())+len(*env), len(*run.Env)) } if run.Command == nil { t.Errorf("Expected non-nil command") } else { if *run.Command != cmd { t.Errorf("Unexpected command, found [%s], exptecting [%s]", *run.Command, cmd) } } if run.Cpu == nil { t.Errorf("Expected non-nil cpu") } else { if *run.Cpu != cpu { t.Errorf("Unexpected cpu, found [%d], exptecting [%d]", *run.Cpu, cpu) } } if run.ServiceAccount == nil { t.Errorf("Expected non-nil service account") } else { if *run.ServiceAccount != sa { t.Errorf("Unexpected service account, found [%s], exptecting [%s]", *run.ServiceAccount, sa) } } includesExpected := false for _, e := range *run.Env { if e.Name == "K1" && e.Value == "V1" { includesExpected = true } } if !includesExpected { t.Errorf("Expected K1:V1 in run environment") } } func TestExecutionService_CreateDefinitionRunByAlias(t *testing.T) { ctx := context.Background() // Tests valid create es, imp := setUp(t) env := &state.EnvList{ {Name: "K1", Value: "V1"}, } expectedCalls := map[string]bool{ "GetDefinitionByAlias": true, "CreateRun": true, "UpdateRun": true, "GetTaskHistoricalRuntime": true, "GetPodReAttemptRate": true, "Enqueue": true, "ListClusterStates": true, } mem := int64(1024) engine := state.DefaultEngine req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ ClusterName: "", Env: env, OwnerID: "somebody", Command: nil, Memory: &mem, Cpu: nil, Engine: &engine, EphemeralStorage: nil, NodeLifecycle: nil, IdempotenceKey: nil, Arch: nil, }, } run, err := es.CreateDefinitionRunByAlias(ctx, "aliasB", &req) if err != nil { t.Error(err.Error()) } if len(imp.Calls) != len(expectedCalls) { t.Errorf("Expected exactly %v calls during run creation but was: %v", len(expectedCalls), len(imp.Calls)) } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during run creation: %s", call) } } if len(run.RunID) == 0 { t.Errorf("Expected Create to populated run with non-empty RunID") } if run.DefinitionID != "B" { t.Errorf("Expected definitionID 'B' but was '%s'", run.DefinitionID) } if run.Status != state.StatusQueued { t.Errorf("Expected new run to have status '%s' but was '%s'", state.StatusQueued, run.Status) } if run.User != "somebody" { t.Errorf("Expected new run to have user 'somebody' but was '%s'", run.User) } if run.QueuedAt == nil { t.Errorf("Expected new run to have a 'queued_at' field but was nil.") } if run.Env == nil { t.Errorf("Expected non-nil environment") } if len(*run.Env) != (len(es.ReservedVariables()) + len(*env)) { t.Errorf("Unexpected number of environment variables; expected %v but was %v", len(es.ReservedVariables())+len(*env), len(*run.Env)) } if run.Memory == nil { t.Errorf("Expected non-nil memory") } else { if *run.Memory != mem { t.Errorf("Unexpected memory , found [%d], exptecting [%d]", *run.Memory, mem) } } includesExpected := false for _, e := range *run.Env { if e.Name == "K1" && e.Value == "V1" { includesExpected = true } } if !includesExpected { t.Errorf("Expected K1:V1 in run environment") } } func TestExecutionService_List(t *testing.T) { ctx := context.Background() es, imp := setUp(t) es.List(ctx, 1, 0, "asc", "cluster_name", nil, nil) expectedCalls := map[string]bool{ "ListRuns": true, } if len(imp.Calls) != len(expectedCalls) { t.Errorf("Expected exactly %v calls during run list with no filters but was: %v", len(expectedCalls), len(imp.Calls)) } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during run list with no filters: %s", call) } } } func TestExecutionService_List2(t *testing.T) { ctx := context.Background() es, imp := setUp(t) es.List( ctx, 1, 0, "asc", "cluster_name", map[string][]string{"definition_id": {"A"}}, nil) expectedCalls := map[string]bool{ "GetDefinition": true, "ListRuns": true, } if len(imp.Calls) != len(expectedCalls) { t.Errorf("Expected exactly %v calls during run list with no filters but was: %v", len(expectedCalls), len(imp.Calls)) } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during run list with no filters: %s", call) } } } func TestExecutionService_ListClusters(t *testing.T) { ctx := context.Background() es, imp := setUp(t) clusters, err := es.ListClusters(ctx) if err != nil { t.Errorf("Expected no error listing clusters, got: %v", err) } expectedCalls := map[string]bool{ "ListClusterStates": true, } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during cluster listing: %s", call) } } if len(clusters) != 2 { t.Errorf("Expected 2 clusters, got %d", len(clusters)) } } func TestExecutionService_CreateDefinitionRunWithTier(t *testing.T) { ctx := context.Background() // Set up test environment confDir := "../conf" c, _ := config.NewConfig(&confDir) // Create mock implementation with clusters supporting different tiers imp := testutils.ImplementsAllTheThings{ T: t, Definitions: map[string]state.Definition{ "A": {DefinitionID: "A", Alias: "aliasA"}, }, Runs: map[string]state.Run{}, Qurls: map[string]string{ "A": "a/", }, ClusterStates: []state.ClusterMetadata{ { Name: "prod-cluster", Status: state.StatusActive, StatusReason: "Active and healthy", AllowedTiers: []string{"1", "2"}, }, { Name: "staging-cluster", Status: state.StatusActive, StatusReason: "Active and healthy", AllowedTiers: []string{"3", "4"}, }, { Name: "string-cluster", Status: state.StatusActive, StatusReason: "Active and healthy", AllowedTiers: []string{"tier3", "tier4"}, }, { Name: "unrestricted-cluster", Status: state.StatusActive, StatusReason: "Active and healthy", // No tiers specified - should use default tier }, { Name: "maintenance-cluster", Status: state.StatusMaintenance, StatusReason: "In maintenance", AllowedTiers: []string{"1", "2", "3", "4"}, }, }, } imp.GetRandomClusterName = func(clusters []string) string { if len(clusters) > 0 { return clusters[0] } return "" } es, err := NewExecutionService(c, &imp, &imp, &imp, &imp) if err != nil { t.Fatalf("Error setting up execution service: %s", err.Error()) } // Test cases with different tiers testCases := []struct { name string tier string expectedCluster string }{ { name: "Production tier request", tier: "1", expectedCluster: "prod-cluster", }, { name: "Staging tier request", tier: "3", expectedCluster: "staging-cluster", }, { name: "No tier specified", tier: "", expectedCluster: "staging-cluster", }, { name: "String Tier", tier: "tier3", expectedCluster: "string-cluster", }, { name: "Invalid tier", tier: "nonexistent", expectedCluster: es.GetDefaultCluster(), }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { imp.Calls = make([]string, 0) cmd := "echo test" engine := state.DefaultEngine req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Tier: state.Tier(tc.tier), Command: &cmd, OwnerID: "testuser", Engine: &engine, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Errorf("Error creating run: %s", err.Error()) return } // Verify the selected cluster matches expectations if run.ClusterName != tc.expectedCluster { t.Errorf("Expected cluster %s for tier %s, but got %s", tc.expectedCluster, tc.tier, run.ClusterName) } // Verify tier was set correctly if string(run.Tier) != tc.tier && tc.tier != "" { t.Errorf("Expected tier %s, but got %s", tc.tier, string(run.Tier)) } }) } } func TestExecutionService_GetRunStatus(t *testing.T) { ctx := context.Background() es, imp := setUp(t) expectedCalls := map[string]bool{ "GetRunStatus": true, } status, err := es.GetRunStatus(ctx, "runA") if err != nil { t.Errorf("Expected no error when getting status of existing run, got: %s", err.Error()) } if len(imp.Calls) != len(expectedCalls) { t.Errorf("Expected exactly %v calls during status retrieval but was: %v", len(expectedCalls), len(imp.Calls)) } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during status retrieval: %s", call) } } if status.RunID != "runA" { t.Errorf("Expected run ID 'runA' but got '%s'", status.RunID) } if status.DefinitionID != "A" { t.Errorf("Expected definition ID 'A' but got '%s'", status.DefinitionID) } if status.ClusterName != "A" { t.Errorf("Expected cluster name 'A' but got '%s'", status.ClusterName) } imp.Calls = []string{} _, err = es.GetRunStatus(ctx, "nonexistent") if err == nil { t.Errorf("Expected error when getting status of non-existent run, got nil") } expectedErrorString := "No run with ID: nonexistent" if err != nil && err.Error() != expectedErrorString { t.Errorf("Expected error message '%s', got '%s'", expectedErrorString, err.Error()) } } func TestExecutionService_CommandHashCalculatedFromCommand(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that command_hash is MD5 of command, not description cmd := "python script.py --arg value" desc := "Different description" engine := state.DefaultEngine req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd, Description: &desc, OwnerID: "testuser", Engine: &engine, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Fatalf("Error creating run: %s", err.Error()) } // Verify command_hash is MD5 of command expectedHash := fmt.Sprintf("%x", md5.Sum([]byte(cmd))) if run.CommandHash == nil { t.Errorf("Expected non-nil command_hash") } else if *run.CommandHash != expectedHash { t.Errorf("Expected command_hash to be MD5 of command '%s', got '%s'", expectedHash, *run.CommandHash) } // Verify it's NOT MD5 of description descHash := fmt.Sprintf("%x", md5.Sum([]byte(desc))) if run.CommandHash != nil && *run.CommandHash == descHash { t.Errorf("command_hash should NOT be MD5 of description (that was the bug!)") } } func TestExecutionService_CommandHashWithSameDescriptionDifferentCommands(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that different commands get different hashes even with same description description := "Daily processing job" cmd1 := "python process.py --date 2025-01-01" cmd2 := "python process.py --date 2025-01-02" engine := state.DefaultEngine req1 := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd1, Description: &description, OwnerID: "testuser", Engine: &engine, }, } req2 := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd2, Description: &description, OwnerID: "testuser", Engine: &engine, }, } run1, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req1) if err != nil { t.Fatalf("Error creating run1: %s", err.Error()) } run2, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req2) if err != nil { t.Fatalf("Error creating run2: %s", err.Error()) } // Verify both have non-nil command_hash if run1.CommandHash == nil { t.Errorf("Expected run1 to have non-nil command_hash") } if run2.CommandHash == nil { t.Errorf("Expected run2 to have non-nil command_hash") } // Verify hashes are different (critical for ARA fix) if run1.CommandHash != nil && run2.CommandHash != nil { if *run1.CommandHash == *run2.CommandHash { t.Errorf("Different commands should have different hashes even with same description. "+ "Both got hash '%s'. This was the ARA bug!", *run1.CommandHash) } } // Verify they match expected hashes expectedHash1 := fmt.Sprintf("%x", md5.Sum([]byte(cmd1))) expectedHash2 := fmt.Sprintf("%x", md5.Sum([]byte(cmd2))) if run1.CommandHash != nil && *run1.CommandHash != expectedHash1 { t.Errorf("run1 command_hash mismatch: expected '%s', got '%s'", expectedHash1, *run1.CommandHash) } if run2.CommandHash != nil && *run2.CommandHash != expectedHash2 { t.Errorf("run2 command_hash mismatch: expected '%s', got '%s'", expectedHash2, *run2.CommandHash) } } func TestExecutionService_CommandHashNullWhenCommandNull(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that NULL command results in NULL command_hash // (This is a malformed job, but should not crash) engine := state.DefaultEngine desc := "A description without a command" req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: nil, // NULL command Description: &desc, OwnerID: "testuser", Engine: &engine, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Fatalf("Error creating run: %s", err.Error()) } // Command should be set from definition's command (if any) // But if definition also has no command, command_hash should be NULL if run.Command == nil || len(*run.Command) == 0 { // Command is NULL/empty, so command_hash should also be NULL if run.CommandHash != nil { t.Errorf("Expected NULL command_hash when command is NULL, got '%s'", *run.CommandHash) } } // Even if command gets set from definition, command_hash should NOT be from description if run.CommandHash != nil { descHash := fmt.Sprintf("%x", md5.Sum([]byte(desc))) if *run.CommandHash == descHash { t.Errorf("command_hash should NOT be MD5 of description (that was the bug!)") } } } func TestExecutionService_CommandHashMatchesCommand(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test with various command strings to ensure consistent hashing testCases := []struct { name string command string }{ {"Simple command", "echo hello"}, {"Command with args", "python train.py --epochs 10 --lr 0.001"}, {"Multi-line command", "set -e\necho 'Starting'\npython script.py\necho 'Done'"}, {"Command with special chars", "grep -r 'pattern' /path/to/files | sort | uniq"}, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { engine := state.DefaultEngine cmd := tc.command req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd, OwnerID: "testuser", Engine: &engine, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Fatalf("Error creating run: %s", err.Error()) } expectedHash := fmt.Sprintf("%x", md5.Sum([]byte(tc.command))) if run.CommandHash == nil { t.Errorf("Expected non-nil command_hash for command: %s", tc.command) } else if *run.CommandHash != expectedHash { t.Errorf("command_hash mismatch for '%s': expected '%s', got '%s'", tc.command, expectedHash, *run.CommandHash) } }) } } func TestExecutionService_CommandHashStableAcrossRuns(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Verify same command always produces same hash (consistency check) cmd := "python train.py --model resnet50" engine := state.DefaultEngine req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd, OwnerID: "testuser", Engine: &engine, }, } // Create multiple runs with same command run1, err1 := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) run2, err2 := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) run3, err3 := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err1 != nil || err2 != nil || err3 != nil { t.Fatalf("Error creating runs") } // All should have same command_hash if run1.CommandHash == nil || run2.CommandHash == nil || run3.CommandHash == nil { t.Errorf("All runs should have non-nil command_hash") } if *run1.CommandHash != *run2.CommandHash || *run1.CommandHash != *run3.CommandHash { t.Errorf("Same command should always produce same hash. Got: '%s', '%s', '%s'", *run1.CommandHash, *run2.CommandHash, *run3.CommandHash) } // Verify it matches expected expectedHash := fmt.Sprintf("%x", md5.Sum([]byte(cmd))) if *run1.CommandHash != expectedHash { t.Errorf("Expected hash '%s', got '%s'", expectedHash, *run1.CommandHash) } } func TestExecutionService_CommandHashNotSetInEndpoints(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that even if description is provided, command_hash comes from command // This verifies the endpoints.go fix (removal of description-based hashing) cmd := "python app.py" desc := "This is a description" engine := state.DefaultEngine req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd, Description: &desc, CommandHash: nil, // Explicitly NULL to verify it gets calculated OwnerID: "testuser", Engine: &engine, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Fatalf("Error creating run: %s", err.Error()) } // Should be MD5 of command, not description cmdHash := fmt.Sprintf("%x", md5.Sum([]byte(cmd))) descHash := fmt.Sprintf("%x", md5.Sum([]byte(desc))) if run.CommandHash == nil { t.Errorf("Expected command_hash to be calculated") } else { if *run.CommandHash == descHash { t.Errorf("BUG: command_hash is MD5 of description! This should have been fixed.") } if *run.CommandHash != cmdHash { t.Errorf("Expected command_hash to be MD5 of command '%s', got '%s'", cmdHash, *run.CommandHash) } } } func TestExecutionService_CommandHashWithOverride(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that if API client explicitly provides a command_hash, it gets overwritten // by the correct hash calculated from the command cmd := "python script.py" wrongHash := "this_is_wrong_hash" engine := state.DefaultEngine req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd, CommandHash: aws.String(wrongHash), // Wrong hash provided by client OwnerID: "testuser", Engine: &engine, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Fatalf("Error creating run: %s", err.Error()) } // Should be overwritten with correct hash expectedHash := fmt.Sprintf("%x", md5.Sum([]byte(cmd))) if run.CommandHash == nil { t.Errorf("Expected non-nil command_hash") } else if *run.CommandHash == wrongHash { t.Errorf("BUG: Wrong hash was not overwritten! Still has '%s'", wrongHash) } else if *run.CommandHash != expectedHash { t.Errorf("Expected command_hash '%s', got '%s'", expectedHash, *run.CommandHash) } } func TestExecutionService_SparkCommandHashFromDescription(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that Spark jobs with NULL command get command_hash from description // Spark jobs don't have a command field - they store config in spark_extension desc := "Vmi Po Recon Data Extract / Run Snapshots" engine := state.EKSSparkEngine entryPoint := "s3://bucket/script.py" req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: nil, // Spark jobs have NULL command Description: &desc, OwnerID: "testuser", Engine: &engine, SparkExtension: &state.SparkExtension{ SparkSubmitJobDriver: &state.SparkSubmitJobDriver{ EntryPoint: &entryPoint, }, }, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Fatalf("Error creating run: %s", err.Error()) } // Should have command_hash from description (for Spark jobs) expectedHash := fmt.Sprintf("%x", md5.Sum([]byte(desc))) if run.CommandHash == nil { t.Errorf("Expected non-nil command_hash for Spark job with description") } else if *run.CommandHash != expectedHash { t.Errorf("Expected Spark command_hash to be MD5 of description '%s', got '%s'", expectedHash, *run.CommandHash) } } func TestExecutionService_SparkCommandHashConsistent(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that Spark jobs with same description get same hash (critical for ARA) desc := "Vmi Po Recon Data Extract / Run Snapshots" engine := state.EKSSparkEngine entryPoint := "s3://bucket/script.py" req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: nil, Description: &desc, OwnerID: "testuser", Engine: &engine, SparkExtension: &state.SparkExtension{ SparkSubmitJobDriver: &state.SparkSubmitJobDriver{ EntryPoint: &entryPoint, }, }, }, } // Create multiple Spark runs with same description run1, err1 := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) run2, err2 := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) run3, err3 := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err1 != nil || err2 != nil || err3 != nil { t.Fatalf("Error creating Spark runs") } // All should have same command_hash for ARA tracking if run1.CommandHash == nil || run2.CommandHash == nil || run3.CommandHash == nil { t.Errorf("All Spark runs should have non-nil command_hash") } if *run1.CommandHash != *run2.CommandHash || *run1.CommandHash != *run3.CommandHash { t.Errorf("Spark jobs with same description should always produce same hash. Got: '%s', '%s', '%s'", *run1.CommandHash, *run2.CommandHash, *run3.CommandHash) } // Verify it matches expected expectedHash := fmt.Sprintf("%x", md5.Sum([]byte(desc))) if *run1.CommandHash != expectedHash { t.Errorf("Expected Spark hash '%s', got '%s'", expectedHash, *run1.CommandHash) } } func TestExecutionService_SparkVsRegularEKSHashing(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that Spark and regular EKS jobs use different hashing strategies // This ensures no cross-contamination between Spark and regular jobs description := "Process data files" cmd := "python process.py" entryPoint := "s3://bucket/script.py" // Regular EKS job regularEngine := state.DefaultEngine regularReq := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: &cmd, Description: &description, OwnerID: "testuser", Engine: ®ularEngine, }, } // Spark job sparkEngine := state.EKSSparkEngine sparkReq := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: nil, // Spark has no command Description: &description, OwnerID: "testuser", Engine: &sparkEngine, SparkExtension: &state.SparkExtension{ SparkSubmitJobDriver: &state.SparkSubmitJobDriver{ EntryPoint: &entryPoint, }, }, }, } regularRun, err1 := es.CreateDefinitionRunByDefinitionID(ctx, "A", ®ularReq) sparkRun, err2 := es.CreateDefinitionRunByDefinitionID(ctx, "A", &sparkReq) if err1 != nil || err2 != nil { t.Fatalf("Error creating runs") } // Verify both have command_hash if regularRun.CommandHash == nil { t.Errorf("Regular EKS job should have command_hash") } if sparkRun.CommandHash == nil { t.Errorf("Spark job should have command_hash") } // Verify they use different hash sources cmdHash := fmt.Sprintf("%x", md5.Sum([]byte(cmd))) descHash := fmt.Sprintf("%x", md5.Sum([]byte(description))) if regularRun.CommandHash != nil && *regularRun.CommandHash != cmdHash { t.Errorf("Regular EKS job should hash from command, expected '%s', got '%s'", cmdHash, *regularRun.CommandHash) } if sparkRun.CommandHash != nil && *sparkRun.CommandHash != descHash { t.Errorf("Spark job should hash from description, expected '%s', got '%s'", descHash, *sparkRun.CommandHash) } // Most importantly: they should have DIFFERENT hashes (no cross-contamination) if regularRun.CommandHash != nil && sparkRun.CommandHash != nil { if *regularRun.CommandHash == *sparkRun.CommandHash { t.Errorf("Regular EKS and Spark jobs should have different hashes to prevent ARA cross-contamination. Both got '%s'", *regularRun.CommandHash) } } } func TestExecutionService_SparkNullDescriptionNullHash(t *testing.T) { ctx := context.Background() es, _ := setUp(t) // Test that Spark jobs with NULL command AND NULL description get NULL hash // (This is a malformed job, but should not crash) engine := state.EKSSparkEngine entryPoint := "s3://bucket/script.py" req := state.DefinitionExecutionRequest{ ExecutionRequestCommon: &state.ExecutionRequestCommon{ Command: nil, // Spark has no command Description: nil, // Also no description (malformed) OwnerID: "testuser", Engine: &engine, SparkExtension: &state.SparkExtension{ SparkSubmitJobDriver: &state.SparkSubmitJobDriver{ EntryPoint: &entryPoint, }, }, }, } run, err := es.CreateDefinitionRunByDefinitionID(ctx, "A", &req) if err != nil { t.Fatalf("Error creating run: %s", err.Error()) } // Should have NULL command_hash (malformed job) if run.CommandHash != nil { t.Errorf("Expected NULL command_hash for Spark job with NULL description, got '%s'", *run.CommandHash) } } ================================================ FILE: services/logs.go ================================================ package services import ( "context" "github.com/aws/aws-sdk-go/aws" "github.com/stitchfix/flotilla-os/clients/logs" "github.com/stitchfix/flotilla-os/state" "net/http" ) type LogService interface { Logs(runID string, lastSeen *string, role *string, facility *string) (string, *string, error) LogsText(runID string, w http.ResponseWriter) error } type logService struct { sm state.Manager lc logs.Client } // Initialize a Log service. func NewLogService(sm state.Manager, lc logs.Client) (LogService, error) { return &logService{sm: sm, lc: lc}, nil } // Returns logs associated with a RunId func (ls *logService) Logs(runID string, lastSeen *string, role *string, facility *string) (string, *string, error) { run, err := ls.sm.GetRun(context.Background(), runID) if err != nil { return "", nil, err } if run.Status != state.StatusRunning && run.Status != state.StatusStopped { // Won't have logs yet return "", aws.String(""), nil } if run.ExecutableType == nil { defaultExecutableType := state.ExecutableTypeDefinition run.ExecutableType = &defaultExecutableType } if run.ExecutableID == nil { run.ExecutableID = &run.DefinitionID } executable, err := ls.sm.GetExecutableByTypeAndID(context.Background(), *run.ExecutableType, *run.ExecutableID) return ls.lc.Logs(executable, run, lastSeen, role, facility) } // Returns all the logs as text associated with a runID (supported only for s3 logs). func (ls *logService) LogsText(runID string, w http.ResponseWriter) error { run, err := ls.sm.GetRun(context.Background(), runID) if err != nil { return err } if run.Status != state.StatusRunning && run.Status != state.StatusStopped { // Won't have logs yet return nil } if run.ExecutableType == nil { defaultExecutableType := state.ExecutableTypeDefinition run.ExecutableType = &defaultExecutableType } if run.ExecutableID == nil { run.ExecutableID = &run.DefinitionID } executable, err := ls.sm.GetExecutableByTypeAndID(context.Background(), *run.ExecutableType, *run.ExecutableID) return ls.lc.LogsText(executable, run, w) } ================================================ FILE: services/logs_test.go ================================================ package services import ( "testing" "github.com/stitchfix/flotilla-os/state" "github.com/stitchfix/flotilla-os/testutils" ) func setUpLogServiceTest(t *testing.T) (LogService, *testutils.ImplementsAllTheThings) { imp := testutils.ImplementsAllTheThings{ T: t, Definitions: map[string]state.Definition{ "B": {DefinitionID: "{}"}, }, Runs: map[string]state.Run{ "isQueued": {DefinitionID: "q", RunID: "isQueued", Status: state.StatusQueued}, "running": {DefinitionID: "B", RunID: "running", Status: state.StatusRunning}, }, } ls, _ := NewLogService(&imp, &imp) return ls, &imp } func TestLogService_Logs(t *testing.T) { ls, imp := setUpLogServiceTest(t) // // Check that we don't try to get logs for runs that won't have them yet // expectedCalls := map[string]bool{ "GetRun": true, } _, _, err := ls.Logs("isQueued", nil, nil, nil) if err != nil { t.Error(err.Error()) } if len(imp.Calls) != len(expectedCalls) { t.Errorf("Expected exactly %v calls for log query for queued run but was: %v", len(expectedCalls), len(imp.Calls)) } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during log query for queued run: %s", call) } } // // Check that we do get logs for runs that should have them // ls, imp = setUpLogServiceTest(t) expectedCalls = map[string]bool{ "GetRun": true, "GetDefinition": true, "Logs": true, "GetExecutableByTypeAndID": true, } _, _, err = ls.Logs("running", nil, nil, nil) if err != nil { t.Error(err.Error()) } if len(imp.Calls) != len(expectedCalls) { t.Errorf("Expected exactly %v calls for log query for running run but was: %v", len(expectedCalls), len(imp.Calls)) } for _, call := range imp.Calls { _, ok := expectedCalls[call] if !ok { t.Errorf("Unexpected call during log query for running run: %s", call) } } } ================================================ FILE: services/template.go ================================================ package services import ( "context" "reflect" "strings" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/exceptions" "github.com/stitchfix/flotilla-os/state" ) // TemplateService defines an interface for operations involving templates. type TemplateService interface { GetByID(ctx context.Context, id string) (state.Template, error) GetLatestByName(ctx context.Context, templateName string) (bool, state.Template, error) List(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) ListLatestOnly(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) Create(ctx context.Context, tpl *state.CreateTemplateRequest) (state.CreateTemplateResponse, error) } type templateService struct { sm state.Manager } // NewTemplateService configures and returns a TemplateService. func NewTemplateService(conf config.Config, sm state.Manager) (TemplateService, error) { ts := templateService{sm: sm} return &ts, nil } // Create fully initialize and save the new template. func (ts *templateService) Create(ctx context.Context, req *state.CreateTemplateRequest) (state.CreateTemplateResponse, error) { res := state.CreateTemplateResponse{ DidCreate: false, Template: state.Template{}, } curr, err := ts.constructTemplateFromCreateTemplateRequest(req) // 1. Check validity. if valid, reasons := curr.IsValid(); !valid { return res, exceptions.MalformedInput{ErrorString: strings.Join(reasons, "\n")} } // 2. Attach template id. templateID, err := state.NewTemplateID(curr) if err != nil { return res, err } curr.TemplateID = templateID // 3. Check if template name exists - if it does NOT, we will insert it into // the DB with a version number of 1. If it does, and if there are any // changed fields, then we will create a new row in the DB w/ the version // incremented by 1. If there are NO changed fields, then just return the // latest version. doesExist, prev, err := ts.sm.GetLatestTemplateByTemplateName(ctx, curr.TemplateName) if err != nil { return res, err } // No previous template with the same name; write it. if doesExist == false { curr.Version = 1 res.Template = curr res.DidCreate = true return res, ts.sm.CreateTemplate(ctx, curr) } // Check if prev and curr are diff, if they are, write curr to DB (increment) // version number by 1. Otherwise, return prev. if ts.diff(prev, curr) == true { curr.Version = prev.Version + 1 res.Template = curr res.DidCreate = true return res, ts.sm.CreateTemplate(ctx, curr) } res.Template = prev return res, nil } // Get returns the template specified by id. func (ts *templateService) GetByID(ctx context.Context, id string) (state.Template, error) { return ts.sm.GetTemplateByID(ctx, id) } // Get returns the template specified by id. func (ts *templateService) GetLatestByName(ctx context.Context, templateName string) (bool, state.Template, error) { return ts.sm.GetLatestTemplateByTemplateName(ctx, templateName) } // List lists templates. func (ts *templateService) List(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) { return ts.sm.ListTemplates(ctx, limit, offset, sortBy, order) } // List lists templates. func (ts *templateService) ListLatestOnly(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) { return ts.sm.ListTemplatesLatestOnly(ctx, limit, offset, sortBy, order) } // diff performs a diff between all fields (except for TemplateName and // Version) of two templates. func (ts *templateService) diff(prev state.Template, curr state.Template) bool { if prev.TemplateName != curr.TemplateName { return true } if prev.CommandTemplate != curr.CommandTemplate { return true } if prev.Image != curr.Image { return true } if *prev.Memory != *curr.Memory { return true } if *prev.Gpu != *curr.Gpu { return true } if *prev.Cpu != *curr.Cpu { return true } if prev.Env != nil && curr.Env != nil { prevEnv := *prev.Env currEnv := *curr.Env if len(prevEnv) != len(currEnv) { return true } for i, e := range prevEnv { if e != currEnv[i] { return true } } } if *prev.AdaptiveResourceAllocation != *curr.AdaptiveResourceAllocation { return true } if reflect.DeepEqual(prev.Defaults, curr.Defaults) == false { return true } if prev.AvatarURI != curr.AvatarURI { return true } if prev.Ports != nil && curr.Ports != nil { prevPorts := *prev.Ports currPorts := *curr.Ports if len(prevPorts) != len(currPorts) { return true } for i, e := range prevPorts { if e != currPorts[i] { return true } } } if prev.Tags != nil && curr.Tags != nil { prevTags := *prev.Tags currTags := *curr.Tags if len(prevTags) != len(currTags) { return true } for i, e := range prevTags { if e != currTags[i] { return true } } } if reflect.DeepEqual(prev.Schema, curr.Schema) == false { return true } return false } // constructTemplateFromCreateTemplateRequest takes a CreateTemplateRequest and // dumps the requisite fields into a Template. func (ts *templateService) constructTemplateFromCreateTemplateRequest(req *state.CreateTemplateRequest) (state.Template, error) { tpl := state.Template{} if len(req.TemplateName) > 0 { tpl.TemplateName = req.TemplateName } if req.Schema != nil { tpl.Schema = req.Schema } if len(req.CommandTemplate) > 0 { tpl.CommandTemplate = req.CommandTemplate } if len(req.Image) > 0 { tpl.Image = req.Image } if req.Memory != nil { tpl.Memory = req.Memory } else { tpl.Memory = &state.MinMem } if req.Gpu != nil { tpl.Gpu = req.Gpu } if req.Cpu != nil { tpl.Cpu = req.Cpu } else { tpl.Cpu = &state.MinCPU } if req.Env != nil { tpl.Env = req.Env } if req.AdaptiveResourceAllocation != nil { tpl.AdaptiveResourceAllocation = req.AdaptiveResourceAllocation } else { *tpl.AdaptiveResourceAllocation = true } if req.Ports != nil { tpl.Ports = req.Ports } if req.Tags != nil { tpl.Tags = req.Tags } if req.Defaults != nil { tpl.Defaults = req.Defaults } else { tpl.Defaults = state.TemplatePayload{} } if len(req.AvatarURI) > 0 { tpl.AvatarURI = req.AvatarURI } else { tpl.AvatarURI = "" } return tpl, nil } ================================================ FILE: services/worker.go ================================================ package services import ( "context" "fmt" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/exceptions" "github.com/stitchfix/flotilla-os/state" ) // // WorkerService defines an interface for operations involving workers // type WorkerService interface { List(ctx context.Context, engine string) (state.WorkersList, error) Get(ctx context.Context, workerType string, engine string) (state.Worker, error) Update(ctx context.Context, workerType string, updates state.Worker) (state.Worker, error) BatchUpdate(ctx context.Context, updates []state.Worker) (state.WorkersList, error) } type workerService struct { sm state.Manager } // // NewWorkerService configures and returns a WorkerService // func NewWorkerService(conf config.Config, sm state.Manager) (WorkerService, error) { ws := workerService{sm: sm} return &ws, nil } func (ws *workerService) List(ctx context.Context, engine string) (state.WorkersList, error) { return ws.sm.ListWorkers(ctx, engine) } func (ws *workerService) Get(ctx context.Context, workerType string, engine string) (state.Worker, error) { var w state.Worker if err := ws.validate(workerType); err != nil { return w, err } return ws.sm.GetWorker(ctx, workerType, engine) } func (ws *workerService) Update(ctx context.Context, workerType string, updates state.Worker) (state.Worker, error) { var w state.Worker if err := ws.validate(workerType); err != nil { return w, err } return ws.sm.UpdateWorker(ctx, workerType, updates) } func (ws *workerService) BatchUpdate(ctx context.Context, updates []state.Worker) (state.WorkersList, error) { var wl state.WorkersList for _, update := range updates { if err := ws.validate(update.WorkerType); err != nil { return wl, err } } return ws.sm.BatchUpdateWorkers(ctx, updates) } func (ws *workerService) validate(workerType string) error { if !state.IsValidWorkerType(workerType) { var validTypesList []string for validType := range state.WorkerTypes { validTypesList = append(validTypesList, validType) } return exceptions.MalformedInput{ ErrorString: fmt.Sprintf( "Worker type: [%s] is not a valid worker type; valid types: %s", workerType, validTypesList)} } return nil } ================================================ FILE: state/manager.go ================================================ package state import ( "context" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/log" ) // Manager interface for CRUD operations // on definitions and runs type Manager interface { Name() string Initialize(conf config.Config) error Cleanup() error ListDefinitions( ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string) (DefinitionList, error) GetDefinition(ctx context.Context, definitionID string) (Definition, error) GetDefinitionByAlias(ctx context.Context, alias string) (Definition, error) UpdateDefinition(ctx context.Context, definitionID string, updates Definition) (Definition, error) CreateDefinition(ctx context.Context, d Definition) error DeleteDefinition(ctx context.Context, definitionID string) error ListRuns(ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string, engines []string) (RunList, error) EstimateRunResources(ctx context.Context, executableID string, commandHash string) (TaskResources, error) EstimateExecutorCount(ctx context.Context, executableID string, commandHash string) (int64, error) ExecutorOOM(ctx context.Context, executableID string, commandHash string) (bool, error) DriverOOM(ctx context.Context, executableID string, commandHash string) (bool, error) GetRun(ctx context.Context, runID string) (Run, error) CreateRun(ctx context.Context, r Run) error UpdateRun(ctx context.Context, runID string, updates Run) (Run, error) ListGroups(ctx context.Context, limit int, offset int, name *string) (GroupsList, error) ListTags(ctx context.Context, limit int, offset int, name *string) (TagsList, error) ListWorkers(ctx context.Context, engine string) (WorkersList, error) BatchUpdateWorkers(ctx context.Context, updates []Worker) (WorkersList, error) GetWorker(ctx context.Context, workerType string, engine string) (Worker, error) UpdateWorker(ctx context.Context, workerType string, updates Worker) (Worker, error) GetExecutableByTypeAndID(ctx context.Context, executableType ExecutableType, executableID string) (Executable, error) GetTemplateByID(ctx context.Context, templateID string) (Template, error) GetLatestTemplateByTemplateName(ctx context.Context, templateName string) (bool, Template, error) GetTemplateByVersion(ctx context.Context, templateName string, templateVersion int64) (bool, Template, error) ListTemplates(ctx context.Context, limit int, offset int, sortBy string, order string) (TemplateList, error) ListTemplatesLatestOnly(ctx context.Context, limit int, offset int, sortBy string, order string) (TemplateList, error) CreateTemplate(ctx context.Context, t Template) error ListFailingNodes(ctx context.Context) (NodeList, error) GetPodReAttemptRate(ctx context.Context) (float32, error) GetNodeLifecycle(ctx context.Context, executableID string, commandHash string) (string, error) GetTaskHistoricalRuntime(ctx context.Context, executableID string, runId string) (float32, error) CheckIdempotenceKey(ctx context.Context, idempotenceKey string) (string, error) GetRunByEMRJobId(ctx context.Context, emrJobId string) (Run, error) GetResources(ctx context.Context, runID string) (Run, error) ListClusterStates(ctx context.Context) ([]ClusterMetadata, error) UpdateClusterMetadata(ctx context.Context, cluster ClusterMetadata) error DeleteClusterMetadata(ctx context.Context, clusterID string) error GetClusterByID(ctx context.Context, clusterID string) (ClusterMetadata, error) GetRunStatus(ctx context.Context, runID string) (RunStatus, error) } // NewStateManager sets up and configures a new statemanager // - if no `state_manager` is configured, will use postgres func NewStateManager(conf config.Config, logger log.Logger) (Manager, error) { name := "postgres" if conf.IsSet("state_manager") { name = conf.GetString("state_manager") } switch name { case "postgres": pgm := &SQLStateManager{log: logger} err := pgm.Initialize(conf) if err != nil { return nil, errors.Wrap(err, "problem initializing SQLStateManager") } return pgm, nil default: return nil, errors.Errorf("state.Manager named [%s] not found", name) } } ================================================ FILE: state/models.go ================================================ package state import ( "bytes" "database/sql" "encoding/json" "fmt" "os" "reflect" "regexp" "sort" "strconv" "strings" "text/template" "time" "github.com/Masterminds/sprig" "github.com/aws/aws-sdk-go/aws" uuid "github.com/nu7hatch/gouuid" "github.com/pkg/errors" "github.com/xeipuuv/gojsonschema" ) var EKSEngine = "eks" var EKSSparkEngine = "eks-spark" var DefaultEngine = EKSEngine var DefaultTaskType = "task" var MinCPU = int64(256) var MaxCPU = int64(60000) var MaxGPUCPU = int64(94000) var MinMem = int64(512) // var MaxMem = int64(248000) var MaxMem = int64(350000) // increasing to 350 GB for #incident-616 var MaxGPUMem = int64(376000) var MaxEphemeralStorage = int64(5000) var TTLSecondsAfterFinished = int32(3600) var SpotActiveDeadlineSeconds = int64(172800) var OndemandActiveDeadlineSeconds = int64(604800) var SpotLifecycle = "spot" var OndemandLifecycle = "ondemand" var DefaultLifecycle = SpotLifecycle var NodeLifeCycles = []string{OndemandLifecycle, SpotLifecycle} var Engines = []string{EKSEngine, EKSSparkEngine} // StatusRunning indicates the run is running var StatusRunning = "RUNNING" // StatusQueued indicates the run is queued var StatusQueued = "QUEUED" // StatusNeedsRetry indicates the run failed for infra reasons and needs retried var StatusNeedsRetry = "NEEDS_RETRY" // StatusPending indicates the run has been allocated to a host and is in the process of launching var StatusPending = "PENDING" // StatusStopped means the run is finished var StatusStopped = "STOPPED" var MaxLogLines = int64(256) var EKSBackoffLimit = int32(0) var GPUNodeTypes = []string{"p3.2xlarge", "p3.8xlarge", "p3.16xlarge", "g5.xlarge", "g5.2xlarge", "g5.4xlarge", "g5.8xlarge", "g5.12xlarge", "g5.16xlarge", "g5.24xlarge", "g5.48xlarge"} var WorkerTypes = map[string]bool{ "retry": true, "submit": true, "status": true, } func IsValidWorkerType(workerType string) bool { return WorkerTypes[workerType] } // IsValidStatus checks that the given status // string is one of the valid statuses func IsValidStatus(status string) bool { return status == StatusRunning || status == StatusQueued || status == StatusNeedsRetry || status == StatusPending || status == StatusStopped } // NewRunID returns a new uuid for a Run func NewRunID(engine *string) (string, error) { s, err := newUUIDv4() return fmt.Sprintf("%s-%s", *engine, s[len(*engine)+1:]), err } // NewDefinitionID returns a new uuid for a Definition func NewDefinitionID(definition Definition) (string, error) { uuid4, err := newUUIDv4() if err != nil { return "", err } return fmt.Sprintf("%s-%s", definition.GroupName, uuid4), nil } func newUUIDv4() (string, error) { u, err := uuid.NewV4() if err != nil { return "", err } return u.String(), nil } // EnvList wraps a list of EnvVar // - abstraction to make it easier to read // and write to db type EnvList []EnvVar // PortsList wraps a list of int // - abstraction to make it easier to read // and write to db type PortsList []int // EnvVar represents a single environment variable // for either a definition or a run type EnvVar struct { Name string `json:"name"` Value string `json:"value"` } type NodeList []string // Tags wraps a list of strings // - abstraction to make it easier to read // and write to db type Tags []string // ExecutableResources define the resources and flags required to run an // executable. type ExecutableResources struct { Image string `json:"image"` Memory *int64 `json:"memory,omitempty"` Gpu *int64 `json:"gpu,omitempty"` Cpu *int64 `json:"cpu,omitempty"` EphemeralStorage *int64 `json:"ephemeral_storage,omitempty" db:"ephemeral_storage"` Env *EnvList `json:"env"` AdaptiveResourceAllocation *bool `json:"adaptive_resource_allocation,omitempty"` Ports *PortsList `json:"ports,omitempty"` Tags *Tags `json:"tags,omitempty"` } type ExecutableType string const ( ExecutableTypeDefinition ExecutableType = "task_definition" ExecutableTypeTemplate ExecutableType = "template" ) type Executable interface { GetExecutableID() *string GetExecutableType() *ExecutableType GetExecutableResources() *ExecutableResources GetExecutableCommand(req ExecutionRequest) (string, error) GetExecutableResourceName() string // This will typically be an ARN. } func UnmarshalSparkExtension(data []byte) (SparkExtension, error) { var r SparkExtension err := json.Unmarshal(data, &r) return r, err } func (r *SparkExtension) Marshal() ([]byte, error) { return json.Marshal(r) } type SparkExtension struct { SparkSubmitJobDriver *SparkSubmitJobDriver `json:"spark_submit_job_driver,omitempty"` ApplicationConf []Conf `json:"application_conf,omitempty"` HiveConf []Conf `json:"hive_conf,omitempty"` EMRJobId *string `json:"emr_job_id,omitempty"` SparkAppId *string `json:"spark_app_id,omitempty"` EMRJobManifest *string `json:"emr_job_manifest,omitempty"` HistoryUri *string `json:"history_uri,omitempty"` MetricsUri *string `json:"metrics_uri,omitempty"` VirtualClusterId *string `json:"virtual_cluster_id,omitempty"` EMRReleaseLabel *string `json:"emr_release_label,omitempty"` ExecutorInitCommand *string `json:"executor_init_command,omitempty"` DriverInitCommand *string `json:"driver_init_command,omitempty"` SparkServerURI *string `json:"spark_server_uri,omitempty"` AppUri *string `json:"app_uri,omitempty"` Executors []string `json:"executors,omitempty"` ExecutorOOM *bool `json:"executor_oom,omitempty"` DriverOOM *bool `json:"driver_oom,omitempty"` } type Conf struct { Name *string `json:"name,omitempty"` Value *string `json:"value,omitempty"` } type SparkSubmitJobDriver struct { EntryPoint *string `json:"entry_point,omitempty"` EntryPointArguments []*string `json:"entry_point_arguments,omitempty"` SparkSubmitConf []Conf `json:"spark_submit_conf,omitempty"` Files []string `json:"files,omitempty"` PyFiles []string `json:"py_files,omitempty"` Jars []string `json:"jars,omitempty"` Class *string `json:"class,omitempty"` WorkingDir *string `json:"working_dir,omitempty"` NumExecutors *int64 `json:"num_executors,omitempty"` ExecutorMemory *int64 `json:"executor_memory,omitempty"` } type Labels map[string]string // Common fields required to execute any Executable. type ExecutionRequestCommon struct { ClusterName string `json:"cluster_name"` Tier Tier `json:"tier"` Env *EnvList `json:"env"` OwnerID string `json:"owner_id"` Command *string `json:"command"` Memory *int64 `json:"memory"` Cpu *int64 `json:"cpu"` Gpu *int64 `json:"gpu"` Engine *string `json:"engine"` EphemeralStorage *int64 `json:"ephemeral_storage"` NodeLifecycle *string `json:"node_lifecycle"` ActiveDeadlineSeconds *int64 `json:"active_deadline_seconds,omitempty"` SparkExtension *SparkExtension `json:"spark_extension,omitempty"` Description *string `json:"description,omitempty"` CommandHash *string `json:"command_hash,omitempty"` IdempotenceKey *string `json:"idempotence_key,omitempty"` Arch *string `json:"arch,omitempty"` Labels *Labels `json:"labels,omitempty"` ServiceAccount *string `json:"service_account,omitempty"` } type ExecutionRequestCustom map[string]interface{} type ExecutionRequest interface { GetExecutionRequestCommon() *ExecutionRequestCommon GetExecutionRequestCustom() *ExecutionRequestCustom } type DefinitionExecutionRequest struct { *ExecutionRequestCommon } // Returns ExecutionRequestCommon, common between Template and Definition types func (d *DefinitionExecutionRequest) GetExecutionRequestCommon() *ExecutionRequestCommon { return d.ExecutionRequestCommon } // Only relevant to the template type func (d *DefinitionExecutionRequest) GetExecutionRequestCustom() *ExecutionRequestCustom { return nil } type TerminateJob struct { RunID string UserInfo UserInfo } // task definition. It implements the `Executable` interface. type Definition struct { DefinitionID string `json:"definition_id"` GroupName string `json:"group_name,omitempty"` Alias string `json:"alias"` Command string `json:"command,omitempty"` TaskType string `json:"task_type,omitempty"` RequiresDocker bool `json:"requires_docker,omitempty" db:"requires_docker"` TargetCluster string `json:"target_cluster,omitempty" db:"target_cluster"` ExecutableResources } // Return definition or template id func (d Definition) GetExecutableID() *string { return &d.DefinitionID } // Returns definition or template func (d Definition) GetExecutableType() *ExecutableType { t := ExecutableTypeDefinition return &t } func (d Definition) GetExecutableResources() *ExecutableResources { return &d.ExecutableResources } func (d Definition) GetExecutableCommand(req ExecutionRequest) (string, error) { return d.Command, nil } func (d Definition) GetExecutableResourceName() string { return d.DefinitionID } var commandWrapper = ` set -e set -x {{.Command}} ` var CommandTemplate, _ = template.New("command").Parse(commandWrapper) // WrappedCommand returns the wrapped command for the definition // * wrapping ensures lines are logged and exit code is set func (d *Definition) WrappedCommand() (string, error) { var result bytes.Buffer if err := CommandTemplate.Execute(&result, d); err != nil { return "", err } return result.String(), nil } type validationCondition struct { condition bool reason string } // IsValid returns true only if this is a valid definition with all // required information func (d *Definition) IsValid() (bool, []string) { conditions := []validationCondition{ {len(d.Image) == 0, "string [image] must be specified"}, {len(d.Alias) == 0, "string [alias] must be specified"}, } valid := true var reasons []string for _, cond := range conditions { if cond.condition { valid = false reasons = append(reasons, cond.reason) } } return valid, reasons } // UpdateWith updates this definition with information from another func (d *Definition) UpdateWith(other Definition) { if len(other.DefinitionID) > 0 { d.DefinitionID = other.DefinitionID } if len(other.Image) > 0 { d.Image = other.Image } if len(other.GroupName) > 0 { d.GroupName = other.GroupName } if len(other.Alias) > 0 { d.Alias = other.Alias } if other.Memory != nil { d.Memory = other.Memory } if other.Gpu != nil { d.Gpu = other.Gpu } if other.Cpu != nil { d.Cpu = other.Cpu } if other.EphemeralStorage != nil { d.EphemeralStorage = other.EphemeralStorage } if other.AdaptiveResourceAllocation != nil { d.AdaptiveResourceAllocation = other.AdaptiveResourceAllocation } if len(other.Command) > 0 { d.Command = other.Command } if len(other.TaskType) > 0 { d.TaskType = other.TaskType } if other.Env != nil { d.Env = other.Env } if other.Ports != nil { d.Ports = other.Ports } if other.Tags != nil { d.Tags = other.Tags } } func (d Definition) MarshalJSON() ([]byte, error) { type Alias Definition env := d.Env if env == nil { env = &EnvList{} } return json.Marshal(&struct { Env *EnvList `json:"env"` Alias }{ Env: env, Alias: (Alias)(d), }) } // DefinitionList wraps a list of Definitions type DefinitionList struct { Total int `json:"total"` Definitions []Definition `json:"definitions"` } func (dl *DefinitionList) MarshalJSON() ([]byte, error) { type Alias DefinitionList l := dl.Definitions if l == nil { l = []Definition{} } return json.Marshal(&struct { Definitions []Definition `json:"definitions"` *Alias }{ Definitions: l, Alias: (*Alias)(dl), }) } // Run represents a single run of a Definition // // TODO: // // Runs need to -copy- the run relevant information // from their associated definition when they are // created so they always have correct info. Currently // the definition can change during or after the run // is created and launched meaning the run is acting // on information that is no longer accessible. type Run struct { RunID string `json:"run_id"` DefinitionID string `json:"definition_id"` Alias string `json:"alias"` Image string `json:"image"` ClusterName string `json:"cluster"` ExitCode *int64 `json:"exit_code,omitempty"` Status string `json:"status"` QueuedAt *time.Time `json:"queued_at,omitempty"` StartedAt *time.Time `json:"started_at,omitempty"` FinishedAt *time.Time `json:"finished_at,omitempty"` InstanceID string `json:"-"` InstanceDNSName string `json:"-"` GroupName string `json:"group_name"` User string `json:"user,omitempty"` TaskType string `json:"task_type,omitempty"` Env *EnvList `json:"env,omitempty"` Command *string `json:"command,omitempty"` CommandHash *string `json:"command_hash,omitempty"` Memory *int64 `json:"memory,omitempty"` MemoryLimit *int64 `json:"memory_limit,omitempty"` Cpu *int64 `json:"cpu,omitempty"` CpuLimit *int64 `json:"cpu_limit,omitempty"` Gpu *int64 `json:"gpu,omitempty"` ExitReason *string `json:"exit_reason,omitempty"` Engine *string `json:"engine,omitempty"` NodeLifecycle *string `json:"node_lifecycle,omitempty"` EphemeralStorage *int64 `json:"ephemeral_storage,omitempty" db:"ephemeral_storage"` PodName *string `json:"pod_name,omitempty"` Namespace *string `json:"namespace,omitempty"` MaxMemoryUsed *int64 `json:"max_memory_used,omitempty"` MaxCpuUsed *int64 `json:"max_cpu_used,omitempty"` PodEvents *PodEvents `json:"pod_events,omitempty"` CloudTrailNotifications *CloudTrailNotifications `json:"cloudtrail_notifications,omitempty"` ExecutableID *string `json:"executable_id,omitempty"` ExecutableType *ExecutableType `json:"executable_type,omitempty"` ExecutionRequestCustom *ExecutionRequestCustom `json:"execution_request_custom,omitempty"` AttemptCount *int64 `json:"attempt_count,omitempty"` SpawnedRuns *SpawnedRuns `json:"spawned_runs,omitempty"` RunExceptions *RunExceptions `json:"run_exceptions,omitempty"` ActiveDeadlineSeconds *int64 `json:"active_deadline_seconds,omitempty"` SparkExtension *SparkExtension `json:"spark_extension,omitempty"` MetricsUri *string `json:"metrics_uri,omitempty"` Description *string `json:"description,omitempty"` IdempotenceKey *string `json:"idempotence_key,omitempty"` Arch *string `json:"arch,omitempty"` Labels Labels `json:"labels,omitempty"` RequiresDocker bool `json:"requires_docker,omitempty" db:"requires_docker"` ServiceAccount *string `json:"service_account,omitempty" db:"service_account"` Tier Tier `json:"tier,omitempty"` } // UpdateWith updates this run with information from another func (d *Run) UpdateWith(other Run) { if len(other.RunID) > 0 { d.RunID = other.RunID } if len(other.DefinitionID) > 0 { d.DefinitionID = other.DefinitionID } if other.Tier != "" { d.Tier = other.Tier } if len(other.Alias) > 0 { d.Alias = other.Alias } if len(other.Image) > 0 { d.Image = other.Image } if len(other.ClusterName) > 0 { d.ClusterName = other.ClusterName } if other.ExitCode != nil { d.ExitCode = other.ExitCode } if other.QueuedAt != nil { d.QueuedAt = other.QueuedAt } if other.StartedAt != nil { d.StartedAt = other.StartedAt } if other.FinishedAt != nil { d.FinishedAt = other.FinishedAt } if len(other.InstanceID) > 0 { d.InstanceID = other.InstanceID } if len(other.InstanceDNSName) > 0 { d.InstanceDNSName = other.InstanceDNSName } if len(other.GroupName) > 0 { d.GroupName = other.GroupName } if len(other.User) > 0 { d.User = other.User } if len(other.TaskType) > 0 { d.TaskType = other.TaskType } if other.Env != nil { d.Env = other.Env } if other.ExitReason != nil { d.ExitReason = other.ExitReason } if other.Command != nil && len(*other.Command) > 0 { d.Command = other.Command } if other.CommandHash != nil && len(*other.CommandHash) > 0 { d.CommandHash = other.CommandHash } if other.Memory != nil { d.Memory = other.Memory } if other.Cpu != nil { d.Cpu = other.Cpu } if other.Gpu != nil { d.Gpu = other.Gpu } if other.MaxMemoryUsed != nil { d.MaxMemoryUsed = other.MaxMemoryUsed } if other.MaxCpuUsed != nil { d.MaxCpuUsed = other.MaxCpuUsed } if other.Engine != nil { d.Engine = other.Engine } if other.EphemeralStorage != nil { d.EphemeralStorage = other.EphemeralStorage } if other.NodeLifecycle != nil { d.NodeLifecycle = other.NodeLifecycle } if other.PodName != nil { d.PodName = other.PodName } if other.Namespace != nil { d.Namespace = other.Namespace } if other.PodEvents != nil { d.PodEvents = other.PodEvents } if other.SpawnedRuns != nil { d.SpawnedRuns = other.SpawnedRuns } if other.RunExceptions != nil { d.RunExceptions = other.RunExceptions } if other.ExecutableID != nil { d.ExecutableID = other.ExecutableID } if other.ExecutableType != nil { d.ExecutableType = other.ExecutableType } if other.SparkExtension != nil { d.SparkExtension = other.SparkExtension } if other.CloudTrailNotifications != nil && len((*other.CloudTrailNotifications).Records) > 0 { d.CloudTrailNotifications = other.CloudTrailNotifications } if other.ExecutionRequestCustom != nil { d.ExecutionRequestCustom = other.ExecutionRequestCustom } if other.CpuLimit != nil { d.CpuLimit = other.CpuLimit } if other.MetricsUri != nil { d.MetricsUri = other.MetricsUri } if other.Description != nil { d.Description = other.Description } if other.IdempotenceKey != nil { d.IdempotenceKey = other.IdempotenceKey } if other.Arch != nil { d.Arch = other.Arch } if other.MemoryLimit != nil { d.MemoryLimit = other.MemoryLimit } if other.AttemptCount != nil { d.AttemptCount = other.AttemptCount } if other.Labels != nil { d.Labels = other.Labels } // // Runs have a deterministic lifecycle // // QUEUED --> PENDING --> RUNNING --> STOPPED // QUEUED --> PENDING --> NEEDS_RETRY --> QUEUED ... // QUEUED --> PENDING --> STOPPED ... // statusPrecedence := map[string]int{ StatusNeedsRetry: -1, StatusQueued: 0, StatusPending: 1, StatusRunning: 2, StatusStopped: 3, } if other.Status == StatusNeedsRetry { d.Status = StatusNeedsRetry } else { if runStatus, ok := statusPrecedence[d.Status]; ok { if newStatus, ok := statusPrecedence[other.Status]; ok { if newStatus > runStatus { d.Status = other.Status } } } } } func removeDuplicateStr(strSlice []string) []string { allKeys := make(map[string]bool) var list []string for _, item := range strSlice { if _, value := allKeys[item]; !value { allKeys[item] = true list = append(list, item) } } return list } type byExecutorName []string type RunStatus struct { RunID string `json:"run_id"` Status string `json:"status"` QueuedAt *time.Time `json:"queued_at,omitempty"` StartedAt *time.Time `json:"started_at,omitempty"` FinishedAt *time.Time `json:"finished_at,omitempty"` ExitCode *int64 `json:"exit_code,omitempty"` ExitReason *string `json:"exit_reason,omitempty"` Engine *string `json:"engine,omitempty"` DefinitionID string `json:"definition_id"` Alias string `json:"alias"` ClusterName string `json:"cluster_name"` } func (s byExecutorName) Len() int { return len(s) } func (s byExecutorName) Key(i int) int { r, _ := regexp.Compile("-exec-(\\d+)") matches := r.FindStringSubmatch(s[i]) if matches == nil || len(matches) < 2 { return 0 } key, err := strconv.Atoi(matches[1]) if err != nil { return 0 } return key } func (s byExecutorName) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s byExecutorName) Less(i, j int) bool { return s.Key(i) < s.Key(j) } func (r Run) MarshalJSON() ([]byte, error) { type Alias Run instance := map[string]string{ "instance_id": r.InstanceID, "dns_name": r.InstanceDNSName, } podEvents := r.PodEvents if podEvents == nil { podEvents = &PodEvents{} } var executors []string for _, podEvent := range *podEvents { if strings.Contains(podEvent.SourceObject, "-exec-") { executors = append(executors, podEvent.SourceObject) } } if executors != nil && len(executors) > 0 && *r.Engine != EKSEngine { executors = removeDuplicateStr(executors) sort.Sort(byExecutorName(executors)) r.SparkExtension.Executors = executors } cloudTrailNotifications := r.CloudTrailNotifications if cloudTrailNotifications == nil { cloudTrailNotifications = &CloudTrailNotifications{} } executionRequestCustom := r.ExecutionRequestCustom if executionRequestCustom == nil { executionRequestCustom = &ExecutionRequestCustom{} } if r.Description == nil { r.Description = aws.String(r.Alias) } sparkExtension := r.SparkExtension if sparkExtension == nil { sparkExtension = &SparkExtension{} } else { if sparkExtension.HiveConf != nil { for _, conf := range sparkExtension.HiveConf { if conf.Name != nil && strings.Contains(*conf.Name, "ConnectionPassword") { conf.Value = aws.String("****") } } } if r.Status != StatusStopped && r.SparkExtension.AppUri != nil { r.SparkExtension.HistoryUri = r.SparkExtension.AppUri } } return json.Marshal(&struct { Instance map[string]string `json:"instance"` PodEvents *PodEvents `json:"pod_events"` CloudTrailNotifications *CloudTrailNotifications `json:"cloudtrail_notifications"` SparkExtension *SparkExtension `json:"spark_extension"` Alias }{ Instance: instance, PodEvents: podEvents, CloudTrailNotifications: cloudTrailNotifications, SparkExtension: sparkExtension, Alias: (Alias)(r), }) } // RunList wraps a list of Runs type RunList struct { Total int `json:"total"` Runs []Run `json:"history"` } type PodEvents []PodEvent type PodEventList struct { Total int `json:"total"` PodEvents PodEvents `json:"pod_events"` } type SpawnedRun struct { RunID string `json:"run_id"` } type SpawnedRuns []SpawnedRun type RunExceptions []string func (w *PodEvent) Equal(other PodEvent) bool { return w.Reason == other.Reason && other.Timestamp != nil && w.Timestamp.Equal(*other.Timestamp) && w.SourceObject == other.SourceObject && w.Message == other.Message && w.EventType == other.EventType } type PodEvent struct { Timestamp *time.Time `json:"timestamp,omitempty"` EventType string `json:"event_type"` Reason string `json:"reason"` SourceObject string `json:"source_object"` Message string `json:"message"` } // GroupsList wraps a list of group names type GroupsList struct { Groups []string Total int } // TagsList wraps a list of tag names type TagsList struct { Tags []string Total int } // Worker represents a Flotilla Worker type Worker struct { WorkerType string `json:"worker_type"` CountPerInstance int `json:"count_per_instance"` Engine string `json:"engine"` } // UpdateWith updates this definition with information from another func (w *Worker) UpdateWith(other Worker) { if other.CountPerInstance >= 0 { w.CountPerInstance = other.CountPerInstance } } // WorkersList wraps a list of Workers type WorkersList struct { Total int `json:"total"` Workers []Worker `json:"workers"` } // User information making the API calls type UserInfo struct { Name string `json:"name"` Email string `json:"email"` } // Internal object for tracking cpu / memory resources. type TaskResources struct { Cpu sql.NullInt64 `json:"cpu" db:"cpu"` Memory sql.NullInt64 `json:"memory" db:"memory"` } // SQS notification object for CloudTrail S3 files. type CloudTrailS3File struct { S3Bucket string `json:"s3Bucket"` S3ObjectKey []string `json:"s3ObjectKey"` Done func() error } // Marshal method for CloudTrail SQS notifications. func (e *CloudTrailNotifications) Marshal() ([]byte, error) { return json.Marshal(e) } // CloudTrail notification object that is persisted into the DB. type CloudTrailNotifications struct { Records []Record `json:"Records"` } // CloudTrail notification record. type Record struct { UserIdentity UserIdentity `json:"userIdentity"` EventSource string `json:"eventSource"` EventName string `json:"eventName"` } // User ARN who performed the AWS api action. type UserIdentity struct { Arn string `json:"arn"` } // Equals helper method for Record. func (w *Record) Equal(other Record) bool { return w.EventName == other.EventName && w.EventSource == other.EventSource } // String helper method for Record. func (w *Record) String() string { return fmt.Sprintf("%s-%s", w.EventSource, w.EventName) } const TemplatePayloadKey = "template_payload" type TemplatePayload map[string]interface{} type TemplateExecutionRequest struct { *ExecutionRequestCommon TemplatePayload TemplatePayload `json:"template_payload"` DryRun bool `json:"dry_run,omitempty"` } // Returns ExecutionRequestCommon associated with a Template type. func (t TemplateExecutionRequest) GetExecutionRequestCommon() *ExecutionRequestCommon { return t.ExecutionRequestCommon } // Returns ExecutionRequestCustom associated with a Template type. func (t TemplateExecutionRequest) GetExecutionRequestCustom() *ExecutionRequestCustom { return &ExecutionRequestCustom{ TemplatePayloadKey: t.TemplatePayload, } } // Templates uses JSON Schema types. type TemplateJSONSchema map[string]interface{} // Template Object Type. The CommandTemplate is a Go Template type. type Template struct { TemplateID string `json:"template_id"` TemplateName string `json:"template_name"` Version int64 `json:"version"` Schema TemplateJSONSchema `json:"schema"` CommandTemplate string `json:"command_template"` Defaults TemplatePayload `json:"defaults"` AvatarURI string `json:"avatar_uri"` ExecutableResources } type CreateTemplateRequest struct { TemplateName string `json:"template_name"` Schema TemplateJSONSchema `json:"schema"` CommandTemplate string `json:"command_template"` Defaults TemplatePayload `json:"defaults"` AvatarURI string `json:"avatar_uri"` ExecutableResources } type CreateTemplateResponse struct { DidCreate bool `json:"did_create"` Template Template `json:"template,omitempty"` } // Returns Template ID func (t Template) GetExecutableID() *string { return &t.TemplateID } // Returns Template Type func (t Template) GetExecutableType() *ExecutableType { et := ExecutableTypeTemplate return &et } // Returns default resources associated with that Template. func (t Template) GetExecutableResources() *ExecutableResources { return &t.ExecutableResources } // Renders the command to be rendered for that Template. func (t Template) GetExecutableCommand(req ExecutionRequest) (string, error) { var ( err error result bytes.Buffer ) // Get the request's custom fields. customFields := *req.GetExecutionRequestCustom() executionPayload, ok := customFields[TemplatePayloadKey] if !ok || executionPayload == nil { return "", err } executionPayload, err = t.compositeUserAndDefaults(executionPayload) schemaLoader := gojsonschema.NewGoLoader(t.Schema) documentLoader := gojsonschema.NewGoLoader(executionPayload) // Perform JSON schema validation to ensure that the request's template // payload conforms to the template's JSON schema. validationResult, err := gojsonschema.Validate(schemaLoader, documentLoader) if err != nil { return "", err } if validationResult != nil && validationResult.Valid() != true { var res []string for _, resultError := range validationResult.Errors() { res = append(res, resultError.String()) } return "", errors.New(strings.Join(res, "\n")) } // Create a new template string based on the template.Template. textTemplate, err := template.New("command").Funcs(sprig.TxtFuncMap()).Parse(t.CommandTemplate) if err != nil { return "", err } // Dump payload into the template string. if err = textTemplate.Execute(&result, executionPayload); err != nil { return "", err } return result.String(), nil } // Returns the Template Id. func (t Template) GetExecutableResourceName() string { return t.TemplateID } func (t Template) compositeUserAndDefaults(userPayload interface{}) (TemplatePayload, error) { var ( final map[string]interface{} ok bool ) final, ok = userPayload.(TemplatePayload) if !ok { return final, errors.New("unable to cast request payload to TemplatePayload struct") } err := MergeMaps(&final, t.Defaults) if err != nil { return final, err } return final, nil } // NewTemplateID returns a new uuid for a Template func NewTemplateID(t Template) (string, error) { uuid4, err := newUUIDv4() if err != nil { return "", err } return fmt.Sprintf("tpl-%s", uuid4[4:]), nil } // Checks validity of a template. func (t *Template) IsValid() (bool, []string) { conditions := []validationCondition{ {len(t.TemplateName) == 0, "string [template_name] must be specified"}, {len(t.Schema) == 0, "schema must be specified"}, {len(t.CommandTemplate) == 0, "string [command_template] must be specified"}, {len(t.Image) == 0, "string [image] must be specified"}, {t.Memory == nil, "int [memory] must be specified"}, } valid := true var reasons []string for _, cond := range conditions { if cond.condition { valid = false reasons = append(reasons, cond.reason) } } return valid, reasons } // TemplateList wraps a list of Templates type TemplateList struct { Total int `json:"total"` Templates []Template `json:"templates"` } // Template Marshal method. func (tl *TemplateList) MarshalJSON() ([]byte, error) { type Alias TemplateList l := tl.Templates if l == nil { l = []Template{} } return json.Marshal(&struct { Templates []Template `json:"templates"` *Alias }{ Templates: l, Alias: (*Alias)(tl), }) } func (r *KubernetesEvent) Marshal() ([]byte, error) { return json.Marshal(r) } type KubernetesEvent struct { Metadata Metadata `json:"metadata,omitempty"` Reason string `json:"reason,omitempty"` Message string `json:"message,omitempty"` Source Source `json:"source,omitempty"` FirstTimestamp string `json:"firstTimestamp,omitempty"` LastTimestamp string `json:"lastTimestamp,omitempty"` Count int64 `json:"count,omitempty"` Type string `json:"type,omitempty"` EventTime interface{} `json:"eventTime,omitempty"` ReportingComponent string `json:"reportingComponent,omitempty"` ReportingInstance string `json:"reportingInstance,omitempty"` InvolvedObject InvolvedObject `json:"involvedObject,omitempty"` Done func() error } type InvolvedObject struct { Kind string `json:"kind,omitempty"` Namespace string `json:"namespace,omitempty"` Name string `json:"name,omitempty"` Uid string `json:"uid,omitempty"` APIVersion string `json:"apiVersion,omitempty"` ResourceVersion string `json:"resourceVersion,omitempty"` FieldPath string `json:"fieldPath,omitempty"` Labels EventLabels `json:"labels,omitempty"` } type EventLabels struct { ControllerUid string `json:"controller-uid,omitempty"` JobName string `json:"job-name,omitempty"` ClusterName string `json:"cluster-name,omitempty"` } type Metadata struct { Name string `json:"name,omitempty"` Namespace string `json:"namespace,omitempty"` SelfLink string `json:"selfLink,omitempty"` Uid string `json:"uid,omitempty"` ResourceVersion string `json:"resourceVersion,omitempty"` CreationTimestamp string `json:"creationTimestamp,omitempty"` } type Source struct { Component string `json:"component,omitempty"` Host string `json:"host,omitempty"` } func UnmarshalEmrEvents(data []byte) (EmrEvent, error) { var r EmrEvent err := json.Unmarshal(data, &r) return r, err } func (r *EmrEvent) Marshal() ([]byte, error) { return json.Marshal(r) } type EmrEvent struct { Version *string `json:"version,omitempty"` ID *string `json:"id,omitempty"` DetailType *string `json:"detail-type,omitempty"` Source *string `json:"source,omitempty"` Account *string `json:"account,omitempty"` Time *string `json:"time,omitempty"` Region *string `json:"region,omitempty"` Resources []interface{} `json:"resources,omitempty"` Detail *Detail `json:"detail,omitempty"` Done func() error } type Detail struct { Severity *string `json:"severity,omitempty"` Name *string `json:"name,omitempty"` ID *string `json:"id,omitempty"` Arn *string `json:"arn,omitempty"` VirtualClusterID *string `json:"virtualClusterId,omitempty"` State *string `json:"state,omitempty"` CreatedBy *string `json:"createdBy,omitempty"` ReleaseLabel *string `json:"releaseLabel,omitempty"` ExecutionRoleArn *string `json:"executionRoleArn,omitempty"` FailureReason *string `json:"failureReason,omitempty"` StateDetails *string `json:"stateDetails,omitempty"` Message *string `json:"message,omitempty"` } type LaunchRequest struct { ClusterName *string `json:"cluster,omitempty"` Env *EnvList `json:"env,omitempty"` Tier Tier `json:"tier"` } type LaunchRequestV2 struct { Tier Tier `json:"tier"` RunTags RunTags `json:"run_tags"` Command *string `json:"command,omitempty"` Memory *int64 `json:"memory,omitempty"` Cpu *int64 `json:"cpu,omitempty"` Gpu *int64 `json:"gpu,omitempty"` EphemeralStorage *int64 `json:"ephemeral_storage,omitempty"` Engine *string `json:"engine,omitempty"` NodeLifecycle *string `json:"node_lifecycle,omitempty"` ActiveDeadlineSeconds *int64 `json:"active_deadline_seconds,omitempty"` SparkExtension *SparkExtension `json:"spark_extension,omitempty"` ClusterName *string `json:"cluster,omitempty"` Env *EnvList `json:"env,omitempty"` Description *string `json:"description,omitempty"` CommandHash *string `json:"command_hash,omitempty"` IdempotenceKey *string `json:"idempotence_key,omitempty"` Arch *string `json:"arch,omitempty"` Labels *Labels `json:"labels,omitempty"` ServiceAccount *string `json:"service_account,omitempty"` } // RunTags represents which user is responsible for a task run type RunTags struct { OwnerEmail string `json:"owner_email"` TeamName string `json:"team_name"` OwnerID string `json:"owner_id"` } type ClusterStatus string type Tier string type Tiers []string type Capability string type Capabilities []string const ( StatusActive ClusterStatus = "active" StatusMaintenance ClusterStatus = "maintenance" StatusOffline ClusterStatus = "offline" ) type ClusterMetadata struct { ID string `json:"id" db:"id"` Name string `json:"name" db:"name"` ClusterVersion string `json:"cluster_version" db:"cluster_version"` Status ClusterStatus `json:"status" db:"status"` StatusReason string `json:"status_reason" db:"status_reason"` StatusSince time.Time `json:"status_since" db:"status_since"` AllowedTiers Tiers `json:"allowed_tiers" db:"allowed_tiers"` Capabilities Capabilities `json:"capabilities" db:"capabilities"` UpdatedAt time.Time `json:"updated_at" db:"updated_at"` Namespace string `json:"namespace" db:"namespace"` Region string `json:"region" db:"region"` EMRVirtualCluster string `json:"emr_virtual_cluster" db:"emr_virtual_cluster"` SparkServerURI string `json:"spark_server_uri" db:"spark_server_uri"` } // MergeMaps takes a pointer to a map (first arg) and map containing default // values (second arg) and recursively sets values that exist in `b` but are // not set in `a`. For existing values, it does not override those of `a` with // those of `b`. func MergeMaps(a *map[string]interface{}, b map[string]interface{}) error { return mergeMapsRecursive(a, b) } func mergeMapsRecursive(a *map[string]interface{}, b map[string]interface{}) error { for k, v := range b { // If the value is a map, check recursively. if reflect.TypeOf(v).Kind() == reflect.Map { if _, ok := (*a)[k]; !ok { (*a)[k] = v } else { aVal, ok := (*a)[k].(map[string]interface{}) bVal, ok := v.(map[string]interface{}) if !ok { return errors.New("unable to cast interface{} to map[string]interface{}") } if err := mergeMapsRecursive(&aVal, bVal); err != nil { return err } } } else { if _, ok := (*a)[k]; !ok { (*a)[k] = v } } } return nil } func GetLabels(run Run) map[string]string { var labels = make(map[string]string) if run.ClusterName != "" { labels["cluster-name"] = run.ClusterName } if run.RunID != "" { labels["flotilla-run-id"] = SanitizeLabel(run.RunID) labels["flotilla-run-mode"] = SanitizeLabel(os.Getenv("FLOTILLA_MODE")) } if run.User != "" { labels["owner"] = SanitizeLabel(run.User) } if run.Tier != "" { labels["tier"] = SanitizeLabel(string(run.Tier)) } if _, workflowExists := run.Labels["kube_workflow"]; !workflowExists { if _, taskNameExists := run.Labels["kube_task_name"]; taskNameExists { labels["kube_workflow"] = SanitizeLabel(run.Labels["kube_task_name"]) } } for k, v := range run.Labels { labels[k] = SanitizeLabel(v) } return labels } func SanitizeLabel(key string) string { key = strings.TrimSpace(key) key = regexp.MustCompile(`[^-a-z0-9A-Z_.]+`).ReplaceAllString(key, "_") key = strings.TrimPrefix(key, "_") key = strings.ToLower(key) if len(key) > 63 { key = key[:63] } for { tempKey := strings.TrimSuffix(key, "_") if tempKey == key { break } key = tempKey } return key } ================================================ FILE: state/models_test.go ================================================ package state import ( "os" "reflect" "strings" "testing" ) func TestMergeMaps_Simple(t *testing.T) { mapA := map[string]interface{}{ "A": "aaa", "B": "bbb", "C": "ccc", } mapB := map[string]interface{}{ "B": "xxx", "D": "ddd", } expectedMapA := map[string]interface{}{ "A": "aaa", "B": "bbb", "C": "ccc", "D": "ddd", } err := MergeMaps(&mapA, mapB) if err != nil { t.Error("unable to merge maps") } if reflect.DeepEqual(mapA, expectedMapA) == false { t.Error("map merge unsuccessful") } } func TestMergeMaps_Nested(t *testing.T) { nestedAValue := "aaa" nestedCValue := "ccc" overrideNestedBVal := "zzzzzz" nestedD1Value := "d1" overrideNestedD1Value := "override_d1" overrideNestedD2Value := "override_d2" mapA := map[string]interface{}{ "Nested": map[string]interface{}{ "A": nestedAValue, "C": nestedCValue, "D": map[string]interface{}{ "D1": nestedD1Value, }, }, } mapB := map[string]interface{}{ "Nested": map[string]interface{}{ "B": overrideNestedBVal, "D": map[string]interface{}{ "D1": overrideNestedD1Value, "D2": overrideNestedD2Value, }, }, } // After merging, mapA should have its `B` value set. Additionally, mapA[D] // should have its D2 value set BUT its D1 value should not be overriden. expectedMapA := map[string]interface{}{ "Nested": map[string]interface{}{ "A": nestedAValue, "B": overrideNestedBVal, "C": nestedCValue, "D": map[string]interface{}{ "D1": nestedD1Value, "D2": overrideNestedD2Value, }, }, } err := MergeMaps(&mapA, mapB) if err != nil { t.Error("unable to merge maps") } if reflect.DeepEqual(mapA, expectedMapA) == false { t.Error("map merge unsuccessful") } } func TestSanitizeLabel(t *testing.T) { tests := []struct { name string input string expected string }{ { name: "should truncate", input: strings.Repeat("a", 64), expected: strings.Repeat("a", 63), }, { name: "leaves lowercase alone", input: "lowercasealphanumeric11", expected: "lowercasealphanumeric11", }, { name: "lowercases stuff", input: "UPPERCASEALPHANUMERIC11", expected: "uppercasealphanumeric11", }, { name: "replaces special chars", input: "a*s", expected: "a_s", }, { name: "trims spaces", input: " foo ", expected: "foo", }, { name: "removes leading _'s", input: "_a", expected: "a", }, { name: "removes trailing _'s", input: "a_", expected: "a", }, { name: "removes repeated trailing _'s", input: "a_____", expected: "a", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { result := SanitizeLabel(test.input) if result != test.expected { t.Errorf("expected %s, got %s", test.expected, result) } }) } } func TestGetLabels(t *testing.T) { type args struct { run Run } var tests []struct { name string args args want map[string]string } os.Setenv("FLOTILLA_MODE", "test") tests = []struct { name string args args want map[string]string }{ { name: "should return labels for run with definition", args: args{ run: Run{ DefinitionID: "A", ClusterName: "A", GroupName: "groupA", RunID: "runA", User: "userA", Tier: "tierA", Labels: map[string]string{ "kube_foo": "bar", "team": "awesomeness", "kube_task_name": "foo", }, }, }, want: map[string]string{ "cluster-name": "A", "flotilla-run-id": "runa", "kube_workflow": "foo", "kube_foo": "bar", "kube_task_name": "foo", "team": "awesomeness", "tier": "tiera", "owner": "usera", "flotilla-run-mode": "test", }, }, { name: "should return empty labels for run with no definition", args: args{ run: Run{}, }, want: map[string]string{}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := GetLabels(tt.args.run); !reflect.DeepEqual(got, tt.want) { t.Errorf("GetLabels() = %v, want %v", got, tt.want) } }) } } ================================================ FILE: state/pg_queries.go ================================================ package state // DefinitionSelect postgres specific query for definitions const DefinitionSelect = ` select td.definition_id as definitionid, td.adaptive_resource_allocation as adaptiveresourceallocation, td.image as image, td.group_name as groupname, td.alias as alias, td.memory as memory, coalesce(td.command, '') as command, coalesce(td.task_type, '') as tasktype, env::TEXT as env, td.cpu as cpu, td.gpu as gpu, td.ephemeral_storage as ephemeral_storage, coalesce(td.requires_docker, false) as requires_docker, coalesce(td.target_cluster, '') as target_cluster, array_to_json('{""}'::TEXT[])::TEXT as tags, array_to_json('{}'::INT[])::TEXT as ports from (select * from task_def) td ` // ListDefinitionsSQL postgres specific query for listing definitions const ListDefinitionsSQL = DefinitionSelect + "\n%s %s limit $1 offset $2" // ListClusterStatesSQL postgres query for listing cluster status const ( ListClusterStatesSQL = ` SELECT id, name, cluster_version, status, status_reason, status_since, capabilities, allowed_tiers, region, updated_at, namespace, emr_virtual_cluster, spark_server_uri FROM cluster_state ORDER BY name ASC` ) // GetDefinitionSQL postgres specific query for getting a single definition const GetDefinitionSQL = DefinitionSelect + "\nwhere definition_id = $1" // GetDefinitionByAliasSQL get definition by alias const GetDefinitionByAliasSQL = DefinitionSelect + "\nwhere alias = $1" const TaskResourcesSelectCommandSQL = ` SELECT cast((percentile_disc(0.99) within GROUP (ORDER BY A.max_memory_used)) * 1.75 as int) as memory, cast((percentile_disc(0.99) within GROUP (ORDER BY A.max_cpu_used)) * 1.25 as int) as cpu FROM (SELECT memory as max_memory_used, cpu as max_cpu_used FROM TASK WHERE queued_at >= CURRENT_TIMESTAMP - INTERVAL '3 days' AND (exit_code = 137 or exit_reason = 'OOMKilled') AND engine = 'eks' AND definition_id = $1 AND command_hash = $2 LIMIT 30) A ` const TaskResourcesExecutorCountSQL = ` SELECT least(coalesce(cast((percentile_disc(0.99) within GROUP (ORDER BY A.executor_count)) as int), 25), 100) as executor_count FROM (SELECT CASE WHEN (exit_reason like '%Exception%') THEN (spark_extension -> 'spark_submit_job_driver' -> 'num_executors')::int * 1.75 ELSE (spark_extension -> 'spark_submit_job_driver' -> 'num_executors')::int * 1 END as executor_count FROM TASK WHERE queued_at >= CURRENT_TIMESTAMP - INTERVAL '24 hours' AND engine = 'eks-spark' AND definition_id = $1 AND command_hash = $2 LIMIT 30) A ` const TaskResourcesDriverOOMSQL = ` SELECT (spark_extension -> 'driver_oom')::boolean AS driver_oom FROM TASK WHERE queued_at >= CURRENT_TIMESTAMP - INTERVAL '7 days' AND engine = 'eks-spark' AND definition_id = $1 AND command_hash = $2 AND exit_code = 137 AND spark_extension ? 'driver_oom' GROUP BY 1 ` const TaskIdempotenceKeyCheckSQL = ` WITH runs as ( SELECT run_id FROM task WHERE idempotence_key = $1 and (exit_code = 0 or exit_code is null) and queued_at >= CURRENT_TIMESTAMP - INTERVAL '7 days') SELECT run_id FROM runs LIMIT 1; ` const TaskResourcesExecutorOOMSQL = ` SELECT CASE WHEN A.c >= 1 THEN true::boolean ELSE false::boolean END FROM (SELECT count(*) as c FROM TASK WHERE queued_at >= CURRENT_TIMESTAMP - INTERVAL '7 days' AND definition_id = $1 AND command_hash = $2 AND engine = 'eks-spark' AND exit_code !=0 LIMIT 30) A ` const TaskResourcesExecutorNodeLifecycleSQL = ` SELECT CASE WHEN A.c >= 1 THEN 'ondemand' ELSE 'spot' END FROM (SELECT count(*) as c FROM TASK WHERE queued_at >= CURRENT_TIMESTAMP - INTERVAL '12 hour' AND definition_id = $1 AND command_hash = $2 AND exit_code !=0 LIMIT 30) A ` const TaskExecutionRuntimeCommandSQL = ` SELECT percentile_disc(0.95) within GROUP (ORDER BY A.minutes) as minutes FROM (SELECT EXTRACT(epoch from finished_at - started_at) / 60 as minutes FROM TASK WHERE definition_id = $1 AND exit_code = 0 AND engine = 'eks' AND queued_at >= CURRENT_TIMESTAMP - INTERVAL '7 days' AND command_hash = (SELECT command_hash FROM task WHERE run_id = $2) LIMIT 30) A ` const ListFailingNodesSQL = ` SELECT instance_dns_name FROM ( SELECT instance_dns_name, count(*) as c FROM TASK WHERE (exit_code = 128 OR pod_events @> '[{"reason": "Failed"}]' OR pod_events @> '[{"reason": "FailedSync"}]' OR pod_events @> '[{"reason": "FailedCreatePodSandBox"}]' OR pod_events @> '[{"reason": "OutOfmemory"}]') AND engine = 'eks' AND queued_at >= NOW() - INTERVAL '1 HOURS' AND instance_dns_name like 'ip-%' GROUP BY 1 order by 2 desc) AS all_nodes WHERE c >= 5 ` const PodReAttemptRate = ` SELECT (multiple_attempts / (CASE WHEN single_attempts = 0 THEN 1 ELSE single_attempts END)) AS attempts FROM ( SELECT COUNT(CASE WHEN attempt_count <= 1 THEN 1 END) * 1.0 AS single_attempts, COUNT(CASE WHEN attempt_count > 1 THEN 1 END) * 1.0 AS multiple_attempts FROM task WHERE engine = 'eks' AND queued_at >= NOW() - INTERVAL '18 MINUTES' AND node_lifecycle = 'spot') A ` // RunSelect postgres specific query for runs const RunSelect = ` select t.run_id as runid, coalesce(t.definition_id, '') as definitionid, coalesce(t.alias, '') as alias, coalesce(t.image, '') as image, coalesce(t.cluster_name, '') as clustername, t.exit_code as exitcode, t.exit_reason as exitreason, coalesce(t.status, '') as status, queued_at as queuedat, started_at as startedat, finished_at as finishedat, coalesce(t.instance_id, '') as instanceid, coalesce(t.instance_dns_name, '') as instancednsname, coalesce(t.group_name, '') as groupname, coalesce(t.task_type, '') as tasktype, env::TEXT as env, command, memory, cpu, gpu, engine, ephemeral_storage as ephemeral_storage, node_lifecycle as nodelifecycle, pod_name as podname, namespace, max_cpu_used as maxcpuused, max_memory_used as maxmemoryused, pod_events::TEXT as podevents, command_hash as commandhash, cloudtrail_notifications::TEXT as cloudtrailnotifications, coalesce(executable_id, '') as executableid, coalesce(executable_type, '') as executabletype, execution_request_custom::TEXT as executionrequestcustom, cpu_limit as cpulimit, memory_limit as memorylimit, attempt_count as attemptcount, spawned_runs::TEXT as spawnedruns, run_exceptions::TEXT as runexceptions, active_deadline_seconds as activedeadlineseconds, spark_extension::TEXT as sparkextension, metrics_uri as metricsuri, description as description, idempotence_key as idempotencekey, coalesce("user", '') as user, coalesce(arch, '') as arch, labels::TEXT as labels, coalesce(requires_docker,false) as requires_docker, service_account as service_account, coalesce(tier::text, 'Tier4') as tier from task t ` const GetRunStatusSQL = ` SELECT run_id, definition_id, alias, cluster_name, status, queued_at, started_at, finished_at, exit_code, exit_reason, engine FROM task WHERE run_id = $1 ` // ListRunsSQL postgres specific query for listing runs const ListRunsSQL = RunSelect + "\n%s %s limit $1 offset $2" // GetRunSQL postgres specific query for getting a single run const GetRunSQL = RunSelect + "\nwhere run_id = $1" const GetRunSQLByEMRJobId = RunSelect + "\nwhere spark_extension->>'emr_job_id' = $1" // GetRunSQLForUpdate postgres specific query for getting a single run // for update const GetRunSQLForUpdate = GetRunSQL + " for update" // GroupsSelect postgres specific query for getting existing definition // group_names const GroupsSelect = ` select distinct group_name from task_def ` // TagsSelect postgres specific query for getting existing definition tags const TagsSelect = ` select distinct text from tags ` // ListGroupsSQL postgres specific query for listing definition group_names const ListGroupsSQL = GroupsSelect + "\n%s order by group_name asc limit $1 offset $2" // ListTagsSQL postgres specific query for listing definition tags const ListTagsSQL = TagsSelect + "\n%s order by text asc limit $1 offset $2" // WorkerSelect postgres specific query for workers const WorkerSelect = ` select worker_type as workertype, count_per_instance as countperinstance, engine from worker ` // ListWorkersSQL postgres specific query for listing workers const ListWorkersSQL = WorkerSelect const GetWorkerEngine = WorkerSelect + "\nwhere engine = $1" // GetWorkerSQL postgres specific query for retrieving data for a specific // worker type. const GetWorkerSQL = WorkerSelect + "\nwhere worker_type = $1 and engine = $2" // GetWorkerSQLForUpdate postgres specific query for retrieving data for a specific // worker type; locks the row. const GetWorkerSQLForUpdate = GetWorkerSQL + " for update" // TemplateSelect selects a template const TemplateSelect = ` SELECT template_id as templateid, template_name as templatename, version, schema, command_template as commandtemplate, adaptive_resource_allocation as adaptiveresourceallocation, image, memory, env::TEXT as env, privileged, cpu, gpu, defaults, coalesce(avatar_uri, '') as avataruri FROM template ` // ListTemplatesSQL postgres specific query for listing templates const ListTemplatesSQL = TemplateSelect + "\n%s limit $1 offset $2" // GetTemplateByIDSQL postgres specific query for getting a single template const GetTemplateByIDSQL = TemplateSelect + "\nwhere template_id = $1" // ListTemplatesLatestOnlySQL lists the latest version of each distinct // template name. const ListTemplatesLatestOnlySQL = ` SELECT DISTINCT ON (template_name) template_id as templateid, template_name as templatename, version, schema, command_template as commandtemplate, adaptive_resource_allocation as adaptiveresourceallocation, image, memory, env::TEXT as env, privileged, cpu, gpu, defaults, coalesce(avatar_uri, '') as avataruri FROM template ORDER BY template_name, version DESC, template_id LIMIT $1 OFFSET $2 ` // GetTemplateLatestOnlySQL get the latest version of a specific template name. const GetTemplateLatestOnlySQL = TemplateSelect + "\nWHERE template_name = $1 ORDER BY version DESC LIMIT 1;" const GetTemplateByVersionSQL = TemplateSelect + "\nWHERE template_name = $1 AND version = $2 ORDER BY version DESC LIMIT 1;" ================================================ FILE: state/pg_state_manager.go ================================================ package state import ( "context" "database/sql/driver" "encoding/json" "fmt" "time" "github.com/stitchfix/flotilla-os/clients/metrics" "github.com/stitchfix/flotilla-os/log" "github.com/stitchfix/flotilla-os/tracing" "github.com/jmoiron/sqlx" // Pull in postgres specific drivers "database/sql" "math" "strings" "github.com/lib/pq" _ "github.com/lib/pq" "github.com/pkg/errors" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/exceptions" "go.uber.org/multierr" sqltrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/database/sql" sqlxtrace "gopkg.in/DataDog/dd-trace-go.v1/contrib/jmoiron/sqlx" ) // SQLStateManager uses postgresql to manage state type SQLStateManager struct { db *sqlx.DB readonlyDB *sqlx.DB log log.Logger } func (sm *SQLStateManager) ListFailingNodes(ctx context.Context) (NodeList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_failing_nodes", "") defer span.Finish() var err error var nodeList NodeList err = sm.readonlyDB.SelectContext(ctx, &nodeList, ListFailingNodesSQL) if err != nil { if err == sql.ErrNoRows { return nodeList, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Error fetching node list")} } else { return nodeList, errors.Wrapf(err, "Error fetching node list") } } return nodeList, err } func (sm *SQLStateManager) GetPodReAttemptRate(ctx context.Context) (float32, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_pod_reattempt_rate", "") defer span.Finish() var err error attemptRate := float32(1.0) err = sm.readonlyDB.GetContext(ctx, &attemptRate, PodReAttemptRate) if err != nil { if err == sql.ErrNoRows { return attemptRate, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Error fetching attempt rate")} } else { return attemptRate, errors.Wrapf(err, "Error fetching attempt rate") } } return attemptRate, err } func (sm *SQLStateManager) GetNodeLifecycle(ctx context.Context, executableID string, commandHash string) (string, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_node_lifecycle", "") defer span.Finish() //span.SetTag("command_hash", commandHash) var err error nodeType := "spot" err = sm.readonlyDB.GetContext(ctx, &nodeType, TaskResourcesExecutorNodeLifecycleSQL, executableID, commandHash) if err != nil { if err == sql.ErrNoRows { return nodeType, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Error fetching node type")} } else { return nodeType, errors.Wrapf(err, "Error fetching node type") } } return nodeType, err } func (sm *SQLStateManager) GetTaskHistoricalRuntime(ctx context.Context, executableID string, runID string) (float32, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_task_historical_runtime", "") defer span.Finish() span.SetTag("job.run_id", runID) var err error minutes := float32(1.0) err = sm.readonlyDB.GetContext(ctx, &minutes, TaskExecutionRuntimeCommandSQL, executableID, runID) if err != nil { if err == sql.ErrNoRows { return minutes, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Error fetching TaskRuntime rate")} } else { return minutes, errors.Wrapf(err, "Error fetching attempt rate") } } return minutes, err } func (sm *SQLStateManager) EstimateRunResources(ctx context.Context, executableID string, commandHash string) (TaskResources, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.estimate_run_resources", "") defer span.Finish() //span.SetTag("command_hash", commandHash) var err error var taskResources TaskResources err = sm.readonlyDB.GetContext(ctx, &taskResources, TaskResourcesSelectCommandSQL, executableID, commandHash) if err != nil { if err == sql.ErrNoRows { // No historical data found - this is expected for new jobs or jobs that haven't OOM'd if sm.log != nil { _ = sm.log.Log( "level", "info", "message", "ARA: No historical resource data found", "definition_id", executableID, "command_hash", commandHash, ) } return taskResources, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Resource usage with executable %s not found", executableID)} } else { // Check if this is a PostgreSQL recovery conflict (expected on read replicas) errMsg := err.Error() isRecoveryConflict := strings.Contains(errMsg, "conflict with recovery") || strings.Contains(errMsg, "canceling statement due to conflict") if isRecoveryConflict { // Recovery conflicts are expected on read replicas - treat as missing data // Log at info level since this is expected behavior, not an error if sm.log != nil { _ = sm.log.Log( "level", "info", "message", "ARA: Query canceled due to recovery conflict on read replica (using defaults)", "definition_id", executableID, "command_hash", commandHash, ) } return taskResources, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Resource usage with executable %s not found (recovery conflict)", executableID)} } // Unexpected error querying historical data if sm.log != nil { _ = sm.log.Log( "level", "error", "message", "ARA: Error querying historical resource data", "definition_id", executableID, "command_hash", commandHash, "error", err.Error(), ) } return taskResources, errors.Wrapf(err, "issue getting resources with executable [%s]", executableID) } } // Check if the query returned NULL values (can happen when percentile_disc has no valid data) if !taskResources.Memory.Valid || !taskResources.Cpu.Valid { // NULL values mean no valid historical data - treat as missing resource if sm.log != nil { _ = sm.log.Log( "level", "info", "message", "ARA: No historical resource data found (NULL values returned)", "definition_id", executableID, "command_hash", commandHash, ) } return taskResources, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Resource usage with executable %s not found (NULL values)", executableID)} } // Successfully found historical data - log the values being returned if sm.log != nil { _ = sm.log.Log( "level", "info", "message", "ARA: Historical resource data found", "definition_id", executableID, "command_hash", commandHash, "estimated_memory_mb", taskResources.Memory.Int64, "estimated_cpu_millicores", taskResources.Cpu.Int64, ) } return taskResources, err } func (sm *SQLStateManager) EstimateExecutorCount(ctx context.Context, executableID string, commandHash string) (int64, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.estimate_executor_count", "") defer span.Finish() //span.SetTag("command_hash", commandHash) var err error executorCount := int64(25) err = sm.readonlyDB.GetContext(ctx, &executorCount, TaskResourcesExecutorCountSQL, executableID, commandHash) if err != nil { if err == sql.ErrNoRows { return executorCount, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Resource usage with executable %s not found", executableID)} } else { return executorCount, errors.Wrapf(err, "issue getting resources with executable [%s]", executableID) } } return executorCount, err } func (sm *SQLStateManager) CheckIdempotenceKey(ctx context.Context, idempotenceKey string) (string, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.check_idempotence_key", "") defer span.Finish() var err error runId := "" err = sm.readonlyDB.GetContext(ctx, &runId, TaskIdempotenceKeyCheckSQL, idempotenceKey) if err != nil || len(runId) == 0 { err = errors.New("no run_id found for idempotence key") } return runId, err } func (sm *SQLStateManager) ExecutorOOM(ctx context.Context, executableID string, commandHash string) (bool, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.executor_oom", "") defer span.Finish() //span.SetTag("command_hash", commandHash) var err error executorOOM := false err = sm.readonlyDB.GetContext(ctx, &executorOOM, TaskResourcesExecutorOOMSQL, executableID, commandHash) if err != nil { if err == sql.ErrNoRows { return executorOOM, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Resource oom for executable %s not found", executableID)} } else { return executorOOM, errors.Wrapf(err, "issue getting resources with executable [%s]", executableID) } } return executorOOM, err } func (sm *SQLStateManager) DriverOOM(ctx context.Context, executableID string, commandHash string) (bool, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.driver_oom", "") defer span.Finish() //span.SetTag("command_hash", commandHash) var err error driverOOM := false err = sm.readonlyDB.GetContext(ctx, &driverOOM, TaskResourcesDriverOOMSQL, executableID, commandHash) if err != nil { if err == sql.ErrNoRows { return driverOOM, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Resource oom for driver %s not found", executableID)} } else { return driverOOM, errors.Wrapf(err, "issue getting resources with executable [%s]", executableID) } } return driverOOM, err } // Name is the name of the state manager - matches value in configuration func (sm *SQLStateManager) Name() string { return "postgres" } // likeFields are the set of fields // that are filtered using a `like` clause var likeFields = map[string]bool{ "image": true, "alias": true, "group_name": true, "command": true, "text": true, "exit_reason": true, } // Initialize creates tables if they do not exist func (sm *SQLStateManager) Initialize(conf config.Config) error { dburl := conf.GetString("database_url") readonlyDbUrl := conf.GetString("readonly_database_url") createSchema := conf.GetBool("create_database_schema") fmt.Printf("create_database_schema: %t\ncreating schema...\n", createSchema) sqltrace.Register("postgres", &pq.Driver{}, sqltrace.WithServiceName("flotilla")) var err error if sm.db, err = sqlxtrace.Open("postgres", dburl); err != nil { return errors.Wrap(err, "unable to open postgres db") } sqltrace.Register("postgres", &pq.Driver{}, sqltrace.WithServiceName("flotilla")) if sm.readonlyDB, err = sqlxtrace.Open("postgres", readonlyDbUrl); err != nil { return errors.Wrap(err, "unable to open readonly postgres db") } if conf.IsSet("database_max_idle_connections") { sm.db.SetMaxIdleConns(conf.GetInt("database_max_idle_connections")) sm.readonlyDB.SetMaxIdleConns(conf.GetInt("database_max_idle_connections")) } if createSchema { // Since this happens at initialization we // could encounter racy conditions waiting for pg // to become available. Wait for it a bit if err = sm.db.Ping(); err != nil { // Try 3 more times // 5, 10, 20 for i := 0; i < 3 && err != nil; i++ { time.Sleep(time.Duration(5*math.Pow(2, float64(i))) * time.Second) err = sm.db.Ping() } if err != nil { return errors.Wrap(err, "error trying to connect to postgres db, retries exhausted") } } // Populate worker table if err = sm.initWorkerTable(conf); err != nil { return errors.Wrap(err, "problem populating worker table sql") } } return nil } func (sm *SQLStateManager) makeWhereClause(filters map[string][]string) []string { // These will be joined with "AND" wc := []string{} for k, v := range filters { if len(v) > 1 { // No like queries for multiple filters with same key quoted := make([]string, len(v)) for i, filterVal := range v { quoted[i] = fmt.Sprintf("'%s'", filterVal) } wc = append(wc, fmt.Sprintf("%s in (%s)", k, strings.Join(quoted, ","))) } else if len(v) == 1 { fmtString := "%s='%s'" fieldName := k if likeFields[k] { fmtString = "%s like '%%%s%%'" } else if strings.HasSuffix(k, "_since") { fieldName = strings.Replace(k, "_since", "", -1) fmtString = "%s > '%s'" } else if strings.HasSuffix(k, "_until") { fieldName = strings.Replace(k, "_until", "", -1) fmtString = "%s < '%s'" } wc = append(wc, fmt.Sprintf(fmtString, fieldName, v[0])) } } return wc } func (sm *SQLStateManager) makeEnvWhereClause(filters map[string]string) []string { wc := make([]string, len(filters)) i := 0 for k, v := range filters { fmtString := `env @> '[{"name":"%s","value":"%s"}]'` wc[i] = fmt.Sprintf(fmtString, k, v) i++ } return wc } func (sm *SQLStateManager) orderBy(obj IOrderable, field string, order string) (string, error) { if order == "asc" || order == "desc" { if obj.ValidOrderField(field) { return fmt.Sprintf("order by %s %s NULLS LAST", field, order), nil } return "", errors.Errorf("Invalid field to order by [%s], must be one of [%s]", field, strings.Join(obj.ValidOrderFields(), ", ")) } return "", errors.Errorf("Invalid order string, must be one of ('asc', 'desc'), was %s", order) } // ListDefinitions returns a DefinitionList // limit: limit the result to this many definitions // offset: start the results at this offset // sortBy: sort by this field // order: 'asc' or 'desc' // filters: map of field filters on Definition - joined with AND // envFilters: map of environment variable filters - joined with AND func (sm *SQLStateManager) ListDefinitions( ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string) (DefinitionList, error) { // Use "list" as an identifier since there's no specific runID for a list operation ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_definitions", "") defer span.Finish() var err error var result DefinitionList var whereClause, orderQuery string where := append(sm.makeWhereClause(filters), sm.makeEnvWhereClause(envFilters)...) if len(where) > 0 { whereClause = fmt.Sprintf("where %s", strings.Join(where, " and ")) } orderQuery, err = sm.orderBy(&Definition{}, sortBy, order) if err != nil { return result, errors.WithStack(err) } sql := fmt.Sprintf(ListDefinitionsSQL, whereClause, orderQuery) countSQL := fmt.Sprintf("select COUNT(*) from (%s) as sq", sql) err = sm.db.Select(&result.Definitions, sql, limit, offset) if err != nil { return result, errors.Wrap(err, "issue running list definitions sql") } err = sm.db.Get(&result.Total, countSQL, nil, 0) if err != nil { return result, errors.Wrap(err, "issue running list definitions count sql") } return result, nil } // GetDefinition returns a single definition by id func (sm *SQLStateManager) GetDefinition(ctx context.Context, definitionID string) (Definition, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_definition", "") defer span.Finish() var err error var definition Definition err = sm.db.GetContext(ctx, &definition, GetDefinitionSQL, definitionID) if err != nil { if err == sql.ErrNoRows { return definition, exceptions.MissingResource{ fmt.Sprintf("Definition with ID %s not found", definitionID)} } else { return definition, errors.Wrapf(err, "issue getting definition with id [%s]", definitionID) } } return definition, nil } // GetDefinitionByAlias returns a single definition by id func (sm *SQLStateManager) GetDefinitionByAlias(ctx context.Context, alias string) (Definition, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_definition_by_alias", "") defer span.Finish() //span.SetTag("alias", alias) var err error var definition Definition err = sm.db.GetContext(ctx, &definition, GetDefinitionByAliasSQL, alias) if err != nil { if err == sql.ErrNoRows { return definition, exceptions.MissingResource{ fmt.Sprintf("Definition with alias %s not found", alias)} } else { return definition, errors.Wrapf(err, "issue getting definition with alias [%s]", alias) } } return definition, err } // UpdateDefinition updates a definition // - updates can be partial func (sm *SQLStateManager) UpdateDefinition(ctx context.Context, definitionID string, updates Definition) (Definition, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.update_definition", "") defer span.Finish() var ( err error existing Definition ) existing, err = sm.GetDefinition(ctx, definitionID) if err != nil { return existing, errors.WithStack(err) } existing.UpdateWith(updates) selectForUpdate := `SELECT * FROM task_def WHERE definition_id = $1 FOR UPDATE;` deletePorts := `DELETE FROM task_def_ports WHERE task_def_id = $1;` deleteTags := `DELETE FROM task_def_tags WHERE task_def_id = $1` insertPorts := ` INSERT INTO task_def_ports( task_def_id, port ) VALUES ($1, $2); ` insertDefTags := ` INSERT INTO task_def_tags( task_def_id, tag_id ) VALUES ($1, $2); ` insertTags := ` INSERT INTO tags(text) SELECT $1 WHERE NOT EXISTS (SELECT text from tags where text = $2) ` tx, err := sm.db.Begin() if err != nil { return existing, errors.WithStack(err) } if _, err = tx.Exec(selectForUpdate, definitionID); err != nil { return existing, errors.WithStack(err) } if _, err = tx.Exec(deletePorts, definitionID); err != nil { return existing, errors.WithStack(err) } if _, err = tx.Exec(deleteTags, definitionID); err != nil { return existing, errors.WithStack(err) } update := ` UPDATE task_def SET image = $2, alias = $3, memory = $4, command = $5, env = $6, cpu = $7, gpu = $8, adaptive_resource_allocation = $9, ephemeral_storage = $10, requires_docker = $11, target_cluster = $12 WHERE definition_id = $1; ` if _, err = tx.Exec( update, definitionID, existing.Image, existing.Alias, existing.Memory, existing.Command, existing.Env, existing.Cpu, existing.Gpu, existing.AdaptiveResourceAllocation, existing.EphemeralStorage, existing.RequiresDocker, existing.TargetCluster); err != nil { return existing, errors.Wrapf(err, "issue updating definition [%s]", definitionID) } if existing.Ports != nil { for _, p := range *existing.Ports { if _, err = tx.Exec(insertPorts, definitionID, p); err != nil { tx.Rollback() return existing, errors.WithStack(err) } } } if existing.Tags != nil { for _, t := range *existing.Tags { if _, err = tx.Exec(insertTags, t, t); err != nil { tx.Rollback() return existing, errors.WithStack(err) } if _, err = tx.Exec(insertDefTags, definitionID, t); err != nil { tx.Rollback() return existing, errors.WithStack(err) } } } err = tx.Commit() if err != nil { return existing, errors.WithStack(err) } return existing, nil } // CreateDefinition creates the passed in definition object // - error if definition already exists func (sm *SQLStateManager) CreateDefinition(ctx context.Context, d Definition) error { ctx, span := tracing.TraceJob(ctx, "flotilla.state.create_definition", "") defer span.Finish() var err error insertPorts := ` INSERT INTO task_def_ports( task_def_id, port ) VALUES ($1, $2); ` insertDefTags := ` INSERT INTO task_def_tags( task_def_id, tag_id ) VALUES ($1, $2); ` insertTags := ` INSERT INTO tags(text) SELECT $1 WHERE NOT EXISTS (SELECT text from tags where text = $2) ` tx, err := sm.db.Begin() if err != nil { return errors.WithStack(err) } insert := ` INSERT INTO task_def( definition_id, image, group_name, alias, memory, command, env, cpu, gpu, adaptive_resource_allocation, ephemeral_storage, requires_docker, target_cluster ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13); ` if _, err = tx.Exec(insert, d.DefinitionID, d.Image, d.GroupName, d.Alias, d.Memory, d.Command, d.Env, d.Cpu, d.Gpu, d.AdaptiveResourceAllocation, d.EphemeralStorage, d.RequiresDocker, d.TargetCluster); err != nil { tx.Rollback() return errors.Wrapf( err, "issue creating new task definition with alias [%s] and id [%s]", d.DefinitionID, d.Alias) } if d.Ports != nil { for _, p := range *d.Ports { if _, err = tx.Exec(insertPorts, d.DefinitionID, p); err != nil { tx.Rollback() return errors.WithStack(err) } } } if d.Tags != nil { for _, t := range *d.Tags { if _, err = tx.Exec(insertTags, t, t); err != nil { tx.Rollback() return errors.WithStack(err) } if _, err = tx.Exec(insertDefTags, d.DefinitionID, t); err != nil { tx.Rollback() return errors.WithStack(err) } } } err = tx.Commit() if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return errors.WithStack(err) } return nil } // DeleteDefinition deletes definition and associated runs and environment variables func (sm *SQLStateManager) DeleteDefinition(ctx context.Context, definitionID string) error { ctx, span := tracing.TraceJob(ctx, "flotilla.state.delete_definition", "") defer span.Finish() var err error statements := []string{ "DELETE FROM task_def_ports WHERE task_def_id = $1", "DELETE FROM task_def_tags WHERE task_def_id = $1", "DELETE FROM task WHERE definition_id = $1", "DELETE FROM task_def WHERE definition_id = $1", } tx, err := sm.db.Begin() if err != nil { return errors.WithStack(err) } for _, stmt := range statements { if _, err = tx.Exec(stmt, definitionID); err != nil { tx.Rollback() return errors.Wrapf(err, "issue deleting definition with id [%s]", definitionID) } } err = tx.Commit() if err != nil { return errors.WithStack(err) } return nil } // ListRuns returns a RunList // limit: limit the result to this many runs // offset: start the results at this offset // sortBy: sort by this field // order: 'asc' or 'desc' // filters: map of field filters on Run - joined with AND // envFilters: map of environment variable filters - joined with AND func (sm *SQLStateManager) ListRuns(ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string, engines []string) (RunList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_runs", "") defer span.Finish() var err error var result RunList var whereClause, orderQuery string if filters == nil { filters = make(map[string][]string) } if engines != nil { filters["engine"] = engines } else { filters["engine"] = []string{DefaultEngine} } where := append(sm.makeWhereClause(filters), sm.makeEnvWhereClause(envFilters)...) if len(where) > 0 { whereClause = fmt.Sprintf("where %s", strings.Join(where, " and ")) } orderQuery, err = sm.orderBy(&Run{}, sortBy, order) if err != nil { return result, errors.WithStack(err) } sql := fmt.Sprintf(ListRunsSQL, whereClause, orderQuery) countSQL := fmt.Sprintf("select COUNT(*) from (%s) as sq", sql) err = sm.db.Select(&result.Runs, sql, limit, offset) if err != nil { return result, errors.Wrap(err, "issue running list runs sql") } err = sm.db.Get(&result.Total, countSQL, nil, 0) if err != nil { return result, errors.Wrap(err, "issue running list runs count sql") } return result, nil } // GetRun gets run by id func (sm *SQLStateManager) GetRun(ctx context.Context, runID string) (Run, error) { // Create a span for this database operation using the utils.TraceJob function ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_run", "") defer span.Finish() span.SetTag("job.run_id", runID) var r Run err := sm.db.GetContext(ctx, &r, GetRunSQL, runID) if err != nil { // Tag error for easier debugging span.SetTag("error", true) span.SetTag("error.msg", err.Error()) if err == sql.ErrNoRows { return r, exceptions.MissingResource{ fmt.Sprintf("Run with id %s not found", runID)} } else { return r, errors.Wrapf(err, "issue getting run with id [%s]", runID) } } // Tag the span with run metadata tracing.TagRunInfo(span, r.RunID, r.DefinitionID, r.Alias, r.Status, r.ClusterName, r.QueuedAt, r.StartedAt, r.FinishedAt, r.PodName, r.Namespace, r.ExitReason, r.ExitCode, string(r.Tier)) return r, nil } func (sm *SQLStateManager) GetRunByEMRJobId(ctx context.Context, emrJobId string) (Run, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_run_by_emr_job_id", "") defer span.Finish() span.SetTag("job.emr_job_id", emrJobId) var err error var r Run err = sm.db.GetContext(ctx, &r, GetRunSQLByEMRJobId, emrJobId) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) if err == sql.ErrNoRows { return r, exceptions.MissingResource{ fmt.Sprintf("Run with emrjobid %s not found", emrJobId)} } else { return r, errors.Wrapf(err, "issue getting run with emrjobid [%s]", emrJobId) } } // Tag the span with run metadata tracing.TagRunInfo(span, r.RunID, r.DefinitionID, r.Alias, r.Status, r.ClusterName, r.QueuedAt, r.StartedAt, r.FinishedAt, r.PodName, r.Namespace, r.ExitReason, r.ExitCode, string(r.Tier)) return r, nil } func (sm *SQLStateManager) GetResources(ctx context.Context, runID string) (Run, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_resources", "") defer span.Finish() span.SetTag("job.run_id", runID) var err error var r Run err = sm.db.GetContext(ctx, &r, GetRunSQL, runID) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) if err == sql.ErrNoRows { return r, exceptions.MissingResource{ fmt.Sprintf("Run with id %s not found", runID)} } else { return r, errors.Wrapf(err, "issue getting run with id [%s]", runID) } } // Tag the span with run metadata tracing.TagRunInfo(span, r.RunID, r.DefinitionID, r.Alias, r.Status, r.ClusterName, r.QueuedAt, r.StartedAt, r.FinishedAt, r.PodName, r.Namespace, r.ExitReason, r.ExitCode, string(r.Tier)) return r, nil } // UpdateRun updates run with updates - can be partial func (sm *SQLStateManager) UpdateRun(ctx context.Context, runID string, updates Run) (Run, error) { start := time.Now() ctx, span := tracing.TraceJob(ctx, "flotilla.state.update_run", "") defer span.Finish() span.SetTag("job.run_id", runID) span.SetTag("status", updates.Status) var ( err error existing Run ) tx, err := sm.db.BeginTx(ctx, nil) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) span.SetTag("error.type", "begin_transaction") return existing, errors.WithStack(err) } rows, err := tx.QueryContext(ctx, GetRunSQLForUpdate, runID) if err != nil { tx.Rollback() span.SetTag("error", true) span.SetTag("error.msg", err.Error()) span.SetTag("error.type", "query") return existing, errors.WithStack(err) } for rows.Next() { err = rows.Scan( &existing.RunID, &existing.DefinitionID, &existing.Alias, &existing.Image, &existing.ClusterName, &existing.ExitCode, &existing.ExitReason, &existing.Status, &existing.QueuedAt, &existing.StartedAt, &existing.FinishedAt, &existing.InstanceID, &existing.InstanceDNSName, &existing.GroupName, &existing.TaskType, &existing.Env, &existing.Command, &existing.Memory, &existing.Cpu, &existing.Gpu, &existing.Engine, &existing.EphemeralStorage, &existing.NodeLifecycle, &existing.PodName, &existing.Namespace, &existing.MaxCpuUsed, &existing.MaxMemoryUsed, &existing.PodEvents, &existing.CommandHash, &existing.CloudTrailNotifications, &existing.ExecutableID, &existing.ExecutableType, &existing.ExecutionRequestCustom, &existing.CpuLimit, &existing.MemoryLimit, &existing.AttemptCount, &existing.SpawnedRuns, &existing.RunExceptions, &existing.ActiveDeadlineSeconds, &existing.SparkExtension, &existing.MetricsUri, &existing.Description, &existing.IdempotenceKey, &existing.User, &existing.Arch, &existing.Labels, &existing.RequiresDocker, &existing.ServiceAccount, &existing.Tier, ) } if err != nil { return existing, errors.WithStack(err) } existing.UpdateWith(updates) update := ` UPDATE task SET definition_id = $2, alias = $3, image = $4, cluster_name = $5, exit_code = $6, exit_reason = $7, status = $8, queued_at = $9, started_at = $10, finished_at = $11, instance_id = $12, instance_dns_name = $13, group_name = $14, env = $15, command = $16, memory = $17, cpu = $18, gpu = $19, engine = $20, ephemeral_storage = $21, node_lifecycle = $22, pod_name = $23, namespace = $24, max_cpu_used = $25, max_memory_used = $26, pod_events = $27, cloudtrail_notifications = $28, executable_id = $29, executable_type = $30, execution_request_custom = $31, cpu_limit = $32, memory_limit = $33, attempt_count = $34, spawned_runs = $35, run_exceptions = $36, active_deadline_seconds = $37, spark_extension = $38, metrics_uri = $39, description = $40, idempotence_key = $41, "user" = $42, arch = $43, labels = $44, requires_docker = $45, service_account = $46, tier = $47 WHERE run_id = $1; ` if _, err = tx.Exec( update, runID, existing.DefinitionID, existing.Alias, existing.Image, existing.ClusterName, existing.ExitCode, existing.ExitReason, existing.Status, existing.QueuedAt, existing.StartedAt, existing.FinishedAt, existing.InstanceID, existing.InstanceDNSName, existing.GroupName, existing.Env, existing.Command, existing.Memory, existing.Cpu, existing.Gpu, existing.Engine, existing.EphemeralStorage, existing.NodeLifecycle, existing.PodName, existing.Namespace, existing.MaxCpuUsed, existing.MaxMemoryUsed, existing.PodEvents, existing.CloudTrailNotifications, existing.ExecutableID, existing.ExecutableType, existing.ExecutionRequestCustom, existing.CpuLimit, existing.MemoryLimit, existing.AttemptCount, existing.SpawnedRuns, existing.RunExceptions, existing.ActiveDeadlineSeconds, existing.SparkExtension, existing.MetricsUri, existing.Description, existing.IdempotenceKey, existing.User, existing.Arch, existing.Labels, existing.RequiresDocker, existing.ServiceAccount, existing.Tier); err != nil { tx.Rollback() return existing, errors.WithStack(err) } if err = tx.Commit(); err != nil { return existing, errors.WithStack(err) } _ = metrics.Timing(metrics.EngineUpdateRun, time.Since(start), []string{existing.ClusterName}, 1) go sm.logStatusUpdate(existing) return existing, nil } // CreateRun creates the passed in run func (sm *SQLStateManager) CreateRun(ctx context.Context, r Run) error { ctx, span := tracing.TraceJob(ctx, "flotilla.state.create_run", "") defer span.Finish() span.SetTag("job.run_id", r.RunID) // Now utils.TraceJob already sets the run_id tag var err error insert := ` INSERT INTO task ( run_id, definition_id, alias, image, cluster_name, exit_code, exit_reason, status, queued_at, started_at, finished_at, instance_id, instance_dns_name, group_name, env, command, memory, cpu, gpu, engine, node_lifecycle, ephemeral_storage, pod_name, namespace, max_cpu_used, max_memory_used, pod_events, executable_id, executable_type, execution_request_custom, cpu_limit, memory_limit, attempt_count, spawned_runs, run_exceptions, active_deadline_seconds, task_type, command_hash, spark_extension, metrics_uri, description, idempotence_key, "user", arch, labels, requires_docker, service_account, tier ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48 ); ` tx, err := sm.db.BeginTx(ctx, nil) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return errors.WithStack(err) } if _, err = tx.ExecContext(ctx, insert, r.RunID, r.DefinitionID, r.Alias, r.Image, r.ClusterName, r.ExitCode, r.ExitReason, r.Status, r.QueuedAt, r.StartedAt, r.FinishedAt, r.InstanceID, r.InstanceDNSName, r.GroupName, r.Env, r.Command, r.Memory, r.Cpu, r.Gpu, r.Engine, r.NodeLifecycle, r.EphemeralStorage, r.PodName, r.Namespace, r.MaxCpuUsed, r.MaxMemoryUsed, r.PodEvents, r.ExecutableID, r.ExecutableType, r.ExecutionRequestCustom, r.CpuLimit, r.MemoryLimit, r.AttemptCount, r.SpawnedRuns, r.RunExceptions, r.ActiveDeadlineSeconds, r.TaskType, r.CommandHash, r.SparkExtension, r.MetricsUri, r.Description, r.IdempotenceKey, r.User, r.Arch, r.Labels, r.RequiresDocker, r.ServiceAccount, r.Tier); err != nil { tx.Rollback() return errors.Wrapf(err, "issue creating new task run with id [%s]", r.RunID) } if err = tx.Commit(); err != nil { return errors.WithStack(err) } go sm.logStatusUpdate(r) return nil } // ListGroups returns a list of the existing group names. func (sm *SQLStateManager) ListGroups(ctx context.Context, limit int, offset int, name *string) (GroupsList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_groups", "") defer span.Finish() var ( err error result GroupsList whereClause string ) if name != nil && len(*name) > 0 { whereClause = fmt.Sprintf("where %s", strings.Join( sm.makeWhereClause(map[string][]string{"group_name": {*name}}), " and ")) } sql := fmt.Sprintf(ListGroupsSQL, whereClause) countSQL := fmt.Sprintf("select COUNT(*) from (%s) as sq", sql) err = sm.db.Select(&result.Groups, sql, limit, offset) if err != nil { return result, errors.Wrap(err, "issue running list groups sql") } err = sm.db.Get(&result.Total, countSQL, nil, 0) if err != nil { return result, errors.Wrap(err, "issue running list groups count sql") } return result, nil } // ListTags returns a list of the existing tags. func (sm *SQLStateManager) ListTags(ctx context.Context, limit int, offset int, name *string) (TagsList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_tags", "") defer span.Finish() var ( err error result TagsList whereClause string ) if name != nil && len(*name) > 0 { whereClause = fmt.Sprintf("where %s", strings.Join( sm.makeWhereClause(map[string][]string{"text": {*name}}), " and ")) } sql := fmt.Sprintf(ListTagsSQL, whereClause) countSQL := fmt.Sprintf("select COUNT(*) from (%s) as sq", sql) err = sm.db.SelectContext(ctx, &result.Tags, sql, limit, offset) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return result, errors.Wrap(err, "issue running list tags sql") } err = sm.db.GetContext(ctx, &result.Total, countSQL, nil, 0) if err != nil { return result, errors.Wrap(err, "issue running list tags count sql") } return result, nil } // initWorkerTable initializes the `worker` table with values from the config func (sm *SQLStateManager) initWorkerTable(c config.Config) error { // Get worker count from configuration (set to 1 as default) for _, engine := range Engines { fmt.Printf("init worker table for %s engine", engine) retryCount := int64(1) if c.IsSet(fmt.Sprintf("worker.%s.retry_worker_count_per_instance", engine)) { retryCount = int64(c.GetInt("worker.ecs.retry_worker_count_per_instance")) } submitCount := int64(1) if c.IsSet(fmt.Sprintf("worker.%s.submit_worker_count_per_instance", engine)) { submitCount = int64(c.GetInt("worker.ecs.submit_worker_count_per_instance")) } statusCount := int64(1) if c.IsSet(fmt.Sprintf("worker.%s.status_worker_count_per_instance", engine)) { statusCount = int64(c.GetInt("worker.ecs.status_worker_count_per_instance")) } var err error insert := ` INSERT INTO worker (worker_type, count_per_instance, engine) VALUES ('retry', $1, $4), ('submit', $2, $4), ('status', $3, $4); ` tx, err := sm.db.Begin() if err != nil { return errors.WithStack(err) } if _, err = tx.Exec(insert, retryCount, submitCount, statusCount, engine); err != nil { tx.Rollback() return errors.Wrapf(err, "issue populating worker table") } err = tx.Commit() if err != nil { return errors.WithStack(err) } } return nil } // ListWorkers returns list of workers func (sm *SQLStateManager) ListWorkers(ctx context.Context, engine string) (WorkersList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_workers", "") defer span.Finish() var err error var result WorkersList countSQL := fmt.Sprintf("select COUNT(*) from (%s) as sq", ListWorkersSQL) err = sm.readonlyDB.SelectContext(ctx, &result.Workers, GetWorkerEngine, engine) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return result, errors.Wrap(err, "issue running list workers sql") } err = sm.readonlyDB.GetContext(ctx, &result.Total, countSQL) if err != nil { return result, errors.Wrap(err, "issue running list workers count sql") } return result, nil } // GetWorker returns data for a single worker. func (sm *SQLStateManager) GetWorker(ctx context.Context, workerType string, engine string) (w Worker, err error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_worker", "") defer span.Finish() //span.SetTag("engine", engine) if err := sm.readonlyDB.GetContext(ctx, &w, GetWorkerSQL, workerType, engine); err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) if err == sql.ErrNoRows { err = exceptions.MissingResource{ ErrorString: fmt.Sprintf("Worker of type %s not found", workerType)} } else { err = errors.Wrapf(err, "issue getting worker of type [%s]", workerType) } } return } // UpdateWorker updates a single worker. func (sm *SQLStateManager) UpdateWorker(ctx context.Context, workerType string, updates Worker) (Worker, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.update_worker", "") defer span.Finish() var ( err error existing Worker ) engine := DefaultEngine tx, err := sm.db.BeginTx(ctx, nil) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return existing, errors.WithStack(err) } rows, err := tx.QueryContext(ctx, GetWorkerSQLForUpdate, workerType, engine) if err != nil { tx.Rollback() span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return existing, errors.WithStack(err) } for rows.Next() { err = rows.Scan(&existing.WorkerType, &existing.CountPerInstance, &existing.Engine) } if err != nil { return existing, errors.WithStack(err) } existing.UpdateWith(updates) update := ` UPDATE worker SET count_per_instance = $2 WHERE worker_type = $1; ` if _, err = tx.ExecContext(ctx, update, workerType, existing.CountPerInstance); err != nil { tx.Rollback() span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return existing, errors.WithStack(err) } if err = tx.Commit(); err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return existing, errors.WithStack(err) } return existing, nil } // BatchUpdateWorker updates multiple workers. func (sm *SQLStateManager) BatchUpdateWorkers(ctx context.Context, updates []Worker) (WorkersList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.batch_update_workers", "") defer span.Finish() var existing WorkersList for _, w := range updates { _, err := sm.UpdateWorker(ctx, w.WorkerType, w) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return existing, err } } return sm.ListWorkers(ctx, DefaultEngine) } // Cleanup close any open resources func (sm *SQLStateManager) Cleanup() error { return multierr.Combine(sm.db.Close(), sm.readonlyDB.Close()) } type IOrderable interface { ValidOrderField(field string) bool ValidOrderFields() []string DefaultOrderField() string } func (d *Definition) ValidOrderField(field string) bool { for _, f := range d.ValidOrderFields() { if field == f { return true } } return false } func (d *Definition) ValidOrderFields() []string { return []string{"alias", "image", "group_name", "memory"} } func (d *Definition) DefaultOrderField() string { return "group_name" } func (r *Run) ValidOrderField(field string) bool { for _, f := range r.ValidOrderFields() { if field == f { return true } } return false } func (r *Run) ValidOrderFields() []string { return []string{"run_id", "cluster_name", "status", "started_at", "finished_at", "group_name"} } func (r *Run) DefaultOrderField() string { return "group_name" } func (t *Template) ValidOrderField(field string) bool { for _, f := range t.ValidOrderFields() { if field == f { return true } } return false } func (t *Template) ValidOrderFields() []string { // @TODO: figure what fields should be orderable. return []string{"template_name", "version"} } func (t *Template) DefaultOrderField() string { return "template_name" } // Scan from db func (e *EnvList) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e *EnvList) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } // Scan from db func (e *PodEvents) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e SpawnedRuns) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } func (e *SpawnedRuns) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e SparkExtension) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } func (e *SparkExtension) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e RunExceptions) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } func (e *RunExceptions) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e PodEvents) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } // Scan from db func (e *PortsList) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e PortsList) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } // Scan from db func (e *Tags) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e Tags) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } // Scan from db func (e *CloudTrailNotifications) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e CloudTrailNotifications) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } // Scan from db func (e *ExecutionRequestCustom) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // Value to db func (e ExecutionRequestCustom) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } // Scan from db func (tjs *TemplateJSONSchema) Scan(value interface{}) error { if value != nil { s := []byte(value.([]uint8)) json.Unmarshal(s, &tjs) } return nil } // Value to db func (tjs TemplateJSONSchema) Value() (driver.Value, error) { res, _ := json.Marshal(tjs) return res, nil } // Scan from db func (tjs *TemplatePayload) Scan(value interface{}) error { if value != nil { s := []byte(value.([]uint8)) json.Unmarshal(s, &tjs) } return nil } // Value to db func (tjs TemplatePayload) Value() (driver.Value, error) { res, _ := json.Marshal(tjs) return res, nil } // Value to db func (e Labels) Value() (driver.Value, error) { res, _ := json.Marshal(e) return res, nil } func (e *Labels) Scan(value interface{}) error { if value != nil { s := []byte(value.(string)) json.Unmarshal(s, &e) } return nil } // GetTemplateByID returns a single template by id. func (sm *SQLStateManager) GetTemplateByID(ctx context.Context, templateID string) (Template, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_template_by_id", "") defer span.Finish() var err error var tpl Template err = sm.db.GetContext(ctx, &tpl, GetTemplateByIDSQL, templateID) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) if err == sql.ErrNoRows { return tpl, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Template with ID %s not found", templateID)} } return tpl, errors.Wrapf(err, "issue getting tpl with id [%s]", templateID) } return tpl, nil } func (sm *SQLStateManager) GetTemplateByVersion(ctx context.Context, templateName string, templateVersion int64) (bool, Template, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_template_by_version", "") defer span.Finish() span.SetTag("template.version", templateVersion) var err error var tpl Template err = sm.db.GetContext(ctx, &tpl, GetTemplateByVersionSQL, templateName, templateVersion) if err != nil { if err == sql.ErrNoRows { return false, tpl, nil } span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return false, tpl, errors.Wrapf(err, "issue getting tpl with id [%s]", templateName) } return true, tpl, nil } // GetLatestTemplateByTemplateName returns the latest version of a template // of a specific template name. func (sm *SQLStateManager) GetLatestTemplateByTemplateName(ctx context.Context, templateName string) (bool, Template, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_latest_template_by_name", "") defer span.Finish() var err error var tpl Template err = sm.db.GetContext(ctx, &tpl, GetTemplateLatestOnlySQL, templateName) if err != nil { if err == sql.ErrNoRows { return false, tpl, nil } span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return false, tpl, errors.Wrapf(err, "issue getting tpl with id [%s]", templateName) } return true, tpl, nil } // ListTemplates returns list of templates from the database. func (sm *SQLStateManager) ListTemplates(ctx context.Context, limit int, offset int, sortBy string, order string) (TemplateList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_templates", "") defer span.Finish() var err error var result TemplateList var orderQuery string orderQuery, err = sm.orderBy(&Template{}, sortBy, order) if err != nil { return result, errors.WithStack(err) } sql := fmt.Sprintf(ListTemplatesSQL, orderQuery) countSQL := fmt.Sprintf("select COUNT(*) from (%s) as sq", sql) err = sm.db.SelectContext(ctx, &result.Templates, sql, limit, offset) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return result, errors.Wrap(err, "issue running list templates sql") } err = sm.db.GetContext(ctx, &result.Total, countSQL, nil, 0) if err != nil { return result, errors.Wrap(err, "issue running list templates count sql") } return result, nil } // ListTemplatesLatestOnly returns list of templates from the database. func (sm *SQLStateManager) ListTemplatesLatestOnly(ctx context.Context, limit int, offset int, sortBy string, order string) (TemplateList, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_templates_latest_only", "") defer span.Finish() var err error var result TemplateList countSQL := fmt.Sprintf("select COUNT(*) from (%s) as sq", ListTemplatesLatestOnlySQL) err = sm.db.SelectContext(ctx, &result.Templates, ListTemplatesLatestOnlySQL, limit, offset) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return result, errors.Wrap(err, "issue running list templates sql") } err = sm.db.GetContext(ctx, &result.Total, countSQL, nil, 0) if err != nil { return result, errors.Wrap(err, "issue running list templates count sql") } return result, nil } // CreateTemplate creates a new template. func (sm *SQLStateManager) CreateTemplate(ctx context.Context, t Template) error { ctx, span := tracing.TraceJob(ctx, "flotilla.state.create_template", "") defer span.Finish() var err error insert := ` INSERT INTO template( template_id, template_name, version, schema, command_template, adaptive_resource_allocation, image, memory, env, cpu, gpu, defaults, avatar_uri ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15); ` tx, err := sm.db.BeginTx(ctx, nil) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return errors.WithStack(err) } if _, err = tx.ExecContext(ctx, insert, t.TemplateID, t.TemplateName, t.Version, t.Schema, t.CommandTemplate, t.AdaptiveResourceAllocation, t.Image, t.Memory, t.Env, t.Cpu, t.Gpu, t.Defaults, t.AvatarURI); err != nil { tx.Rollback() span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return errors.Wrapf( err, "issue creating new template with template_name [%s] and version [%d]", t.TemplateName, t.Version) } err = tx.Commit() if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return errors.WithStack(err) } return nil } // GetExecutableByExecutableType returns a single executable by id. func (sm *SQLStateManager) GetExecutableByTypeAndID(ctx context.Context, t ExecutableType, id string) (Executable, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_executable_by_type_and_id", "") defer span.Finish() span.SetTag("executable.type", string(t)) switch t { case ExecutableTypeDefinition: return sm.GetDefinition(ctx, id) case ExecutableTypeTemplate: return sm.GetTemplateByID(ctx, id) default: span.SetTag("error", true) span.SetTag("error.msg", fmt.Sprintf("executable type of [%s] not valid", t)) return nil, exceptions.MalformedInput{ ErrorString: fmt.Sprintf("executable type of [%s] not valid.", t), } } } func (sm *SQLStateManager) logStatusUpdate(update Run) { var err error var startedAt, finishedAt time.Time var duration float64 var env EnvList var command string if update.StartedAt != nil { startedAt = *update.StartedAt duration = time.Now().Sub(startedAt).Seconds() } if update.FinishedAt != nil { finishedAt = *update.FinishedAt duration = finishedAt.Sub(startedAt).Seconds() } if update.Env != nil { env = *update.Env } if update.Command != nil { command = *update.Command } if update.ExitCode != nil { err = sm.log.Event("eventClassName", "FlotillaTaskStatus", "run_id", update.RunID, "definition_id", update.DefinitionID, "alias", update.Alias, "image", update.Image, "cluster_name", update.ClusterName, "command", command, "exit_code", *update.ExitCode, "status", update.Status, "started_at", startedAt, "finished_at", finishedAt, "duration", duration, "instance_id", update.InstanceID, "instance_dns_name", update.InstanceDNSName, "group_name", update.GroupName, "user", update.User, "task_type", update.TaskType, "env", env, "executable_id", update.ExecutableID, "executable_type", update.ExecutableType, "Tier", update.Tier) } else { err = sm.log.Event("eventClassName", "FlotillaTaskStatus", "run_id", update.RunID, "definition_id", update.DefinitionID, "alias", update.Alias, "image", update.Image, "cluster_name", update.ClusterName, "command", command, "status", update.Status, "started_at", startedAt, "finished_at", finishedAt, "duration", duration, "instance_id", update.InstanceID, "instance_dns_name", update.InstanceDNSName, "group_name", update.GroupName, "user", update.User, "task_type", update.TaskType, "env", env, "executable_id", update.ExecutableID, "executable_type", update.ExecutableType, "Tier", update.Tier) } if err != nil { sm.log.Log("level", "error", "message", "Failed to emit status event", "run_id", update.RunID, "error", err.Error()) } } func (sm *SQLStateManager) ListClusterStates(ctx context.Context) ([]ClusterMetadata, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.list_cluster_states", "") defer span.Finish() var clusters []ClusterMetadata err := sm.db.SelectContext(ctx, &clusters, ListClusterStatesSQL) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) } return clusters, err } func (sm *SQLStateManager) UpdateClusterMetadata(ctx context.Context, cluster ClusterMetadata) error { operationName := "flotilla.state.create_cluster_metadata" identifier := cluster.Name if cluster.ID != "" { operationName = "flotilla.state.update_cluster_metadata" identifier = cluster.ID } ctx, span := tracing.TraceJob(ctx, operationName, "") defer span.Finish() span.SetTag("cluster.id", identifier) // Add relevant tags span.SetTag("cluster.name", cluster.Name) span.SetTag("cluster.status", cluster.Status) if cluster.ClusterVersion != "" { span.SetTag("cluster.version", cluster.ClusterVersion) } if cluster.ID == "" { sql := ` INSERT INTO cluster_state (name, cluster_version, status, status_reason, allowed_tiers, capabilities, namespace, region, emr_virtual_cluster, spark_server_uri) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) RETURNING id; ` var id string err := sm.db.QueryRowContext(ctx, sql, cluster.Name, cluster.ClusterVersion, cluster.Status, cluster.StatusReason, pq.Array(cluster.AllowedTiers), pq.Array(cluster.Capabilities), cluster.Namespace, cluster.Region, cluster.EMRVirtualCluster, cluster.SparkServerURI).Scan(&id) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return err } return nil } else { sql := ` UPDATE cluster_state SET name = $2, cluster_version = $3, status = $4, status_reason = $5, allowed_tiers = $6, capabilities = $7, namespace = $8, region = $9, emr_virtual_cluster = $10, spark_server_uri = $11, updated_at = NOW() WHERE id = $1; ` result, err := sm.db.ExecContext(ctx, sql, cluster.ID, cluster.Name, cluster.ClusterVersion, cluster.Status, cluster.StatusReason, pq.Array(cluster.AllowedTiers), pq.Array(cluster.Capabilities), cluster.Namespace, cluster.Region, cluster.EMRVirtualCluster, cluster.SparkServerURI) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return err } rows, err := result.RowsAffected() if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return err } if rows == 0 { span.SetTag("error", true) span.SetTag("error.msg", "Cluster not found") return exceptions.MissingResource{ ErrorString: fmt.Sprintf("Cluster with ID %s not found", cluster.ID), } } return nil } } func (sm *SQLStateManager) DeleteClusterMetadata(ctx context.Context, clusterID string) error { ctx, span := tracing.TraceJob(ctx, "flotilla.state.delete_cluster_metadata", "") defer span.Finish() span.SetTag("cluster.id", clusterID) sql := `DELETE FROM cluster_state WHERE id = $1` result, err := sm.db.ExecContext(ctx, sql, clusterID) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return err } count, err := result.RowsAffected() if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return err } if count == 0 { span.SetTag("error", true) span.SetTag("error.msg", "Cluster not found") return exceptions.MissingResource{ ErrorString: fmt.Sprintf("Cluster with ID %s not found", clusterID), } } return nil } func (sm *SQLStateManager) GetClusterByID(ctx context.Context, clusterID string) (ClusterMetadata, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_cluster_by_id", "") defer span.Finish() span.SetTag("cluster.id", clusterID) var cluster ClusterMetadata query := ` SELECT id, name, status, status_reason, status_since, allowed_tiers, capabilities, region, updated_at, namespace, emr_virtual_cluster, spark_server_uri FROM cluster_state WHERE id = $1 ` err := sm.db.GetContext(ctx, &cluster, query, clusterID) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) if err == sql.ErrNoRows { return cluster, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Cluster with ID %s not found", clusterID), } } return cluster, err } // Add tags for the cluster data span.SetTag("cluster.name", cluster.Name) span.SetTag("cluster.status", cluster.Status) if cluster.ClusterVersion != "" { span.SetTag("cluster.version", cluster.ClusterVersion) } return cluster, nil } func ScanStringArray(arr *[]string, value interface{}) error { if value == nil { *arr = []string{} return nil } switch v := value.(type) { case []byte: var result []string if err := json.Unmarshal(v, &result); err == nil { *arr = result return nil } str := string(v) if len(str) < 2 { *arr = []string{} return nil } elements := strings.Split(str[1:len(str)-1], ",") result = make([]string, 0, len(elements)) for _, e := range elements { if e != "" { // Remove quotes if they exist e = strings.Trim(e, "\"") result = append(result, e) } } *arr = result return nil default: return fmt.Errorf("unexpected type for string array: %T", value) } } func (arr *Tiers) Scan(value interface{}) error { if value == nil { *arr = Tiers{} return nil } switch v := value.(type) { case []byte: var result []string if err := json.Unmarshal(v, &result); err == nil { *arr = Tiers(result) return nil } str := string(v) if len(str) < 2 || str[0] != '{' || str[len(str)-1] != '}' { *arr = Tiers{} return nil } str = str[1 : len(str)-1] if len(str) == 0 { *arr = Tiers{} return nil } elements := strings.Split(str, ",") result = make([]string, 0, len(elements)) for _, e := range elements { if e == "" { continue } e = strings.Trim(e, "\"") result = append(result, e) } *arr = Tiers(result) return nil default: return fmt.Errorf("unsupported Scan, storing driver.Value type %T into type *Tiers", value) } } func (arr Tiers) Value() (driver.Value, error) { if len(arr) == 0 { return "{}", nil } quoted := make([]string, len(arr)) for i, v := range arr { quoted[i] = fmt.Sprintf("\"%s\"", v) } return fmt.Sprintf("{%s}", strings.Join(quoted, ",")), nil } // Scan from db for Capabilities func (arr *Capabilities) Scan(value interface{}) error { if value == nil { *arr = Capabilities{} return nil } switch v := value.(type) { case []byte: var result []string if err := json.Unmarshal(v, &result); err == nil { *arr = Capabilities(result) return nil } str := string(v) if len(str) < 2 { *arr = Capabilities{} return nil } elements := strings.Split(str[1:len(str)-1], ",") result = make([]string, 0, len(elements)) for _, e := range elements { if e != "" { result = append(result, e) } } *arr = Capabilities(result) return nil default: return fmt.Errorf("unexpected type for string array: %T", value) } } // Value to db for Capabilities func (arr Capabilities) Value() (driver.Value, error) { if len(arr) == 0 { return "{}", nil } return fmt.Sprintf("{%s}", strings.Join(arr, ",")), nil } func (sm *SQLStateManager) GetRunStatus(ctx context.Context, runID string) (RunStatus, error) { ctx, span := tracing.TraceJob(ctx, "flotilla.state.get_run_status", "") defer span.Finish() span.SetTag("job.run.id", runID) var status RunStatus tx, err := sm.db.BeginTx(ctx, nil) if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return status, errors.Wrap(err, "failed to begin transaction") } _, err = tx.ExecContext(ctx, "SET LOCAL lock_timeout = '500ms'") if err != nil { tx.Rollback() span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return status, errors.Wrap(err, "failed to set lock timeout") } err = tx.QueryRowContext(ctx, GetRunStatusSQL, runID).Scan( &status.RunID, &status.DefinitionID, &status.Alias, &status.ClusterName, &status.Status, &status.QueuedAt, &status.StartedAt, &status.FinishedAt, &status.ExitCode, &status.ExitReason, &status.Engine, ) if err != nil { tx.Rollback() span.SetTag("error", true) span.SetTag("error.msg", err.Error()) if err == sql.ErrNoRows { return status, exceptions.MissingResource{ ErrorString: fmt.Sprintf("Run with id %s not found", runID)} } if pqErr, ok := err.(*pq.Error); ok && pqErr.Code == "55P03" { return status, exceptions.ConflictingResource{ ErrorString: fmt.Sprintf("Run with id %s is currently locked, please retry", runID)} } return status, errors.Wrapf(err, "issue getting run status with id [%s]", runID) } err = tx.Commit() if err != nil { span.SetTag("error", true) span.SetTag("error.msg", err.Error()) return status, errors.Wrap(err, "failed to commit transaction") } //if status.Status != "" { // span.SetTag("job.status", status.Status) //} return status, nil } ================================================ FILE: state/pg_state_manager_test.go ================================================ package state import ( "context" "fmt" "log" "os" "testing" "time" gklog "github.com/go-kit/kit/log" flotillaLog "github.com/stitchfix/flotilla-os/log" "database/sql/driver" "reflect" "github.com/jmoiron/sqlx" _ "github.com/lib/pq" "github.com/stitchfix/flotilla-os/config" ) func getDB(conf config.Config) *sqlx.DB { dbURL := conf.GetString("database_url") if dbURL == "" { dbURL = "postgresql://postgres:docker@localhost/postgres?sslmode=disable" } db, err := sqlx.Connect("postgres", dbURL) if err != nil { log.Fatal(err) } return db } func setUp() Manager { conf, _ := config.NewConfig(nil) db := getDB(conf) err := os.Setenv("STATE_MANAGER", "postgres") if err != nil { log.Fatal("error setting env, STATE_MANAGER") } err = os.Setenv("CREATE_DATABASE_SCHEMA", "true") if err != nil { log.Fatal("error setting env, CREATE_DATABASE_SCHEMA") } l := gklog.NewLogfmtLogger(gklog.NewSyncWriter(os.Stderr)) l = gklog.With(l, "ts", gklog.DefaultTimestampUTC) eventSinks := []flotillaLog.EventSink{flotillaLog.NewLocalEventSink()} logger := flotillaLog.NewLogger(l, eventSinks) sm, err := NewStateManager(conf, logger) fmt.Println(err) insertDefinitions(db) return sm } func insertDefinitions(db *sqlx.DB) { defsql := ` INSERT INTO task_def (definition_id, image, group_name, alias, memory, command, env) VALUES ($1, $2, $3, $4, $5, $6, $7) ` portsql := ` INSERT INTO task_def_ports(task_def_id, port) VALUES ($1, $2) ` taskDefTagsSQL := ` INSERT INTO task_def_tags(task_def_id, tag_id) VALUES($1, $2) ` tagSQL := ` INSERT INTO tags(text) VALUES($1) ` taskSQL := ` INSERT INTO task ( run_id, definition_id, cluster_name, alias, image, exit_code, status, started_at, finished_at, instance_id, instance_dns_name, group_name, env, engine, "user", service_account, tier ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, 'eks', 'foo', 'flotilla', $14 ) ` db.MustExec(defsql, "A", "imageA", "groupZ", "aliasA", 1024, "echo 'hi'", `[{"name":"E_A1","value":"V_A1"}]`) db.MustExec(defsql, "B", "imageB", "groupY", "aliasB", 1024, "echo 'hi'", `[{"name":"E_B1","value":"V_B1"},{"name":"E_B2","value":"V_B2"},{"name":"E_B3","value":"V_B3"}]`) db.MustExec(defsql, "C", "imageC", "groupX", "aliasC", 1024, "echo 'hi'", nil) db.MustExec(defsql, "D", "imageD", "groupW", "aliasD", 1024, "echo 'hi'", nil) db.MustExec(defsql, "E", "imageE", "groupV", "aliasE", 1024, "echo 'hi'", nil) db.MustExec(portsql, "A", 10000) db.MustExec(portsql, "C", 10001) db.MustExec(portsql, "D", 10002) db.MustExec(portsql, "E", 10003) db.MustExec(portsql, "E", 10004) db.MustExec(tagSQL, "tagA") db.MustExec(tagSQL, "tagB") db.MustExec(tagSQL, "tagC") db.MustExec(taskDefTagsSQL, "A", "tagA") db.MustExec(taskDefTagsSQL, "A", "tagC") db.MustExec(taskDefTagsSQL, "B", "tagB") t1, _ := time.Parse(time.RFC3339, "2017-07-04T00:01:00+00:00") t2, _ := time.Parse(time.RFC3339, "2017-07-04T00:02:00+00:00") t3, _ := time.Parse(time.RFC3339, "2017-07-04T00:03:00+00:00") t4, _ := time.Parse(time.RFC3339, "2017-07-04T00:04:00+00:00") db.MustExec(taskSQL, "run0", "A", "clusta", "aliasA", "imgA", nil, StatusRunning, t1, nil, "id1", "dns1", "groupZ", `[{"name":"E0","value":"V0"}]`, 4) db.MustExec( taskSQL, "run1", "B", "clusta", "aliasB", "imgB", nil, StatusRunning, t2, nil, "id1", "dns1", "groupY", `[{"name":"E1","value":"V1"}]`, 4) db.MustExec( taskSQL, "run2", "B", "clusta", "aliasB", "imgB", 1, StatusStopped, t2, t3, "id1", "dns1", "groupY", `[{"name":"E2","value":"V2"}]`, 4) db.MustExec(taskSQL, "run3", "C", "clusta", "aliasC", "imgC", nil, StatusQueued, nil, nil, "", "", "groupX", `[{"name":"E3_1","value":"V3_1"},{"name":"E3_2","value":"v3_2"},{"name":"E3_3","value":"V3_3"}]`, 4) db.MustExec(taskSQL, "run4", "C", "clusta", "aliasC", "imgC", 0, StatusStopped, t3, t4, "id1", "dns1", "groupX", nil, 4) db.MustExec(taskSQL, "run5", "D", "clustb", "aliasD", "imgD", nil, StatusPending, nil, nil, "", "", "groupW", nil, 4) } func tearDown() { conf, _ := config.NewConfig(nil) db := getDB(conf) db.MustExec(` DELETE FROM task_def_ports; DELETE FROM task_def_tags; DELETE FROM task_status; DELETE FROM task; DELETE FROM task_def; DELETE FROM tags; `) } func TestSQLStateManager_ListDefinitions(t *testing.T) { defer tearDown() sm := setUp() var err error var dl DefinitionList // Test limiting expectedTotal := 5 dl, err = sm.ListDefinitions(ctx, 1, 0, "alias", "asc", nil, nil) if err != nil { t.Error(err.Error()) } if dl.Total != expectedTotal { t.Errorf("Expected %v total definitions, got %v", expectedTotal, dl.Total) } if len(dl.Definitions) != 1 { t.Errorf("Expected 1 definition returned, got %v", len(dl.Definitions)) } dA := dl.Definitions[0] if dA.DefinitionID != "A" { t.Errorf("Listing returned incorrect definition, expected A but got %s", dA.DefinitionID) } if len(*dA.Env) != 1 { t.Errorf("Expected returned definitions to have correctly attached env vars, was %v", dA.Env) } // Test ordering and offset dl, _ = sm.ListDefinitions(ctx, 1, 1, "group_name", "asc", nil, nil) if dl.Definitions[0].GroupName != "groupW" { t.Errorf("Error ordering with offset - expected groupW but got %s", dl.Definitions[0].GroupName) } // Test order validation dl, err = sm.ListDefinitions(ctx, 1, 0, "nonexistent_field", "asc", nil, nil) if err == nil { t.Errorf("Sorting by [nonexistent_field] did not produce an error") } dl, err = sm.ListDefinitions(ctx, 1, 0, "alias", "nooop", nil, nil) if err == nil { t.Errorf("Sort order [nooop] is not valid but did not produce an error") } // Test filtering on fields dl, _ = sm.ListDefinitions(ctx, 1, 0, "alias", "asc", map[string][]string{"image": {"imageC"}}, nil) if dl.Definitions[0].Image != "imageC" { t.Errorf("Error filtering by field - expected imageC but got %s", dl.Definitions[0].Image) } // Test filtering on environment variables dl, _ = sm.ListDefinitions(ctx, 1, 0, "alias", "desc", nil, map[string]string{"E_B1": "V_B1", "E_B2": "V_B2"}) if dl.Definitions[0].DefinitionID != "B" { t.Errorf( `Expected environment variable filters (E_B1:V_B1 AND E_B2:V_B2) to yield definition B, but was %s`, dl.Definitions[0].DefinitionID) } } func TestSQLStateManager_GetDefinition(t *testing.T) { defer tearDown() sm := setUp() dE, _ := sm.GetDefinition(ctx, "E") if dE.DefinitionID != "E" { t.Errorf("Expected definition E to be fetched, got %s", dE.DefinitionID) } if dE.Env != nil { t.Errorf("Expected empty environment but got %s", *dE.Env) } _, err := sm.GetDefinition(ctx, "Z") if err == nil { t.Errorf("Expected get for non-existent definition Z to return error, was nil") } } func TestSQLStateManager_GetDefinitionByAlias(t *testing.T) { defer tearDown() sm := setUp() dE, _ := sm.GetDefinitionByAlias(ctx, "aliasE") if dE.DefinitionID != "E" { t.Errorf("Expected definition E to be fetched, got %s", dE.DefinitionID) } if dE.Env != nil { t.Errorf("Expected empty environment but got %s", *dE.Env) } _, err := sm.GetDefinitionByAlias(ctx, "aliasZ") if err == nil { t.Errorf("Expected get for non-existent definition Z to return error, was nil") } } func TestSQLStateManager_CreateDefinition(t *testing.T) { defer tearDown() sm := setUp() var err error memory := int64(512) d := Definition{ DefinitionID: "id:cupcake", GroupName: "group:cupcake", Alias: "cupcake", Command: "echo 'hi'", ExecutableResources: ExecutableResources{ Memory: &memory, Image: "image:cupcake", Env: &EnvList{ {Name: "E1", Value: "V1"}, }, Ports: &PortsList{12345, 6789}, Tags: &Tags{"apple", "orange", "tiger"}, }, } err = sm.CreateDefinition(ctx, d) if err != nil { t.Error(err.Error()) } f, err := sm.GetDefinition(ctx, "id:cupcake") if err != nil { t.Errorf("Expected create definition to create definition with id [id:cupcake]") t.Error(err) } if f.Alias != d.Alias || len(*f.Env) != len(*d.Env) || *f.Memory != *d.Memory { t.Errorf("Expected created definition to match the one passed in for creation") } } func TestSQLStateManager_UpdateDefinition(t *testing.T) { defer tearDown() sm := setUp() env := EnvList{ {Name: "NEW1", Value: "NEWVAL1"}, {Name: "NEW2", Value: "NEWVAL2"}, } tags := Tags{ "cupcake", } updates := Definition{ ExecutableResources: ExecutableResources{ Tags: &tags, Image: "updated", Env: &env, Ports: &PortsList{}, // <---- empty, set ports to empty list }, } _, err := sm.UpdateDefinition(ctx, "A", updates) if err != nil { t.Error(err.Error()) } d, _ := sm.GetDefinition(ctx, "A") if d.Image != "updated" { t.Errorf("Expected image to be updated to [updated] but is %s", d.Image) } if len(*d.Env) != 2 { t.Errorf("Expected new env to have length 2, was %v", len(*d.Env)) } updatedEnv := *d.Env matches := 0 for i := range updatedEnv { updatedVar := updatedEnv[i] for j := range env { expectedVar := env[j] if updatedVar.Name == expectedVar.Name && updatedVar.Value == expectedVar.Value { matches++ } } } if matches != len(env) { t.Errorf("Not all updated env vars match") } } func TestSQLStateManager_DeleteDefinition(t *testing.T) { defer tearDown() sm := setUp() var err error err = sm.DeleteDefinition(ctx, "A") if err != nil { t.Error(err.Error()) } _, err = sm.GetDefinition(ctx, "A") if err == nil { t.Errorf("Expected querying definition after delete would return error") } } func TestSQLStateManager_ListRuns(t *testing.T) { defer tearDown() sm := setUp() var err error expectedTotal := 6 rl, err := sm.ListRuns(ctx, 1, 0, "started_at", "asc", nil, nil, nil) if err != nil { t.Error(err.Error()) } if rl.Total != expectedTotal { t.Errorf("Expected total to be %v but was %v", expectedTotal, rl.Total) } if len(rl.Runs) != 1 { t.Errorf("Expected limit query to limit to 1 but was %v", len(rl.Runs)) } r0 := rl.Runs[0] if r0.RunID != "run0" { t.Errorf("Listing with order returned incorrect run, expected run0 but got %s", r0.RunID) } if r0.Env == nil { t.Errorf("Expected non-nil env for run") } if len(*r0.Env) != 1 { t.Errorf("Expected returned runs to have correctly attached env vars, was %v", r0.Env) } // Test ordering and offset // - there's only two, so offset 1 should return second one rl, err = sm.ListRuns(ctx, 1, 1, "cluster_name", "desc", nil, nil, nil) if rl.Runs[0].ClusterName != "clusta" { t.Errorf("Error ordering with offset - expected clusta but got %s", rl.Runs[0].ClusterName) } // Test order validation rl, err = sm.ListRuns(ctx, 1, 0, "nonexistent_field", "asc", nil, nil, nil) if err == nil { t.Errorf("Sorting by [nonexistent_field] did not produce an error") } rl, err = sm.ListRuns(ctx, 1, 0, "started_at", "nooop", nil, nil, nil) if err == nil { t.Errorf("Sort order [nooop] is not valid but did not produce an error") } // Test filtering on fields rl, err = sm.ListRuns(ctx, 1, 0, "started_at", "asc", map[string][]string{"cluster_name": {"clustb"}}, nil, nil) if rl.Runs[0].ClusterName != "clustb" { t.Errorf("Error filtering by field - expected clustb but got %s", rl.Runs[0].ClusterName) } // Test filtering on environment variables rl, err = sm.ListRuns(ctx, 1, 0, "started_at", "desc", nil, map[string]string{"E2": "V2"}, nil) if err != nil { t.Error(err.Error()) } if rl.Runs[0].RunID != "run2" { t.Errorf( `Expected environment variable filters (E2:V2) to yield run run2, but was %s`, rl.Runs[0].RunID) } } func TestSQLStateManager_ListRuns2(t *testing.T) { defer tearDown() sm := setUp() var err error expectedTotal := 1 expectedRun := "run4" rl, err := sm.ListRuns(ctx, 100, 0, "started_at", "asc", map[string][]string{ "started_at_since": { "2017-07-04T00:02:59+00:00", }, "started_at_until": { "2017-07-04T00:03:01+00:00", }, }, nil, nil) if err != nil { t.Error(err.Error()) } if rl.Total != expectedTotal { t.Errorf("Expected total to be %v but was %v", expectedTotal, rl.Total) } r := rl.Runs[0] if r.RunID != expectedRun { t.Errorf("Got unexpected run: %s", r.RunID) } } func TestSQLStateManager_ListRuns3(t *testing.T) { defer tearDown() sm := setUp() var err error expectedTotal := 2 expectedRuns := map[string]bool{"run3": true, "run5": true} rl, err := sm.ListRuns(ctx, 100, 0, "started_at", "asc", map[string][]string{ "status": { StatusPending, StatusQueued, }, }, nil, nil) if err != nil { t.Error(err.Error()) } if rl.Total != expectedTotal { t.Errorf("Expected total to be %v but was %v", expectedTotal, rl.Total) } for _, r := range rl.Runs { if _, ok := expectedRuns[r.RunID]; !ok { t.Errorf("Got unexpected run: %s", r.RunID) } } } func TestSQLStateManager_GetRun(t *testing.T) { defer tearDown() sm := setUp() r2, _ := sm.GetRun(ctx, "run2") if r2.RunID != "run2" { t.Errorf("Expected run 2 to be fetched, got %s", r2.RunID) } if len(*r2.Env) != 1 { t.Errorf("Expected environment to have exactly one entry, but was %v", len(*r2.Env)) } _, err := sm.GetRun(ctx, "run100") if err == nil { t.Errorf("Expected get for non-existent run100 to return error, was nil") } } func TestSQLStateManager_CreateRun(t *testing.T) { defer tearDown() sm := setUp() r1 := Run{ RunID: "run:17", GroupName: "group:cupcake", Alias: "cute", Image: "someImage", DefinitionID: "A", ClusterName: "clusta", Status: StatusQueued, Env: &EnvList{ {Name: "RUN_PARAM", Value: "VAL"}, }, Engine: &DefaultEngine, Tier: Tier("4"), } ec := int64(137) reason := "instance is ded." cmd := "_test cmd__" mem := int64(10) t1, _ := time.Parse(time.RFC3339, "2017-07-04T00:01:00+00:00") t2, _ := time.Parse(time.RFC3339, "2017-07-04T00:02:00+00:00") t1 = t1.UTC() t2 = t2.UTC() r2 := Run{ RunID: "run:18", GroupName: "group:cupcake", DefinitionID: "A", Alias: "AliasA", Image: "ImageA", ExitCode: &ec, ExitReason: &reason, StartedAt: &t1, FinishedAt: &t2, ClusterName: "clusta", Status: StatusStopped, Env: &EnvList{ {Name: "RUN_PARAM", Value: "VAL"}, }, Command: &cmd, Memory: &mem, Engine: &DefaultEngine, Tier: Tier("4"), } sm.CreateRun(ctx, r1) sm.CreateRun(ctx, r2) f1, _ := sm.GetRun(ctx, "run:17") f2, _ := sm.GetRun(ctx, "run:18") if f1.RunID != "run:17" { t.Errorf("Expected to fetch inserted run:17, but got %s", f1.RunID) } // Check null handling if f1.ExitCode != nil || f1.StartedAt != nil || f1.FinishedAt != nil { t.Errorf("Expected run:17 to have null exit code, started_at, and finished_at") } if f2.ExitCode == nil || f2.StartedAt == nil || f2.FinishedAt == nil { t.Errorf("Expected run:18 to have non null exit code, started_at, and finished_at") } if *f2.ExitCode != *r2.ExitCode { t.Errorf("Expected exit code %v but was %v", *r2.ExitCode, *f2.ExitCode) } if *f2.ExitReason != *r2.ExitReason { t.Errorf("Expected exit reason %s but was %s", *r2.ExitReason, *f2.ExitReason) } if (*f2.StartedAt).UTC().String() != (*r2.StartedAt).String() { t.Errorf("Expected started_at %s but was %s", *r2.StartedAt, *f2.StartedAt) } if (*f2.FinishedAt).UTC().String() != (*r2.FinishedAt).String() { t.Errorf("Expected finished_at %s but was %s", *r2.FinishedAt, *f2.FinishedAt) } if f2.Alias != r2.Alias { t.Errorf("Expected alias: [%s] but was [%s]", r2.Alias, f2.Alias) } if f2.Image != r2.Image { t.Errorf("Expected image: [%s] but was [%s]", r2.Image, f2.Image) } if f1.Command != nil { t.Errorf("Expected null command, but was [%s]", *f1.Command) } if f1.Memory != nil { t.Errorf("Expected null mem, but was [%d]", *f1.Memory) } if f2.Command == nil { t.Errorf("Expected non-null command, but was null") } if f2.Memory == nil { t.Errorf("Expected non-null memory, but was null") } if f2.Command != nil && *f2.Command != cmd { t.Errorf("Expected command [%s], but got [%s]", cmd, *f2.Command) } if f2.Memory != nil && *f2.Memory != mem { t.Errorf("Expected mem [%d], but got [%d]", mem, *f2.Memory) } } func TestSQLStateManager_UpdateRun(t *testing.T) { defer tearDown() sm := setUp() ec := int64(1) env := EnvList{ {Name: "NEW1", Value: "NEWVAL1"}, {Name: "NEW2", Value: "NEWVAL2"}, } t1, _ := time.Parse(time.RFC3339, "2017-07-04T00:01:00+00:00") t2, _ := time.Parse(time.RFC3339, "2017-07-04T00:02:00+00:00") t1 = t1.UTC() t2 = t2.UTC() u := Run{ Alias: "alien", Image: "imagine", ExitCode: &ec, Status: StatusStopped, StartedAt: &t1, FinishedAt: &t2, Env: &env, Tier: Tier("4"), } u2 := Run{ Status: StatusNeedsRetry, } _, e := sm.UpdateRun(ctx, "run3", u) if e != nil { t.Errorf("Error while updating %v", e) } r, e := sm.GetRun(ctx, "run3") if e != nil { t.Errorf("Error in GetRun %v", e) } if *r.ExitCode != ec { t.Errorf("Expected update to set exit code to %v but was %v", ec, *r.ExitCode) } if (*r.StartedAt).UTC().String() != t1.String() { t.Errorf("Expected update to started_at to %s but was %s", t1, *r.StartedAt) } if (*r.FinishedAt).UTC().String() != t2.String() { t.Errorf("Expected update to set finished_at to %s but was %s", t1, *r.FinishedAt) } if r.Status != u.Status { t.Errorf("Expected update to set status to %s but was %s", u.Status, r.Status) } if r.Alias != u.Alias { t.Errorf("Expected update to set alias: [%s] but was [%s]", u.Alias, r.Alias) } if r.Image != u.Image { t.Errorf("Expected update to set image: [%s] but was [%s]", u.Image, r.Image) } updatedEnv := *r.Env matches := 0 for i := range updatedEnv { updatedVar := updatedEnv[i] for j := range env { expectedVar := env[j] if updatedVar.Name == expectedVar.Name && updatedVar.Value == expectedVar.Value { matches++ } } } if matches != len(env) { t.Errorf("Not all updated env vars match") } sm.UpdateRun(ctx, "run3", u2) r, _ = sm.GetRun(ctx, "run3") if r.Status != u2.Status { t.Errorf("Expected to update status to %s but was %s", u2.Status, r.Status) } } func TestSQLStateManager_UpdateWorker(t *testing.T) { defer tearDown() sm := setUp() // First, list workers to find an existing worker type created during init workers, err := sm.ListWorkers(ctx, DefaultEngine) if err != nil { t.Fatalf("Error listing workers: %v", err) } if len(workers.Workers) == 0 { t.Fatal("Expected at least one worker to exist after setUp") } originalWorker := workers.Workers[0] // Update the worker's count to call row.Scan in UpdateWorker, // which previously only scanned 2 of 3 columns (missing Engine), causing: // "sql: expected 2 destination arguments in Scan, not 3" newCount := originalWorker.CountPerInstance + 5 updates := Worker{ CountPerInstance: newCount, } updated, err := sm.UpdateWorker(ctx, originalWorker.WorkerType, updates) if err != nil { t.Fatalf("UpdateWorker failed: %v", err) } if updated.CountPerInstance != newCount { t.Errorf("Expected CountPerInstance to be %d, got %d", newCount, updated.CountPerInstance) } if updated.Engine != DefaultEngine { t.Errorf("Expected Engine to be %s, got %s", DefaultEngine, updated.Engine) } // Verify via GetWorker that the update persisted fetched, err := sm.GetWorker(ctx, originalWorker.WorkerType, DefaultEngine) if err != nil { t.Fatalf("GetWorker failed: %v", err) } if fetched.CountPerInstance != newCount { t.Errorf("Expected persisted CountPerInstance to be %d, got %d", newCount, fetched.CountPerInstance) } } func TestSQLStateManager_ListClusterStates(t *testing.T) { defer tearDown() sm := setUp() // Simple test to ensure the method exists and returns without error _, err := sm.ListClusterStates(ctx) if err != nil { t.Errorf("Error listing cluster states: %v", err) } } func TestStringArray_Scan(t *testing.T) { tests := []struct { name string input interface{} expected Tiers wantErr bool }{ { name: "nil input", input: nil, expected: Tiers{}, wantErr: false, }, { name: "empty array", input: []byte("{}"), expected: Tiers{}, wantErr: false, }, { name: "single value", input: []byte("{\"tier1\"}"), expected: Tiers{"tier1"}, wantErr: false, }, { name: "multiple values", input: []byte("{\"tier1\",\"tier2\",\"tier3\"}"), expected: Tiers{"tier1", "tier2", "tier3"}, wantErr: false, }, { name: "values with empty elements", input: []byte("{\"tier1\",,\"tier3\"}"), expected: Tiers{"tier1", "tier3"}, wantErr: false, }, { name: "unquoted values", input: []byte("{tier1,tier2,tier3}"), expected: Tiers{"tier1", "tier2", "tier3"}, wantErr: false, }, { name: "unsupported type", input: 123, expected: nil, wantErr: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { var result Tiers err := result.Scan(tt.input) if (err != nil) != tt.wantErr { t.Errorf("StringArray.Scan() error = %v, wantErr %v", err, tt.wantErr) return } if !reflect.DeepEqual(result, tt.expected) { t.Errorf("StringArray.Scan() = %v, want %v", result, tt.expected) } }) } } func TestStringArray_Value(t *testing.T) { tests := []struct { name string array Tiers expected driver.Value wantErr bool }{ { name: "empty slice", array: Tiers{}, expected: "{}", wantErr: false, }, { name: "single value", array: Tiers{"tier1"}, expected: "{\"tier1\"}", wantErr: false, }, { name: "multiple values", array: Tiers{"tier1", "tier2", "tier3"}, expected: "{\"tier1\",\"tier2\",\"tier3\"}", wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, err := tt.array.Value() if (err != nil) != tt.wantErr { t.Errorf("StringArray.Value() error = %v, wantErr %v", err, tt.wantErr) return } if !reflect.DeepEqual(got, tt.expected) { t.Errorf("StringArray.Value() = %v, want %v", got, tt.expected) } }) } } // This test verifies that a value that's converted to a database format // can be correctly scanned back into the original structure func TestStringArray_RoundTrip(t *testing.T) { tests := []struct { name string array Tiers }{ { name: "empty array", array: Tiers{}, }, { name: "single value", array: Tiers{"tier1"}, }, { name: "multiple values", array: Tiers{"tier1", "tier2", "tier3"}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { dbValue, err := tt.array.Value() if err != nil { t.Fatalf("Failed to convert to DB value: %v", err) } stringValue, ok := dbValue.(string) if !ok { t.Fatalf("Expected dbValue to be a string, got %T", dbValue) } byteValue := []byte(stringValue) var result Tiers err = result.Scan(byteValue) if err != nil { t.Fatalf("Failed to scan from DB value: %v", err) } if !reflect.DeepEqual(result, tt.array) { t.Errorf("Round trip failed: got %v, want %v", result, tt.array) } }) } } func TestCapabilities_Scan(t *testing.T) { tests := []struct { name string input interface{} expected Capabilities wantErr bool }{ { name: "nil input", input: nil, expected: Capabilities{}, wantErr: false, }, { name: "empty array", input: []byte("{}"), expected: Capabilities{}, wantErr: false, }, { name: "single value", input: []byte("{spark}"), expected: Capabilities{"spark"}, wantErr: false, }, { name: "multiple values", input: []byte("{spark,ray,gpu}"), expected: Capabilities{"spark", "ray", "gpu"}, wantErr: false, }, { name: "values with empty elements", input: []byte("{spark,gpu}"), expected: Capabilities{"spark", "gpu"}, wantErr: false, }, { name: "unsupported type", input: 123, expected: nil, wantErr: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { var result Capabilities err := result.Scan(tt.input) if (err != nil) != tt.wantErr { t.Errorf("Capabilities.Scan() error = %v, wantErr %v", err, tt.wantErr) return } if !reflect.DeepEqual(result, tt.expected) { t.Errorf("Capabilities.Scan() = %v, want %v", result, tt.expected) } }) } } func TestCapabilities_Value(t *testing.T) { tests := []struct { name string capabilities Capabilities expected driver.Value wantErr bool }{ { name: "empty slice", capabilities: Capabilities{}, expected: "{}", wantErr: false, }, { name: "single value", capabilities: Capabilities{"gpu"}, expected: "{gpu}", wantErr: false, }, { name: "multiple values", capabilities: Capabilities{"gpu", "cpu", "memory"}, expected: "{gpu,cpu,memory}", wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, err := tt.capabilities.Value() if (err != nil) != tt.wantErr { t.Errorf("Capabilities.Value() error = %v, wantErr %v", err, tt.wantErr) return } if !reflect.DeepEqual(got, tt.expected) { t.Errorf("Capabilities.Value() = %v, want %v", got, tt.expected) } }) } } func TestCapabilities_RoundTrip(t *testing.T) { tests := []struct { name string capabilities Capabilities }{ { name: "empty capabilities", capabilities: Capabilities{}, }, { name: "single capability", capabilities: Capabilities{"gpu"}, }, { name: "multiple capabilities", capabilities: Capabilities{"gpu", "spark", "ray"}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Convert to database value dbValue, err := tt.capabilities.Value() if err != nil { t.Fatalf("Failed to convert to DB value: %v", err) } // Convert the string to []byte since that's what // would happen in a real database call stringValue, ok := dbValue.(string) if !ok { t.Fatalf("Expected dbValue to be a string, got %T", dbValue) } byteValue := []byte(stringValue) // Convert database value back to Capabilities var result Capabilities err = result.Scan(byteValue) if err != nil { t.Fatalf("Failed to scan from DB value: %v", err) } // Check that we got back what we started with if !reflect.DeepEqual(result, tt.capabilities) { t.Errorf("Round trip failed: got %v, want %v", result, tt.capabilities) } }) } } func tearDownClusters() { conf, _ := config.NewConfig(nil) db := getDB(conf) db.MustExec(`DELETE FROM cluster_state;`) } var ctx = context.Background() func TestSQLStateManager_UpdateClusterMetadata(t *testing.T) { defer tearDownClusters() sm := setUp() initialCluster := ClusterMetadata{ Name: "test-cluster", Status: StatusActive, StatusReason: "Initial setup", AllowedTiers: Tiers{"1", "2"}, Capabilities: Capabilities{"gpu", "spark"}, Namespace: "flotilla", Region: "us-east-1", EMRVirtualCluster: "11111111", SparkServerURI: "spark://spark-server:7077", } err := sm.UpdateClusterMetadata(ctx, initialCluster) if err != nil { t.Fatalf("Error creating initial cluster: %v", err) } clusters, err := sm.ListClusterStates(ctx) if err != nil { t.Fatalf("Error listing clusters: %v", err) } var clusterID string for _, c := range clusters { if c.Name == "test-cluster" { clusterID = c.ID break } } if clusterID == "" { t.Fatalf("Test cluster not found after insertion") } updatedCluster := ClusterMetadata{ ID: clusterID, Name: "test-cluster", Status: StatusMaintenance, StatusReason: "Under maintenance", AllowedTiers: Tiers{"1", "2"}, Capabilities: Capabilities{"gpu", "spark", "ray"}, Namespace: "flotilla-test", Region: "us-east-1", EMRVirtualCluster: "test-emr-cluster", SparkServerURI: "spark://spark-server:7077", } err = sm.UpdateClusterMetadata(ctx, updatedCluster) if err != nil { t.Fatalf("Error updating cluster: %v", err) } updatedFromDB, err := sm.GetClusterByID(ctx, clusterID) if err != nil { t.Fatalf("Error getting updated cluster: %v", err) } if updatedFromDB.Status != StatusMaintenance { t.Errorf("Expected status %s, got %s", StatusMaintenance, updatedFromDB.Status) } if updatedFromDB.StatusReason != "Under maintenance" { t.Errorf("Expected reason 'Under maintenance', got '%s'", updatedFromDB.StatusReason) } } func TestSQLStateManager_DeleteClusterMetadata(t *testing.T) { tearDown() sm := setUp() initialCluster := ClusterMetadata{ Name: "test-delete-cluster", Status: StatusActive, StatusReason: "For deletion test", AllowedTiers: Tiers{"1", "2"}, Capabilities: Capabilities{"gpu", "spark"}, Namespace: "flotilla", Region: "us-east-1", EMRVirtualCluster: "11111111", SparkServerURI: "spark://spark-server:7077", } err := sm.UpdateClusterMetadata(ctx, initialCluster) if err != nil { t.Fatalf("Error creating initial cluster: %v", err) } clusters, err := sm.ListClusterStates(ctx) if err != nil { t.Fatalf("Error listing clusters: %v", err) } var clusterID string for _, c := range clusters { if c.Name == "test-delete-cluster" { clusterID = c.ID break } } if clusterID == "" { t.Fatalf("Test cluster not found after insertion") } err = sm.DeleteClusterMetadata(ctx, clusterID) if err != nil { t.Fatalf("Error deleting cluster: %v", err) } _, err = sm.GetClusterByID(ctx, clusterID) if err == nil { t.Errorf("Expected error when getting deleted cluster") } tearDown() } ================================================ FILE: testutils/mocks.go ================================================ package testutils import ( "context" "fmt" "math" "net/http" "testing" "github.com/aws/aws-sdk-go/aws" "github.com/stitchfix/flotilla-os/config" "github.com/stitchfix/flotilla-os/execution/engine" "github.com/stitchfix/flotilla-os/queue" "github.com/stitchfix/flotilla-os/state" ) // ImplementsAllTheThings defines a struct which implements many of the interfaces // to facilitate easier testing type ImplementsAllTheThings struct { T *testing.T Calls []string // Collects calls Definitions map[string]state.Definition // Definitions stored in "state" Runs map[string]state.Run // Runs stored in "state" Workers []state.Worker // Workers stored in "state" Qurls map[string]string // Urls returned by Queue Manager Defined []string // List of defined definitions (Execution Engine) Queued []string // List of queued runs (Queue Manager) StatusUpdates []string // List of queued status updates (Queue Manager) StatusUpdatesAsRuns []state.Run // List of queued status updates (Execution Engine) ExecuteError error // Execution Engine - error to return ExecuteErrorIsRetryable bool // Execution Engine - is the run retryable? Groups []string Tags []string Templates map[string]state.Template ClusterStates []state.ClusterMetadata GetRandomClusterName func(clusters []string) string } func (iatt *ImplementsAllTheThings) GetResources(ctx context.Context, runID string) (state.Run, error) { iatt.Calls = append(iatt.Calls, "GetResources") run, exists := iatt.Runs[runID] if !exists { return state.Run{}, fmt.Errorf("Run with id %s not found", runID) } return run, nil } func (iatt *ImplementsAllTheThings) ListClusters() ([]state.ClusterMetadata, error) { iatt.Calls = append(iatt.Calls, "ListClusters") return iatt.ClusterStates, nil } func (i *ImplementsAllTheThings) ListClusterStates(ctx context.Context) ([]state.ClusterMetadata, error) { i.Calls = append(i.Calls, "ListClusterStates") fmt.Printf("ListClusterStates called, returning %d clusters\n", len(i.ClusterStates)) return i.ClusterStates, nil } func (i *ImplementsAllTheThings) GetClusterByID(ctx context.Context, clusterID string) (state.ClusterMetadata, error) { i.Calls = append(i.Calls, "GetClusterByID") return i.ClusterStates[0], nil } func (i *ImplementsAllTheThings) DeleteClusterMetadata(ctx context.Context, clusterName string) error { i.Calls = append(i.Calls, "DeleteClusterMetadata") return nil } func (i *ImplementsAllTheThings) UpdateClusterMetadata(ctx context.Context, cluster state.ClusterMetadata) error { i.Calls = append(i.Calls, "UpdateClusterMetadata") return nil } func (iatt *ImplementsAllTheThings) LogsText(executable state.Executable, run state.Run, w http.ResponseWriter) error { iatt.Calls = append(iatt.Calls, "LogsText") return nil } func (iatt *ImplementsAllTheThings) Log(keyvals ...interface{}) error { iatt.Calls = append(iatt.Calls, "Name") return nil } func (iatt *ImplementsAllTheThings) Event(keyvals ...interface{}) error { iatt.Calls = append(iatt.Calls, "Name") return nil } // Name - general func (iatt *ImplementsAllTheThings) Name() string { iatt.Calls = append(iatt.Calls, "Name") return "implementer" } // Initialize - general func (iatt *ImplementsAllTheThings) Initialize(conf config.Config) error { iatt.Calls = append(iatt.Calls, "Initialize") return nil } // Cleanup - general func (iatt *ImplementsAllTheThings) Cleanup() error { iatt.Calls = append(iatt.Calls, "Cleanup") return nil } func (iatt *ImplementsAllTheThings) ListFailingNodes(ctx context.Context) (state.NodeList, error) { var nodeList state.NodeList iatt.Calls = append(iatt.Calls, "ListFailingNodes") return nodeList, nil } func (iatt *ImplementsAllTheThings) GetPodReAttemptRate(ctx context.Context) (float32, error) { iatt.Calls = append(iatt.Calls, "GetPodReAttemptRate") return 1.0, nil } func (iatt *ImplementsAllTheThings) GetNodeLifecycle(ctx context.Context, executableID string, commandHash string) (string, error) { iatt.Calls = append(iatt.Calls, "GetNodeLifecycle") return "spot", nil } func (iatt *ImplementsAllTheThings) GetTaskHistoricalRuntime(ctx context.Context, executableID string, runId string) (float32, error) { iatt.Calls = append(iatt.Calls, "GetTaskHistoricalRuntime") return 1.0, nil } // ListDefinitions - StateManager func (iatt *ImplementsAllTheThings) ListDefinitions( ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string) (state.DefinitionList, error) { iatt.Calls = append(iatt.Calls, "ListDefinitions") dl := state.DefinitionList{Total: len(iatt.Definitions)} for _, d := range iatt.Definitions { dl.Definitions = append(dl.Definitions, d) } return dl, nil } // GetDefinition - StateManager func (iatt *ImplementsAllTheThings) GetDefinition(ctx context.Context, definitionID string) (state.Definition, error) { iatt.Calls = append(iatt.Calls, "GetDefinition") var err error d, ok := iatt.Definitions[definitionID] if !ok { err = fmt.Errorf("No definition %s", definitionID) } return d, err } // GetDefinitionByAlias - StateManager func (iatt *ImplementsAllTheThings) GetDefinitionByAlias(ctx context.Context, alias string) (state.Definition, error) { iatt.Calls = append(iatt.Calls, "GetDefinitionByAlias") for _, d := range iatt.Definitions { if d.Alias == alias { return d, nil } } return state.Definition{}, fmt.Errorf("No definition with alias %s", alias) } // UpdateDefinition - StateManager func (iatt *ImplementsAllTheThings) UpdateDefinition(ctx context.Context, definitionID string, updates state.Definition) (state.Definition, error) { iatt.Calls = append(iatt.Calls, "UpdateDefinition") defn := iatt.Definitions[definitionID] defn.UpdateWith(updates) iatt.Definitions[definitionID] = defn return defn, nil } // CreateDefinition - StateManager func (iatt *ImplementsAllTheThings) CreateDefinition(ctx context.Context, d state.Definition) error { iatt.Calls = append(iatt.Calls, "CreateDefinition") iatt.Definitions[d.DefinitionID] = d return nil } // DeleteDefinition - StateManager func (iatt *ImplementsAllTheThings) DeleteDefinition(ctx context.Context, definitionID string) error { iatt.Calls = append(iatt.Calls, "DeleteDefinition") delete(iatt.Definitions, definitionID) return nil } // ListRuns - StateManager func (iatt *ImplementsAllTheThings) ListRuns(ctx context.Context, limit int, offset int, sortBy string, order string, filters map[string][]string, envFilters map[string]string, engines []string) (state.RunList, error) { iatt.Calls = append(iatt.Calls, "ListRuns") rl := state.RunList{Total: len(iatt.Runs)} for _, r := range iatt.Runs { rl.Runs = append(rl.Runs, r) } return rl, nil } // GetRun - StateManager func (iatt *ImplementsAllTheThings) GetRun(ctx context.Context, runID string) (state.Run, error) { iatt.Calls = append(iatt.Calls, "GetRun") var err error r, ok := iatt.Runs[runID] if !ok { err = fmt.Errorf("No run %s", runID) } return r, err } func (iatt *ImplementsAllTheThings) GetRunByEMRJobId(ctx context.Context, emrJobId string) (state.Run, error) { iatt.Calls = append(iatt.Calls, "GetRunByEMRJobId") var err error r, ok := iatt.Runs[emrJobId] if !ok { err = fmt.Errorf("No run %s", emrJobId) } return r, err } // CreateRun - StateManager func (iatt *ImplementsAllTheThings) CreateRun(ctx context.Context, r state.Run) error { iatt.Calls = append(iatt.Calls, "CreateRun") iatt.Runs[r.RunID] = r return nil } func (iatt *ImplementsAllTheThings) EstimateRunResources(ctx context.Context, executableID string, command string) (state.TaskResources, error) { iatt.Calls = append(iatt.Calls, "EstimateRunResources") return state.TaskResources{}, nil } func (iatt *ImplementsAllTheThings) EstimateExecutorCount(ctx context.Context, executableID string, commandHash string) (int64, error) { iatt.Calls = append(iatt.Calls, "EstimateExecutorCount") return 0, nil } func (iatt *ImplementsAllTheThings) ExecutorOOM(ctx context.Context, executableID string, commandHash string) (bool, error) { iatt.Calls = append(iatt.Calls, "ExecutorOOM") return false, nil } func (iatt *ImplementsAllTheThings) DriverOOM(ctx context.Context, executableID string, commandHash string) (bool, error) { iatt.Calls = append(iatt.Calls, "DriverOOM") return false, nil } // UpdateRun - StateManager func (iatt *ImplementsAllTheThings) UpdateRun(ctx context.Context, runID string, updates state.Run) (state.Run, error) { iatt.Calls = append(iatt.Calls, "UpdateRun") run := iatt.Runs[runID] run.UpdateWith(updates) iatt.Runs[runID] = run return run, nil } // ListGroups - StateManager func (iatt *ImplementsAllTheThings) ListGroups(ctx context.Context, limit int, offset int, name *string) (state.GroupsList, error) { iatt.Calls = append(iatt.Calls, "ListGroups") return state.GroupsList{Total: len(iatt.Groups), Groups: iatt.Groups}, nil } // ListTags - StateManager func (iatt *ImplementsAllTheThings) ListTags(ctx context.Context, limit int, offset int, name *string) (state.TagsList, error) { iatt.Calls = append(iatt.Calls, "ListTags") return state.TagsList{Total: len(iatt.Tags), Tags: iatt.Tags}, nil } // initWorkerTable - StateManager func (iatt *ImplementsAllTheThings) initWorkerTable(c config.Config) error { iatt.Calls = append(iatt.Calls, "initWorkerTable") return nil } // ListWorkers - StateManager func (iatt *ImplementsAllTheThings) ListWorkers(ctx context.Context, engine string) (state.WorkersList, error) { iatt.Calls = append(iatt.Calls, "ListWorkers") return state.WorkersList{Total: len(iatt.Workers), Workers: iatt.Workers}, nil } func (iatt *ImplementsAllTheThings) CheckIdempotenceKey(ctx context.Context, idempotenceKey string) (string, error) { iatt.Calls = append(iatt.Calls, "CheckIdempotenceKey") return "42", nil } // GetWorker - StateManager func (iatt *ImplementsAllTheThings) GetWorker(ctx context.Context, workerType string, engine string) (state.Worker, error) { iatt.Calls = append(iatt.Calls, "GetWorker") return state.Worker{WorkerType: workerType, CountPerInstance: 2}, nil } // UpdateWorker - StateManager func (iatt *ImplementsAllTheThings) UpdateWorker(ctx context.Context, workerType string, updates state.Worker) (state.Worker, error) { iatt.Calls = append(iatt.Calls, "UpdateWorker") return state.Worker{WorkerType: workerType, CountPerInstance: updates.CountPerInstance}, nil } // BatchUpdateWorkers- StateManager func (iatt *ImplementsAllTheThings) BatchUpdateWorkers(ctx context.Context, updates []state.Worker) (state.WorkersList, error) { iatt.Calls = append(iatt.Calls, "BatchUpdateWorkers") return state.WorkersList{Total: len(iatt.Workers), Workers: iatt.Workers}, nil } // QurlFor - QueueManager func (iatt *ImplementsAllTheThings) QurlFor(name string, prefixed bool) (string, error) { iatt.Calls = append(iatt.Calls, "QurlFor") qurl, _ := iatt.Qurls[name] return qurl, nil } func (iatt *ImplementsAllTheThings) Enqueue(ctx context.Context, run state.Run) error { iatt.Calls = append(iatt.Calls, "Enqueue") iatt.Queued = append(iatt.Queued, run.RunID) return nil } // ReceiveRun - QueueManager func (iatt *ImplementsAllTheThings) ReceiveRun(qURL string) (queue.RunReceipt, error) { iatt.Calls = append(iatt.Calls, "ReceiveRun") if len(iatt.Queued) == 0 { return queue.RunReceipt{}, nil } popped := iatt.Queued[0] iatt.Queued = iatt.Queued[1:] receipt := queue.RunReceipt{ Run: &state.Run{RunID: popped}, } receipt.Done = func() error { iatt.Calls = append(iatt.Calls, "RunReceipt.Done") return nil } return receipt, nil } // ReceiveStatus - QueueManager func (iatt *ImplementsAllTheThings) ReceiveStatus(qURL string) (queue.StatusReceipt, error) { iatt.Calls = append(iatt.Calls, "ReceiveStatus") if len(iatt.StatusUpdates) == 0 { return queue.StatusReceipt{}, nil } popped := iatt.StatusUpdates[0] iatt.StatusUpdates = iatt.StatusUpdates[1:] receipt := queue.StatusReceipt{ StatusUpdate: &popped, } receipt.Done = func() error { iatt.Calls = append(iatt.Calls, "RunReceipt.Done") return nil } return receipt, nil } // List - QueueManager func (iatt *ImplementsAllTheThings) List() ([]string, error) { iatt.Calls = append(iatt.Calls, "List") res := make([]string, len(iatt.Qurls)) i := 0 for _, qurl := range iatt.Qurls { res[i] = qurl i++ } return res, nil } func (iatt *ImplementsAllTheThings) GetEvents(ctx context.Context, run state.Run) (state.PodEventList, error) { iatt.Calls = append(iatt.Calls, "GetEvents") return state.PodEventList{ Total: 0, PodEvents: nil, }, nil } func (iatt *ImplementsAllTheThings) FetchUpdateStatus(ctx context.Context, run state.Run) (state.Run, error) { iatt.Calls = append(iatt.Calls, "FetchUpdateStatus") return run, nil } func (iatt *ImplementsAllTheThings) FetchPodMetrics(ctx context.Context, run state.Run) (state.Run, error) { iatt.Calls = append(iatt.Calls, "FetchPodMetrics") return run, nil } // CanBeRun - Cluster Client func (iatt *ImplementsAllTheThings) CanBeRun(clusterName string, executableResources state.ExecutableResources) (bool, error) { iatt.Calls = append(iatt.Calls, "CanBeRun") if clusterName == "invalidcluster" { return false, nil } return true, nil } // IsImageValid - Registry Client func (iatt *ImplementsAllTheThings) IsImageValid(imageRef string) (bool, error) { iatt.Calls = append(iatt.Calls, "IsImageValid") if imageRef == "invalidimage" { return false, nil } return true, nil } func (iatt *ImplementsAllTheThings) PollRunStatus(ctx context.Context) (state.Run, error) { iatt.Calls = append(iatt.Calls, "PollRunStatus") return state.Run{}, nil } // PollRuns - Execution Engine func (iatt *ImplementsAllTheThings) PollRuns(ctx context.Context) ([]engine.RunReceipt, error) { iatt.Calls = append(iatt.Calls, "PollRuns") var r []engine.RunReceipt if len(iatt.Queued) == 0 { return r, nil } popped := iatt.Queued[0] iatt.Queued = iatt.Queued[1:] receipt := queue.RunReceipt{ Run: &state.Run{RunID: popped}, } receipt.Done = func() error { iatt.Calls = append(iatt.Calls, "RunReceipt.Done") return nil } r = append(r, engine.RunReceipt{receipt, 1111, 1111111, 1}) return r, nil } // PollStatus - Execution Engine func (iatt *ImplementsAllTheThings) PollStatus(ctx context.Context) (engine.RunReceipt, error) { iatt.Calls = append(iatt.Calls, "PollStatus") if len(iatt.StatusUpdatesAsRuns) == 0 { return engine.RunReceipt{}, nil } popped := iatt.StatusUpdatesAsRuns[0] iatt.StatusUpdatesAsRuns = iatt.StatusUpdatesAsRuns[1:] receipt := queue.RunReceipt{ Run: &popped, } receipt.Done = func() error { iatt.Calls = append(iatt.Calls, "StatusReceipt.Done") return nil } return engine.RunReceipt{receipt, 1111, 1111111, 1}, nil } // Execute - Execution Engine func (iatt *ImplementsAllTheThings) Execute(ctx context.Context, executable state.Executable, run state.Run, manager state.Manager) (state.Run, bool, error) { iatt.Calls = append(iatt.Calls, "Execute") return state.Run{}, iatt.ExecuteErrorIsRetryable, iatt.ExecuteError } // Terminate - Execution Engine func (iatt *ImplementsAllTheThings) Terminate(ctx context.Context, run state.Run) error { iatt.Calls = append(iatt.Calls, "Terminate") return nil } // Define - Execution Engine func (iatt *ImplementsAllTheThings) Define(ctx context.Context, definition state.Definition) (state.Definition, error) { iatt.Calls = append(iatt.Calls, "Define") iatt.Defined = append(iatt.Defined, definition.DefinitionID) return definition, nil } // Deregister - Execution Engine func (iatt *ImplementsAllTheThings) Deregister(ctx context.Context, definition state.Definition) error { iatt.Calls = append(iatt.Calls, "Deregister") return nil } // Logs - Logs Client func (iatt *ImplementsAllTheThings) Logs(executable state.Executable, run state.Run, lastSeen *string, role *string, facility *string) (string, *string, error) { iatt.Calls = append(iatt.Calls, "Logs") return "", aws.String(""), nil } // GetExecutableByTypeAndID - StateManager func (iatt *ImplementsAllTheThings) GetExecutableByTypeAndID(ctx context.Context, t state.ExecutableType, id string) (state.Executable, error) { iatt.Calls = append(iatt.Calls, "GetExecutableByTypeAndID") switch t { case state.ExecutableTypeDefinition: return iatt.GetDefinition(ctx, id) case state.ExecutableTypeTemplate: return iatt.GetTemplateByID(ctx, id) default: return nil, fmt.Errorf("Invalid executable type %s", t) } } // ListTemplates - StateManager func (iatt *ImplementsAllTheThings) ListTemplates(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) { iatt.Calls = append(iatt.Calls, "ListTemplates") tl := state.TemplateList{Total: len(iatt.Templates)} for _, t := range iatt.Templates { tl.Templates = append(tl.Templates, t) } return tl, nil } // ListTemplatesLatestOnly - StateManager func (iatt *ImplementsAllTheThings) ListTemplatesLatestOnly(ctx context.Context, limit int, offset int, sortBy string, order string) (state.TemplateList, error) { // TODO: this is not actually implemented correctly - but also we're never // using it. iatt.Calls = append(iatt.Calls, "ListTemplatesLatestOnly") tl := state.TemplateList{Total: len(iatt.Templates)} for _, t := range iatt.Templates { tl.Templates = append(tl.Templates, t) } return tl, nil } func (iatt *ImplementsAllTheThings) GetTemplateByVersion(ctx context.Context, templateName string, templateVersion int64) (bool, state.Template, error) { iatt.Calls = append(iatt.Calls, "GetTemplateByVersion") var err error var tpl *state.Template // Iterate over templates to find max version. for _, t := range iatt.Templates { if t.TemplateName == templateName && t.Version == templateVersion { tpl = &t } } if tpl == nil { return false, *tpl, fmt.Errorf("No template with name: %s", templateName) } return true, *tpl, err } // GetTemplateByID - StateManager func (iatt *ImplementsAllTheThings) GetTemplateByID(ctx context.Context, id string) (state.Template, error) { iatt.Calls = append(iatt.Calls, "GetTemplateByID") var err error t, ok := iatt.Templates[id] if !ok { err = fmt.Errorf("No template %s", id) } return t, err } // GetLatestTemplateByTemplateName - StateManager func (iatt *ImplementsAllTheThings) GetLatestTemplateByTemplateName(ctx context.Context, templateName string) (bool, state.Template, error) { iatt.Calls = append(iatt.Calls, "GetLatestTemplateByTemplateName") var err error var tpl *state.Template var maxVersion int64 = int64(math.Inf(-1)) // Iterate over templates to find max version. for _, t := range iatt.Templates { if t.TemplateName == templateName && t.Version > maxVersion { tpl = &t maxVersion = t.Version } } if tpl == nil { return false, *tpl, fmt.Errorf("No template with name: %s", templateName) } return true, *tpl, err } // CreateTemplate - StateManager func (iatt *ImplementsAllTheThings) CreateTemplate(ctx context.Context, t state.Template) error { iatt.Calls = append(iatt.Calls, "CreateTemplate") iatt.Templates[t.TemplateID] = t return nil } func (iatt *ImplementsAllTheThings) GetRunStatus(ctx context.Context, runID string) (state.RunStatus, error) { iatt.Calls = append(iatt.Calls, "GetRunStatus") var err error r, ok := iatt.Runs[runID] if !ok { err = fmt.Errorf("No run with ID: %s", runID) return state.RunStatus{}, err } status := state.RunStatus{ RunID: r.RunID, Status: r.Status, DefinitionID: r.DefinitionID, ClusterName: r.ClusterName, QueuedAt: r.QueuedAt, StartedAt: r.StartedAt, FinishedAt: r.FinishedAt, ExitCode: r.ExitCode, ExitReason: r.ExitReason, Engine: r.Engine, Alias: r.Alias, } return status, err } ================================================ FILE: tracing/tracing.go ================================================ package tracing import ( "context" "time" "gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer" ) // TraceJob starts or continues a trace for a job operation func TraceJob(ctx context.Context, operationName string, runID string) (context.Context, tracer.Span) { span, ctx := tracer.StartSpanFromContext( ctx, operationName, tracer.ResourceName(runID), tracer.Tag("job.run_id", runID), ) return ctx, span } // TagRunInfo adds standardized job metadata to a span func TagRunInfo(span tracer.Span, runID, definitionID, alias, status, clusterName string, queuedAt, startedAt, finishedAt *time.Time, podName, namespace, exitReason *string, exitCode *int64, tier string) { if span == nil { return } span.SetTag("job.run_id", runID) if exitReason != nil { span.SetTag("job.exit_reason", *exitReason) } } type TextMapCarrier map[string]string // ForeachKey implements the TextMapReader interface for Extract func (c TextMapCarrier) ForeachKey(handler func(key, val string) error) error { for k, v := range c { if err := handler(k, v); err != nil { return err } } return nil } // Set implements the TextMapWriter interface for Inject func (c TextMapCarrier) Set(key, val string) { c[key] = val } ================================================ FILE: ui/.gitignore ================================================ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # dependencies /node_modules /.pnp .pnp.js # testing /coverage # production /build # misc .DS_Store .env.local .env.development.local .env.test.local .env.production.local npm-debug.log* yarn-debug.log* yarn-error.log* package-lock.json ================================================ FILE: ui/.prettierrc ================================================ { "trailingComma": "es5", "semi": false } ================================================ FILE: ui/Dockerfile ================================================ FROM node:carbon WORKDIR /usr/src/app ADD . /usr/src/app RUN npm install -g serve RUN npm install ARG FLOTILLA_API ARG DEFAULT_CLUSTER RUN npm run build ENTRYPOINT serve -s build ================================================ FILE: ui/README.md ================================================ # Flotilla UI The Flotilla UI is a React application bundled along with the rest of Flotilla. If you are running the entire Flotilla stack locally, it is recommended to use docker-compose as documented in the main [README](https://github.com/stitchfix/flotilla-os#starting-the-service-locally). If you are interested in developing the UI itself, you can follow these steps: ## Development ### Running Locally ``` git clone git@github.com:stitchfix/flotilla-os.git cd flotilla-os/ui npm install REACT_APP_BASE_URL=http://my-flotilla.com REACT_APP_BASE_URL_DEV=http://flotilla.staging.vertigo.stitchfix.com/api npm start ``` ### Testing UI testing is done with Jest and Enzyme. You can run the tests via: ``` npm run test ``` ================================================ FILE: ui/package.json ================================================ { "name": "flotilla", "version": "5.1.1", "dependencies": { "@blueprintjs/core": "3.15.1", "@blueprintjs/datetime": "3.15.1", "@reduxjs/toolkit": "^1.1.0", "ansi-to-react": "5.1.0", "axios": "1.15.2", "cookie": "0.7.0", "formik": "1.5.7", "localforage": "^1.7.3", "lodash": "4.18.1", "moment": "2.29.4", "pretty-ms": "5.0.0", "qs": "6.14.1", "react": "^16.8.6", "react-copy-to-clipboard": "5.0.2", "react-debounce-input": "3.2.0", "react-dom": "16.8.6", "react-helmet": "^5.2.1", "react-json-editor-ajrm": "^2.5.9", "react-json-view": "^1.19.1", "react-jsonschema-form": "^1.8.1", "react-redux": "^7.1.3", "react-resize-detector": "^4.2.1", "react-router-dom": "^5.1.2", "react-scripts": "^5.0.1", "react-select": "2.4.4", "react-window": "^1.8.5", "redux-logger": "^3.0.6", "url-join": "^4.0.1", "yup": "0.27.0" }, "scripts": { "start": "react-scripts start", "build": "react-scripts build", "test": "react-scripts test", "eject": "react-scripts eject" }, "eslintConfig": { "extends": "react-app" }, "browserslist": { "production": [ ">0.2%", "not dead", "not op_mini all" ], "development": [ "last 1 chrome version", "last 1 firefox version", "last 1 safari version" ] }, "devDependencies": { "@babel/plugin-proposal-private-property-in-object": "^7.21.11", "@types/cookie": "0.3.3", "@types/enzyme": "3.9.3", "@types/history": "4.7.2", "@types/jest": "24.0.13", "@types/lodash": "4.17.16", "@types/node": "12.0.2", "@types/qs": "6.5.3", "@types/react": "16.8.18", "@types/react-copy-to-clipboard": "4.3.0", "@types/react-dom": "16.8.4", "@types/react-helmet": "^5.0.14", "@types/react-jsonschema-form": "^1.7.0", "@types/react-redux": "^7.1.5", "@types/react-resize-detector": "^4.2.0", "@types/react-router-dom": "^5.1.3", "@types/react-select": "2.0.9", "@types/react-window": "^1.8.1", "@types/redux-logger": "^3.0.7", "@types/url-join": "^4.0.0", "@types/yup": "0.26.14", "axios-mock-adapter": "1.16.0", "babel-core": "6.26.3", "babel-jest": "24.8.0", "enzyme": "3.9.0", "enzyme-adapter-react-16": "1.13.2", "enzyme-to-json": "3.3.5", "flush-promises": "1.0.2", "regenerator-runtime": "0.13.2", "typescript": "3.4.5" } } ================================================ FILE: ui/public/index.html ================================================ Flotilla | Stitch Fix
================================================ FILE: ui/src/api.ts ================================================ import FlotillaClient from "./helpers/FlotillaClient" const err = "Base URL undefined. If you are running this in development, please set the `REACT_APP_BASE_URL_DEV` environment variable. If you are running this in production, please set the `REACT_APP_BASE_URL` environment variable." let baseURL: string | undefined = undefined switch (process.env.NODE_ENV) { case "production": baseURL = process.env.REACT_APP_BASE_URL break case "development": case "test": default: baseURL = process.env.REACT_APP_BASE_URL_DEV break } if (baseURL === undefined) { throw new Error(err) } const client = new FlotillaClient({ baseURL }) export default client ================================================ FILE: ui/src/components/ARASwitch.tsx ================================================ import * as React from "react" import { get } from "lodash" import { Tag, Colors, Checkbox, Intent } from "@blueprintjs/core" import { Task, UpdateTaskPayload } from "../types" import api from "../api" import Toaster from "./Toaster" import Request, { ChildProps } from "./Request" type Props = { task: Task } & ChildProps class ARASwitch extends React.Component { constructor(props: Props) { super(props) this.handleChange = this.handleChange.bind(this) } handleChange() { const { task, request } = this.props let enabled: boolean if (this.isEnabled()) { enabled = false } else { enabled = true } request({ definitionID: task.definition_id, data: { env: task.env, image: task.image, group_name: task.group_name, memory: task.memory, cpu: task.cpu, command: task.command, tags: task.tags, adaptive_resource_allocation: enabled, }, }) } isEnabled() { return get(this.props.task, "adaptive_resource_allocation", false) === true } render() { const enabled = this.isEnabled() return (
{enabled ? "Enabled" : "Disabled"}
) } } type ConnectedProps = { task: Task request: (opts: { definitionID: string }) => void } const Connected: React.FC = ({ task, request }) => ( requestFn={api.updateTask} shouldRequestOnMount={false} onSuccess={(data: Task) => { Toaster.show({ message: `${data.alias} updated successfully!`, intent: Intent.SUCCESS, }) // Re-request data. request({ definitionID: data.definition_id }) }} onFailure={() => { Toaster.show({ message: "An error occurred.", intent: Intent.DANGER, }) }} > {requestProps => } ) export default Connected ================================================ FILE: ui/src/components/App.tsx ================================================ import * as React from "react" import { BrowserRouter, Route, Switch, Redirect } from "react-router-dom" import Tasks from "./Tasks" import Task from "./Task" import CreateTaskForm from "./CreateTaskForm" import Run from "./Run" import Runs from "./Runs" import Templates from "./Templates" import Template from "./Template" import Navigation from "./Navigation" import ls from "../localstorage" import { LOCAL_STORAGE_IS_ONBOARDED_KEY } from "../constants" import Toaster from "./Toaster" import { Intent } from "@blueprintjs/core" import { connect, ConnectedProps } from "react-redux" import { toggleDialogVisibilityChange } from "../state/settings" const connector = connect() class App extends React.Component> { componentDidMount() { this.checkOnboardingStatus() } checkOnboardingStatus() { ls.getItem(LOCAL_STORAGE_IS_ONBOARDED_KEY).then(res => { if (res !== true) { Toaster.show({ icon: "clean", message: "You can now configure global settings via the Settings menu.", timeout: 0, intent: Intent.PRIMARY, action: { onClick: () => { ls.setItem(LOCAL_STORAGE_IS_ONBOARDED_KEY, true).then( () => { this.props.dispatch(toggleDialogVisibilityChange(true)) } ) }, text: "Open settings menu", }, onDismiss: () => { ls.setItem(LOCAL_STORAGE_IS_ONBOARDED_KEY, true) }, }) } }) } render() { return (
) } } export default connector(App) ================================================ FILE: ui/src/components/Attribute.tsx ================================================ import * as React from "react" import { Tag, Tooltip, Icon, Intent } from "@blueprintjs/core" import CopyToClipboard from "react-copy-to-clipboard" type Props = { rawValue: string } type State = { isCopied: boolean } class CopyableAttributeValue extends React.Component { constructor(props: Props) { super(props) this.handleCopy = this.handleCopy.bind(this) } state = { isCopied: false, } handleCopy() { this.setState({ isCopied: true }) } render() { return ( Click to copy to clipboard {this.state.isCopied && ( )} } >
{this.props.children}
) } } const Attribute: React.FunctionComponent<{ name: React.ReactNode value: React.ReactNode containerStyle?: object isCopyable?: boolean rawValue?: string description?: React.ReactElement isNew?: boolean }> = ({ name, value, containerStyle, isCopyable, rawValue, description, isNew, }) => (
{name}
{description && ( )} {isNew && New!}
{isCopyable && rawValue ? (
{value}
) : (
{value}
)}
) export default Attribute ================================================ FILE: ui/src/components/AutoscrollSwitch.tsx ================================================ import * as React from "react" import { useDispatch, useSelector } from "react-redux" import { Switch } from "@blueprintjs/core" import { RootState } from "../state/store" import { toggleAutoscroll } from "../state/runView" const AutoscrollSwitch: React.FC = () => { const dispatch = useDispatch() const shouldAutoscroll = useSelector( (state: RootState) => state.runView.shouldAutoscroll ) return ( { dispatch(toggleAutoscroll()) }} /> ) } export default AutoscrollSwitch ================================================ FILE: ui/src/components/BaseTaskForm.tsx ================================================ import * as React from "react" import { FormGroup, Classes } from "@blueprintjs/core" import { FastField, FormikProps } from "formik" import * as Yup from "yup" import GroupNameSelect from "./GroupNameSelect" import TagsSelect from "./TagsSelect" import EnvFieldArray from "./EnvFieldArray" import FieldError from "./FieldError" import { groupNameFieldSpec, imageFieldSpec, commandFieldSpec, memoryFieldSpec, tagsFieldSpec, cpuFieldSpec, } from "../helpers/taskFormHelpers" export const validationSchema = { env: Yup.array().of( Yup.object().shape({ name: Yup.string().required(), value: Yup.string().required(), }) ), image: Yup.string() .min(1) .required("Required"), group_name: Yup.string() .min(1) .required("Required"), memory: Yup.number() .required("Required") .min(0), cpu: Yup.number() .required("Required") .min(512), command: Yup.string() .min(1) .required("Required"), tags: Yup.array().of(Yup.string()), } export type Props = Pick< FormikProps, "values" | "setFieldValue" | "errors" > const BaseTaskForm: React.FunctionComponent = ({ values, setFieldValue, errors, }) => ( <> { setFieldValue(groupNameFieldSpec.name, value) }} /> {errors.group_name && {errors.group_name}} {errors.image && {errors.image}} {errors.command && {errors.command}} {errors.cpu && {errors.cpu}} {errors.memory && {errors.memory}} { setFieldValue(tagsFieldSpec.name, value) }} /> {errors.tags && {errors.tags}} ) export default BaseTaskForm ================================================ FILE: ui/src/components/CloudtrailRecords.tsx ================================================ import * as React from "react" import { CloudtrailRecord } from "../types" import { HTMLTable } from "@blueprintjs/core" type Props = { data: CloudtrailRecord[] } const CloudtrailRecords: React.FC = ({ data }) => ( Event Name Event Source {data.map((r, i) => ( {r.eventName} {r.eventSource} ))} ) export default CloudtrailRecords ================================================ FILE: ui/src/components/ClusterSelect.tsx ================================================ import * as React from "react" import { get, isArray } from "lodash" import Creatable from "react-select/lib/Creatable" import Request from "./Request" import { ListClustersResponse, SelectOption, SelectProps } from "../types" import api from "../api" import * as helpers from "../helpers/selectHelpers" /** * ClusterSelect allows users to select an ECS cluster on which to run a * particular task. This component hits the `/clusters` endpoint and renders * the results into a React Select component. */ export const ClusterSelect: React.FunctionComponent = props => { return ( value={helpers.stringToSelectOpt(props.value)} options={props.options} isClearable onChange={option => { props.onChange(helpers.preprocessSelectOption(option)) }} styles={helpers.selectStyles} theme={helpers.selectTheme} isDisabled={props.isDisabled} /> ) } const Connected: React.FunctionComponent = props => ( requestFn={api.listClusters}> {res => { let options = get(res, ["data", "clusters"], []) // If there's an error fetching available clusters, set the options to // an empty array. if (!isArray(options)) options = [] return ( ) }} ) export default Connected ================================================ FILE: ui/src/components/CreateTaskForm.tsx ================================================ import * as React from "react" import { RouteComponentProps } from "react-router-dom" import { Button, Intent, FormGroup, Classes } from "@blueprintjs/core" import { Formik, Form, FastField, FormikProps } from "formik" import * as Yup from "yup" import api from "../api" import { CreateTaskPayload, Task } from "../types" import Request, { RequestStatus, ChildProps as RequestChildProps, } from "./Request" import BaseTaskForm, { validationSchema as baseTaskFormValidationSchema, } from "./BaseTaskForm" import Toaster from "./Toaster" import ErrorCallout from "./ErrorCallout" import FieldError from "./FieldError" export const validationSchema = Yup.object().shape({ ...baseTaskFormValidationSchema, alias: Yup.string() .min(1) .required("Required"), }) export type Props = Pick< FormikProps, "values" | "setFieldValue" | "isValid" | "errors" > & Pick< RequestChildProps, "requestStatus" | "error" | "isLoading" > export const CreateTaskForm: React.FunctionComponent = ({ values, isValid, setFieldValue, requestStatus, error, isLoading, errors, }) => { return ( <> {requestStatus === RequestStatus.ERROR && error && ( )}
{errors.alias && {errors.alias}} ) } export type ConnectedProps = RouteComponentProps & { initialValues: CreateTaskPayload onSuccess?: (data: Task) => void } const Connected: React.FunctionComponent = props => ( requestFn={api.createTask} shouldRequestOnMount={false} onSuccess={(data: Task) => { Toaster.show({ message: `Task ${data.alias} created successfully!`, intent: Intent.SUCCESS, }) props.history.push(`/tasks/${data.definition_id}`) if (props.onSuccess) { props.onSuccess(data) } }} onFailure={() => { Toaster.show({ message: "An error occurred.", intent: Intent.DANGER, }) }} > {requestProps => ( { requestProps.request({ data }) }} > {({ values, setFieldValue, isValid, errors }) => ( )} )} ) Connected.defaultProps = { initialValues: { env: [], image: "", group_name: "", alias: "", memory: 1024, cpu: 512, command: "", tags: [], }, } export default Connected ================================================ FILE: ui/src/components/DeleteTaskButton.tsx ================================================ import * as React from "react" import { Button, Dialog, Intent, Classes } from "@blueprintjs/core" import { withRouter, RouteComponentProps } from "react-router-dom" import Request, { ChildProps } from "./Request" import api from "../api" import Toaster from "./Toaster" import ErrorCallout from "./ErrorCallout" type Args = { definitionID: string } export type Props = ChildProps & ConnectedProps type State = { isOpen: boolean } export class DeleteTaskButton extends React.Component { constructor(props: Props) { super(props) this.handleSubmitClick = this.handleSubmitClick.bind(this) this.openDialog = this.openDialog.bind(this) this.closeDialog = this.closeDialog.bind(this) } state = { isOpen: false, } handleSubmitClick() { this.props.request({ definitionID: this.props.definitionID }) } openDialog() { this.setState({ isOpen: true }) } closeDialog() { this.setState({ isOpen: false }) } render() { const { isLoading, error } = this.props return ( <>
{error && } Are you sure you want to delete this task?
) } } type ConnectedProps = { definitionID: string } const Connected: React.FunctionComponent< RouteComponentProps & ConnectedProps > = ({ definitionID, history }) => ( requestFn={api.deleteTask} initialRequestArgs={{ definitionID }} shouldRequestOnMount={false} onSuccess={() => { Toaster.show({ message: "Task deleted!", intent: Intent.SUCCESS, }) history.push(`/tasks`) }} onFailure={() => { Toaster.show({ message: "An error occurred.", intent: Intent.DANGER, }) }} > {requestProps => ( )} ) export default withRouter(Connected) ================================================ FILE: ui/src/components/Duration.tsx ================================================ import * as React from "react" import prettyMS from "pretty-ms" import calculateDuration from "../helpers/calculateDuration" type Props = { start: string end: string | undefined | null isActive: boolean } type State = { duration: number } class Duration extends React.Component { private intervalID: number | undefined constructor(props: Props) { super(props) this.process = this.process.bind(this) } state = { duration: 0, } componentDidMount() { // Immediately process duration on mount. this.process() // If the end date is undefined, begin interval to process duration. if (this.props.end === undefined && this.props.isActive === true) { this.intervalID = window.setInterval(this.process.bind(this), 1000) } } componentWillUnmount() { window.clearInterval(this.intervalID) } process() { const { start, end } = this.props this.setState({ duration: calculateDuration(start, end) }) } render() { return (
{prettyMS(this.state.duration, { secondsDecimalDigits: 0 })}
) } } export default Duration ================================================ FILE: ui/src/components/EngineTag.tsx ================================================ import * as React from "react" import { Tag } from "@blueprintjs/core" import { ExecutionEngine } from "../types" const EngineTag: React.FC<{ engine: ExecutionEngine }> = ({ engine }) => ( {engine} ) export default EngineTag ================================================ FILE: ui/src/components/EnvFieldArray.tsx ================================================ import * as React from "react" import { FieldArray, FastField, FormikErrors } from "formik" import { get } from "lodash" import { Button, FormGroup, Classes, Intent } from "@blueprintjs/core" import { Env } from "../types" import { IconNames } from "@blueprintjs/icons" import { envFieldSpec } from "../helpers/taskFormHelpers" import FieldError from "./FieldError" export type Props = { values: Env[] push: (env: Env) => void remove: (index: number) => void errors: string | FormikErrors | undefined } export const EnvFieldArray: React.FunctionComponent = ({ values, push, remove, errors, }) => (
{envFieldSpec.label}
{values.map((env: Env, i: number) => (
{get(errors, [i, "name"], null)} {get(errors, [i, "value"], null)}
))}
) const ConnectedEnvFieldArray: React.FunctionComponent<{}> = () => ( {({ form, push, remove }) => ( )} ) export default ConnectedEnvFieldArray ================================================ FILE: ui/src/components/EnvList.tsx ================================================ import * as React from "react" import { isEmpty, isArray } from "lodash" import { Env } from "../types" import Attribute from "./Attribute" const EnvList: React.FunctionComponent<{ env: Env[] }> = ({ env }) => (
{isArray(env) && !isEmpty(env) && env.map(e => ( ))}
) export default EnvList ================================================ FILE: ui/src/components/EnvQueryFilter.tsx ================================================ import * as React from "react" import { Button, FormGroup, Classes, Intent } from "@blueprintjs/core" import { Env } from "../types" import { IconNames } from "@blueprintjs/icons" import { DebounceInput } from "react-debounce-input" import { envFieldSpec } from "../helpers/taskFormHelpers" type Props = { value: string[] onChange: (value: string[]) => void } type State = { newEnvName: string newEnvValue: string } class EnvQueryFilter extends React.Component { private delimiter: string = "|" constructor(props: Props) { super(props) this.handleNameChange = this.handleNameChange.bind(this) this.handleValueChange = this.handleValueChange.bind(this) this.handleRemove = this.handleRemove.bind(this) this.handleNewNameChange = this.handleNewNameChange.bind(this) this.handleNewValueChange = this.handleNewValueChange.bind(this) this.handleAddNewEnv = this.handleAddNewEnv.bind(this) } state = { newEnvName: "", newEnvValue: "", } serialize(env: Env): string { return `${env.name}${this.delimiter}${env.value}` } deserialize(str: string): Env { const split = str.split(this.delimiter) return { name: split[0], value: split[1], } } handleNameChange(i: number, evt: React.ChangeEvent) { const { value, onChange } = this.props const prevEnvValue = this.deserialize(value[i]).value const nextArr = value nextArr[i] = this.serialize({ name: evt.target.value, value: prevEnvValue }) onChange(nextArr) } handleValueChange(i: number, evt: React.ChangeEvent) { const { value, onChange } = this.props const prevEnvName = this.deserialize(value[i]).name const nextArr = value nextArr[i] = this.serialize({ name: prevEnvName, value: evt.target.value }) onChange(nextArr) } handleRemove(i: number) { const { value, onChange } = this.props let nextArr = value nextArr.splice(i, 1) onChange(nextArr) } handleNewNameChange(evt: React.ChangeEvent) { this.setState({ newEnvName: evt.target.value }) } handleNewValueChange(evt: React.ChangeEvent) { this.setState({ newEnvValue: evt.target.value }) } handleAddNewEnv() { const { value, onChange } = this.props const { newEnvName, newEnvValue } = this.state const prev = value const e = this.serialize({ name: newEnvName, value: newEnvValue }) const next = prev.concat(e) this.setState({ newEnvName: "", newEnvValue: "" }, () => { onChange(next) }) } shouldDisableAddNewEnvButton(): boolean { const { newEnvName, newEnvValue } = this.state return newEnvName.length === 0 || newEnvValue.length === 0 } render() { const { value } = this.props const { newEnvName, newEnvValue } = this.state return (
{envFieldSpec.label}
{value.map((s: string, i: number) => { const e: Env = this.deserialize(s) return (
) })}
) } } export default EnvQueryFilter ================================================ FILE: ui/src/components/ErrorCallout.tsx ================================================ import * as React from "react" import { Callout, Intent } from "@blueprintjs/core" import { get } from "lodash" import { AxiosError } from "axios" import Attribute from "./Attribute" const ErrorCallout: React.FunctionComponent<{ error: AxiosError | null }> = ({ error, }) => { return (
) } export default ErrorCallout ================================================ FILE: ui/src/components/FieldError.tsx ================================================ import * as React from "react" import { Colors } from "@blueprintjs/core" const FieldError: React.FunctionComponent = ({ children }) => (
{children}
) export default FieldError ================================================ FILE: ui/src/components/GenericMultiSelect.tsx ================================================ import * as React from "react" import { isArray } from "lodash" import Creatable from "react-select/lib/Creatable" import { SelectOption, MultiSelectProps } from "../types" import * as helpers from "../helpers/selectHelpers" const GenericMultiSelect: React.FunctionComponent = props => { let value = props.value if (!isArray(props.value)) { value = [props.value] } return ( value={value.map(helpers.stringToSelectOpt)} options={[]} onChange={option => { props.onChange(helpers.preprocessMultiSelectOption(option)) }} isMulti isClearable styles={helpers.selectStyles} theme={helpers.selectTheme} isDisabled={props.isDisabled} /> ) } export default GenericMultiSelect ================================================ FILE: ui/src/components/GroupNameSelect.tsx ================================================ import * as React from "react" import { get } from "lodash" import Creatable from "react-select/lib/Creatable" import Request, { RequestStatus } from "./Request" import { ListGroupsResponse, SelectOption, SelectProps } from "../types" import api from "../api" import * as helpers from "../helpers/selectHelpers" import { Classes, Spinner } from "@blueprintjs/core" /** * GroupNameSelect lets users choose a group name for their task definition. It * hits the `/groups` endpoint and renders the results into a React Select * component. If there are no existing groups, it will render an `` * element as a fallback. */ export const GroupNameSelect: React.FunctionComponent = props => { return ( value={helpers.stringToSelectOpt(props.value)} options={props.options} onChange={option => { props.onChange(helpers.preprocessSelectOption(option)) }} isClearable id="groupNameSelect" styles={helpers.selectStyles} theme={helpers.selectTheme} isDisabled={props.isDisabled} /> ) } const ConnectedGroupNameSelect: React.FunctionComponent = props => ( requestFn={api.listGroups}> {({ data, requestStatus }) => { switch (requestStatus) { case RequestStatus.ERROR: return ( { props.onChange(evt.target.value) }} /> ) case RequestStatus.READY: let options = get(data, "groups", []) === null ? [] : get(data, "groups", []) if (options === null) options = [] return ( ) case RequestStatus.NOT_READY: default: return } }} ) export default ConnectedGroupNameSelect ================================================ FILE: ui/src/components/ISO8601AttributeValue.tsx ================================================ import * as React from "react" import moment from "moment" import { Classes } from "@blueprintjs/core" const ISO8601AttributeValue: React.FunctionComponent<{ time: string | null | undefined inline?: boolean verbose?: boolean }> = ({ time, inline, verbose }) => { return (
{time !== null && time !== undefined ? moment(time).fromNow() : "-"}
{verbose && time !== null && time !== undefined && (
{time.substr(0, 19)}
)}
) } ISO8601AttributeValue.defaultProps = { verbose: true, } export default ISO8601AttributeValue ================================================ FILE: ui/src/components/ListFiltersDropdown.tsx ================================================ import * as React from "react" import { Button, Tooltip, Popover, Position, Card } from "@blueprintjs/core" const ListFiltersDropdown: React.FunctionComponent<{}> = ({ children }) => ( {children}} > } breadcrumbs={[ { text: this.getExecutableLinkName(), href: this.getExecutableLinkURL(), }, { text: data.run_id, href: `/runs/${data.run_id}`, }, ]} buttons={btn} />
{metadataVisibility.isVisible && }
} /> ) } /> } />
{ this.setActiveTabId(id as RunTabId) }} > ) : ( ) } /> EKS Pod Events ) : ( "EKS Pod Events" ) } panel={ } disabled={data.engine !== ExecutionEngine.EKS} /> Cloudtrail Records ) : ( `EKS Cloudtrail Records (${ hasCloudtrailRecords ? get( data, ["cloudtrail_notifications", "Records"], [] ).length : 0 })` ) } panel={ } disabled={ data.engine !== ExecutionEngine.EKS || hasCloudtrailRecords === false } />
)} ) } return case RequestStatus.NOT_READY: default: return } } } const ReduxConnectedRun = connected(Run) const Connected: React.FunctionComponent> = ({ match }) => ( {({ query, setQuery }) => ( requestFn={api.getRun} initialRequestArgs={{ runID: match.params.runID }} > {props => ( <> {`${ props.data ? EnhancedRunStatusEmojiMap.get( getEnhancedRunStatus(props.data) as EnhancedRunStatus ) : "" } ${match.params.runID}`} )} )} ) export default Connected ================================================ FILE: ui/src/components/RunAttributes.tsx ================================================ import * as React from "react" import { Card, Pre, Tag } from "@blueprintjs/core" import { Run, ExecutionEngine } from "../types" import Attribute from "./Attribute" import ISO8601AttributeValue from "./ISO8601AttributeValue" const RunAttributes: React.FC<{ data: Run }> = ({ data }) => (
{data.engine}} /> {data.engine !== ExecutionEngine.EKS && ( )} {data.node_lifecycle || "-"}} />
{data.max_cpu_used && }
{data.max_memory_used && }
{data.gpu && (
)}
} /> } /> } />
{data.command.replace(/\n(\s)+/g, "\n")} ) : ( "Existing task definition command was used." ) } />
) export default RunAttributes ================================================ FILE: ui/src/components/RunDebugAttributes.tsx ================================================ import * as React from "react" import { Card, Icon } from "@blueprintjs/core" import urljoin from "url-join" import { Run, ExecutionEngine } from "../types" import Attribute from "./Attribute" const createS3LogsUrl = (runID: string): string => { const prefix = process.env.REACT_APP_S3_BUCKET_PREFIX || "" return urljoin(prefix, "logs", runID, "/") } const createEC2Url = (dns: string): string => { const prefix = process.env.REACT_APP_EC2_INSTANCE_URL_PREFIX || "" return urljoin(prefix, dns) } const createS3ManifestUrl = (runID: string): string => { const prefix = process.env.REACT_APP_S3_OBJECT_PREFIX || "" return urljoin(prefix, "manifests", runID, `${runID}.yaml`) } const RunDebugAttributes: React.FC<{ data: Run }> = ({ data }) => (
EKS Debug
{data.cluster && } {data.pod_name && } {data.attempt_count && } {data.engine === ExecutionEngine.EKS && ( Link } /> )} {data.instance.dns_name && ( {data.instance.dns_name} } /> )} {data.engine === ExecutionEngine.EKS && ( Link } /> )}
) export default RunDebugAttributes ================================================ FILE: ui/src/components/RunEvents.tsx ================================================ import * as React from "react" import { RunStatus, RunTabId } from "../types" import Request, { RequestStatus } from "./Request" import api from "../api" import { ListRunEventsResponse } from "../types" import ErrorCallout from "./ErrorCallout" import { Spinner, Callout, Card, Tag, Button, Intent } from "@blueprintjs/core" import QueryParams from "./QueryParams" import { RUN_TAB_ID_QUERY_KEY } from "../constants" type Props = { runID: string status: RunStatus hasLogs: boolean } const RunEvents: React.FC = ({ runID, status, hasLogs }) => ( {({ setQuery }) => ( requestFn={api.listRunEvents} initialRequestArgs={runID} > {({ data, requestStatus, isLoading, error }) => { switch (requestStatus) { case RequestStatus.ERROR: return case RequestStatus.READY: let viewLogsCallout = ( ) if (data && data.pod_events !== null) { return ( <>
{data.pod_events.map((evt, i) => (
{evt.timestamp} {evt.reason}
{evt.message}
))}
{hasLogs && viewLogsCallout} ) } return ( <> No events found. {hasLogs && viewLogsCallout} ) case RequestStatus.NOT_READY: default: return } }} )}
) export default RunEvents ================================================ FILE: ui/src/components/RunSidebar.tsx ================================================ import * as React from "react" import { get } from "lodash" import { Card } from "@blueprintjs/core" import JsonView from "react-json-view" import { ExecutionEngine, Run, ExecutableType } from "../types" import EnvList from "./EnvList" import RunAttributes from "./RunAttributes" import RunDebugAttributes from "./RunDebugAttributes" import { JSON_VIEW_PROPS } from "../constants" const RunSidebar: React.FC<{ data: Run }> = ({ data }) => { const templatePayload = get( data, ["execution_request_custom", "template_payload"], {} ) return (
{data && data.executable_type === ExecutableType.ExecutableTypeTemplate && (
Template Payload
)}
Environment Variables
{data && data.engine === ExecutionEngine.EKS && ( )}
) } export default RunSidebar ================================================ FILE: ui/src/components/RunStatusSelect.tsx ================================================ import * as React from "react" import { isArray } from "lodash" import Select from "react-select" import { SelectOption, MultiSelectProps, RunStatus } from "../types" import * as helpers from "../helpers/selectHelpers" const RunStatusSelect: React.FunctionComponent = props => { let v: SelectOption[] if (!isArray(props.value)) { v = [helpers.stringToSelectOpt(props.value)] } else { v = props.value.map(helpers.stringToSelectOpt) } return ( value={v} options={[ { label: RunStatus.PENDING, value: RunStatus.PENDING }, { label: RunStatus.QUEUED, value: RunStatus.QUEUED }, { label: RunStatus.RUNNING, value: RunStatus.RUNNING }, ]} onChange={option => { props.onChange(helpers.preprocessMultiSelectOption(option)) }} isMulti styles={helpers.selectStyles} theme={helpers.selectTheme} isDisabled={props.isDisabled} /> ) } export default RunStatusSelect ================================================ FILE: ui/src/components/RunTag.tsx ================================================ import * as React from "react" import { Run } from "../types" import { Tag, Colors } from "@blueprintjs/core" import { RUN_STATUS_COLOR_MAP } from "../constants" import getEnhancedRunStatus from "../helpers/getEnhancedRunStatus" const RunTag: React.FunctionComponent = run => { const enhancedStatus = getEnhancedRunStatus(run) return ( {enhancedStatus} ) } export default RunTag ================================================ FILE: ui/src/components/Runs.tsx ================================================ import * as React from "react" import { Link } from "react-router-dom" import { get, omit, isArray, isString } from "lodash" import { DebounceInput } from "react-debounce-input" import ListRequest, { ChildProps as ListRequestChildProps } from "./ListRequest" import api from "../api" import { ListRunParams, ListRunResponse, SortOrder, Run, RunStatus, } from "../types" import pageToOffsetLimit from "../helpers/pageToOffsetLimit" import Table from "./Table" import ViewHeader from "./ViewHeader" import ListFiltersDropdown from "./ListFiltersDropdown" import Pagination from "./Pagination" import GenericMultiSelect from "./GenericMultiSelect" import RunStatusSelect from "./RunStatusSelect" import { FormGroup, Classes, Spinner, Tag } from "@blueprintjs/core" import { PAGE_SIZE } from "../constants" import { RequestStatus } from "./Request" import ErrorCallout from "./ErrorCallout" import ISO8601AttributeValue from "./ISO8601AttributeValue" import RunTag from "./RunTag" import EnvQueryFilter from "./EnvQueryFilter" export const initialQuery = { page: 1, sort_by: "started_at", order: SortOrder.DESC, status: [RunStatus.PENDING, RunStatus.QUEUED, RunStatus.RUNNING], } export type Props = ListRequestChildProps< ListRunResponse, { params: ListRunParams } > export const Runs: React.FunctionComponent = ({ data, updateSort, currentSortKey, currentSortOrder, updatePage, currentPage, query, updateFilter, isLoading, requestStatus, error, }) => { let content: React.ReactNode switch (requestStatus) { case RequestStatus.ERROR: content = break case RequestStatus.READY: content = ( items={get(data, "history", [])} getItemKey={(r: Run) => r.run_id} updateSort={updateSort} currentSortKey={currentSortKey} currentSortOrder={currentSortOrder} columns={{ status: { displayName: "Status", render: (r: Run) => , isSortable: true, }, started_at: { displayName: "Started At", render: (r: Run) => , isSortable: true, }, run_id: { displayName: "Run ID", render: (r: Run) => ( {r.run_id} ), isSortable: true, }, alias: { displayName: "Alias", render: (r: Run) => ( {r.alias} ), isSortable: false, }, engine: { displayName: "Engine", render: (r: Run) => {r.engine}, isSortable: false, }, }} /> ) break case RequestStatus.NOT_READY: default: content = break } // Preprocess `env` query to ensure that it's an array. let env: string | string[] = get(query, "env", []) if (!isArray(env) && isString(env)) env = [env] return ( <>
{ updateFilter("alias", value) }} isDisabled={false} /> { updateFilter("status", value) }} isDisabled={false} /> { updateFilter("env", value) }} /> { updateFilter("cluster_name", value) }} isDisabled={false} /> ) => { updateFilter("started_at_since", evt.target.value) }} /> ) => { updateFilter("started_at_until", evt.target.value) }} /> ) => { updateFilter("finished_at_since", evt.target.value) }} /> ) => { updateFilter("finished_at_until", evt.target.value) }} />
{content} ) } const ConnectedRuns: React.FunctionComponent<{}> = () => ( requestFn={api.listRun} initialQuery={initialQuery} getRequestArgs={params => ({ params: { ...omit(params, "page"), ...pageToOffsetLimit({ page: get(params, "page", 1), limit: PAGE_SIZE, }), }, })} > {props => } ) export default ConnectedRuns ================================================ FILE: ui/src/components/SettingsButton.tsx ================================================ import * as React from "react" import { useSelector, useDispatch } from "react-redux" import { Formik, Form, FastField, Field } from "formik" import { Classes, Button, Dialog, Switch, FormGroup, Intent, } from "@blueprintjs/core" import { RootState } from "../state/store" import { Settings, update, toggleDialogVisibilityChange, } from "../state/settings" const SettingsButton: React.FC = () => { const dispatch = useDispatch() const { settings, isSettingsDialogOpen, isLoading } = useSelector( (s: RootState) => s.settings ) return ( <> { dispatch(toggleDialogVisibilityChange(false)) }} className="bp3-dark" title={`Settings (v${process.env.REACT_APP_VERSION})`} > initialValues={settings} onSubmit={values => { dispatch(update(values)) }} > {({ values, setFieldValue }) => { return (
{ setFieldValue( "USE_OPTIMIZED_LOG_RENDERER", !values.USE_OPTIMIZED_LOG_RENDERER ) }} label="Use optimized log renderer." /> { setFieldValue( "SHOULD_OVERRIDE_CMD_F_IN_RUN_VIEW", !values.SHOULD_OVERRIDE_CMD_F_IN_RUN_VIEW ) }} label="Override ⌘-F in run view." disabled={values.USE_OPTIMIZED_LOG_RENDERER === false} />
) }}
) } export default SettingsButton ================================================ FILE: ui/src/components/SortableTh.tsx ================================================ import * as React from "react" import { SortOrder } from "../types" export type Props = { isSortable: boolean isActive: boolean order: SortOrder onClick: () => void } const Th: React.FunctionComponent = ({ isSortable, isActive, order, children, onClick, }) => { let className = "" if (isSortable) { className += "flotilla-th-sortable" if (isActive) { className += " active" if (order === SortOrder.ASC) { className += " active-asc" } else { className += " active-desc" } } } return ( {children} ) } export default Th ================================================ FILE: ui/src/components/StopRunButton.tsx ================================================ import * as React from "react" import { Button, Dialog, Intent, Classes } from "@blueprintjs/core" import Request, { ChildProps } from "./Request" import api from "../api" import Toaster from "./Toaster" import { withRouter, RouteComponentProps } from "react-router-dom" import ErrorCallout from "./ErrorCallout" type Args = { definitionID: string; runID: string } export type Props = ChildProps & ConnectedProps type State = { isOpen: boolean } export class StopRunButton extends React.Component { constructor(props: Props) { super(props) this.handleSubmitClick = this.handleSubmitClick.bind(this) this.openDialog = this.openDialog.bind(this) this.closeDialog = this.closeDialog.bind(this) } state = { isOpen: false, } openDialog() { this.setState({ isOpen: true }) } closeDialog() { this.setState({ isOpen: false }) } handleSubmitClick() { this.props.request({ definitionID: this.props.definitionID, runID: this.props.runID, }) this.closeDialog() } render() { const { error, isLoading } = this.props return ( <>
{error && } Are you sure you want to stop this run?
) } } type ConnectedProps = { definitionID: string runID: string } const Connected: React.FunctionComponent< RouteComponentProps & ConnectedProps > = ({ runID, definitionID, history }) => ( requestFn={api.stopRun} initialRequestArgs={{ runID, definitionID }} shouldRequestOnMount={false} onSuccess={() => { Toaster.show({ message: "Run stopped!", intent: Intent.SUCCESS, }) }} onFailure={() => { Toaster.show({ message: "An error occurred.", intent: Intent.DANGER, }) }} > {requestProps => ( )} ) export default withRouter(Connected) ================================================ FILE: ui/src/components/Table.tsx ================================================ import * as React from "react" import { HTMLTable, Callout } from "@blueprintjs/core" import { isArray } from "lodash" import SortableTh from "./SortableTh" import { SortOrder } from "../types" type Column = { displayName: string render: (item: ItemType) => React.ReactNode isSortable: boolean } type Props = { items: ItemType[] columns: { [key: string]: Column } getItemKey: (item: ItemType, index: number) => any updateSort: (sortKey: string) => void currentSortKey: string currentSortOrder: SortOrder } class Table extends React.Component> { render() { const { columns, items, getItemKey, updateSort, currentSortKey, currentSortOrder, } = this.props if (isArray(items) && items.length > 0) { return ( {Object.entries(columns).map(([k, v]) => ( { if (v.isSortable === true) { updateSort(k) } }} key={k} > {v.displayName} ))} {items.map((item, i) => ( {Object.entries(columns).map(([k, v]) => ( {v.render(item)} ))} ))} ) } return No items were found. } } export default Table ================================================ FILE: ui/src/components/TagsSelect.tsx ================================================ import * as React from "react" import { get, isArray } from "lodash" import Creatable from "react-select/lib/Creatable" import Request from "./Request" import { ListTagsResponse, SelectOption, MultiSelectProps } from "../types" import api from "../api" import * as helpers from "../helpers/selectHelpers" export const TagsSelect: React.FunctionComponent = props => ( isMulti value={props.value.map(helpers.stringToSelectOpt)} options={props.options} onChange={options => { props.onChange(helpers.preprocessMultiSelectOption(options)) }} styles={helpers.selectStyles} theme={helpers.selectTheme} closeMenuOnSelect={false} isDisabled={props.isDisabled} /> ) const ConnectedTagsSelect: React.FunctionComponent = props => ( requestFn={api.listTags}> {res => { let options = get(res, ["data", "tags"], []) if (!isArray(options)) options = [] return ( ) }} ) export default ConnectedTagsSelect ================================================ FILE: ui/src/components/Task.tsx ================================================ import * as React from "react" import { Switch, Route, RouteComponentProps } from "react-router-dom" import { get } from "lodash" import Request, { ChildProps, RequestStatus } from "./Request" import api from "../api" import { Task as TaskShape, Task as TaskTypeDef } from "../types" import TaskDetails from "./TaskDetails" import UpdateTaskForm from "./UpdateTaskForm" import TaskExecutionForm from "./TaskExecutionForm" import CreateTaskForm from "./CreateTaskForm" import ErrorCallout from "./ErrorCallout" import { Spinner } from "@blueprintjs/core" export type TaskCtx = ChildProps & { basePath: string definitionID: string } export const TaskContext = React.createContext({ data: null, requestStatus: RequestStatus.NOT_READY, isLoading: false, error: null, request: () => {}, basePath: "", // TODO: maybe this is not required. definitionID: "", receivedAt: null, }) export const Task: React.FunctionComponent = props => { return ( ( {ctx => { switch (ctx.requestStatus) { case RequestStatus.ERROR: return case RequestStatus.READY: return ( { ctx.request({ definitionID: data.definition_id }) }} initialValues={{ env: get(props, ["data", "env"], []), image: get(props, ["data", "image"], ""), group_name: get(props, ["data", "group_name"], ""), cpu: get(props, ["data", "cpu"], ""), memory: get(props, ["data", "memory"], ""), command: get(props, ["data", "command"], ""), tags: get(props, ["data", "tags"], []), alias: "", }} /> ) case RequestStatus.NOT_READY: return default: return null } }} )} /> ) } type ConnectedProps = RouteComponentProps<{ definitionID: string }> const Connected: React.FunctionComponent = ({ match }) => ( requestFn={api.getTask} initialRequestArgs={{ definitionID: match.params.definitionID }} > {props => ( )} ) export default Connected ================================================ FILE: ui/src/components/TaskDetails.tsx ================================================ import * as React from "react" import { Link } from "react-router-dom" import { Collapse, Card, ButtonGroup, Pre, Classes, Button, Spinner, Icon, } from "@blueprintjs/core" import { TaskContext } from "./Task" import Attribute from "./Attribute" import TaskRuns from "./TaskRuns" import ViewHeader from "./ViewHeader" import EnvList from "./EnvList" import DeleteTaskButton from "./DeleteTaskButton" import Toggler from "./Toggler" import { RequestStatus } from "./Request" import ErrorCallout from "./ErrorCallout" import ARASwitch from "./ARASwitch" const TaskDetails: React.FC<{}> = () => ( {({ requestStatus, data, error, definitionID, request }) => { switch (requestStatus) { case RequestStatus.ERROR: return case RequestStatus.READY: if (data) { return ( <>
Copy
Update
Run } />
{({ isVisible, toggleVisibility }) => (
Attributes
} description={ Adaptive CPU and memory resource allocation based on prior run history. } /> {data.command} } />
)}
{data.env && ( {({ isVisible, toggleVisibility }) => (
Environment Variables
)}
)}
) } return null case RequestStatus.NOT_READY: default: return } }}
) export default TaskDetails ================================================ FILE: ui/src/components/TaskExecutionForm.tsx ================================================ import * as React from "react" import { Formik, Form, FastField, Field } from "formik" import * as Yup from "yup" import { RouteComponentProps } from "react-router-dom" import { FormGroup, Button, Intent, Spinner, Classes, RadioGroup, Radio, } from "@blueprintjs/core" import api from "../api" import { LaunchRequestV2, Run, ExecutionEngine } from "../types" import { getInitialValuesForTaskExecutionForm } from "../helpers/getInitialValuesForExecutionForm" import Request, { ChildProps as RequestChildProps, RequestStatus, } from "./Request" import EnvFieldArray from "./EnvFieldArray" import ClusterSelect from "./ClusterSelect" import { TaskContext, TaskCtx } from "./Task" import Toaster from "./Toaster" import ErrorCallout from "./ErrorCallout" import FieldError from "./FieldError" import NodeLifecycleSelect from "./NodeLifecycleSelect" import * as helpers from "../helpers/runFormHelpers" import { commandFieldSpec } from "../helpers/taskFormHelpers" const validationSchema = Yup.object().shape({ owner_id: Yup.string(), cluster: Yup.string().required("Required"), memory: Yup.number() .required("Required") .min(0), cpu: Yup.number() .required("Required") .min(512), env: Yup.array().of( Yup.object().shape({ name: Yup.string().required(), value: Yup.string().required(), }) ), engine: Yup.string() .matches(/(eks|ecs)/) .required("A valid engine type of ecs or eks must be set."), node_lifecycle: Yup.string().matches(/(spot|ondemand)/), command: Yup.string() .min(1) .nullable(), }) type Props = RequestChildProps< Run, { definitionID: string; data: LaunchRequestV2 } > & { definitionID: string initialValues: LaunchRequestV2 } const TaskExecutionForm: React.FC = ({ initialValues, request, requestStatus, isLoading, error, definitionID, }) => ( validationSchema.isValidSync(values.initialValues) } initialValues={initialValues} validationSchema={validationSchema} onSubmit={data => { request({ definitionID, data }) }} > {({ errors, values, setFieldValue, isValid, ...rest }) => { const getEngine = (): ExecutionEngine => values.engine return (
{requestStatus === RequestStatus.ERROR && error && ( )} {/* Owner ID Field */} {errors.owner_id && {errors.owner_id}} {/* Engine Type Field */} ) => { setFieldValue("engine", evt.currentTarget.value) if (evt.currentTarget.value === ExecutionEngine.EKS) { setFieldValue( "cluster", process.env.REACT_APP_EKS_CLUSTER_NAME || "" ) } else if (getEngine() === ExecutionEngine.EKS) { setFieldValue("cluster", "") } }} selectedValue={values.engine} > {/* Cluster Field. Note: this is a "Field" rather than a "FastField" as it needs to re-render when value.engine is updated. */} {getEngine() !== ExecutionEngine.EKS && ( { setFieldValue("cluster", value) }} /> {errors.cluster && {errors.cluster}} )} {/* CPU Field */} {errors.cpu && {errors.cpu}} {/* Memory Field */} {errors.memory && {errors.memory}} { setFieldValue(helpers.nodeLifecycleFieldSpec.name, value) }} isDisabled={getEngine() !== ExecutionEngine.EKS} /> {errors.node_lifecycle && ( {errors.node_lifecycle} )} {errors.command && {errors.command}} ) }}
) const Connected: React.FunctionComponent> = ({ location, history }) => ( requestFn={api.runTask} shouldRequestOnMount={false} onSuccess={(data: Run) => { Toaster.show({ message: `Run ${data.run_id} submitted successfully!`, intent: Intent.SUCCESS, }) history.push(`/runs/${data.run_id}`) }} onFailure={() => { Toaster.show({ message: "An error occurred.", intent: Intent.DANGER, }) }} > {requestProps => ( {(ctx: TaskCtx) => { switch (ctx.requestStatus) { case RequestStatus.ERROR: return case RequestStatus.READY: if (ctx.data) { const initialValues: LaunchRequestV2 = getInitialValuesForTaskExecutionForm( ctx.data, location.state ) return ( ) } break case RequestStatus.NOT_READY: default: return } }} )} ) export default Connected ================================================ FILE: ui/src/components/TaskRuns.tsx ================================================ import * as React from "react" import { Link } from "react-router-dom" import { get, omit, isArray, isString } from "lodash" import ListRequest, { ChildProps as ListRequestChildProps } from "./ListRequest" import api from "../api" import { ListTaskRunsParams, ListTaskRunsResponse, SortOrder, Run, RunStatus, ExecutionEngine, } from "../types" import pageToOffsetLimit from "../helpers/pageToOffsetLimit" import Table from "./Table" import { FormGroup, Classes, Spinner, Tag } from "@blueprintjs/core" import GenericMultiSelect from "./GenericMultiSelect" import RunStatusSelect from "./RunStatusSelect" import ListFiltersDropdown from "./ListFiltersDropdown" import { DebounceInput } from "react-debounce-input" import Pagination from "./Pagination" import { PAGE_SIZE } from "../constants" import { RequestStatus } from "./Request" import ErrorCallout from "./ErrorCallout" import RunTag from "./RunTag" import ISO8601AttributeValue from "./ISO8601AttributeValue" import EnvQueryFilter from "./EnvQueryFilter" import Duration from "./Duration" export const initialQuery = { page: 1, sort_by: "started_at", order: SortOrder.DESC, } export type Props = ListRequestChildProps< ListTaskRunsResponse, { params: ListTaskRunsParams } > export const TaskRuns: React.FunctionComponent = ({ data, updateSort, currentSortKey, currentSortOrder, query, updateFilter, updatePage, currentPage, isLoading, requestStatus, error, }) => { let content: React.ReactNode // Preprocess `env` query to ensure that it's an array. let env: string | string[] = get(query, "env", []) if (!isArray(env) && isString(env)) env = [env] switch (requestStatus) { case RequestStatus.ERROR: content = break case RequestStatus.READY: content = ( items={get(data, "history", [])} getItemKey={(r: Run) => r.run_id} updateSort={updateSort} currentSortKey={currentSortKey} currentSortOrder={currentSortOrder} columns={{ run_id: { displayName: "Run ID", render: (r: Run) => ( {r.run_id} ), isSortable: true, }, status: { displayName: "Status", render: (r: Run) => , isSortable: true, }, engine: { displayName: "Engine", render: (r: Run) => {r.engine}, isSortable: false, }, duration: { displayName: "Duration", render: (r: Run) => r.started_at ? ( ) : ( "-" ), isSortable: false, }, started_at: { displayName: "Started At", render: (r: Run) => ( ), isSortable: true, }, finished_at: { displayName: "Finished At", render: (r: Run) => ( ), isSortable: true, }, cluster: { displayName: "Cluster", render: (r: Run) => r.engine === ExecutionEngine.EKS ? "-" : r.cluster, isSortable: false, }, }} /> ) break case RequestStatus.NOT_READY: default: content = break } return ( <>
{ updateFilter("status", value) }} isDisabled={false} /> { updateFilter("env", value) }} /> { updateFilter("cluster_name", value) }} isDisabled={false} /> ) => { updateFilter("started_at_since", evt.target.value) }} /> ) => { updateFilter("started_at_until", evt.target.value) }} /> ) => { updateFilter("finished_at_since", evt.target.value) }} /> ) => { updateFilter("finished_at_until", evt.target.value) }} />
{content} ) } const ConnectedTaskRuns: React.FunctionComponent<{ definitionID: string }> = ({ definitionID, }) => ( requestFn={api.listTaskRuns} initialQuery={initialQuery} // @TODO: this function should be extracted and tested. getRequestArgs={params => ({ definitionID, params: { ...omit(params, "page"), ...pageToOffsetLimit({ page: get(params, "page", 1), limit: PAGE_SIZE, }), }, })} > {props => } ) export default ConnectedTaskRuns ================================================ FILE: ui/src/components/Tasks.tsx ================================================ import * as React from "react" import { Link } from "react-router-dom" import { get, omit } from "lodash" import { DebounceInput } from "react-debounce-input" import { FormGroup, Classes, Spinner } from "@blueprintjs/core" import ListRequest, { ChildProps as ListRequestChildProps } from "./ListRequest" import api from "../api" import { ListTaskParams, ListTaskResponse, SortOrder, Task } from "../types" import pageToOffsetLimit from "../helpers/pageToOffsetLimit" import Table from "./Table" import Pagination from "./Pagination" import GroupNameSelect from "./GroupNameSelect" import ViewHeader from "./ViewHeader" import ListFiltersDropdown from "./ListFiltersDropdown" import { PAGE_SIZE } from "../constants" import { RequestStatus } from "./Request" import ErrorCallout from "./ErrorCallout" export const initialQuery = { page: 1, sort_by: "alias", order: SortOrder.ASC, } export type Props = ListRequestChildProps< ListTaskResponse, { params: ListTaskParams } > export const Tasks: React.FunctionComponent = props => { const { query, data, updateFilter, updatePage, updateSort, currentPage, currentSortKey, currentSortOrder, isLoading, requestStatus, error, } = props let content: React.ReactNode switch (requestStatus) { case RequestStatus.ERROR: content = break case RequestStatus.READY: content = ( items={get(data, "definitions", [])} getItemKey={(task: Task) => task.definition_id} updateSort={updateSort} currentSortKey={currentSortKey} currentSortOrder={currentSortOrder} columns={{ alias: { displayName: "Alias", render: (item: Task) => ( {item.alias} ), isSortable: true, }, group_name: { displayName: "Group Name", render: (item: Task) => item.group_name, isSortable: true, }, image: { displayName: "Image", render: (item: Task) => item.image, isSortable: true, }, memory: { displayName: "Memory (MB)", render: (item: Task) => item.memory, isSortable: true, }, }} /> ) break case RequestStatus.NOT_READY: default: content = break } return ( <> Create Task } />
) => { updateFilter("alias", evt.target.value) }} placeholder="Search by task alias..." /> { updateFilter("group_name", value) }} isDisabled={false} /> ) => { updateFilter("image", evt.target.value) }} />
{content} ) } const ConnectedTasks: React.FunctionComponent = () => ( requestFn={api.listTasks} initialQuery={initialQuery} getRequestArgs={params => ({ params: { ...omit(params, "page"), ...pageToOffsetLimit({ page: get(params, "page", 1), limit: PAGE_SIZE, }), }, })} > {props => } ) export default ConnectedTasks ================================================ FILE: ui/src/components/Template.tsx ================================================ import * as React from "react" import { Switch, Route, RouteComponentProps } from "react-router-dom" import Request, { ChildProps, RequestStatus } from "./Request" import api from "../api" import { Template as TemplateShape } from "../types" import TemplateDetails from "./TemplateDetails" import TemplateExecutionForm from "./TemplateExecutionForm" export type TemplateCtx = ChildProps & { basePath: string templateID: string } export const TemplateContext = React.createContext({ data: null, requestStatus: RequestStatus.NOT_READY, isLoading: false, error: null, request: () => {}, basePath: "", // TODO: maybe this is not required. templateID: "", receivedAt: null, }) export const Template: React.FunctionComponent = props => { return ( ) } type ConnectedProps = RouteComponentProps<{ templateID: string }> const Connected: React.FunctionComponent = ({ match }) => ( requestFn={api.getTemplate} initialRequestArgs={{ templateID: match.params.templateID }} > {props => (