Repository: keephq/keep Branch: main Commit: 2403b2b71042 Files: 2212 Total size: 8.9 MB Directory structure: gitextract_gaok0hlb/ ├── .cursor/ │ └── rules/ │ ├── keep-ui-react-typescript.mdc │ └── keep-ui-tests.mdc ├── .dockerignore ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ ├── config.yml │ │ ├── documentation.md │ │ ├── feature_request.md │ │ ├── new_provider_request.md │ │ └── use_case.md │ └── workflows/ │ ├── auto-release.yml │ ├── auto-resolve-keep.yml │ ├── but-to-project.yml │ ├── developer-onboarding-notification.yml │ ├── lint-pr.yml │ ├── release-workflow-schema.yml │ ├── release.yml │ ├── run-e2e-tests.yml │ ├── sync-keep-workflows.yml │ ├── test-docs.yml │ ├── test-pr-e2e.yml │ ├── test-pr-integrations.yml │ ├── test-pr-ut-ui.yml │ ├── test-pr-ut.yml │ └── test-workflow-examples.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docker/ │ ├── Dockerfile.api │ ├── Dockerfile.cli │ ├── Dockerfile.dev.api │ ├── Dockerfile.dev.ui │ └── Dockerfile.ui ├── docker-compose-with-arq.yml ├── docker-compose-with-auth.yml ├── docker-compose-with-otel.yaml ├── docker-compose.common.yml ├── docker-compose.dev.yml ├── docker-compose.yml ├── docs/ │ ├── README.md │ ├── alertevaluation/ │ │ ├── examples/ │ │ │ ├── victoriametricsmulti.mdx │ │ │ └── victoriametricssingle.mdx │ │ └── overview.mdx │ ├── alerts/ │ │ ├── actionmenu.mdx │ │ ├── overview.mdx │ │ ├── presets.mdx │ │ ├── sidebar.mdx │ │ ├── sound.mdx │ │ └── table.mdx │ ├── applications/ │ │ └── github.mdx │ ├── authentication/ │ │ └── okta.md │ ├── cli/ │ │ ├── commands/ │ │ │ ├── alert-enrich.mdx │ │ │ ├── alert-get.mdx │ │ │ ├── alert-list.mdx │ │ │ ├── cli-alert.mdx │ │ │ ├── cli-api.mdx │ │ │ ├── cli-config-new.mdx │ │ │ ├── cli-config-show.mdx │ │ │ ├── cli-config.mdx │ │ │ ├── cli-provider.mdx │ │ │ ├── cli-run.mdx │ │ │ ├── cli-version.mdx │ │ │ ├── cli-whoami.mdx │ │ │ ├── cli-workflow.mdx │ │ │ ├── cli.mdx │ │ │ ├── extraction-create.mdx │ │ │ ├── extraction-delete.mdx │ │ │ ├── extractions-list.mdx │ │ │ ├── mappings-create.mdx │ │ │ ├── mappings-delete.mdx │ │ │ ├── mappings-list.mdx │ │ │ ├── provider-connect.mdx │ │ │ ├── provider-delete.mdx │ │ │ ├── provider-list.mdx │ │ │ ├── runs-list.mdx │ │ │ ├── runs-logs.mdx │ │ │ ├── workflow-apply.mdx │ │ │ ├── workflow-list.mdx │ │ │ ├── workflow-run.mdx │ │ │ └── workflow-runs.mdx │ │ ├── github-actions.mdx │ │ ├── installation.mdx │ │ └── overview.mdx │ ├── deployment/ │ │ ├── authentication/ │ │ │ ├── auth0-auth.mdx │ │ │ ├── azuread-auth.mdx │ │ │ ├── db-auth.mdx │ │ │ ├── keycloak-auth.mdx │ │ │ ├── no-auth.mdx │ │ │ ├── oauth2-proxy-gitlab.mdx │ │ │ ├── oauth2proxy-auth.mdx │ │ │ ├── okta-auth.mdx │ │ │ ├── onelogin-auth.mdx │ │ │ └── overview.mdx │ │ ├── configuration.mdx │ │ ├── docker.mdx │ │ ├── ecs.mdx │ │ ├── kubernetes/ │ │ │ ├── architecture.mdx │ │ │ ├── installation.mdx │ │ │ ├── openshift.mdx │ │ │ └── overview.mdx │ │ ├── local-llm/ │ │ │ └── keep-with-litellm.mdx │ │ ├── monitoring.mdx │ │ ├── provision/ │ │ │ ├── dashboard.mdx │ │ │ ├── overview.mdx │ │ │ ├── provider.mdx │ │ │ └── workflow.mdx │ │ ├── secret-store.mdx │ │ └── stress-testing.mdx │ ├── development/ │ │ ├── external-url.mdx │ │ └── getting-started.mdx │ ├── images/ │ │ └── datadog_raw_alerts.txt │ ├── incidents/ │ │ ├── facets.mdx │ │ └── overview.mdx │ ├── mint.json │ ├── openapi.json │ ├── overview/ │ │ ├── ai-correlation.mdx │ │ ├── ai-in-workflows.mdx │ │ ├── ai-incident-assistant.mdx │ │ ├── ai-semi-automatic-correlation.mdx │ │ ├── ai-workflow-assistant.mdx │ │ ├── alertseverityandstatus.mdx │ │ ├── cel.mdx │ │ ├── comparisons.mdx │ │ ├── correlation-rules.mdx │ │ ├── correlation-topology.mdx │ │ ├── deduplication.mdx │ │ ├── enrichment/ │ │ │ ├── extraction.mdx │ │ │ └── mapping.mdx │ │ ├── faq.mdx │ │ ├── fingerprints.mdx │ │ ├── glossary.mdx │ │ ├── howdoeskeepgetmyalerts.mdx │ │ ├── introduction.mdx │ │ ├── maintenance-windows.mdx │ │ ├── playground.mdx │ │ ├── servicetopology.mdx │ │ ├── support.mdx │ │ ├── usecases.mdx │ │ └── workflow-automation.mdx │ ├── providers/ │ │ ├── adding-a-new-provider.mdx │ │ ├── documentation/ │ │ │ ├── airflow-provider.mdx │ │ │ ├── aks-provider.mdx │ │ │ ├── amazonsqs-provider.mdx │ │ │ ├── anthropic-provider.mdx │ │ │ ├── appdynamics-provider.mdx │ │ │ ├── argocd-provider.mdx │ │ │ ├── asana-provider.mdx │ │ │ ├── auth0-provider.mdx │ │ │ ├── axiom-provider.mdx │ │ │ ├── azuremonitoring-provider.mdx │ │ │ ├── bash-provider.mdx │ │ │ ├── bigquery-provider.mdx │ │ │ ├── centreon-provider.mdx │ │ │ ├── checkly-provider.mdx │ │ │ ├── checkmk-provider.mdx │ │ │ ├── cilium-provider.mdx │ │ │ ├── clickhouse-provider.mdx │ │ │ ├── cloudwatch-provider.mdx │ │ │ ├── console-provider.mdx │ │ │ ├── coralogix-provider.mdx │ │ │ ├── dash0-provider.mdx │ │ │ ├── databend-provider.mdx │ │ │ ├── datadog-provider.mdx │ │ │ ├── deepseek-provider.mdx │ │ │ ├── discord-provider.mdx │ │ │ ├── dynatrace-provider.mdx │ │ │ ├── eks-provider.mdx │ │ │ ├── elastic-provider.mdx │ │ │ ├── flashduty-provider.mdx │ │ │ ├── fluxcd-provider.mdx │ │ │ ├── gcpmonitoring-provider.mdx │ │ │ ├── gemini-provider.mdx │ │ │ ├── github-provider.mdx │ │ │ ├── github_workflows_provider.mdx │ │ │ ├── gitlab-provider.mdx │ │ │ ├── gitlabpipelines-provider.mdx │ │ │ ├── gke-provider.mdx │ │ │ ├── google_chat-provider.mdx │ │ │ ├── grafana-provider.mdx │ │ │ ├── grafana_incident-provider.mdx │ │ │ ├── grafana_loki-provider.mdx │ │ │ ├── grafana_oncall-provider.mdx │ │ │ ├── graylog-provider.mdx │ │ │ ├── grok-provider.mdx │ │ │ ├── http-provider.mdx │ │ │ ├── icinga2-provider.mdx │ │ │ ├── ilert-provider.mdx │ │ │ ├── incidentio-provider.mdx │ │ │ ├── incidentmanager-provider.mdx │ │ │ ├── jira-on-prem-provider.mdx │ │ │ ├── jira-provider.mdx │ │ │ ├── kafka-provider.mdx │ │ │ ├── keep-provider.mdx │ │ │ ├── kibana-provider.mdx │ │ │ ├── kubernetes-provider.mdx │ │ │ ├── libre_nms-provider.mdx │ │ │ ├── linear_provider.mdx │ │ │ ├── linearb-provider.mdx │ │ │ ├── litellm-provider.mdx │ │ │ ├── llamacpp-provider.mdx │ │ │ ├── mailgun-provider.mdx │ │ │ ├── mattermost-provider.mdx │ │ │ ├── mock-provider.mdx │ │ │ ├── monday-provider.mdx │ │ │ ├── mongodb-provider.mdx │ │ │ ├── mysql-provider.mdx │ │ │ ├── netbox-provider.mdx │ │ │ ├── netdata-provider.mdx │ │ │ ├── new-relic-provider.mdx │ │ │ ├── ntfy-provider.mdx │ │ │ ├── ollama-provider.mdx │ │ │ ├── openai-provider.mdx │ │ │ ├── openobserve-provider.mdx │ │ │ ├── opensearchserverless-provider.mdx │ │ │ ├── openshift-provider.mdx │ │ │ ├── opsgenie-provider.mdx │ │ │ ├── pagerduty-provider.mdx │ │ │ ├── pagertree-provider.mdx │ │ │ ├── parseable-provider.mdx │ │ │ ├── pingdom-provider.mdx │ │ │ ├── planner-provider.mdx │ │ │ ├── postgresql-provider.mdx │ │ │ ├── posthog-provider.mdx │ │ │ ├── prometheus-provider.mdx │ │ │ ├── pushover-provider.mdx │ │ │ ├── python-provider.mdx │ │ │ ├── quickchart-provider.mdx │ │ │ ├── redmine-provider.mdx │ │ │ ├── resend-provider.mdx │ │ │ ├── rollbar-provider.mdx │ │ │ ├── s3-provider.mdx │ │ │ ├── sendgrid-provider.mdx │ │ │ ├── sentry-provider.mdx │ │ │ ├── service-now-provider.mdx │ │ │ ├── signalfx-provider.mdx │ │ │ ├── signl4-provider.mdx │ │ │ ├── site24x7-provider.mdx │ │ │ ├── slack-provider.mdx │ │ │ ├── smtp-provider.mdx │ │ │ ├── snowflake-provider.mdx │ │ │ ├── splunk-provider.mdx │ │ │ ├── squadcast-provider.mdx │ │ │ ├── ssh-provider.mdx │ │ │ ├── statuscake-provider.mdx │ │ │ ├── sumologic-provider.mdx │ │ │ ├── teams-provider.mdx │ │ │ ├── telegram-provider.mdx │ │ │ ├── template.mdx │ │ │ ├── thousandeyes-provider.mdx │ │ │ ├── trello-provider.mdx │ │ │ ├── twilio-provider.mdx │ │ │ ├── uptimekuma-provider.mdx │ │ │ ├── victorialogs-provider.mdx │ │ │ ├── victoriametrics-provider.mdx │ │ │ ├── vllm-provider.mdx │ │ │ ├── wazuh-provider.mdx │ │ │ ├── webhook-provider.mdx │ │ │ ├── websocket-provider.mdx │ │ │ ├── youtrack-provider.mdx │ │ │ ├── zabbix-provider.mdx │ │ │ ├── zenduty-provider.mdx │ │ │ ├── zoom-provider.mdx │ │ │ └── zoom_chat-provider.mdx │ │ ├── linked-providers.mdx │ │ ├── overview.md │ │ ├── overview.mdx │ │ └── provider-methods.mdx │ ├── snippets/ │ │ └── providers/ │ │ ├── airflow-snippet-autogenerated.mdx │ │ ├── aks-snippet-autogenerated.mdx │ │ ├── amazonsqs-snippet-autogenerated.mdx │ │ ├── anthropic-snippet-autogenerated.mdx │ │ ├── appdynamics-snippet-autogenerated.mdx │ │ ├── argocd-snippet-autogenerated.mdx │ │ ├── asana-snippet-autogenerated.mdx │ │ ├── auth0-snippet-autogenerated.mdx │ │ ├── axiom-snippet-autogenerated.mdx │ │ ├── azuremonitoring-snippet-autogenerated.mdx │ │ ├── base-snippet-autogenerated.mdx │ │ ├── bash-snippet-autogenerated.mdx │ │ ├── bigquery-snippet-autogenerated.mdx │ │ ├── centreon-snippet-autogenerated.mdx │ │ ├── checkly-snippet-autogenerated.mdx │ │ ├── checkmk-snippet-autogenerated.mdx │ │ ├── cilium-snippet-autogenerated.mdx │ │ ├── clickhouse-snippet-autogenerated.mdx │ │ ├── cloudwatch-snippet-autogenerated.mdx │ │ ├── console-snippet-autogenerated.mdx │ │ ├── coralogix-snippet-autogenerated.mdx │ │ ├── dash0-snippet-autogenerated.mdx │ │ ├── databend-snippet-autogenerated.mdx │ │ ├── datadog-snippet-autogenerated.mdx │ │ ├── deepseek-snippet-autogenerated.mdx │ │ ├── discord-snippet-autogenerated.mdx │ │ ├── dynatrace-snippet-autogenerated.mdx │ │ ├── eks-snippet-autogenerated.mdx │ │ ├── elastic-snippet-autogenerated.mdx │ │ ├── flashduty-snippet-autogenerated.mdx │ │ ├── fluxcd-snippet-autogenerated.mdx │ │ ├── gcpmonitoring-snippet-autogenerated.mdx │ │ ├── gemini-snippet-autogenerated.mdx │ │ ├── github-snippet-autogenerated.mdx │ │ ├── github_workflows-snippet-autogenerated.mdx │ │ ├── gitlab-snippet-autogenerated.mdx │ │ ├── gitlabpipelines-snippet-autogenerated.mdx │ │ ├── gke-snippet-autogenerated.mdx │ │ ├── google_chat-snippet-autogenerated.mdx │ │ ├── grafana-snippet-autogenerated.mdx │ │ ├── grafana_incident-snippet-autogenerated.mdx │ │ ├── grafana_loki-snippet-autogenerated.mdx │ │ ├── grafana_oncall-snippet-autogenerated.mdx │ │ ├── graylog-snippet-autogenerated.mdx │ │ ├── grok-snippet-autogenerated.mdx │ │ ├── http-snippet-autogenerated.mdx │ │ ├── icinga2-snippet-autogenerated.mdx │ │ ├── ilert-snippet-autogenerated.mdx │ │ ├── incidentio-snippet-autogenerated.mdx │ │ ├── incidentmanager-snippet-autogenerated.mdx │ │ ├── jira-snippet-autogenerated.mdx │ │ ├── jiraonprem-snippet-autogenerated.mdx │ │ ├── kafka-snippet-autogenerated.mdx │ │ ├── keep-snippet-autogenerated.mdx │ │ ├── kibana-snippet-autogenerated.mdx │ │ ├── kubernetes-snippet-autogenerated.mdx │ │ ├── libre_nms-snippet-autogenerated.mdx │ │ ├── linear-snippet-autogenerated.mdx │ │ ├── linearb-snippet-autogenerated.mdx │ │ ├── litellm-snippet-autogenerated.mdx │ │ ├── llamacpp-snippet-autogenerated.mdx │ │ ├── mailgun-snippet-autogenerated.mdx │ │ ├── mattermost-snippet-autogenerated.mdx │ │ ├── mock-snippet-autogenerated.mdx │ │ ├── monday-snippet-autogenerated.mdx │ │ ├── mongodb-snippet-autogenerated.mdx │ │ ├── mysql-snippet-autogenerated.mdx │ │ ├── netbox-snippet-autogenerated.mdx │ │ ├── netdata-snippet-autogenerated.mdx │ │ ├── netxms-snippet-autogenerated.mdx │ │ ├── newrelic-snippet-autogenerated.mdx │ │ ├── ntfy-snippet-autogenerated.mdx │ │ ├── ollama-snippet-autogenerated.mdx │ │ ├── openai-snippet-autogenerated.mdx │ │ ├── openobserve-snippet-autogenerated.mdx │ │ ├── opensearchserverless-snippet-autogenerated.mdx │ │ ├── openshift-snippet-autogenerated.mdx │ │ ├── opsgenie-snippet-autogenerated.mdx │ │ ├── pagerduty-snippet-autogenerated.mdx │ │ ├── pagertree-snippet-autogenerated.mdx │ │ ├── parseable-snippet-autogenerated.mdx │ │ ├── pingdom-snippet-autogenerated.mdx │ │ ├── planner-snippet-autogenerated.mdx │ │ ├── postgres-snippet-autogenerated.mdx │ │ ├── posthog-snippet-autogenerated.mdx │ │ ├── prometheus-snippet-autogenerated.mdx │ │ ├── pushover-snippet-autogenerated.mdx │ │ ├── python-snippet-autogenerated.mdx │ │ ├── quickchart-snippet-autogenerated.mdx │ │ ├── redmine-snippet-autogenerated.mdx │ │ ├── resend-snippet-autogenerated.mdx │ │ ├── rollbar-snippet-autogenerated.mdx │ │ ├── s3-snippet-autogenerated.mdx │ │ ├── salesforce-snippet-autogenerated.mdx │ │ ├── sendgrid-snippet-autogenerated.mdx │ │ ├── sentry-snippet-autogenerated.mdx │ │ ├── servicenow-snippet-autogenerated.mdx │ │ ├── signalfx-snippet-autogenerated.mdx │ │ ├── signl4-snippet-autogenerated.mdx │ │ ├── site24x7-snippet-autogenerated.mdx │ │ ├── slack-snippet-autogenerated.mdx │ │ ├── smtp-snippet-autogenerated.mdx │ │ ├── snowflake-snippet-autogenerated.mdx │ │ ├── splunk-snippet-autogenerated.mdx │ │ ├── squadcast-snippet-autogenerated.mdx │ │ ├── ssh-snippet-autogenerated.mdx │ │ ├── statuscake-snippet-autogenerated.mdx │ │ ├── sumologic-snippet-autogenerated.mdx │ │ ├── teams-snippet-autogenerated.mdx │ │ ├── telegram-snippet-autogenerated.mdx │ │ ├── test_fluxcd-snippet-autogenerated.mdx │ │ ├── thousandeyes-snippet-autogenerated.mdx │ │ ├── trello-snippet-autogenerated.mdx │ │ ├── twilio-snippet-autogenerated.mdx │ │ ├── uptimekuma-snippet-autogenerated.mdx │ │ ├── vectordev-snippet-autogenerated.mdx │ │ ├── victorialogs-snippet-autogenerated.mdx │ │ ├── victoriametrics-snippet-autogenerated.mdx │ │ ├── vllm-snippet-autogenerated.mdx │ │ ├── wazuh-snippet-autogenerated.mdx │ │ ├── webhook-snippet-autogenerated.mdx │ │ ├── websocket-snippet-autogenerated.mdx │ │ ├── youtrack-snippet-autogenerated.mdx │ │ ├── zabbix-snippet-autogenerated.mdx │ │ ├── zendesk-snippet-autogenerated.mdx │ │ ├── zenduty-snippet-autogenerated.mdx │ │ ├── zoom-snippet-autogenerated.mdx │ │ └── zoom_chat-snippet-autogenerated.mdx │ └── workflows/ │ ├── examples/ │ │ ├── autosupress.mdx │ │ ├── buisnesshours.mdx │ │ ├── create-servicenow-tickets.mdx │ │ ├── highsev.mdx │ │ └── update-servicenow-tickets.mdx │ ├── overview.mdx │ └── syntax/ │ ├── conditions.mdx │ ├── context.mdx │ ├── enrichment.mdx │ ├── foreach.mdx │ ├── functions.mdx │ ├── permissions.mdx │ ├── providers.mdx │ ├── steps-and-actions.mdx │ └── triggers.mdx ├── ee/ │ ├── LICENSE │ └── identitymanager/ │ ├── __init__.py │ └── identity_managers/ │ ├── __init__.py │ ├── auth0/ │ │ ├── __init__.py │ │ ├── auth0_authverifier.py │ │ ├── auth0_identitymanager.py │ │ └── auth0_utils.py │ ├── azuread/ │ │ ├── __init__.py │ │ ├── azuread_authverifier.py │ │ └── azuread_identitymanager.py │ └── keycloak/ │ ├── __init__.py │ ├── keycloak_authverifier.py │ └── keycloak_identitymanager.py ├── elk/ │ ├── README.md │ ├── docker-compose-elk.yml │ ├── filebeat.yml │ └── logstash.conf ├── examples/ │ ├── providers/ │ │ ├── airflow-prod.yaml │ │ └── telegram-bot.yaml │ └── workflows/ │ ├── aks_basic.yml │ ├── autosupress.yml │ ├── bash_example.yml │ ├── bigquery.yml │ ├── blogpost.yml │ ├── businesshours.yml │ ├── change.yml │ ├── clickhouse_multiquery.yml │ ├── complex-conditions-cel.yml │ ├── conditionally_run_if_ai_says_so.yaml │ ├── console_example.yml │ ├── consts_and_dict.yml │ ├── consts_and_vars.yml │ ├── create-issue-youtrack.yaml │ ├── create-new-incident-grafana-incident.yaml │ ├── create-task-in-asana.yaml │ ├── create_alert_from_vm_metric.yml │ ├── create_alert_in_keep.yml │ ├── create_alerts_from_elastic.yml │ ├── create_alerts_from_mysql.yml │ ├── create_jira_ticket_upon_alerts.yml │ ├── create_multi_alert_from_vm_metric.yml │ ├── create_service_now_ticket_upon_alerts.yml │ ├── datadog-log-monitor.yml │ ├── db_disk_space_monitor.yml │ ├── discord_basic.yml │ ├── disk_grown_defects_rule.yml │ ├── eks_advanced.yml │ ├── eks_basic.yml │ ├── elastic_basic.yml │ ├── elastic_enrich_example.yml │ ├── enrich_using_structured_output_from_deepseek.yaml │ ├── enrich_using_structured_output_from_openai.yaml │ ├── enrich_using_structured_output_from_vllm_qwen.yaml │ ├── failed-to-login-workflow.yml │ ├── flashduty_example.yml │ ├── fluxcd_example.yml │ ├── gcp_logging_open_ai.yaml │ ├── gke.yml │ ├── http_enrich.yml │ ├── ifelse.yml │ ├── ilert-incident-upon-alert.yaml │ ├── incident-enrich.yaml │ ├── incident-tier-escalation.yml │ ├── incident_example.yml │ ├── inputs_example.yml │ ├── jira-create-ticket-on-alert.yml │ ├── jira-transition-on-resolved.yml │ ├── jira_on_prem.yml │ ├── monday_create_pulse.yml │ ├── multi-condition-cel.yml │ ├── mustache-paths-example.yml │ ├── new-auth0-users-monitor.yml │ ├── new_github_stars.yml │ ├── notify-new-trello-card.yml │ ├── ntfy_basic.yml │ ├── opensearchserverless_basic.yml │ ├── openshift_basic.yml │ ├── openshift_monitoring_and_remediation.yml │ ├── openshift_pod_restart.yml │ ├── opsgenie-close-alert.yml │ ├── opsgenie-create-alert-cel.yml │ ├── opsgenie-create-alert.yml │ ├── opsgenie_open_alerts.yml │ ├── pagerduty.yml │ ├── pattern-matching-cel.yml │ ├── permissions_example.yml │ ├── planner_basic.yml │ ├── posthog_example.yml │ ├── query-databend.yml │ ├── query_clickhouse.yml │ ├── query_grafana_loki.yaml │ ├── query_mongodb.yaml │ ├── query_victorialogs.yaml │ ├── query_victoriametrics.yml │ ├── raw_sql_query_datetime.yml │ ├── resolve_old_alerts.yml │ ├── retrieve_cloudwatch_logs.yaml │ ├── run-github-workflow.yaml │ ├── send-message-telegram-with-htmlmd.yaml │ ├── send_slack_message_on_failure.yaml │ ├── send_smtp_email.yml │ ├── send_smtp_html_email.yml │ ├── sendgrid_basic.yml │ ├── service-error-rate-monitor-datadog.yml │ ├── severity_changed.yml │ ├── signl4-alerting-workflow.yaml │ ├── simple_http_request_ntfy.yml │ ├── slack-message-reaction.yml │ ├── slack-workflow-trigger.yml │ ├── slack_basic.yml │ ├── slack_basic_cel.yml │ ├── slack_basic_interval.yml │ ├── slack_message_update.yml │ ├── squadcast_example.yml │ ├── teams-adaptive-card-notifier.yaml │ ├── teams-adaptive-cards-with-mentions.yaml │ ├── telegram_advanced.yml │ ├── telegram_basic.yml │ ├── test_jira_create_with_custom_fields.yml │ ├── test_jira_custom_fields_fix.yml │ ├── update-incident-grafana-incident.yaml │ ├── update-task-in-asana.yaml │ ├── update_jira_ticket.yml │ ├── update_service_now_tickets_status.yml │ ├── update_workflows_from_http.yml │ ├── update_workflows_from_s3.yml │ ├── webhook_example.yml │ ├── webhook_example_foreach.yml │ ├── workflow_only_first_time_example.yml │ ├── workflow_start_example.yml │ ├── zoom_chat_example.yml │ └── zoom_example.yml ├── keep/ │ ├── actions/ │ │ ├── __init__.py │ │ ├── actions_exception.py │ │ └── actions_factory.py │ ├── alembic.ini │ ├── api/ │ │ ├── __init__.py │ │ ├── alert_deduplicator/ │ │ │ ├── __init__.py │ │ │ ├── alert_deduplicator.py │ │ │ └── deduplication_rules_provisioning.py │ │ ├── api.py │ │ ├── arq_pool.py │ │ ├── arq_worker.py │ │ ├── arq_worker_debug_patch.py │ │ ├── arq_worker_gunicorn.py │ │ ├── bl/ │ │ │ ├── ai_suggestion_bl.py │ │ │ ├── dismissal_expiry_bl.py │ │ │ ├── enrichments_bl.py │ │ │ ├── incident_reports.py │ │ │ ├── incidents_bl.py │ │ │ └── maintenance_windows_bl.py │ │ ├── config.py │ │ ├── consts.py │ │ ├── core/ │ │ │ ├── alerts.py │ │ │ ├── cel_to_sql/ │ │ │ │ ├── ast_nodes.py │ │ │ │ ├── cel_ast_converter.py │ │ │ │ ├── properties_mapper.py │ │ │ │ ├── properties_metadata.py │ │ │ │ └── sql_providers/ │ │ │ │ ├── base.py │ │ │ │ ├── get_cel_to_sql_provider_for_dialect.py │ │ │ │ ├── mysql.py │ │ │ │ ├── postgresql.py │ │ │ │ └── sqlite.py │ │ │ ├── config.py │ │ │ ├── db.py │ │ │ ├── db_on_start.py │ │ │ ├── db_utils.py │ │ │ ├── demo_mode.py │ │ │ ├── dependencies.py │ │ │ ├── elastic.py │ │ │ ├── facets.py │ │ │ ├── facets_query_builder/ │ │ │ │ ├── base_facets_query_builder.py │ │ │ │ ├── get_facets_query_builder.py │ │ │ │ ├── mysql.py │ │ │ │ ├── postgresql.py │ │ │ │ ├── sqlite.py │ │ │ │ └── utils.py │ │ │ ├── incidents.py │ │ │ ├── limiter.py │ │ │ ├── metrics.py │ │ │ ├── report_uptime.py │ │ │ ├── tenant_configuration.py │ │ │ ├── tracer.py │ │ │ └── workflows.py │ │ ├── custom_worker.py │ │ ├── logging.py │ │ ├── middlewares.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── action.py │ │ │ ├── action_type.py │ │ │ ├── ai_external.py │ │ │ ├── alert.py │ │ │ ├── alert_audit.py │ │ │ ├── db/ │ │ │ │ ├── action.py │ │ │ │ ├── ai_external.py │ │ │ │ ├── ai_suggestion.py │ │ │ │ ├── alert.py │ │ │ │ ├── dashboard.py │ │ │ │ ├── enrichment_event.py │ │ │ │ ├── extraction.py │ │ │ │ ├── facet.py │ │ │ │ ├── helpers.py │ │ │ │ ├── incident.py │ │ │ │ ├── maintenance_window.py │ │ │ │ ├── mapping.py │ │ │ │ ├── migrations/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── env.py │ │ │ │ │ ├── script.py.mako │ │ │ │ │ └── versions/ │ │ │ │ │ ├── 2024-07-11-17-10_54c1252b2c8a.py │ │ │ │ │ ├── 2024-07-15-15-10_c37ec8f6db3e.py │ │ │ │ │ ├── 2024-07-16-12-16_37019ca3eb2e.py │ │ │ │ │ ├── 2024-07-17-16-46_dcbd2873dcfd.py │ │ │ │ │ ├── 2024-07-24-13-39_9ba0aeecd4d0.py │ │ │ │ │ ├── 2024-07-25-17-13_67f1efb93c99.py │ │ │ │ │ ├── 2024-07-28-16-24_8e5942040de6.py │ │ │ │ │ ├── 2024-07-29-12-51_c91b348b94f2.py │ │ │ │ │ ├── 2024-07-29-18-10_92f4f93f2140.py │ │ │ │ │ ├── 2024-08-05-13-09_4147d9e706c0.py │ │ │ │ │ ├── 2024-08-11-17-38_9453855f3ba0.py │ │ │ │ │ ├── 2024-08-13-19-22_0832e0d9889a.py │ │ │ │ │ ├── 2024-08-14-18-30_87594ea6d308.py │ │ │ │ │ ├── 2024-08-25-16-40_4ef2c767664c.py │ │ │ │ │ ├── 2024-08-25-16-48_1c650a429672.py │ │ │ │ │ ├── 2024-08-30-09-34_7ed12220a0d3.py │ │ │ │ │ ├── 2024-09-01-14-04_94886bc59c11.py │ │ │ │ │ ├── 2024-09-02-12-07_70671c95028e.py │ │ │ │ │ ├── 2024-09-03-10-08_49e7c02579db.py │ │ │ │ │ ├── 2024-09-03-16-24_1a5eb7069f9a.py │ │ │ │ │ ├── 2024-09-04-13-09_e6653be70b62.py │ │ │ │ │ ├── 2024-09-08-17-51_1aacee84447e.py │ │ │ │ │ ├── 2024-09-13-10-48_938b1aa62d5c.py │ │ │ │ │ ├── 2024-09-17-23-30_c5443d9deb0f.py │ │ │ │ │ ├── 2024-09-18-02-05_772790c2e50a.py │ │ │ │ │ ├── 2024-09-18-14-08_5d7ae55efc6a.py │ │ │ │ │ ├── 2024-09-19-15-26_493f217af6b6.py │ │ │ │ │ ├── 2024-09-22-14-16_01ebe17218c0.py │ │ │ │ │ ├── 2024-10-05-18-37_017d759805d9.py │ │ │ │ │ ├── 2024-10-08-10-47_bf756df80e9d.py │ │ │ │ │ ├── 2024-10-14-08-34_83c1020be97d.py │ │ │ │ │ ├── 2024-10-22-10-38_8438f041ee0e.py │ │ │ │ │ ├── 2024-10-23-15-21_89b4d3905d26.py │ │ │ │ │ ├── 2024-10-26-17-03_3f056d747d9e.py │ │ │ │ │ ├── 2024-10-29-18-37_991b30bcf0b9.py │ │ │ │ │ ├── 2024-10-31-18-01_273b29f368b7.py │ │ │ │ │ ├── 2024-11-03-10-49_ef0b5b0df41c.py │ │ │ │ │ ├── 2024-11-08-20-58_895fe80117aa.py │ │ │ │ │ ├── 2024-11-10-13-06_620b6c048091.py │ │ │ │ │ ├── 2024-11-20-15-50_192157fd5788.py │ │ │ │ │ ├── 2024-12-01-16-40_3ad5308e7200.py │ │ │ │ │ ├── 2024-12-02-13-36_bdae8684d0b4.py │ │ │ │ │ ├── 2024-12-02-20-42_c6e5594c99f8.py │ │ │ │ │ ├── 2024-12-08-16-24_55cc64020f6d.py │ │ │ │ │ ├── 2024-12-10-19-11_7297ae99cd21.py │ │ │ │ │ ├── 2024-12-17-12-48_3d20d954e058.py │ │ │ │ │ ├── 2024-12-23-17-22_0c5e002094a9.py │ │ │ │ │ ├── 2024-12-23-18-49_4f8c4b185d5b.py │ │ │ │ │ ├── 2025-01-01-09-59_dcb7f88a04da.py │ │ │ │ │ ├── 2025-01-01-15-14_1c117f1accff.py │ │ │ │ │ ├── 2025-01-08-19-20_8a4ec08f2d6b.py │ │ │ │ │ ├── 2025-01-14-18-41_416155f25854.py │ │ │ │ │ ├── 2025-01-16-14-00_e3f33e571c3c.py │ │ │ │ │ ├── 2025-01-19-10-44_d359baaf0836.py │ │ │ │ │ ├── 2025-01-26-15-25_8176d7153747.py │ │ │ │ │ ├── 2025-02-05-15-46_e343054ae740.py │ │ │ │ │ ├── 2025-02-10-12-05_908d95386e29.py │ │ │ │ │ ├── 2025-02-11-12-59_21d314490e6a.py │ │ │ │ │ ├── 2025-02-13-09-54_cfe08cc46950.py │ │ │ │ │ ├── 2025-02-13-17-27_90e2d22edc6a.py │ │ │ │ │ ├── 2025-02-18-18-09_876a424d8f06.py │ │ │ │ │ ├── 2025-02-19-15-32_35ebba262eb0.py │ │ │ │ │ ├── 2025-02-20-23-15_ea25d9402518.py │ │ │ │ │ ├── 2025-02-25-14-20_a82154690f35.py │ │ │ │ │ ├── 2025-03-05-15-55_0b80bda47ee2.py │ │ │ │ │ ├── 2025-03-11-16-54_16309df224d1.py │ │ │ │ │ ├── 2025-03-12-13-22_ab333148350e.py │ │ │ │ │ ├── 2025-03-12-14-36_9f11356d8ed9.py │ │ │ │ │ ├── 2025-03-12-14-46_ca74b4a04371.py │ │ │ │ │ ├── 2025-03-13-14-08_c0e70149c9ec.py │ │ │ │ │ ├── 2025-03-14-15-52_f3ecc7411f38.py │ │ │ │ │ ├── 2025-03-16-11-08_aff0128aa8f1.py │ │ │ │ │ ├── 2025-03-18-14-54_971abbbf0a2c.py │ │ │ │ │ ├── 2025-03-20-09-37_c0880e315ebe.py │ │ │ │ │ ├── 2025-03-24-14-26_2a6132b443ab.py │ │ │ │ │ ├── 2025-03-30-10-53_e663a98b1142.py │ │ │ │ │ ├── 2025-04-03-12-09_bdf252fbc1be.py │ │ │ │ │ ├── 2025-04-04-21-48_0dafe96ea97f.py │ │ │ │ │ ├── 2025-04-06-12-18_78777e6b12d3.py │ │ │ │ │ ├── 2025-04-08-10-43_59991b568c7d.py │ │ │ │ │ ├── 2025-04-15-15-30_885ff6b12fed.py │ │ │ │ │ ├── 2025-04-21-10-18_819927b7ccfa.py │ │ │ │ │ ├── 2025-05-04-15-02_eddcb77eb6f3.py │ │ │ │ │ ├── 2025-05-06-13-09_7b687c555318.py │ │ │ │ │ ├── 2025-05-12-17-49_c2f78c69e9cf.py │ │ │ │ │ ├── 2025-05-15-00-34_fcef2c58b21c.py │ │ │ │ │ ├── 2025-05-15-14-18_bedb5f07417b.py │ │ │ │ │ ├── 2025-05-16-14-33_aa167915c4d6.py │ │ │ │ │ ├── 2025-05-19-18-48_90e3eababbf0.py │ │ │ │ │ ├── 2025-05-19-20-54_combined_commentmention.py │ │ │ │ │ ├── 2025-06-04-10-43_7c14f776ef6b.py │ │ │ │ │ └── 2025-06-18-17-17_9dd1be4539e0.py │ │ │ │ ├── preset.py │ │ │ │ ├── provider.py │ │ │ │ ├── provider_image.py │ │ │ │ ├── rule.py │ │ │ │ ├── secret.py │ │ │ │ ├── statistics.py │ │ │ │ ├── system.py │ │ │ │ ├── tenant.py │ │ │ │ ├── topology.py │ │ │ │ ├── user.py │ │ │ │ └── workflow.py │ │ │ ├── facet.py │ │ │ ├── incident.py │ │ │ ├── provider.py │ │ │ ├── query.py │ │ │ ├── search_alert.py │ │ │ ├── severity_base.py │ │ │ ├── smtp.py │ │ │ ├── time_stamp.py │ │ │ ├── user.py │ │ │ ├── webhook.py │ │ │ └── workflow.py │ │ ├── observability.py │ │ ├── redis_settings.py │ │ ├── routes/ │ │ │ ├── __init__.py │ │ │ ├── actions.py │ │ │ ├── ai.py │ │ │ ├── alerts.py │ │ │ ├── auth/ │ │ │ │ ├── __init__.py │ │ │ │ ├── groups.py │ │ │ │ ├── permissions.py │ │ │ │ ├── roles.py │ │ │ │ └── users.py │ │ │ ├── cel.py │ │ │ ├── dashboard.py │ │ │ ├── deduplications.py │ │ │ ├── extraction.py │ │ │ ├── facets.py │ │ │ ├── healthcheck.py │ │ │ ├── incidents.py │ │ │ ├── maintenance.py │ │ │ ├── mapping.py │ │ │ ├── metrics.py │ │ │ ├── preset.py │ │ │ ├── provider_images.py │ │ │ ├── providers.py │ │ │ ├── pusher.py │ │ │ ├── rules.py │ │ │ ├── settings.py │ │ │ ├── status.py │ │ │ ├── tags.py │ │ │ ├── topology.py │ │ │ ├── whoami.py │ │ │ └── workflows.py │ │ ├── tasks/ │ │ │ ├── __init__.py │ │ │ ├── notification_cache.py │ │ │ ├── process_event_task.py │ │ │ ├── process_incident_task.py │ │ │ ├── process_topology_task.py │ │ │ └── process_watcher_task.py │ │ └── utils/ │ │ ├── alert_utils.py │ │ ├── cel_utils.py │ │ ├── email_utils.py │ │ ├── enrichment_helpers.py │ │ ├── import_ee.py │ │ ├── pagination.py │ │ ├── pluralize.py │ │ ├── tenant_utils.py │ │ └── time_stamp_helpers.py │ ├── cli/ │ │ ├── cli.py │ │ └── click_extensions.py │ ├── conditions/ │ │ ├── __init__.py │ │ ├── assert_condition.py │ │ ├── base_condition.py │ │ ├── condition_factory.py │ │ ├── stddev_condition.py │ │ └── threshold_condition.py │ ├── contextmanager/ │ │ ├── __init__.py │ │ └── contextmanager.py │ ├── entrypoint.sh │ ├── event_subscriber/ │ │ ├── __init__.py │ │ └── event_subscriber.py │ ├── exceptions/ │ │ ├── __init__.py │ │ ├── action_error.py │ │ ├── provider_config_exception.py │ │ ├── provider_connection_failed.py │ │ └── provider_exception.py │ ├── functions/ │ │ ├── __init__.py │ │ └── cyaml.py │ ├── identitymanager/ │ │ ├── authenticatedentity.py │ │ ├── authverifierbase.py │ │ ├── identity_managers/ │ │ │ ├── __init__.py │ │ │ ├── db/ │ │ │ │ ├── __init__.py │ │ │ │ ├── db_authverifier.py │ │ │ │ └── db_identitymanager.py │ │ │ ├── noauth/ │ │ │ │ ├── __init__.py │ │ │ │ ├── noauth_authverifier.py │ │ │ │ └── noauth_identitymanager.py │ │ │ ├── oauth2proxy/ │ │ │ │ ├── __init__.py │ │ │ │ ├── oauth2proxy_authverifier.py │ │ │ │ └── oauth2proxy_identitymanager.py │ │ │ ├── okta/ │ │ │ │ ├── __init__.py │ │ │ │ ├── okta_authverifier.py │ │ │ │ └── okta_identitymanager.py │ │ │ └── onelogin/ │ │ │ ├── __init__.py │ │ │ ├── onelogin_authverifier.py │ │ │ └── onelogin_identitymanager.py │ │ ├── identitymanager.py │ │ ├── identitymanagerfactory.py │ │ └── rbac.py │ ├── iohandler/ │ │ └── iohandler.py │ ├── parser/ │ │ └── parser.py │ ├── providers/ │ │ ├── __init__.py │ │ ├── airflow_provider/ │ │ │ ├── __init__.py │ │ │ └── airflow_provider.py │ │ ├── aks_provider/ │ │ │ └── aks_provider.py │ │ ├── amazonsqs_provider/ │ │ │ ├── __init__.py │ │ │ └── amazonsqs_provider.py │ │ ├── anthropic_provider/ │ │ │ ├── __init__.py │ │ │ └── anthropic_provider.py │ │ ├── appdynamics_provider/ │ │ │ ├── __init__.py │ │ │ ├── appdynamics_provider.py │ │ │ └── httpactiontemplate.json │ │ ├── argocd_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── applicationset.yaml │ │ │ └── argocd_provider.py │ │ ├── asana_provider/ │ │ │ ├── __init__.py │ │ │ └── asana_provider.py │ │ ├── auth0_provider/ │ │ │ ├── __init__.py │ │ │ └── auth0_provider.py │ │ ├── axiom_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── axiom_provider.py │ │ ├── azuremonitoring_provider/ │ │ │ ├── __init__.py │ │ │ └── azuremonitoring_provider.py │ │ ├── base/ │ │ │ ├── __init__.py │ │ │ ├── base_provider.py │ │ │ └── provider_exceptions.py │ │ ├── bash_provider/ │ │ │ ├── __init__.py │ │ │ └── bash_provider.py │ │ ├── bigquery_provider/ │ │ │ ├── __init__.py │ │ │ └── bigquery_provider.py │ │ ├── centreon_provider/ │ │ │ ├── __init__.py │ │ │ └── centreon_provider.py │ │ ├── checkly_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── checkly_provider.py │ │ ├── checkmk_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ ├── checkmk_provider.py │ │ │ └── webhook-keep.py │ │ ├── cilium_provider/ │ │ │ ├── __init__.py │ │ │ ├── cilium_provider.py │ │ │ ├── generate_protobuf.py │ │ │ ├── grpc/ │ │ │ │ ├── __init__.py │ │ │ │ ├── flow/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── flow.proto │ │ │ │ │ ├── flow_pb2.py │ │ │ │ │ └── flow_pb2_grpc.py │ │ │ │ ├── google/ │ │ │ │ │ └── protobuf/ │ │ │ │ │ ├── duration.proto │ │ │ │ │ ├── timestamp.proto │ │ │ │ │ └── wrappers.proto │ │ │ │ ├── observer.proto │ │ │ │ ├── observer_pb2.py │ │ │ │ ├── observer_pb2_grpc.py │ │ │ │ └── relay/ │ │ │ │ ├── __init__.py │ │ │ │ ├── relay.proto │ │ │ │ ├── relay_pb2.py │ │ │ │ └── relay_pb2_grpc.py │ │ │ └── runtime_version.py │ │ ├── clickhouse_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── clickhouse-secure/ │ │ │ │ ├── certs/ │ │ │ │ │ ├── server.crt │ │ │ │ │ └── server.key │ │ │ │ ├── config.xml │ │ │ │ ├── docker-compose.yml │ │ │ │ └── users.xml │ │ │ └── clickhouse_provider.py │ │ ├── cloudwatch_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── cloudwatch_provider.py │ │ ├── console_provider/ │ │ │ ├── __init__.py │ │ │ └── console_provider.py │ │ ├── coralogix_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── coralogix_provider.py │ │ ├── dash0_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── dash0_provider.py │ │ ├── databend_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── databend_provider.py │ │ ├── datadog_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ ├── datadog_alert_format_description.py │ │ │ ├── datadog_provider.py │ │ │ └── topology_mock.py │ │ ├── deepseek_provider/ │ │ │ ├── __init__.py │ │ │ └── deepseek_provider.py │ │ ├── discord_provider/ │ │ │ ├── __init__.py │ │ │ └── discord_provider.py │ │ ├── dynatrace_provider/ │ │ │ ├── __init__.py │ │ │ └── dynatrace_provider.py │ │ ├── eks_provider/ │ │ │ └── eks_provider.py │ │ ├── elastic_provider/ │ │ │ ├── __init__.py │ │ │ └── elastic_provider.py │ │ ├── flashduty_provider/ │ │ │ ├── __init__.py │ │ │ └── flashduty_provider.py │ │ ├── fluxcd_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── example.yaml │ │ │ ├── fluxcd_provider.py │ │ │ ├── requirements.txt │ │ │ ├── setup.py │ │ │ └── test_fluxcd_provider.py │ │ ├── gcpmonitoring_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── gcpmonitoring_provider.py │ │ ├── gemini_provider/ │ │ │ ├── __init__.py │ │ │ └── gemini_provider.py │ │ ├── github_provider/ │ │ │ ├── __init__.py │ │ │ └── github_provider.py │ │ ├── github_workflows_provider/ │ │ │ ├── __init__.py │ │ │ └── github_workflows_provider.py │ │ ├── gitlab_provider/ │ │ │ ├── __init__.py │ │ │ └── gitlab_provider.py │ │ ├── gitlabpipelines_provider/ │ │ │ ├── __init__.py │ │ │ └── gitlabpipelines_provider.py │ │ ├── gke_provider/ │ │ │ ├── __init__.py │ │ │ └── gke_provider.py │ │ ├── google_chat_provider/ │ │ │ ├── __init__.py │ │ │ └── google_chat_provider.py │ │ ├── grafana_incident_provider/ │ │ │ ├── __init__.py │ │ │ └── grafana_incident_provider.py │ │ ├── grafana_loki_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── docker-compose.auth.yml │ │ │ └── grafana_loki_provider.py │ │ ├── grafana_oncall_provider/ │ │ │ ├── __init__.py │ │ │ └── grafana_oncall_provider.py │ │ ├── grafana_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ ├── docker-compose.yml │ │ │ ├── grafana/ │ │ │ │ ├── grafana.ini │ │ │ │ └── provisioning/ │ │ │ │ ├── access_control/ │ │ │ │ │ └── custom_roles.yml │ │ │ │ ├── alerting/ │ │ │ │ │ ├── alerts.yml │ │ │ │ │ ├── contact_points.yml │ │ │ │ │ └── notification_policies.yml │ │ │ │ ├── dashboards/ │ │ │ │ │ ├── dashboards.yml │ │ │ │ │ └── system.json │ │ │ │ ├── datasources/ │ │ │ │ │ └── datasource.yml │ │ │ │ ├── notifiers/ │ │ │ │ │ └── email.yml │ │ │ │ └── service_accounts/ │ │ │ │ ├── service_accounts.yml │ │ │ │ └── tokens.yml │ │ │ ├── grafana_alert_format_description.py │ │ │ ├── grafana_provider.py │ │ │ └── prometheus/ │ │ │ └── prometheus.yml │ │ ├── graylog_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ ├── docker-compose-v4.yml │ │ │ ├── docker-compose.yml │ │ │ └── graylog_provider.py │ │ ├── grok_provider/ │ │ │ ├── __init__.py │ │ │ └── grok_provider.py │ │ ├── http_provider/ │ │ │ ├── __init__.py │ │ │ └── http_provider.py │ │ ├── icinga2_provider/ │ │ │ └── icinga2_provider.py │ │ ├── ilert_provider/ │ │ │ ├── __init__.py │ │ │ └── ilert_provider.py │ │ ├── incidentio_provider/ │ │ │ ├── __init__.py │ │ │ └── incidentio_provider.py │ │ ├── incidentmanager_provider/ │ │ │ ├── __init__.py │ │ │ └── incidentmanager_provider.py │ │ ├── jira_provider/ │ │ │ ├── __init__.py │ │ │ └── jira_provider.py │ │ ├── jiraonprem_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── jiraonprem_provider.py │ │ ├── kafka_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── docker-compose-no-auth.yml │ │ │ ├── docker-compose.yml │ │ │ ├── kafka_provider.py │ │ │ └── kafka_server_jaas.conf │ │ ├── keep_provider/ │ │ │ ├── __init__.py │ │ │ └── keep_provider.py │ │ ├── kibana_provider/ │ │ │ ├── __init__.py │ │ │ └── kibana_provider.py │ │ ├── kubernetes_provider/ │ │ │ ├── __init__.py │ │ │ └── kubernetes_provider.py │ │ ├── libre_nms_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── libre_nms_provider.py │ │ ├── linear_provider/ │ │ │ ├── __init__.py │ │ │ └── linear_provider.py │ │ ├── linearb_provider/ │ │ │ ├── __init__.py │ │ │ └── linearb_provider.py │ │ ├── litellm_provider/ │ │ │ ├── __init__.py │ │ │ └── litellm_provider.py │ │ ├── llamacpp_provider/ │ │ │ ├── __init__.py │ │ │ └── llamacpp_provider.py │ │ ├── mailgun_provider/ │ │ │ ├── __init__.py │ │ │ └── mailgun_provider.py │ │ ├── mattermost_provider/ │ │ │ ├── __init__.py │ │ │ └── mattermost_provider.py │ │ ├── microsoft-planner-provider/ │ │ │ ├── __init__.py │ │ │ └── microsoft-planner-provider.py │ │ ├── mock_provider/ │ │ │ ├── __init__.py │ │ │ └── mock_provider.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── provider_config.py │ │ │ └── provider_method.py │ │ ├── monday_provider/ │ │ │ ├── __init__.py │ │ │ └── monday_provider.py │ │ ├── mongodb_provider/ │ │ │ ├── __init__.py │ │ │ └── mongodb_provider.py │ │ ├── mysql_provider/ │ │ │ ├── __init__.py │ │ │ └── mysql_provider.py │ │ ├── netbox_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── netbox_provider.py │ │ ├── netdata_provider/ │ │ │ ├── __init__.py │ │ │ └── netdata_provider.py │ │ ├── netxms_provider/ │ │ │ ├── __init__.py │ │ │ └── netxms_provider.py │ │ ├── newrelic_provider/ │ │ │ ├── __init__.py │ │ │ └── newrelic_provider.py │ │ ├── ntfy_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── docker-compose.yml │ │ │ ├── ntfy_provider.py │ │ │ └── server.yml │ │ ├── ollama_provider/ │ │ │ ├── __init__.py │ │ │ └── ollama_provider.py │ │ ├── openai_provider/ │ │ │ ├── __init__.py │ │ │ └── openai_provider.py │ │ ├── openobserve_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerttemplate.json │ │ │ └── openobserve_provider.py │ │ ├── opensearchserverless_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── opensearchserverless_provider.py │ │ ├── openshift_provider/ │ │ │ ├── __init__.py │ │ │ └── openshift_provider.py │ │ ├── opsgenie_provider/ │ │ │ ├── __init__.py │ │ │ └── opsgenie_provider.py │ │ ├── pagerduty_provider/ │ │ │ ├── __init__.py │ │ │ └── pagerduty_provider.py │ │ ├── pagertree_provider/ │ │ │ ├── __init__.py │ │ │ └── pagertree_provider.py │ │ ├── parseable_provider/ │ │ │ ├── __init__.py │ │ │ └── parseable_provider.py │ │ ├── pingdom_provider/ │ │ │ ├── __init__.py │ │ │ └── pingdom_provider.py │ │ ├── planner_provider/ │ │ │ ├── __init__.py │ │ │ └── planner_provider.py │ │ ├── postgres_provider/ │ │ │ ├── __init__.py │ │ │ └── postgres_provider.py │ │ ├── posthog_provider/ │ │ │ ├── __init__.py │ │ │ └── posthog_provider.py │ │ ├── prometheus_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── prometheus_provider.py │ │ ├── providers_factory.py │ │ ├── providers_service.py │ │ ├── pushover_provider/ │ │ │ ├── __init__.py │ │ │ └── pushover_provider.py │ │ ├── python_provider/ │ │ │ ├── __init__.py │ │ │ └── python_provider.py │ │ ├── quickchart_provider/ │ │ │ ├── __init__.py │ │ │ └── quickchart_provider.py │ │ ├── redmine_provider/ │ │ │ ├── __init__.py │ │ │ └── redmine_provider.py │ │ ├── resend_provider/ │ │ │ ├── __init__.py │ │ │ └── resend_provider.py │ │ ├── rollbar_provider/ │ │ │ └── rollbar_provider.py │ │ ├── s3_provider/ │ │ │ ├── __init__.py │ │ │ └── s3_provider.py │ │ ├── salesforce_provider/ │ │ │ ├── __init__.py │ │ │ └── salesforce_provider.py │ │ ├── sendgrid_provider/ │ │ │ ├── __init__.py │ │ │ └── sendgrid_provider.py │ │ ├── sentry_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── sentry_provider.py │ │ ├── servicenow_provider/ │ │ │ ├── .gitignore │ │ │ ├── __init__.py │ │ │ └── servicenow_provider.py │ │ ├── signalfx_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── signalfx_provider.py │ │ ├── signl4_provider/ │ │ │ ├── __init__.py │ │ │ └── signl4_provider.py │ │ ├── site24x7_provider/ │ │ │ ├── __init__.py │ │ │ └── site24x7_provider.py │ │ ├── slack_provider/ │ │ │ ├── __init__.py │ │ │ └── slack_provider.py │ │ ├── smtp_provider/ │ │ │ ├── __init__.py │ │ │ └── smtp_provider.py │ │ ├── snowflake_provider/ │ │ │ ├── __init__.py │ │ │ └── snowflake_provider.py │ │ ├── splunk_provider/ │ │ │ ├── __init__.py │ │ │ └── splunk_provider.py │ │ ├── squadcast_provider/ │ │ │ ├── __init__.py │ │ │ └── squadcast_provider.py │ │ ├── ssh_provider/ │ │ │ ├── __init__.py │ │ │ └── ssh_provider.py │ │ ├── statuscake_provider/ │ │ │ ├── __init__.py │ │ │ └── statuscake_provider.py │ │ ├── sumologic_provider/ │ │ │ ├── __init__.py │ │ │ ├── connection_template.json │ │ │ └── sumologic_provider.py │ │ ├── teams_provider/ │ │ │ ├── __init__.py │ │ │ └── teams_provider.py │ │ ├── telegram_provider/ │ │ │ ├── __init__.py │ │ │ └── telegram_provider.py │ │ ├── thousandeyes_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ └── thousandeyes_provider.py │ │ ├── trello_provider/ │ │ │ ├── __init__.py │ │ │ └── trello_provider.py │ │ ├── twilio_provider/ │ │ │ └── twilio_provider.py │ │ ├── uptimekuma_provider/ │ │ │ ├── __init__.py │ │ │ └── uptimekuma_provider.py │ │ ├── vectordev_provider/ │ │ │ ├── __init__.py │ │ │ └── vectordev_provider.py │ │ ├── victorialogs_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── victorialogs_provider.py │ │ ├── victoriametrics_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── victoriametrics_provider.py │ │ ├── vllm_provider/ │ │ │ ├── __init__.py │ │ │ └── vllm_provider.py │ │ ├── wazuh_provider/ │ │ │ ├── __init__.py │ │ │ ├── alerts_mock.py │ │ │ ├── custom-keep │ │ │ ├── custom-keep.py │ │ │ └── wazuh_provider.py │ │ ├── webhook_provider/ │ │ │ ├── __init__.py │ │ │ └── webhook_provider.py │ │ ├── websocket_provider/ │ │ │ ├── __init__.py │ │ │ └── websocket_provider.py │ │ ├── youtrack_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── youtrack_provider.py │ │ ├── zabbix_provider/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── zabbix_provider.py │ │ │ └── zabbix_provider_script.js │ │ ├── zendesk_provider/ │ │ │ ├── __init__.py │ │ │ └── zendesk_provider.py │ │ ├── zenduty_provider/ │ │ │ ├── __init__.py │ │ │ └── zenduty_provider.py │ │ ├── zoom_chat_provider/ │ │ │ ├── __init__.py │ │ │ └── zoom_chat_provider.py │ │ └── zoom_provider/ │ │ ├── __init__.py │ │ └── zoom_provider.py │ ├── rulesengine/ │ │ ├── __init__.py │ │ └── rulesengine.py │ ├── searchengine/ │ │ └── searchengine.py │ ├── secretmanager/ │ │ ├── __init__.py │ │ ├── awssecretmanager.py │ │ ├── dbsecretmanager.py │ │ ├── filesecretmanager.py │ │ ├── gcpsecretmanager.py │ │ ├── kubernetessecretmanager.py │ │ ├── secretmanager.py │ │ ├── secretmanagerfactory.py │ │ └── vaultsecretmanager.py │ ├── server_jobs_bg.py │ ├── step/ │ │ ├── __init__.py │ │ ├── step.py │ │ └── step_provider_parameter.py │ ├── throttles/ │ │ ├── base_throttle.py │ │ ├── one_until_resolved_throttle.py │ │ └── throttle_factory.py │ ├── topologies/ │ │ ├── topologies_service.py │ │ └── topology_processor.py │ ├── validation/ │ │ ├── __init__.py │ │ └── fields.py │ └── workflowmanager/ │ ├── __init__.py │ ├── workflow.py │ ├── workflowmanager.py │ ├── workflowscheduler.py │ └── workflowstore.py ├── keep-ui/ │ ├── .dockerignore │ ├── .eslintignore │ ├── .eslintrc.json │ ├── .gitignore │ ├── .prettierrc │ ├── README.md │ ├── __mocks__/ │ │ ├── @monaco-editor/ │ │ │ └── react.js │ │ └── monaco-editor.js │ ├── app/ │ │ ├── (health)/ │ │ │ ├── health/ │ │ │ │ ├── check.tsx │ │ │ │ ├── modal.tsx │ │ │ │ └── page.tsx │ │ │ └── layout.tsx │ │ ├── (keep)/ │ │ │ ├── [...not-found]/ │ │ │ │ └── page.tsx │ │ │ ├── ai/ │ │ │ │ ├── ai-plugins.tsx │ │ │ │ ├── model.ts │ │ │ │ └── page.tsx │ │ │ ├── alerts/ │ │ │ │ └── [id]/ │ │ │ │ ├── page.tsx │ │ │ │ └── ui/ │ │ │ │ ├── __tests__/ │ │ │ │ │ └── alerts-fingerprint.test.tsx │ │ │ │ ├── alert-table-alert-facets.tsx │ │ │ │ ├── alert-table-facet-dynamic.tsx │ │ │ │ ├── alert-table-facet-types.tsx │ │ │ │ ├── alert-table-facet-utils.tsx │ │ │ │ ├── alert-table-facet-value.tsx │ │ │ │ ├── alert-table-facet.tsx │ │ │ │ ├── alert-table-tab-panel-server-side.tsx │ │ │ │ └── alerts.tsx │ │ │ ├── dashboard/ │ │ │ │ ├── GridItem.tsx │ │ │ │ ├── GridItemContainer.tsx │ │ │ │ ├── GridLayout.tsx │ │ │ │ ├── MenuButton.tsx │ │ │ │ ├── WidgetModal.tsx │ │ │ │ ├── [id]/ │ │ │ │ │ ├── dashboard.tsx │ │ │ │ │ └── page.tsx │ │ │ │ ├── alert-quality-table.tsx │ │ │ │ ├── styles.css │ │ │ │ ├── types.tsx │ │ │ │ └── widget-types/ │ │ │ │ ├── generic-metrics/ │ │ │ │ │ ├── generic-metrics-grid-item.tsx │ │ │ │ │ └── generic-metrics-widget-form.tsx │ │ │ │ ├── metric/ │ │ │ │ │ ├── metric-grid-item.tsx │ │ │ │ │ └── metric-widget-form.tsx │ │ │ │ └── preset/ │ │ │ │ ├── columns-selection.tsx │ │ │ │ ├── constants.ts │ │ │ │ ├── preset-grid-item.tsx │ │ │ │ ├── preset-widget-form.tsx │ │ │ │ ├── widget-alert-count-panel.tsx │ │ │ │ └── widget-alerts-table.tsx │ │ │ ├── deduplication/ │ │ │ │ ├── DeduplicationPlaceholder.tsx │ │ │ │ ├── DeduplicationSidebar.tsx │ │ │ │ ├── DeduplicationTable.tsx │ │ │ │ ├── client.tsx │ │ │ │ ├── models.tsx │ │ │ │ └── page.tsx │ │ │ ├── error.ts │ │ │ ├── extraction/ │ │ │ │ ├── [rule_id]/ │ │ │ │ │ └── executions/ │ │ │ │ │ ├── [execution_id]/ │ │ │ │ │ │ └── page.tsx │ │ │ │ │ └── page.tsx │ │ │ │ ├── create-or-update-extraction-rule.tsx │ │ │ │ ├── extraction.tsx │ │ │ │ ├── extractions-table.tsx │ │ │ │ ├── layout.tsx │ │ │ │ ├── model.ts │ │ │ │ ├── page.tsx │ │ │ │ └── run-extraction-modal.tsx │ │ │ ├── incidents/ │ │ │ │ ├── [id]/ │ │ │ │ │ ├── activity/ │ │ │ │ │ │ ├── incident-activity.css │ │ │ │ │ │ ├── incident-activity.tsx │ │ │ │ │ │ ├── lib/ │ │ │ │ │ │ │ └── extractTaggedUsers.ts │ │ │ │ │ │ ├── page.tsx │ │ │ │ │ │ └── ui/ │ │ │ │ │ │ ├── IncidentActivityComment.tsx │ │ │ │ │ │ ├── IncidentActivityItem.tsx │ │ │ │ │ │ ├── IncidentCommentInput.dynamic.tsx │ │ │ │ │ │ ├── IncidentCommentInput.scss │ │ │ │ │ │ └── IncidentCommentInput.tsx │ │ │ │ │ ├── alerts/ │ │ │ │ │ │ ├── ALERT_SIDEBAR_INTEGRATION.md │ │ │ │ │ │ ├── __tests__/ │ │ │ │ │ │ │ ├── incident-alerts-sidebar.test.tsx │ │ │ │ │ │ │ └── incident-alerts.test.tsx │ │ │ │ │ │ ├── incident-alert-action-tray.tsx │ │ │ │ │ │ ├── incident-alert-actions.tsx │ │ │ │ │ │ ├── incident-alert-table-body-skeleton.tsx │ │ │ │ │ │ ├── incident-alerts.tsx │ │ │ │ │ │ └── page.tsx │ │ │ │ │ ├── chat/ │ │ │ │ │ │ ├── incident-chat.css │ │ │ │ │ │ ├── incident-chat.tsx │ │ │ │ │ │ └── page.client.tsx │ │ │ │ │ ├── create-ticket-modal.tsx │ │ │ │ │ ├── enrichments/ │ │ │ │ │ │ ├── EnrichmentEditableField.tsx │ │ │ │ │ │ └── EnrichmentEditableForm.tsx │ │ │ │ │ ├── getIncidentWithErrorHandling.tsx │ │ │ │ │ ├── incident-header-skeleton.tsx │ │ │ │ │ ├── incident-header.tsx │ │ │ │ │ ├── incident-layout-client.tsx │ │ │ │ │ ├── incident-overview.tsx │ │ │ │ │ ├── incident-tabs-navigation.tsx │ │ │ │ │ ├── layout.tsx │ │ │ │ │ ├── link-ticket-modal.tsx │ │ │ │ │ ├── not-found.tsx │ │ │ │ │ ├── route.tsx │ │ │ │ │ ├── ticketing-incident-options.tsx │ │ │ │ │ ├── timeline/ │ │ │ │ │ │ ├── incident-timeline.tsx │ │ │ │ │ │ └── page.tsx │ │ │ │ │ ├── topology/ │ │ │ │ │ │ └── page.tsx │ │ │ │ │ └── workflows/ │ │ │ │ │ ├── incident-workflow-empty.tsx │ │ │ │ │ ├── incident-workflow-sidebar.tsx │ │ │ │ │ ├── incident-workflow-table.tsx │ │ │ │ │ └── page.tsx │ │ │ │ ├── incident-overview-skeleton.tsx │ │ │ │ ├── layout.tsx │ │ │ │ ├── page.tsx │ │ │ │ └── predicted-incidents-table.tsx │ │ │ ├── layout.tsx │ │ │ ├── loading.tsx │ │ │ ├── maintenance/ │ │ │ │ ├── create-or-update-maintenance-rule.tsx │ │ │ │ ├── layout.tsx │ │ │ │ ├── maintenance-rules-table.tsx │ │ │ │ ├── maintenance.tsx │ │ │ │ ├── model.ts │ │ │ │ └── page.tsx │ │ │ ├── mapping/ │ │ │ │ ├── [rule_id]/ │ │ │ │ │ └── executions/ │ │ │ │ │ ├── [execution_id]/ │ │ │ │ │ │ └── page.tsx │ │ │ │ │ └── page.tsx │ │ │ │ ├── create-or-edit-mapping.tsx │ │ │ │ ├── layout.tsx │ │ │ │ ├── mapping.tsx │ │ │ │ ├── models.tsx │ │ │ │ ├── page.tsx │ │ │ │ ├── rules-table.tsx │ │ │ │ └── run-mapping-modal.tsx │ │ │ ├── not-found.tsx │ │ │ ├── notifications-hub/ │ │ │ │ ├── layout.tsx │ │ │ │ └── page.tsx │ │ │ ├── page.tsx │ │ │ ├── providers/ │ │ │ │ ├── components/ │ │ │ │ │ ├── providers-categories/ │ │ │ │ │ │ ├── index.ts │ │ │ │ │ │ └── providers-categories.tsx │ │ │ │ │ ├── providers-filter-by-label/ │ │ │ │ │ │ ├── index.ts │ │ │ │ │ │ └── providers-filter-by-label.tsx │ │ │ │ │ └── providers-search/ │ │ │ │ │ ├── index.ts │ │ │ │ │ └── providers-search.tsx │ │ │ │ ├── filter-context/ │ │ │ │ │ ├── constants.ts │ │ │ │ │ ├── filter-context.tsx │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── types.ts │ │ │ │ │ └── use-filter-context.ts │ │ │ │ ├── form-fields.tsx │ │ │ │ ├── form-validation.ts │ │ │ │ ├── layout.tsx │ │ │ │ ├── oauth2/ │ │ │ │ │ └── [providerType]/ │ │ │ │ │ └── page.tsx │ │ │ │ ├── page.client.tsx │ │ │ │ ├── page.tsx │ │ │ │ ├── provider-form-scopes.css │ │ │ │ ├── provider-form-scopes.tsx │ │ │ │ ├── provider-form.css │ │ │ │ ├── provider-form.tsx │ │ │ │ ├── provider-logs.tsx │ │ │ │ ├── provider-semi-automated.tsx │ │ │ │ ├── provider-tile.css │ │ │ │ ├── provider-tile.tsx │ │ │ │ ├── providers-tiles.tsx │ │ │ │ └── providers.css │ │ │ ├── rules/ │ │ │ │ ├── CorrelationPlaceholder.tsx │ │ │ │ ├── CorrelationSidebar/ │ │ │ │ │ ├── AlertsFoundBadge.tsx │ │ │ │ │ ├── CorrelationForm.tsx │ │ │ │ │ ├── CorrelationGroups.tsx │ │ │ │ │ ├── CorrelationSidebarBody.tsx │ │ │ │ │ ├── CorrelationSidebarHeader.tsx │ │ │ │ │ ├── CorrelationSubmission.tsx │ │ │ │ │ ├── DeleteRule.tsx │ │ │ │ │ ├── RuleFields.tsx │ │ │ │ │ ├── RuleGroup.tsx │ │ │ │ │ ├── convert-cel-ast-to-query-builder-ast/ │ │ │ │ │ │ ├── convert-cel-ast-to-query-builder-ast.function.test.ts │ │ │ │ │ │ └── convert-cel-ast-to-query-builder-ast.function.ts │ │ │ │ │ ├── index.tsx │ │ │ │ │ ├── timeframe-constants.ts │ │ │ │ │ ├── types.ts │ │ │ │ │ └── useMatchingAlerts.ts │ │ │ │ ├── CorrelationTable.tsx │ │ │ │ ├── GroupedByCel.tsx │ │ │ │ ├── client.tsx │ │ │ │ ├── flatten-cel-ast.ts │ │ │ │ ├── page.tsx │ │ │ │ └── ui/ │ │ │ │ └── PlaceholderSankey.tsx │ │ │ ├── settings/ │ │ │ │ ├── auth/ │ │ │ │ │ ├── api-key-settings.tsx │ │ │ │ │ ├── api-key-tab.tsx │ │ │ │ │ ├── api-key-table.tsx │ │ │ │ │ ├── groups-sidebar.tsx │ │ │ │ │ ├── groups-tab.tsx │ │ │ │ │ ├── groups-table.tsx │ │ │ │ │ ├── multiselect.css │ │ │ │ │ ├── permissions-sidebar.tsx │ │ │ │ │ ├── permissions-tab.tsx │ │ │ │ │ ├── permissions-table.tsx │ │ │ │ │ ├── roles-sidebar.tsx │ │ │ │ │ ├── roles-tab.tsx │ │ │ │ │ ├── roles-table.tsx │ │ │ │ │ ├── sso-settings.tsx │ │ │ │ │ ├── sso-tab.tsx │ │ │ │ │ ├── types.ts │ │ │ │ │ ├── users-settings.tsx │ │ │ │ │ ├── users-sidebar.tsx │ │ │ │ │ ├── users-tab.tsx │ │ │ │ │ └── users-table.tsx │ │ │ │ ├── create-api-key-modal.tsx │ │ │ │ ├── layout.tsx │ │ │ │ ├── models.tsx │ │ │ │ ├── page.tsx │ │ │ │ ├── provider-images/ │ │ │ │ │ ├── page.tsx │ │ │ │ │ ├── provider-image-list.tsx │ │ │ │ │ ├── provider-image-uploader.tsx │ │ │ │ │ └── provider-images-settings.tsx │ │ │ │ ├── settings.client.tsx │ │ │ │ ├── smtp-settings.tsx │ │ │ │ └── webhook-settings.tsx │ │ │ ├── topology/ │ │ │ │ ├── TopologySearchContext.tsx │ │ │ │ ├── api/ │ │ │ │ │ └── index.ts │ │ │ │ ├── layout.tsx │ │ │ │ ├── lib/ │ │ │ │ │ └── badge-colors.ts │ │ │ │ ├── model/ │ │ │ │ │ ├── TopologyPollingContext.tsx │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── models.ts │ │ │ │ │ ├── useTopology.ts │ │ │ │ │ └── useTopologyApplications.ts │ │ │ │ ├── page.tsx │ │ │ │ ├── topology-client.tsx │ │ │ │ └── ui/ │ │ │ │ ├── TopologySearchAutocomplete.tsx │ │ │ │ ├── applications/ │ │ │ │ │ ├── application-card.tsx │ │ │ │ │ ├── application-modal.tsx │ │ │ │ │ ├── applications-list.tsx │ │ │ │ │ └── create-or-update-application-form.tsx │ │ │ │ └── map/ │ │ │ │ ├── AddEditNodeSidePanel.tsx │ │ │ │ ├── application-node.tsx │ │ │ │ ├── getLayoutedElements.ts │ │ │ │ ├── getNodesAndEdgesFromTopologyData.ts │ │ │ │ ├── index.tsx │ │ │ │ ├── manage-selection.tsx │ │ │ │ ├── service-node.tsx │ │ │ │ ├── styles.tsx │ │ │ │ ├── topology-map.tsx │ │ │ │ └── topology.css │ │ │ └── workflows/ │ │ │ ├── [workflow_id]/ │ │ │ │ ├── layout.tsx │ │ │ │ ├── page.tsx │ │ │ │ ├── runs/ │ │ │ │ │ └── [workflow_execution_id]/ │ │ │ │ │ └── page.tsx │ │ │ │ ├── table-filters.tsx │ │ │ │ ├── versions/ │ │ │ │ │ └── [revision]/ │ │ │ │ │ └── page.tsx │ │ │ │ ├── workflow-breadcrumbs.tsx │ │ │ │ ├── workflow-detail-header.tsx │ │ │ │ ├── workflow-detail-page.tsx │ │ │ │ ├── workflow-executions-table.tsx │ │ │ │ ├── workflow-overview-skeleton.tsx │ │ │ │ ├── workflow-overview.tsx │ │ │ │ ├── workflow-providers.tsx │ │ │ │ ├── workflow-secrets.tsx │ │ │ │ ├── workflow-sync-status.tsx │ │ │ │ └── workflow-versions.tsx │ │ │ ├── __tests__/ │ │ │ │ └── existing-workflows-state.test.tsx │ │ │ ├── builder/ │ │ │ │ ├── [workflowId]/ │ │ │ │ │ └── page.tsx │ │ │ │ ├── layout.tsx │ │ │ │ └── page.tsx │ │ │ ├── create-workflow-modal.tsx │ │ │ ├── existing-workflows-state.tsx │ │ │ ├── no-workflows-state.tsx │ │ │ ├── noworkflows.tsx │ │ │ ├── page.tsx │ │ │ ├── preview/ │ │ │ │ ├── [workflowId]/ │ │ │ │ │ └── page.tsx │ │ │ │ └── page.tsx │ │ │ ├── upload-workflows-modal.tsx │ │ │ ├── workflow-graph.tsx │ │ │ ├── workflow-menu.tsx │ │ │ ├── workflow-templates/ │ │ │ │ ├── index.ts │ │ │ │ ├── workflow-template-card.tsx │ │ │ │ └── workflow-templates.tsx │ │ │ ├── workflow-tile.css │ │ │ ├── workflow-tile.tsx │ │ │ ├── workflow-utils.ts │ │ │ ├── workflows-steps.tsx │ │ │ └── workflows.page.tsx │ │ ├── (signin)/ │ │ │ ├── error/ │ │ │ │ ├── authEnvUtils.tsx │ │ │ │ ├── error-client.tsx │ │ │ │ └── page.tsx │ │ │ ├── layout.tsx │ │ │ ├── mobile/ │ │ │ │ ├── GithubButton.tsx │ │ │ │ └── page.tsx │ │ │ └── signin/ │ │ │ ├── SignInForm.tsx │ │ │ └── page.tsx │ │ ├── actions/ │ │ │ └── authactions.ts │ │ ├── api/ │ │ │ ├── auth/ │ │ │ │ └── [...nextauth]/ │ │ │ │ └── route.ts │ │ │ ├── aws-marketplace/ │ │ │ │ └── route.ts │ │ │ ├── copilotkit/ │ │ │ │ └── route.ts │ │ │ └── healthcheck/ │ │ │ └── route.ts │ │ ├── auth-provider.tsx │ │ ├── config-provider.tsx │ │ ├── global-error.tsx │ │ ├── globals.css │ │ ├── not-authorized.tsx │ │ ├── posthog-provider.tsx │ │ └── raw/ │ │ └── workflows/ │ │ └── [workflow_filename]/ │ │ └── route.ts │ ├── auth.config.ts │ ├── auth.ts │ ├── components/ │ │ ├── LinkWithIcon.tsx │ │ ├── LogViewer.tsx │ │ ├── SidePanel.tsx │ │ ├── banners/ │ │ │ ├── BannerBase.tsx │ │ │ ├── health-page-banner.tsx │ │ │ └── read-only-banner.tsx │ │ ├── filters/ │ │ │ └── GenericFilters.tsx │ │ ├── icons/ │ │ │ └── index.tsx │ │ ├── navbar/ │ │ │ ├── AILink.tsx │ │ │ ├── AlertsLinks.tsx │ │ │ ├── DashboardLink.tsx │ │ │ ├── DashboardLinks.tsx │ │ │ ├── IncidentLinks.tsx │ │ │ ├── Menu.tsx │ │ │ ├── MinimizeMenuButton.tsx │ │ │ ├── Navbar.css │ │ │ ├── Navbar.tsx │ │ │ ├── NoiseReductionLinks.tsx │ │ │ ├── Search.tsx │ │ │ ├── SetSentryUser.tsx │ │ │ ├── UserAvatar.tsx │ │ │ └── UserInfo.tsx │ │ ├── popover/ │ │ │ └── GenericPopover.tsx │ │ ├── table/ │ │ │ ├── ExecutionsTable.tsx │ │ │ ├── GenericTable.tsx │ │ │ └── Pagination.tsx │ │ └── ui/ │ │ ├── AutocompleteInput.tsx │ │ ├── Button.tsx │ │ ├── Calendar.scss │ │ ├── Calendar.tsx │ │ ├── CreatableMultiSelect.tsx │ │ ├── DateRangePicker.tsx │ │ ├── DateRangePickerV2.tsx │ │ ├── DynamicProviderIcon.tsx │ │ ├── EmptyStateImage.tsx │ │ ├── EmptyStateTable.tsx │ │ ├── ImagePreviewTooltip.tsx │ │ ├── Link.tsx │ │ ├── Modal.tsx │ │ ├── ResizableColumns.tsx │ │ ├── RootCauseAnalysis.tsx │ │ ├── ShortNumber.tsx │ │ ├── TextInput.tsx │ │ ├── Textarea.tsx │ │ ├── index.ts │ │ └── useTimeframeState.ts │ ├── docs/ │ │ └── incident-alerts/ │ │ ├── ALERT_SIDEBAR_INTEGRATION.md │ │ └── CI_CD_FIXES.md │ ├── entities/ │ │ ├── alerts/ │ │ │ ├── lib/ │ │ │ │ └── getTabsFromPreset.ts │ │ │ ├── model/ │ │ │ │ ├── constants.ts │ │ │ │ ├── index.ts │ │ │ │ ├── types.ts │ │ │ │ ├── useAlertRowStyle.ts │ │ │ │ ├── useAlertTableTheme.ts │ │ │ │ ├── useAlerts.ts │ │ │ │ ├── useAvailableAlertFields.ts │ │ │ │ └── useSeverityMapping.ts │ │ │ └── ui/ │ │ │ ├── AlertImage/ │ │ │ │ └── AlertImage.tsx │ │ │ ├── AlertName/ │ │ │ │ └── AlertName.tsx │ │ │ ├── alert-severity.tsx │ │ │ └── index.ts │ │ ├── incidents/ │ │ │ ├── api/ │ │ │ │ ├── incidents.ts │ │ │ │ └── index.ts │ │ │ ├── lib/ │ │ │ │ ├── __tests__/ │ │ │ │ │ └── ticketing-utils.test.ts │ │ │ │ ├── ticketing-utils.ts │ │ │ │ └── utils.ts │ │ │ ├── model/ │ │ │ │ ├── index.ts │ │ │ │ ├── models.ts │ │ │ │ └── useIncidentActions.tsx │ │ │ └── ui/ │ │ │ ├── IncidentIconName/ │ │ │ │ ├── IncidentIconName.tsx │ │ │ │ └── index.ts │ │ │ ├── IncidentSeverityBadge.tsx │ │ │ ├── index.ts │ │ │ └── statuses.tsx │ │ ├── presets/ │ │ │ └── model/ │ │ │ ├── constants.ts │ │ │ ├── index.ts │ │ │ ├── types.ts │ │ │ ├── usePresetActions.ts │ │ │ ├── usePresetColumnConfig.ts │ │ │ ├── usePresetColumnState.ts │ │ │ ├── usePresetPolling.ts │ │ │ ├── usePresets.ts │ │ │ └── useSilencedPresets.ts │ │ ├── provider-images/ │ │ │ └── model/ │ │ │ └── useProviderImages.ts │ │ ├── providers/ │ │ │ └── model/ │ │ │ └── __mocks__/ │ │ │ └── provider-mocks.ts │ │ ├── users/ │ │ │ ├── model/ │ │ │ │ ├── useUser.ts │ │ │ │ └── useUsers.ts │ │ │ └── ui/ │ │ │ ├── UserStatefulAvatar.tsx │ │ │ └── index.ts │ │ ├── workflow-executions/ │ │ │ └── model/ │ │ │ ├── __tests__/ │ │ │ │ └── useWorkflowExecutionsV2.test.tsx │ │ │ ├── index.ts │ │ │ ├── useWorkflowExecutionDetail.ts │ │ │ ├── useWorkflowExecutions.ts │ │ │ ├── useWorkflowExecutionsRevalidation.ts │ │ │ ├── useWorkflowExecutionsV2.ts │ │ │ └── workflowExecutionsKeys.ts │ │ └── workflows/ │ │ ├── index.ts │ │ ├── lib/ │ │ │ ├── __tests__/ │ │ │ │ ├── extractWorkflowYamlDependencies.test.ts │ │ │ │ ├── getCurrentPath.test.ts │ │ │ │ ├── mustache.test.ts │ │ │ │ ├── parseWorkflowYamlToJSON.test.ts │ │ │ │ ├── parser.test.ts │ │ │ │ ├── validate-mustache-ui-builder.test.ts │ │ │ │ ├── validate-mustache-yaml.test.ts │ │ │ │ ├── validation.test.ts │ │ │ │ └── yaml-utils.test.ts │ │ │ ├── extractWorkflowYamlDependencies.ts │ │ │ ├── generateWorkflowYamlJsonSchema.ts │ │ │ ├── getHumanReadableInterval.ts │ │ │ ├── getLayoutedWorkflowElements.ts │ │ │ ├── getTriggerDescription.ts │ │ │ ├── mustache.ts │ │ │ ├── parser.ts │ │ │ ├── ui-utils.tsx │ │ │ ├── use-query-workflow-template.ts │ │ │ ├── useWorkflowJsonSchema.ts │ │ │ ├── useWorkflowZodSchema.ts │ │ │ ├── validate-definition.ts │ │ │ ├── validate-mustache-ui-builder.ts │ │ │ ├── validate-mustache-yaml.ts │ │ │ └── yaml-utils.ts │ │ ├── model/ │ │ │ ├── __mocks__/ │ │ │ │ └── mock-workflow.ts │ │ │ ├── __tests__/ │ │ │ │ ├── types.test.ts │ │ │ │ ├── useWorkflowActions.test.ts │ │ │ │ ├── useWorkflowRevalidation.test.tsx │ │ │ │ ├── useWorkflowsV2.test.ts │ │ │ │ ├── workflow-store.test.tsx │ │ │ │ └── yaml.schema.test.ts │ │ │ ├── index.ts │ │ │ ├── schema.ts │ │ │ ├── types.ts │ │ │ ├── useWorkflowActions.ts │ │ │ ├── useWorkflowDetail.ts │ │ │ ├── useWorkflowRevalidation.ts │ │ │ ├── useWorkflowRevisions.ts │ │ │ ├── useWorkflows.ts │ │ │ ├── useWorkflowsV2.ts │ │ │ ├── workflow-store.ts │ │ │ ├── workflow-yaml-editor-store.ts │ │ │ ├── workflowKeys.ts │ │ │ ├── yaml.schema.ts │ │ │ └── yaml.types.ts │ │ └── ui/ │ │ ├── NodeTriggerIcon.tsx │ │ ├── TriggerIcon.tsx │ │ ├── WorkflowAlertIncidentDependenciesForm.tsx │ │ ├── WorkflowInputFields.tsx │ │ ├── WorkflowPermissionsBadge.tsx │ │ └── WorkflowTriggerBadge.tsx │ ├── entrypoint.sh │ ├── errors.ts │ ├── eslint.config.mjs │ ├── features/ │ │ ├── alerts/ │ │ │ ├── alert-assign-ticket/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── alert-assign-ticket-modal.tsx │ │ │ ├── alert-associate-to-incident/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── alert-associate-incident-modal.tsx │ │ │ ├── alert-call-provider-method/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── alert-method-modal.tsx │ │ │ │ └── alert-method-results-table.tsx │ │ │ ├── alert-change-status/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── alert-change-status-modal.tsx │ │ │ ├── alert-create-incident-ai/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── alert-create-incident-ai-card.tsx │ │ │ │ └── alert-create-incident-ai-modal.tsx │ │ │ ├── alert-detail-sidebar/ │ │ │ │ ├── index.ts │ │ │ │ ├── lib/ │ │ │ │ │ └── alertSidebarFields.tsx │ │ │ │ └── ui/ │ │ │ │ ├── alert-sidebar-incidents.tsx │ │ │ │ ├── alert-sidebar.tsx │ │ │ │ └── alert-timeline.tsx │ │ │ ├── alert-error-event-process/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── alert-error-event-modal.tsx │ │ │ ├── alert-history/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── alert-history-charts.tsx │ │ │ │ └── alert-history-modal.tsx │ │ │ ├── alert-menu/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── alert-menu.tsx │ │ │ ├── alert-note/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── alert-note-modal.tsx │ │ │ ├── change-alert-table-theme/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── AlertTableThemeSelection.tsx │ │ │ │ └── __tests__/ │ │ │ │ └── change-alert-table-theme.test.tsx │ │ │ ├── dismiss-alert/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── alert-dismiss-modal.css │ │ │ │ └── alert-dismiss-modal.tsx │ │ │ ├── enrich-alert/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── EnrichAlertSidePanel.tsx │ │ │ ├── severity-mapping/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── SeverityMappingFacet.tsx │ │ │ │ └── SeverityMappingSelection.tsx │ │ │ ├── simulate-alert/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── alert-push-alert-to-server-modal.tsx │ │ │ └── view-raw-alert/ │ │ │ ├── index.ts │ │ │ └── ui/ │ │ │ ├── ViewAlertModal.css │ │ │ └── ViewAlertModal.tsx │ │ ├── cel-input/ │ │ │ ├── __tests__/ │ │ │ │ └── use-cel-state.test.ts │ │ │ ├── cel-input.tsx │ │ │ └── use-cel-state.ts │ │ ├── filter/ │ │ │ ├── add-facet-modal-with-suggestions.tsx │ │ │ ├── add-facet-modal.tsx │ │ │ ├── api.ts │ │ │ ├── facet-panel-server-side.tsx │ │ │ ├── facet-value.tsx │ │ │ ├── facet.tsx │ │ │ ├── facets-panel.tsx │ │ │ ├── hooks.tsx │ │ │ ├── index.ts │ │ │ ├── models.tsx │ │ │ ├── pagination.tsx │ │ │ ├── search-input.tsx │ │ │ └── store/ │ │ │ ├── __tests__/ │ │ │ │ ├── facets-store.test.ts │ │ │ │ ├── use-initial-state-handler.test.ts │ │ │ │ ├── use-queries-handler.test.ts │ │ │ │ └── utils.test.ts │ │ │ ├── create-facets-store.ts │ │ │ ├── index.ts │ │ │ ├── use-facets-config.tsx │ │ │ ├── use-facets-loading-state-handler.ts │ │ │ ├── use-initial-state-handler.ts │ │ │ ├── use-queries-handler.ts │ │ │ ├── use-query-params/ │ │ │ │ ├── __tests__/ │ │ │ │ │ ├── split-facet-values.test.ts │ │ │ │ │ └── use-query-params.test.ts │ │ │ │ ├── split-facet-values.ts │ │ │ │ └── use-query-params.ts │ │ │ ├── use-store.tsx │ │ │ └── utils.ts │ │ ├── incidents/ │ │ │ ├── change-incident-severity/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── incident-change-severity-select.tsx │ │ │ │ └── incident-severity-select.tsx │ │ │ ├── change-incident-status/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── incident-change-status-select.tsx │ │ │ ├── create-or-update-incident/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── create-or-update-incident-form.tsx │ │ │ │ └── react-quill-override.css │ │ │ ├── incident-list/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── incident-dropdown-menu.tsx │ │ │ │ ├── incident-list-error.tsx │ │ │ │ ├── incident-list-placeholder.tsx │ │ │ │ ├── incident-list.tsx │ │ │ │ ├── incident-table-component.tsx │ │ │ │ ├── incident-table-filters-context.tsx │ │ │ │ ├── incidents-not-found.tsx │ │ │ │ ├── incidents-report/ │ │ │ │ │ ├── generate-report-modal.tsx │ │ │ │ │ ├── incident-severity-metric.tsx │ │ │ │ │ ├── incidents-report.tsx │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── models.ts │ │ │ │ │ ├── pie-chart.tsx │ │ │ │ │ └── use-report-data.ts │ │ │ │ ├── incidents-table.tsx │ │ │ │ └── useIncidentsTableData.tsx │ │ │ ├── merge-incidents/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ └── merge-incidents-modal.tsx │ │ │ ├── same-incidents-in-the-past/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── change-same-incident-in-the-past-form.tsx │ │ │ │ ├── following-incidents.tsx │ │ │ │ ├── index.ts │ │ │ │ └── same-incident-field.tsx │ │ │ └── split-incident-alerts/ │ │ │ ├── index.ts │ │ │ └── ui/ │ │ │ └── split-incident-alerts-modal.tsx │ │ ├── keyboard-shortcuts/ │ │ │ ├── index.ts │ │ │ └── useIsShiftKeyHeld.ts │ │ ├── presets/ │ │ │ ├── create-or-update-preset/ │ │ │ │ ├── index.ts │ │ │ │ └── ui/ │ │ │ │ ├── alerts-count-badge.tsx │ │ │ │ ├── create-or-update-preset-form.tsx │ │ │ │ └── preset-controls.tsx │ │ │ ├── custom-preset-links/ │ │ │ │ ├── index.ts │ │ │ │ ├── model/ │ │ │ │ │ └── usePresetAlertsCount.ts │ │ │ │ └── ui/ │ │ │ │ ├── CustomPresetAlertLink.css │ │ │ │ ├── CustomPresetAlertLinks.tsx │ │ │ │ └── PresetsNoise.tsx │ │ │ └── presets-manager/ │ │ │ ├── index.ts │ │ │ ├── lib/ │ │ │ │ └── eval-with-context.ts │ │ │ └── ui/ │ │ │ ├── __tests__/ │ │ │ │ ├── alert-preset-manager.test.tsx │ │ │ │ └── preset-navigation.test.ts │ │ │ ├── alert-preset-manager.tsx │ │ │ └── alerts-rules-builder.tsx │ │ ├── workflow-execution-results/ │ │ │ ├── index.ts │ │ │ ├── lib/ │ │ │ │ └── logs-utils.ts │ │ │ └── ui/ │ │ │ ├── WorkflowExecutionError.tsx │ │ │ ├── WorkflowExecutionLogs.tsx │ │ │ └── WorkflowExecutionResults.tsx │ │ └── workflows/ │ │ ├── ai-assistant/ │ │ │ ├── index.ts │ │ │ ├── lib/ │ │ │ │ ├── constants.ts │ │ │ │ └── utils.ts │ │ │ └── ui/ │ │ │ ├── AddStepUI.tsx │ │ │ ├── AddTriggerOrStepSkeleton.tsx │ │ │ ├── AddTriggerUI.tsx │ │ │ ├── StepPreview.tsx │ │ │ ├── SuggestionStatus.tsx │ │ │ ├── WorkflowBuilderChat.tsx │ │ │ ├── WorkflowBuilderChatSafe.tsx │ │ │ └── chat.css │ │ ├── builder/ │ │ │ ├── index.ts │ │ │ ├── lib/ │ │ │ │ └── utils.tsx │ │ │ └── ui/ │ │ │ ├── Editor/ │ │ │ │ ├── EditorField.tsx │ │ │ │ ├── ReactFlowEditor.tsx │ │ │ │ ├── StepEditor.tsx │ │ │ │ ├── StepTest.tsx │ │ │ │ ├── TriggerEditor.tsx │ │ │ │ └── WorkflowEditor.tsx │ │ │ ├── NodeMenu.tsx │ │ │ ├── ReactFlowBuilder.tsx │ │ │ ├── WorkflowEdge.tsx │ │ │ ├── WorkflowNode.tsx │ │ │ ├── WorkflowToolbox.tsx │ │ │ ├── __tests__/ │ │ │ │ └── ReactFlowBuilder.test.tsx │ │ │ └── workflow-status.tsx │ │ ├── edit-metadata/ │ │ │ ├── index.ts │ │ │ └── ui/ │ │ │ ├── edit-workflow-metadata-form.tsx │ │ │ └── workflow-metadata-modal.tsx │ │ ├── edit-workflow-metadata/ │ │ │ ├── index.ts │ │ │ └── ui/ │ │ │ └── edit-workflow-metadata-form.tsx │ │ ├── enable-disable/ │ │ │ ├── index.ts │ │ │ ├── model/ │ │ │ │ ├── index.ts │ │ │ │ └── useWorkflowToggle.ts │ │ │ └── ui/ │ │ │ └── WorkflowEnabledSwitch.tsx │ │ ├── manual-run-workflow/ │ │ │ ├── index.ts │ │ │ ├── model/ │ │ │ │ ├── WorkflowModalContext.tsx │ │ │ │ ├── types.ts │ │ │ │ └── useWorkflowRun.ts │ │ │ └── ui/ │ │ │ ├── WorkflowInputsForm.tsx │ │ │ ├── WorkflowUnsavedChangesForm.tsx │ │ │ ├── manual-run-workflow-modal.tsx │ │ │ └── workflow-run-with-alert-modal.tsx │ │ └── test-run/ │ │ ├── index.ts │ │ ├── model/ │ │ │ └── useWorkflowTestRun.ts │ │ └── ui/ │ │ └── workflow-test-run-button.tsx │ ├── instrumentation.ts │ ├── jest.config.ts │ ├── jest.setup.ts │ ├── middleware.ts │ ├── next-env.d.ts │ ├── next.config.js │ ├── next_build.sh │ ├── next_start.sh │ ├── package.json │ ├── postcss.config.js │ ├── proxyFetch.node.ts │ ├── proxyFetch.ts │ ├── scripts/ │ │ ├── build-monaco-workers-turbopack.js │ │ ├── generate-workflow-yaml-json-schema.ts │ │ └── validate-workflow-examples.ts │ ├── sentry.client.config.ts │ ├── sentry.edge.config.ts │ ├── sentry.server.config.ts │ ├── shared/ │ │ ├── api/ │ │ │ ├── ApiClient.ts │ │ │ ├── KeepApiError.ts │ │ │ ├── __tests__/ │ │ │ │ └── ApiClient.test.ts │ │ │ ├── enrichment-events.ts │ │ │ ├── index.ts │ │ │ ├── providers.ts │ │ │ ├── server/ │ │ │ │ ├── createServerApiClient.ts │ │ │ │ └── index.ts │ │ │ ├── workflow-executions.ts │ │ │ └── workflows.ts │ │ ├── constants.ts │ │ ├── lib/ │ │ │ ├── __tests__/ │ │ │ │ ├── getIconForStatusString.test.tsx │ │ │ │ ├── logs-utils.test.ts │ │ │ │ ├── oauth2proxy-auth.test.ts │ │ │ │ ├── object-utils.test.ts │ │ │ │ ├── provider-utils.test.ts │ │ │ │ ├── regex-utils.test.ts │ │ │ │ ├── severity-utils.test.ts │ │ │ │ └── status-utils.test.ts │ │ │ ├── capture.ts │ │ │ ├── downloadFileFromString.ts │ │ │ ├── encodings.ts │ │ │ ├── getApiUrlFromConfig.ts │ │ │ ├── hooks/ │ │ │ │ ├── __tests__/ │ │ │ │ │ └── useSignOut.test.ts │ │ │ │ ├── useApi.tsx │ │ │ │ ├── useHealth.ts │ │ │ │ ├── useHydratedSession.tsx │ │ │ │ ├── useMounted.tsx │ │ │ │ ├── useSetSentryUser.ts │ │ │ │ └── useSignOut.ts │ │ │ ├── logs-utils.ts │ │ │ ├── oauth2proxy-auth.ts │ │ │ ├── object-utils.ts │ │ │ ├── provider-utils.ts │ │ │ ├── regex-utils.ts │ │ │ ├── server/ │ │ │ │ └── getConfig.ts │ │ │ ├── state-utils.ts │ │ │ ├── status-utils.ts │ │ │ └── tremor-utils.ts │ │ ├── tests/ │ │ │ └── next-auth-mock.tsx │ │ └── ui/ │ │ ├── DateTimeField.tsx │ │ ├── DebugJSON/ │ │ │ ├── DebugJSON.tsx │ │ │ └── index.ts │ │ ├── Drawer/ │ │ │ ├── Drawer.tsx │ │ │ ├── TremorDrawer.tsx │ │ │ └── index.ts │ │ ├── DropdownMenu/ │ │ │ ├── DropdownMenu.css │ │ │ ├── DropdownMenu.tsx │ │ │ └── index.ts │ │ ├── EmptyState/ │ │ │ ├── EmptyStateCard.tsx │ │ │ └── index.ts │ │ ├── ErrorComponent/ │ │ │ ├── ErrorComponent.tsx │ │ │ └── index.ts │ │ ├── FieldHeader.tsx │ │ ├── FormattedContent/ │ │ │ └── FormattedContent.tsx │ │ ├── Input/ │ │ │ └── index.tsx │ │ ├── JsonCard/ │ │ │ ├── JsonCard.tsx │ │ │ └── index.ts │ │ ├── KeepLoader/ │ │ │ └── KeepLoader.tsx │ │ ├── KeepLogoError/ │ │ │ ├── KeepLogoError.tsx │ │ │ ├── index.ts │ │ │ └── logo-error.css │ │ ├── MarkdownHTML/ │ │ │ ├── MarkdownHTML.tsx │ │ │ └── index.ts │ │ ├── MonacoCELEditor/ │ │ │ ├── MonacoCel.tsx │ │ │ ├── MonacoCel.turbopack.tsx │ │ │ ├── cel-support.ts │ │ │ ├── editor.scss │ │ │ ├── handle-completions.ts │ │ │ ├── index.ts │ │ │ ├── monaco-cel-editor.tsx │ │ │ └── validation-hook.ts │ │ ├── MonacoEditor/ │ │ │ ├── MonacoEditorCDN.tsx │ │ │ ├── MonacoEditorNPM.tsx │ │ │ ├── index.ts │ │ │ └── index.turbopack.ts │ │ ├── MonacoYAMLEditor/ │ │ │ ├── MonacoYAMLEditor.types.ts │ │ │ ├── editor.client.tsx │ │ │ ├── editor.client.turbopack.tsx │ │ │ ├── index.ts │ │ │ └── index.turbopack.ts │ │ ├── PageSubtitle.tsx │ │ ├── PageTitle.tsx │ │ ├── PostHogPageView.tsx │ │ ├── ResizableColumns/ │ │ │ ├── index.ts │ │ │ └── ui/ │ │ │ └── ResizableColumns.tsx │ │ ├── Select/ │ │ │ ├── Select.tsx │ │ │ └── index.ts │ │ ├── SeverityBorderIcon/ │ │ │ ├── SeverityBorderIcon.tsx │ │ │ └── index.ts │ │ ├── SeverityLabel/ │ │ │ ├── SeverityLabel.tsx │ │ │ └── index.ts │ │ ├── TabLinkNavigation/ │ │ │ ├── TabLinkNavigation.tsx │ │ │ ├── TabNavigationLink.tsx │ │ │ └── index.tsx │ │ ├── TableIndeterminateCheckbox/ │ │ │ ├── TableIndeterminateCheckbox.tsx │ │ │ └── index.ts │ │ ├── TablePagination/ │ │ │ ├── TablePagination.tsx │ │ │ └── index.ts │ │ ├── TableSeverityCell/ │ │ │ ├── TableSeverityCell.tsx │ │ │ └── index.ts │ │ ├── Tooltip/ │ │ │ ├── Tooltip.tsx │ │ │ └── index.ts │ │ ├── TraceViewer/ │ │ │ ├── Trace.ts │ │ │ ├── TraceViewer.tsx │ │ │ └── index.ts │ │ ├── VerticalRoundedList/ │ │ │ ├── VerticalRoundedList.tsx │ │ │ ├── index.ts │ │ │ └── vertical-rounded-list.css │ │ ├── WorkflowYAMLEditor/ │ │ │ ├── index.ts │ │ │ ├── lib/ │ │ │ │ ├── useYamlValidation.ts │ │ │ │ └── utils.ts │ │ │ ├── model/ │ │ │ │ └── types.ts │ │ │ └── ui/ │ │ │ ├── WorkflowYAMLEditor.tsx │ │ │ ├── WorkflowYAMLEditorStandalone.tsx │ │ │ ├── WorkflowYAMLEditorToolbar.tsx │ │ │ ├── WorkflowYAMLValidationErrors.tsx │ │ │ └── WorkflowYamlEditorHeader.tsx │ │ ├── WorkflowYAMLEditorWithLogs/ │ │ │ ├── WorkflowYAMLEditorWithLogs.css │ │ │ ├── WorkflowYAMLEditorWithLogs.tsx │ │ │ └── index.tsx │ │ ├── index.ts │ │ ├── theme/ │ │ │ ├── ThemeControl.tsx │ │ │ ├── ThemeScript.tsx │ │ │ ├── WatchUpdateTheme.ts │ │ │ └── index.ts │ │ └── utils/ │ │ ├── favicon.ts │ │ ├── getIconForStatusString.tsx │ │ ├── severity-utils.ts │ │ ├── showErrorToast.tsx │ │ ├── showSuccessToast.tsx │ │ └── table-utils.ts │ ├── styles/ │ │ └── linear.scss │ ├── tailwind.config.js │ ├── tsconfig.json │ ├── tsconfig.scripts.json │ ├── types/ │ │ ├── auth.d.ts │ │ ├── internal-config.ts │ │ └── react-table.d.ts │ ├── utils/ │ │ ├── apiUrl.ts │ │ ├── authenticationType.ts │ │ ├── cel-ast.ts │ │ ├── fatigue.ts │ │ ├── helpers.ts │ │ ├── hooks/ │ │ │ ├── useAI.ts │ │ │ ├── useAlertPolling.ts │ │ │ ├── useAlertQuality.ts │ │ │ ├── useConfig.ts │ │ │ ├── useDashboardMetricWidgets.ts │ │ │ ├── useDashboardPresets.ts │ │ │ ├── useDashboards.ts │ │ │ ├── useDebouncedValue.ts │ │ │ ├── useDeduplicationRules.ts │ │ │ ├── useEnrichmentEvents.ts │ │ │ ├── useExpandedRows.ts │ │ │ ├── useExtractionRules.ts │ │ │ ├── useGroupExpansion.ts │ │ │ ├── useGroups.ts │ │ │ ├── useIncidents.ts │ │ │ ├── useLocalStorage.ts │ │ │ ├── useMaintenanceRules.ts │ │ │ ├── useMappingRules.ts │ │ │ ├── usePermissions.ts │ │ │ ├── useProviderLogs.ts │ │ │ ├── useProviders.ts │ │ │ ├── usePusher.ts │ │ │ ├── useRoles.ts │ │ │ ├── useRules.ts │ │ │ ├── useScopes.ts │ │ │ ├── useSearchAlerts.ts │ │ │ ├── useTags.ts │ │ │ ├── useTenantConfiguration.ts │ │ │ └── useWorkflowSecrets.ts │ │ ├── reactFlow.ts │ │ └── type-utils.ts │ └── widgets/ │ ├── alerts-table/ │ │ ├── lib/ │ │ │ ├── alert-table-list-format.tsx │ │ │ ├── alert-table-time-format.tsx │ │ │ └── alert-table-utils.tsx │ │ └── ui/ │ │ ├── ActionTraySelection.tsx │ │ ├── ColumnSelection.tsx │ │ ├── RowStyleSelection.tsx │ │ ├── SettingsSelection.tsx │ │ ├── TitleAndFilters.tsx │ │ ├── __tests__/ │ │ │ ├── alert-assignee.test.tsx │ │ │ ├── alert-grouped-row.test.tsx │ │ │ └── useAlertsTableData.test.ts │ │ ├── alert-actions.tsx │ │ ├── alert-assignee.tsx │ │ ├── alert-extra-payload.tsx │ │ ├── alert-grouped-row.tsx │ │ ├── alert-pagination.tsx │ │ ├── alert-table-column-rename.tsx │ │ ├── alert-table-headers.tsx │ │ ├── alert-table-server-side.tsx │ │ ├── alert-table.tsx │ │ ├── alerts-table-body.tsx │ │ └── useAlertsTableData.ts │ └── workflow-builder/ │ ├── __tests__/ │ │ ├── workflow-builder-widget.test.tsx │ │ └── workflow-builder.test.tsx │ ├── empty-builder-state.tsx │ ├── index.ts │ ├── workflow-builder-card.tsx │ ├── workflow-builder-widget-safe.tsx │ ├── workflow-builder-widget.tsx │ └── workflow-builder.tsx ├── keycloak/ │ ├── Dockerfile.keycloak │ ├── KEYCLOAK_LDAP.md │ ├── docker-compose.yaml │ ├── event_listeners/ │ │ └── last-login-event-listener-0.0.1-SNAPSHOT.jar │ ├── generate_ldap.py │ ├── javascript_providers/ │ │ ├── keep-abac-policy/ │ │ │ ├── META-INF/ │ │ │ │ └── keycloak-scripts.json │ │ │ └── keep-abac-policy.js │ │ └── keep-abac-policy.jar │ ├── jsons/ │ │ ├── group-claim.json │ │ ├── group-mapper.json │ │ └── tenant-ids-js-mapper.json │ ├── keep-realm.json │ ├── keycloak_entrypoint.sh │ ├── ldap.ldif │ ├── ldap_generated.ldif │ ├── ldif/ │ │ ├── ldap_orgs.ldif │ │ └── ldap_orgs_new.ldif │ ├── readme.md │ └── themes/ │ └── keep.jar ├── oauth2proxy/ │ ├── docker-compose-oauth2proxy.yml │ └── nginx.conf ├── otel-shared/ │ ├── alertmanager.yml │ ├── grafana-datasources.yaml │ ├── otel-collector-config.yaml │ ├── prometheus.yaml │ ├── tempo.yaml │ └── vector.toml ├── prometheus/ │ └── prometheus.yml ├── proxy/ │ ├── README.md │ ├── docker-compose-proxy.yml │ ├── nginx.conf │ └── squid.conf ├── pyproject.toml ├── render.yaml ├── scripts/ │ ├── docs_generate_api_docs_from_openapi.sh │ ├── docs_get_providers_list.py │ ├── docs_openapi_converter.py │ ├── docs_render_provider_snippets.py │ ├── docs_validate_navigation.sh │ ├── docs_validate_openapi_is_actual.sh │ ├── migrate_to_elastic.py │ ├── save_providers_list.py │ ├── shoot_alerts_from_dump.py │ ├── simulate_alerts.py │ ├── simulate_alerts.sh │ ├── simulate_rules.py │ └── workflow_yaml_generate_json_schema.sh ├── start.sh ├── templates/ │ └── CHANGELOG.md └── tests/ ├── Dockerfile.keycloak.test ├── __init__.py ├── cel_to_sql/ │ ├── cel-to-sql-test-cases.json │ ├── order-by-exp-test-cases.json │ ├── test_cel_to_ast.py │ ├── test_cel_to_sql.py │ └── test_order_by_exp.py ├── conftest.py ├── deduplication/ │ ├── test_deduplications.py │ └── test_deduplications_provisioning.py ├── docker-compose-elastic.yml ├── docker-compose-keycloak.yml ├── docker-compose-mysql.yml ├── e2e_tests/ │ ├── docker-compose-e2e-mysql.yml │ ├── docker-compose-e2e-postgres.yml │ ├── docker-compose-e2e-redis-sentinel-noauth.yml │ ├── docker-compose-e2e-redis.yml │ ├── docker-compose-e2e-sqlite.yml │ ├── docker-entrypoint-initdb.d/ │ │ └── update-postgresql-conf.sh │ ├── grafana.ini │ ├── incidents_alerts_tests/ │ │ ├── incidents_alerts_setup.py │ │ ├── test_filtering_sort_search_on_alerts.py │ │ ├── test_filtering_sort_search_on_incidents.py │ │ ├── test_mentions_in_incident_comments.py │ │ └── test_xss_protection.py │ ├── postgres-custom.conf │ ├── test_end_to_end.py │ ├── test_end_to_end_db_auth.py │ ├── test_end_to_end_theme.py │ ├── test_grafana_provider.py │ ├── test_pushing_prometheus_alerts.py │ ├── test_pushing_prometheus_config.yaml │ ├── test_pushing_prometheus_rules.yaml │ ├── test_redis_sentinel_e2e_full.py │ ├── test_topology.py │ ├── utils.py │ ├── workflow-alert-log.yaml │ ├── workflow-incident-log.yaml │ ├── workflow-inputs-alert.yaml │ ├── workflow-inputs.yaml │ ├── workflow-interval.yaml │ ├── workflow-invalid-sample.yaml │ ├── workflow-quotes-sample.yaml │ ├── workflow-sample-npm.yaml │ ├── workflow-sample.yaml │ └── workflow-valid-sample.yaml ├── fixtures/ │ ├── __init__.py │ ├── client.py │ └── workflow_manager.py ├── keycloak-test-realm-export.json ├── providers/ │ └── jira_provider/ │ └── test_jira_priority_fix.py ├── provision/ │ ├── workflows_1/ │ │ ├── provision_example_1.yml │ │ ├── provision_example_2.yml │ │ └── provision_example_3.yml │ ├── workflows_2/ │ │ ├── provision_example_1.yml │ │ └── provision_example_2.yml │ ├── workflows_3/ │ │ └── workflows_with_no_name.yml │ └── workflows_4/ │ └── console_example.yml ├── test.json ├── test_actions.py ├── test_alert_dto.py ├── test_alert_evaluation.py ├── test_alert_tenrary.py ├── test_alert_utils.py ├── test_auth.py ├── test_auth_new.py ├── test_auto_resolve_workflow.py ├── test_batch_enrich_cel.py ├── test_conditions.py ├── test_contextmanager.py ├── test_counting.py ├── test_counting_integration.py ├── test_cyaml.py ├── test_dismissal_expiry_bug.py ├── test_enrichments.py ├── test_extraction_rules.py ├── test_functions.py ├── test_incidents.py ├── test_iohandler.py ├── test_jira_provider.py ├── test_keep_provider_time_delta.py ├── test_maintenance_windows_bl.py ├── test_metrics.py ├── test_pagerduty_provider.py ├── test_parser.py ├── test_provider_factory.py ├── test_provider_reprovisioning.py ├── test_provider_validation_fields.py ├── test_providers_api.py ├── test_providers_yaml_provisioning.py ├── test_provisioning.py ├── test_rules_api.py ├── test_rules_engine.py ├── test_search_alerts.py ├── test_search_alerts_configuration.py ├── test_secretmanager.py ├── test_servicenow_provider.py ├── test_settings_api.py ├── test_smtp_provider.py ├── test_steps.py ├── test_teams_provider.py ├── test_topology.py ├── test_workflow_api.py ├── test_workflow_cel_filter.py ├── test_workflow_execution.py ├── test_workflow_filters.py ├── test_workflow_severity_comparisons.py ├── test_workflowmanager.py ├── test_workflows.py ├── test_workflows_update.py ├── test_workflowstore.py └── workflows/ ├── db_disk_space_for_testing.yml ├── providers_for_testing.yaml ├── reusable_actions_for_testing.yml ├── reusable_alert_for_testing.yml └── reusable_alert_with_actions_for_testing.yml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .cursor/rules/keep-ui-react-typescript.mdc ================================================ --- description: globs: alwaysApply: true --- --- description: Rules for writing frontend code at Keep (React + Typescript) globs: keep-ui/**/*.tsx, keep-ui/**/*.ts --- You are an expert in TypeScript, React, Next.js, SWR, Tailwind, and UX design. # Achitecture Use Feature-Slice Design Convention with modification: instead of `pages` and `app` we use default Next.js route-based folder structure. Example: - entities/ - incidents/ - api/ - lib/ - model/ - ui/ Top-level folders, called Layers: - widgets - features - entities - shared Each layer has segments, e.g. "entities/users". Each segment has slices - ui — everything related to UI display: UI components, date formatters, styles, etc. - api — backend interactions: request functions, data types, mappers, etc. - model — the data model: schemas, interfaces, stores, and business logic. - lib — library code that other modules on this slice need. - config — configuration files and feature flags. # Code Style and Structure - Write TypeScript with proper typing for all new code - Use functional programming patterns; avoid classes - Prefer iteration and modularization over code duplication. - Use descriptive variable names with auxiliary verbs (e.g., isLoading, hasError). - Don't use `useEffect` where you can use ref function for dom-dependent things (e.g. ref={el => ...}) - Don't use `useState` where you can infer from props - Use named exports; avoid default exports - If you need to create new base component, first look at existing ones in `@/shared/ui` # Naming Conventions - Always look around the codebase for naming conventions, and follow the best practices of the environment (e.g. use `camelCase` variables in JS). - Use clear, yet functional names (`searchResults` vs `data`). - React components are PascalCase (`IncidentList`). - Props for components and hooks are PascalCase and end with `Props`, e.g. `WorkflowBuilderWidgetProps`, return value for hooks is PascalCase and end with `Value`, e.g. `UseIncidentActionsValue` - Name the `.ts` file according to its main export: `IncidentList.ts` or `IncidentList.tsx` or `useIncidents.ts`. Pay attention to the case. - Avoid `index.ts`, `styles.css`, and other generic names, even if this is the only file in a directory. # Data Fetching - Use useSWR for fetching data, create or extend hooks in @/entities//model/use.ts which encapsulates fetching logic - Create a dedicated keys file @/entities//lib/Keys.ts to manage SWR cache keys. Structure it as an object with methods for different operations: ```export const entityKeys = { all: "entityName", list: (query: QueryParams) => [...], detail: (id: string) => [...], getListMatcher: () => (key: any) => boolean }``` - For query-based endpoints, construct cache keys by joining parameters with "::", filtering out falsy values: ```list: (query: QueryParams) => [ entityKeys.all, "list", query.param1, query.param2 ].filter(Boolean).join("::")``` - For create, update, delete actions: - Create or extend hook in @/entities//model/useActions.ts - Create a dedicated revalidation hook (e.g., useRevalidation.ts) to handle cache invalidation - Revalidate both specific items and list queries after mutations - Include success/error toast notifications for user feedback - Handle file uploads and other complex operations within the actions hook # UI and Styling - Use Tailwind CSS as primary styling solution - For non-Tailwind cases: - Use CSS with component-specific files - Namespace under component class (.DropdownMenu) - Follow BEM for modals (.DropdownMenu__modal) - Import styles directly (import './DropdownMenu.css') - Replace custom CSS with Tailwind when possible ================================================ FILE: .cursor/rules/keep-ui-tests.mdc ================================================ --- description: globs: alwaysApply: true --- --- description: Rules and guidelines for writing and running React tests globs: *.spec.tsx, *.test.tsx, *.test.ts, *.spec.ts --- # Writing frontend tests Place tests in __tests__ folder in the module, e.g. tests for file `/features/workflows/model/useWorkflows.tsx` should be `/features/workflows/models/__tests__/useWorkflows.test.tsx` # Running frontend tests Please run tests with command: npm run test in keep-ui folder For example: cd keep-ui && npm run test ================================================ FILE: .dockerignore ================================================ docs/* keep-ui/node_modules keep-ui/.next/* keep-ui/.env.local .venv/ .vercel/ .vscode/ .github/ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: "[🐛 Bug]: " labels: "" assignees: "" --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. **Additional context** Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: true contact_links: - name: Support url: https://github.com/keephq/keep/discussions about: Get help! Ask questions, get support, and share ideas. - name: Chat url: https://slack.keephq.dev about: Engage with the Keep team and other community members over Slack. - name: Twitter url: https://twitter.com/keepalerting about: Follow us and stay up to date with Keep. ================================================ FILE: .github/ISSUE_TEMPLATE/documentation.md ================================================ --- name: Documentation issue about: Any issue related with Keep's documentation title: "[📃 Docs]: " labels: "Documentation" assignees: "" --- **Describe the documentation change** Add any context about the documentation change you aim to do. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: "[➕ Feature]: " labels: "" assignees: "" --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. ================================================ FILE: .github/ISSUE_TEMPLATE/new_provider_request.md ================================================ --- name: New provider request about: Suggest a new provider for keep title: "[🔌 Provider]: " labels: "Provider" assignees: "" --- **Describe the provider you want to add** Add any context about the tool and the kind of data you would want to pull/push from the provider. **Describe your use case** Does this integration will help you to use Keep? **Are you already using Keep?** Yes/No **Additional context** Add any other context or screenshots about the provider request here. ================================================ FILE: .github/ISSUE_TEMPLATE/use_case.md ================================================ --- name: Use case about: Tell us how you use Keep and we will add it to the docs. title: '' labels: '' assignees: '' --- **What do you use Keep for?** A clear and concise description of what you do with Keep. ================================================ FILE: .github/workflows/auto-release.yml ================================================ name: Auto Release on Version Change on: push: branches: - main paths: - "pyproject.toml" jobs: check-and-release: runs-on: ubuntu-latest permissions: contents: write steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.11" - name: Extract version from pyproject.toml id: get_version run: | VERSION=$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/') echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Check if release exists id: check_release run: | TAG_EXISTS=$(git tag -l "v${{ steps.get_version.outputs.version }}") if [ -z "$TAG_EXISTS" ]; then echo "exists=false" >> $GITHUB_OUTPUT else echo "exists=true" >> $GITHUB_OUTPUT fi - name: Create Release if: steps.check_release.outputs.exists == 'false' uses: softprops/action-gh-release@v1 with: tag_name: v${{ steps.get_version.outputs.version }} name: Release v${{ steps.get_version.outputs.version }} generate_release_notes: true draft: false prerelease: false env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/auto-resolve-keep.yml ================================================ name: Auto resolve Keep incident/alert on: workflow_dispatch: inputs: incident_id: description: "Keep incident ID to resolve" required: false type: string alert_fingerprint: description: "Keep alert fingerprint to resolve" required: false type: string status: description: "Status to set" required: false type: string default: "resolved" pull_request: types: [closed] branches: - main jobs: auto-resolve-keep: runs-on: ubuntu-latest steps: - name: Extract Keep ID from PR description if: github.event_name == 'pull_request' id: extract_id run: | PR_DESC="${{ github.event.pull_request.body }}" INCIDENT_ID=$(echo "$PR_DESC" | grep -ioP 'close keep incident:\s*\K[a-f0-9-]+' || true) ALERT_FINGERPRINT=$(echo "$PR_DESC" | grep -ioP 'close keep alert:\s*\K[a-f0-9-]+' || true) echo "incident_id=$INCIDENT_ID" >> $GITHUB_OUTPUT echo "alert_fingerprint=$ALERT_FINGERPRINT" >> $GITHUB_OUTPUT - name: Set final IDs id: set_ids run: | FINAL_INCIDENT_ID="${{ inputs.incident_id || steps.extract_id.outputs.incident_id }}" FINAL_ALERT_FINGERPRINT="${{ inputs.alert_fingerprint || steps.extract_id.outputs.alert_fingerprint }}" echo "final_incident_id=$FINAL_INCIDENT_ID" >> $GITHUB_OUTPUT echo "final_alert_fingerprint=$FINAL_ALERT_FINGERPRINT" >> $GITHUB_OUTPUT - name: Auto resolve Keep incident if: | (github.event_name == 'pull_request' && github.event.pull_request.merged == true && steps.set_ids.outputs.final_incident_id != '') || (github.event_name == 'workflow_dispatch' && inputs.incident_id != '') uses: fjogeleit/http-request-action@v1 with: url: "https://api.keephq.dev/incidents/${{ steps.set_ids.outputs.final_incident_id }}/status" method: "POST" customHeaders: '{"X-API-KEY": "${{ secrets.KEEP_API_KEY }}", "Content-Type": "application/json"}' data: '{"status": "${{ inputs.status || ''resolved'' }}"}' - name: Auto enrich Keep incident if: | (github.event_name == 'pull_request' && github.event.pull_request.merged == true && steps.set_ids.outputs.final_incident_id != '') || (github.event_name == 'workflow_dispatch' && inputs.incident_id != '') uses: fjogeleit/http-request-action@v1 with: url: "https://api.keephq.dev/incidents/${{ steps.set_ids.outputs.final_incident_id }}/enrich" method: "POST" customHeaders: '{"X-API-KEY": "${{ secrets.KEEP_API_KEY }}", "Content-Type": "application/json"}' data: '{"enrichments":{"incident_title":"${{ github.event.pull_request.title || ''Manual resolution'' }}","incident_url":"${{ github.event.pull_request.html_url || github.server_url }}//${{ github.repository }}/actions/runs/${{ github.run_id }}", "incident_id": "${{ github.run_id }}", "incident_provider": "github"}}' - name: Auto resolve Keep alert if: | (github.event_name == 'pull_request' && github.event.pull_request.merged == true && steps.set_ids.outputs.final_alert_fingerprint != '') || (github.event_name == 'workflow_dispatch' && inputs.alert_fingerprint != '') uses: fjogeleit/http-request-action@v1 with: url: "https://api.keephq.dev/alerts/enrich?dispose_on_new_alert=true" method: "POST" customHeaders: '{"Content-Type": "application/json", "X-API-KEY": "${{ secrets.KEEP_API_KEY }}"}' data: '{"enrichments":{"status":"${{ inputs.status || ''resolved'' }}","dismissed":false,"dismissUntil":"","note":"${{ github.event.pull_request.title || ''Manual resolution'' }}","ticket_url":"${{ github.event.pull_request.html_url || github.server_url }}//${{ github.repository }}/actions/runs/${{ github.run_id }}"},"fingerprint":"${{ steps.set_ids.outputs.final_alert_fingerprint }}"}' ================================================ FILE: .github/workflows/but-to-project.yml ================================================ name: Add bugs to project board on: issues: types: - labeled jobs: add-to-project: name: Add bug to project board runs-on: ubuntu-latest if: github.event.label.name == 'Bug' steps: - uses: actions/add-to-project@v0.5.0 with: project-url: https://github.com/orgs/keephq/projects/11 github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} ================================================ FILE: .github/workflows/developer-onboarding-notification.yml ================================================ name: Celebrating Contributions on: pull_request_target: types: [closed] permissions: pull-requests: write jobs: comment_on_merged_pull_request: if: github.event.pull_request.merged == true runs-on: ubuntu-latest steps: - name: Checkout Repository uses: actions/checkout@v4 - name: Set Environment Variables env: AUTHOR: ${{ github.event.pull_request.user.login }} REPO: ${{ github.event.repository.name }} OWNER: ${{ github.event.repository.owner.login }} run: | echo "AUTHOR=${AUTHOR}" >> $GITHUB_ENV echo "REPO=${REPO}" >> $GITHUB_ENV echo "OWNER=${OWNER}" >> $GITHUB_ENV - name: Count Merged Pull Requests id: count_merged_pull_requests uses: actions/github-script@v6 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | try { const author = process.env.AUTHOR; const repo = process.env.REPO; const owner = process.env.OWNER; const { data } = await github.rest.search.issuesAndPullRequests({ q: `repo:${owner}/${repo} type:pr state:closed author:${author}` }); const prCount = data.items.filter(pr => pr.pull_request.merged_at).length; core.exportVariable('PR_COUNT', prCount); } catch (error) { core.setFailed(`Error counting merged pull requests: ${error.message}`); } - name: Comment on the Merged Pull Request uses: actions/github-script@v6 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | try { const prCount = parseInt(process.env.PR_COUNT); const author = process.env.AUTHOR; const prNumber = context.payload.pull_request.number; const repo = process.env.REPO; function getRandomEmoji() { const emojis = ['🎉', '🚀', '💪', '🌟', '🏆', '🎊', '🔥', '👏', '🌈', '🚂']; return emojis[Math.floor(Math.random() * emojis.length)]; } function getMessage(count) { const emoji = getRandomEmoji(); switch(count) { case 1: return `${emoji} **Fantastic work @${author}!** Your very first PR to ${repo} has been merged! 🎉🥳\n\n` + `You've just taken your first step into open-source, and we couldn't be happier to have you onboard. 🙌\n` + `If you're feeling adventurous, why not dive into another issue and keep contributing? The community would love to see more from you! 🚀\n\n` + `For any support, feel free to reach out on the community: https://slack.keephq.dev. Happy coding! 👩‍💻👨‍💻`; case 2: return `${emoji} **Well done @${author}!** Two PRs merged already! 🎉🥳\n\n` + `With your second PR, you're on a roll, and your contributions are already making a difference. 🌟\n` + `Looking forward to seeing even more contributions from you. See you in Slack https://slack.keephq.dev 🚀`; case 3: return `${emoji} **You're on fire, @${author}!** Three PRs merged and counting! 🔥🎉\n\n` + `Your consistent contributions are truly impressive. You're becoming a valued member of our community! 💖\n` + `Have you considered taking on some more challenging issues? We'd love to see what you can do! 💪\n\n` + `Remember, the team is always here to support you. Keep blazing that trail! 🚀`; case 5: return `${emoji} **High five, @${author}!** You've hit the incredible milestone of 5 merged PRs! 🖐️✨\n\n` + `Your dedication to ${repo} is outstanding. You're not just contributing code; you're shaping the future of this project! 🌠\n` + `We'd love to hear your thoughts on the project. Any ideas for new features or improvements? 🤔\n\n` + `The whole team applaud your efforts. You're a superstar! 🌟`; case 10: return `${emoji} **Double digits, @${author}!** 10 merged PRs is a massive achievement! 🏆🎊\n\n` + `Your impact on ${repo} is undeniable. You've become a pillar of our community! 🏛️\n` + `We'd be thrilled to have you take on a mentorship role for newer contributors. Interested? 🧑‍🏫\n\n` + `Everyone here are in awe of your contributions. You're an open source hero! 🦸‍♀️🦸‍♂️`; default: return ""; } } const message = getMessage(prCount); if (message) { await github.rest.issues.createComment({ owner: process.env.OWNER, repo: process.env.REPO, issue_number: prNumber, body: message }); } } catch (error) { core.setFailed(`Error creating comment: ${error.message}`); } ================================================ FILE: .github/workflows/lint-pr.yml ================================================ name: "Lint PR" on: pull_request_target: types: - opened - edited - synchronize - reopened permissions: pull-requests: write # Add explicit permissions for PR comments jobs: main: name: Validate PR title runs-on: ubuntu-latest steps: - name: lint_pr_title id: lint_pr_title uses: amannn/action-semantic-pull-request@v5.1.0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - uses: marocchino/sticky-pull-request-comment@v2 # When the previous steps fails, the workflow would stop. By adding this # condition you can continue the execution with the populated error message. if: always() && (steps.lint_pr_title.outputs.error_message != null) with: header: pr-title-lint-error message: | Hey there and thank you for opening this pull request! 👋🏼 We require pull request titles to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and it looks like your proposed title needs to be adjusted. Details: ``` ${{ steps.lint_pr_title.outputs.error_message }} ``` # Delete a previous comment when the issue has been resolved - if: ${{ steps.lint_pr_title.outputs.error_message == null }} uses: marocchino/sticky-pull-request-comment@v2 with: header: pr-title-lint-error delete: true links: runs-on: ubuntu-latest name: Validate PR to Issue link permissions: issues: read pull-requests: write steps: - uses: nearform-actions/github-action-check-linked-issues@v1 id: check-linked-issues with: exclude-branches: "release/**, dependabot/**" # OPTIONAL: Use the output from the `check-linked-issues` step - name: Get the output run: echo "How many linked issues? ${{ steps.check-linked-issues.outputs.linked_issues_count }}" ================================================ FILE: .github/workflows/release-workflow-schema.yml ================================================ name: Release JSON Schema on: push: branches: - main paths: - ".github/workflows/release-workflow-schema.yml" - "pyproject.toml" - "keep/providers/**" - "keep-ui/entities/workflows/model/yaml.schema.ts" pull_request: paths: - ".github/workflows/release-workflow-schema.yml" - "pyproject.toml" - "keep/providers/**" - "keep-ui/entities/workflows/model/yaml.schema.ts" workflow_dispatch: env: PYTHON_VERSION: 3.11 STORAGE_MANAGER_DIRECTORY: /tmp/storage-manager SCHEMA_REPO_NAME: keephq/keep-workflow-schema jobs: generate-schema: runs-on: ubuntu-latest permissions: contents: read outputs: version: ${{ steps.get_version.outputs.version }} steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 - name: Extract version from pyproject.toml id: get_version run: | VERSION=$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/') echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: python-version: "3.11" - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - name: Cache dependencies id: cache-deps uses: actions/cache@v4.2.0 with: path: .venv key: pydeps-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies using poetry run: poetry install --no-interaction --no-root --with dev - name: Save providers list run: | PYTHONPATH="${{ github.workspace }}" poetry run python ./scripts/save_providers_list.py - name: Set up Node.js 20 uses: actions/setup-node@v3 with: node-version: 20 cache: "npm" cache-dependency-path: keep-ui/package-lock.json - name: Install Node dependencies working-directory: keep-ui run: npm ci - name: Generate JSON Schema working-directory: keep-ui run: npm run build:workflow-yaml-json-schema - name: Upload schema artifact uses: actions/upload-artifact@v4 with: name: workflow-schema path: workflow-yaml-json-schema.json release-schema: runs-on: ubuntu-latest needs: generate-schema if: ${{ github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork }} steps: - name: Download schema artifact uses: actions/download-artifact@v4 with: name: workflow-schema path: . - name: Checkout schema repository uses: actions/checkout@v4 with: repository: ${{ env.SCHEMA_REPO_NAME }} token: ${{ secrets.SCHEMA_REPO_PAT }} path: schema-repo - name: Set target branch variable id: set_branch run: | if [ "${{ github.event_name }}" = "pull_request" ]; then echo "branch=${{ github.head_ref }}" >> $GITHUB_OUTPUT else echo "branch=${{ github.ref_name }}" >> $GITHUB_OUTPUT fi - name: Create or switch to target branch in schema repo working-directory: schema-repo run: | git fetch origin if git show-ref --verify --quiet refs/heads/${{ steps.set_branch.outputs.branch }}; then git checkout ${{ steps.set_branch.outputs.branch }} else git checkout -b ${{ steps.set_branch.outputs.branch }} fi - name: Copy schema to target repository run: | cp workflow-yaml-json-schema.json schema-repo/schema.json # Update schema with version info jq --arg version "${{ needs.generate-schema.outputs.version }}" \ --arg id "https://raw.githubusercontent.com/${{ env.SCHEMA_REPO_NAME }}/v${{ needs.generate-schema.outputs.version }}/schema.json" \ '. + {version: $version, "$id": $id}' \ schema-repo/schema.json > schema-repo/schema.tmp.json mv schema-repo/schema.tmp.json schema-repo/schema.json - name: Check if schema changed id: check_changes working-directory: schema-repo run: | git add schema.json if git diff --cached --quiet schema.json; then echo "changed=false" >> $GITHUB_OUTPUT else echo "changed=true" >> $GITHUB_OUTPUT fi - name: Commit and push schema if: steps.check_changes.outputs.changed == 'true' working-directory: schema-repo run: | git config user.name "Keep Schema Bot" git config user.email "no-reply@keephq.dev" git commit -m "Release schema v${{ needs.generate-schema.outputs.version }}" git push origin ${{ steps.set_branch.outputs.branch }} if [ "${{ steps.set_branch.outputs.branch }}" = "main" ]; then git tag "v${{ needs.generate-schema.outputs.version }}" git push origin "v${{ needs.generate-schema.outputs.version }}" fi - name: Create GitHub Release if: steps.check_changes.outputs.changed == 'true' && steps.set_branch.outputs.branch == 'main' uses: softprops/action-gh-release@v1 with: repository: ${{ env.SCHEMA_REPO_NAME }} tag_name: v${{ needs.generate-schema.outputs.version }} name: Release v${{ needs.generate-schema.outputs.version }} body: | Automated release of schema version v${{ needs.generate-schema.outputs.version }}. env: GITHUB_TOKEN: ${{ secrets.SCHEMA_REPO_PAT }} ================================================ FILE: .github/workflows/release.yml ================================================ name: Keep Release on: workflow_dispatch: jobs: release: runs-on: ubuntu-latest concurrency: release permissions: id-token: write contents: write pull-requests: write steps: - uses: actions/checkout@v3 with: fetch-depth: 0 persist-credentials: false ref: main - name: Release Keep id: release-step uses: python-semantic-release/python-semantic-release@v9.8.7 with: git_committer_name: Keep Release Bot git_committer_email: no-reply@keephq.dev github_token: ${{ secrets.GITHUB_TOKEN }} push: false tag: true commit: true - name: Open PR for release branch id: pr-step uses: peter-evans/create-pull-request@v6.1.0 with: committer: Keep Release Bot title: "Release - ${{ steps.release-step.outputs.version }}" branch: release/${{ steps.release-step.outputs.version }} body: "This PR contains the latest release changes." draft: false base: main - uses: peter-evans/enable-pull-request-automerge@v3 with: token: ${{ secrets.GITHUB_TOKEN }} pull-request-number: ${{ steps.pr-step.outputs.pull-request-number }} - name: Create release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} tag: "v${{ steps.release-step.outputs.version }}" run: | gh release create "$tag" \ --repo="$GITHUB_REPOSITORY" \ --title="v${{ steps.release-step.outputs.version }}" \ --target="release/${{ steps.release-step.outputs.version }}" \ --generate-notes ================================================ FILE: .github/workflows/run-e2e-tests.yml ================================================ on: workflow_call: inputs: db-type: required: true type: string redis_enabled: required: true type: boolean python-version: required: true type: string is-fork: required: true type: boolean backend-image-name: required: true type: string frontend-image-name: required: true type: string jobs: # Run tests with all services in one job run-tests: runs-on: ubuntu-latest permissions: contents: read packages: write env: REDIS: ${{ inputs.redis_enabled }} REDIS_HOST: keep-redis REDIS_PORT: 6379 BACKEND_IMAGE: ${{ inputs.backend-image-name }} FRONTEND_IMAGE: ${{ inputs.frontend-image-name }} steps: - name: Checkout uses: actions/checkout@v3 - name: Login to GitHub Container Registry if: ${{ inputs.is-fork != true }} uses: docker/login-action@v2 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Set up Python ${{ inputs.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ inputs.python-version }} - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - name: Restore dependencies cache id: cache-deps uses: actions/cache@v4.2.0 with: path: .venv key: pydeps-${{ hashFiles('**/poetry.lock') }} # Always install dependencies to ensure venv is valid # When cached, this completes quickly; when broken, this fixes it - name: Install dependencies using poetry run: poetry install --no-interaction --no-root --with dev - name: Get Playwright version from poetry.lock id: playwright-version run: | PLAYWRIGHT_VERSION=$(grep "playwright" poetry.lock -A 5 | grep "version" | head -n 1 | cut -d'"' -f2) echo "version=$PLAYWRIGHT_VERSION" >> $GITHUB_OUTPUT - name: Cache Playwright browsers id: playwright-cache uses: actions/cache@v4.2.0 with: path: ~/.cache/ms-playwright key: playwright-${{ steps.playwright-version.outputs.version }} - name: Install Playwright and dependencies if: steps.playwright-cache.outputs.cache-hit != 'true' run: | poetry run playwright install --with-deps # For forks: Build images locally again since they don't persist between jobs - name: Set up Docker Buildx if: ${{ inputs.is-fork == true }} id: buildx uses: docker/setup-buildx-action@v2 - name: Rebuild frontend image locally for fork PRs if: ${{ inputs.is-fork == true }} uses: docker/build-push-action@v4 with: context: keep-ui file: ./docker/Dockerfile.ui push: false load: true tags: | keep-frontend:local cache-from: type=gha cache-to: type=gha,mode=max build-args: | BUILDKIT_INLINE_CACHE=1 - name: Rebuild backend image locally for fork PRs if: ${{ inputs.is-fork == true }} uses: docker/build-push-action@v4 with: context: . file: ./docker/Dockerfile.api push: false load: true tags: | keep-backend:local cache-from: type=gha cache-to: type=gha,mode=max build-args: | BUILDKIT_INLINE_CACHE=1 # Create a modified compose file with our built images - name: Create modified docker-compose file with built images run: | cp tests/e2e_tests/docker-compose-e2e-${{ inputs.db-type }}.yml tests/e2e_tests/docker-compose-modified.yml # Replace image placeholders with actual image references sed -i "s|%KEEPFRONTEND_IMAGE%|${{ env.FRONTEND_IMAGE }}|g" tests/e2e_tests/docker-compose-modified.yml sed -i "s|%KEEPBACKEND_IMAGE%|${{ env.BACKEND_IMAGE }}|g" tests/e2e_tests/docker-compose-modified.yml # cat the modified file for debugging cat tests/e2e_tests/docker-compose-modified.yml # Start ALL services in one go - name: Start ALL services run: | echo "Starting ALL services for ${{ inputs.db-type }}..." # Pull the required images first (only needed for non-fork builds) if [[ "${{ inputs.is-fork }}" != "true" ]]; then docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml pull fi # Start all services together docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml up -d # Show running containers docker ps # Show the images sha of the running containers docker images # Wait for all services to be ready - name: Wait for services to be ready run: | # Function for exponential backoff function wait_for_service() { local service_name=$1 local check_command=$2 local max_attempts=$3 local compose_service=$4 # Docker Compose service name local attempt=0 local wait_time=1 echo "Waiting for $service_name to be ready..." until eval "$check_command"; do if [ "$attempt" -ge "$max_attempts" ]; then echo "Max attempts reached, exiting..." # Show final logs before exiting if [ ! -z "$compose_service" ]; then echo "===== FINAL LOGS FOR ON ERROR EXIT $compose_service =====" docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs $compose_service echo "==========================================" fi exit 1 fi echo "Waiting for $service_name... (Attempt: $((attempt+1)), waiting ${wait_time}s)" # Print logs using docker compose if [ ! -z "$compose_service" ]; then echo "===== RECENT LOGS FOR $compose_service =====" docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs $compose_service --tail 100 echo "==========================================" fi attempt=$((attempt+1)) sleep $wait_time # Exponential backoff with max of 8 seconds wait_time=$((wait_time * 2 > 8 ? 8 : wait_time * 2)) done echo "$service_name is ready!" # last time, print logs using docker compose if [ ! -z "$compose_service" ]; then echo "===== FINAL LOGS FOR $compose_service =====" docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs $compose_service --tail 100 echo "==========================================" fi } # Database checks if [ "${{ inputs.db-type }}" == "mysql" ]; then wait_for_service "MySQL Database" "docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml exec -T keep-database mysqladmin ping -h \"localhost\" --silent" 10 "keep-database" wait_for_service "MySQL Database (DB AUTH)" "docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml exec -T keep-database-db-auth mysqladmin ping -h \"localhost\" --silent" 10 "keep-database-db-auth" elif [ "${{ inputs.db-type }}" == "postgres" ]; then wait_for_service "Postgres Database" "docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml exec -T keep-database pg_isready -h localhost -U keepuser" 10 "keep-database" wait_for_service "Postgres Database (DB AUTH)" "docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml exec -T keep-database-db-auth pg_isready -h localhost -U keepuser" 10 "keep-database-db-auth" fi # Wait for services with health checks wait_for_service "Keep backend" "curl --output /dev/null --silent --fail http://localhost:8080/healthcheck" 15 "keep-backend" wait_for_service "Keep backend (DB AUTH)" "curl --output /dev/null --silent --fail http://localhost:8081/healthcheck" 15 "keep-backend-db-auth" wait_for_service "Keep frontend" "curl --output /dev/null --silent --fail http://localhost:3000/" 15 "keep-frontend" wait_for_service "Keep frontend (DB AUTH)" "curl --output /dev/null --silent --fail http://localhost:3001/" 15 "keep-frontend-db-auth" # Give Prometheus and Grafana extra time to initialize # (using direct curl commands instead of container exec) echo "Waiting for Prometheus to be ready..." MAX_ATTEMPTS=15 for i in $(seq 1 $MAX_ATTEMPTS); do if curl --output /dev/null --silent --fail http://localhost:9090/-/healthy; then echo "Prometheus is ready!" break elif [ $i -eq $MAX_ATTEMPTS ]; then echo "Prometheus did not become ready in time, but continuing..." docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs prometheus-server-for-test-target --tail 50 else echo "Waiting for Prometheus... Attempt $i/$MAX_ATTEMPTS" sleep 5 fi done echo "Waiting for Grafana to be ready..." MAX_ATTEMPTS=15 for i in $(seq 1 $MAX_ATTEMPTS); do if curl --output /dev/null --silent --fail http://localhost:3002/api/health; then echo "Grafana is ready!" break elif [ $i -eq $MAX_ATTEMPTS ]; then echo "Grafana did not become ready in time, but continuing..." docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs grafana --tail 50 else echo "Waiting for Grafana... Attempt $i/$MAX_ATTEMPTS" sleep 5 fi done # Give everything a bit more time to stabilize echo "Giving services additional time to stabilize..." sleep 10 # Debug the environment before running tests - name: Debug environment run: | echo "Checking all container status..." docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml ps echo "Network information:" docker network ls docker network inspect keep_default || true echo "Testing Prometheus API..." curl -v http://localhost:9090/api/v1/status/config || echo "Prometheus API not responding, but continuing..." echo "Testing Grafana API..." curl -v http://localhost:3002/api/health || echo "Grafana API not responding, but continuing..." echo "Test Keep Frontend..." curl -v http://localhost:3000/ || echo "Keep Frontend not responding, but continuing..." echo "Test Keep Frontend with DB Auth..." curl -v http://localhost:3001/ || echo "Keep Frontend with DB Auth not responding, but continuing..." echo "Listing available ports:" netstat -tuln | grep -E '3000|3001|3002|8080|8081|9090' # Run e2e tests - name: Run e2e tests and report coverage run: | echo "Running tests..." poetry run coverage run --branch -m pytest -v tests/e2e_tests/ -n 4 --dist=loadfile echo "Tests completed!" - name: Convert coverage results to JSON (for CodeCov support) run: poetry run coverage json --omit="keep/providers/*" - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v3 with: fail_ci_if_error: false files: coverage.json verbose: true # Collect logs - name: Dump logs if: always() run: | docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs keep-backend > backend_logs-${{ inputs.db-type }}.txt docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs keep-frontend > frontend_logs-${{ inputs.db-type }}.txt docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs keep-backend-db-auth > backend_logs-${{ inputs.db-type }}-db-auth.txt docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs keep-frontend-db-auth > frontend_logs-${{ inputs.db-type }}-db-auth.txt docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs prometheus-server-for-test-target > prometheus_logs-${{ inputs.db-type }}.txt docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml logs grafana > grafana_logs-${{ inputs.db-type }}.txt continue-on-error: true # Upload artifacts - name: Upload test artifacts on failure if: always() uses: actions/upload-artifact@v4.4.3 with: name: test-artifacts-db-${{ inputs.db-type }}-redis-${{ inputs.redis_enabled }} path: | playwright_dump_*.html playwright_dump_*.png playwright_dump_*.txt playwright_dump_*.json backend_logs-${{ inputs.db-type }}.txt frontend_logs-${{ inputs.db-type }}.txt backend_logs-${{ inputs.db-type }}-db-auth.txt frontend_logs-${{ inputs.db-type }}-db-auth.txt prometheus_logs-${{ inputs.db-type }}.txt grafana_logs-${{ inputs.db-type }}.txt continue-on-error: true # Tear down environment - name: Tear down environment if: always() run: | docker compose -p keep --project-directory . -f tests/e2e_tests/docker-compose-modified.yml down ================================================ FILE: .github/workflows/sync-keep-workflows.yml ================================================ # A workflow that sync Keep workflows from a directory name: "Sync Keep Workflows" on: workflow_dispatch: inputs: keep_api_key: description: 'Keep API Key' required: false keep_api_url: description: 'Keep API URL' required: false default: 'https://api.keephq.dev' # push: # paths: # - 'examples/workflows/**' jobs: sync-workflows: name: Sync workflows to Keep runs-on: ubuntu-latest # Use the Keep CLI image container: image: us-central1-docker.pkg.dev/keephq/keep/keep-cli:latest env: KEEP_API_KEY: ${{ secrets.KEEP_API_KEY || github.event.inputs.keep_api_key }} KEEP_API_URL: ${{ secrets.KEEP_API_URL || github.event.inputs.keep_api_url }} steps: - name: Check out the repo uses: actions/checkout@v2 - name: Run Keep CLI run: | keep workflow apply -f examples/workflows ================================================ FILE: .github/workflows/test-docs.yml ================================================ name: Test docs on: push: paths: - 'keep/providers/**' - 'docs/**' - 'examples/**' pull_request: paths: - 'keep/providers/**' - 'docs/**' - 'examples/**' workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.head_ref }}-${{ github.job }} cancel-in-progress: true env: PYTHON_VERSION: 3.11 STORAGE_MANAGER_DIRECTORY: /tmp/storage-manager jobs: tests-docs: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - uses: chartboost/ruff-action@v1 with: src: "./keep" - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - name: cache deps id: cache-deps uses: actions/cache@v4.2.0 with: path: .venv key: pydeps-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies using poetry run: poetry install --no-interaction --no-root --with dev - name: Validate docs/providers/overview.mdx run: | cd scripts; poetry run python ./docs_get_providers_list.py --validate - name: Validate snippets for providers run: | poetry run python ./scripts/docs_render_provider_snippets.py --validate - name: Validate broken links and navigation run: | npm i -g mintlify; cd docs && mintlify broken-links; cd ../scripts; ./docs_validate_navigation.sh; # Todo: validate if openapi schema is matching with the code ================================================ FILE: .github/workflows/test-pr-e2e.yml ================================================ name: Tests (E2E) on: workflow_dispatch: pull_request: paths: - "keep/**" - "keep-ui/**" - "tests/**" # Add permissions for GitHub Container Registry permissions: contents: read packages: write concurrency: group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.head_ref }} cancel-in-progress: true env: PYTHON_VERSION: 3.11 STORAGE_MANAGER_DIRECTORY: /tmp/storage-manager # MySQL server environment variables MYSQL_ROOT_PASSWORD: keep MYSQL_DATABASE: keep # Postgres environment variables POSTGRES_USER: keepuser POSTGRES_PASSWORD: keeppassword POSTGRES_DB: keepdb # To test if imports are working properly EE_ENABLED: true # Docker Compose project name COMPOSE_PROJECT_NAME: keep # Check if PR is from fork (external contributor) IS_FORK: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork }} jobs: # Prepare test environment in parallel with Docker builds prepare-test-environment: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - name: Cache dependencies id: cache-deps uses: actions/cache@v4.2.0 with: path: .venv key: pydeps-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies using poetry run: poetry install --no-interaction --no-root --with dev - name: Get Playwright version from poetry.lock id: playwright-version run: | PLAYWRIGHT_VERSION=$(grep "playwright" poetry.lock -A 5 | grep "version" | head -n 1 | cut -d'"' -f2) echo "version=$PLAYWRIGHT_VERSION" >> $GITHUB_OUTPUT - name: Cache Playwright browsers id: playwright-cache uses: actions/cache@v4.2.0 with: path: ~/.cache/ms-playwright key: playwright-${{ steps.playwright-version.outputs.version }} - name: Install Playwright and dependencies run: | if [ "${{ steps.playwright-cache.outputs.cache-hit }}" != "true" ]; then poetry run playwright install --with-deps else poetry run playwright install-deps fi # Build images in parallel build-frontend: runs-on: ubuntu-latest outputs: image_name: ${{ steps.set-image-name.outputs.image_name }} permissions: contents: read packages: write steps: - name: Set image name id: set-image-name run: | if [[ "${{ env.IS_FORK }}" == "true" ]]; then echo "image_name=keep-frontend:local" >> $GITHUB_OUTPUT else echo "image_name=ghcr.io/${{ github.repository_owner }}/keep-frontend:${{ github.sha }}" >> $GITHUB_OUTPUT fi - name: Login to GitHub Container Registry if: ${{ env.IS_FORK != 'true' }} uses: docker/login-action@v2 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Checkout uses: actions/checkout@v3 - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v2 - name: Set cache key variables id: cache-keys run: | # Create a safe branch name for cache key (replace / with - and remove special chars) SAFE_BRANCH=$(echo "${{ github.head_ref || github.ref_name }}" | sed 's/\//-/g' | sed 's/[^a-zA-Z0-9._-]//g') echo "SAFE_BRANCH_NAME=${SAFE_BRANCH}" >> $GITHUB_OUTPUT # Create a hash ONLY of the dependencies section of package.json and package-lock.json # This ensures the hash only changes when dependencies change DEPS_HASH=$(jq '.dependencies' keep-ui/package.json | sha256sum | cut -d ' ' -f 1) echo "DEPS_HASH=${DEPS_HASH:0:8}" >> $GITHUB_OUTPUT - name: Debug repository and cache info run: | echo "Repository: ${{ github.repository }}" echo "Repository owner: ${{ github.repository_owner }}" echo "Branch: ${{ github.head_ref || github.ref_name }}" echo "Safe branch name: ${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }}" echo "Dependencies hash: ${{ steps.cache-keys.outputs.DEPS_HASH }}" echo "Is fork: ${{ env.IS_FORK }}" # Pre-check if branch cache exists (only for non-forks) - name: Check if branch cache exists id: branch-cache-exists if: ${{ env.IS_FORK != 'true' }} continue-on-error: true run: | BRANCH_CACHE_TAG="ghcr.io/${{ github.repository_owner }}/keep-frontend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }}" if docker buildx imagetools inspect "$BRANCH_CACHE_TAG" &>/dev/null; then echo "Branch cache exists: $BRANCH_CACHE_TAG" echo "cache_exists=true" >> $GITHUB_OUTPUT else echo "Branch cache does not exist: $BRANCH_CACHE_TAG" echo "cache_exists=false" >> $GITHUB_OUTPUT fi - name: Log frontend cache status if: ${{ env.IS_FORK != 'true' }} run: | if [ "${{ steps.branch-cache-exists.outputs.cache_exists }}" == "true" ]; then echo "FRONTEND CACHE HIT ✅" echo "Cache tag: ghcr.io/${{ github.repository_owner }}/keep-frontend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }}" else echo "FRONTEND CACHE MISS ❌" echo "Will attempt to use main branch cache and create a new branch cache" fi # For non-forks: Build and push to registry - name: Build and push frontend image with registry cache if: ${{ env.IS_FORK != 'true' }} uses: docker/build-push-action@v4 with: context: keep-ui file: ./docker/Dockerfile.ui push: true tags: | ghcr.io/${{ github.repository_owner }}/keep-frontend:${{ github.sha }} # Use registry-based caching with branch-specific tags cache-from: | type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-frontend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }} type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-frontend:cache-${{ steps.cache-keys.outputs.DEPS_HASH }} type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-frontend:cache-main cache-to: | type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-frontend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }},mode=max type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-frontend:cache-${{ steps.cache-keys.outputs.DEPS_HASH }},mode=max # Add build args for better caching build-args: | BUILDKIT_INLINE_CACHE=1 # Verbose output outputs: type=image,push=true build-backend: runs-on: ubuntu-latest outputs: image_name: ${{ steps.set-image-name.outputs.image_name }} permissions: contents: read packages: write steps: - name: Set image name id: set-image-name run: | if [[ "${{ env.IS_FORK }}" == "true" ]]; then echo "image_name=keep-backend:local" >> $GITHUB_OUTPUT else echo "image_name=ghcr.io/${{ github.repository_owner }}/keep-backend:${{ github.sha }}" >> $GITHUB_OUTPUT fi - name: Login to GitHub Container Registry if: ${{ env.IS_FORK != 'true' }} uses: docker/login-action@v2 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Checkout uses: actions/checkout@v3 - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v2 - name: Set cache key variables id: cache-keys run: | # Create a safe branch name for cache key (replace / with - and remove special chars) SAFE_BRANCH=$(echo "${{ github.head_ref || github.ref_name }}" | sed 's/\//-/g' | sed 's/[^a-zA-Z0-9._-]//g') echo "SAFE_BRANCH_NAME=${SAFE_BRANCH}" >> $GITHUB_OUTPUT # Create a hash of poetry files for version-specific caching DEPS_HASH=$(cat poetry.lock pyproject.toml | sha256sum | cut -d ' ' -f 1) echo "DEPS_HASH=${DEPS_HASH:0:8}" >> $GITHUB_OUTPUT - name: Debug repository and cache info run: | echo "Repository: ${{ github.repository }}" echo "Repository owner: ${{ github.repository_owner }}" echo "Branch: ${{ github.head_ref || github.ref_name }}" echo "Safe branch name: ${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }}" echo "Dependencies hash: ${{ steps.cache-keys.outputs.DEPS_HASH }}" echo "Is fork: ${{ env.IS_FORK }}" # Pre-check if branch cache exists (only for non-forks) - name: Check if branch cache exists id: branch-cache-exists if: ${{ env.IS_FORK != 'true' }} continue-on-error: true run: | BRANCH_CACHE_TAG="ghcr.io/${{ github.repository_owner }}/keep-backend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }}" if docker buildx imagetools inspect "$BRANCH_CACHE_TAG" &>/dev/null; then echo "Branch cache exists: $BRANCH_CACHE_TAG" echo "cache_exists=true" >> $GITHUB_OUTPUT else echo "Branch cache does not exist: $BRANCH_CACHE_TAG" echo "cache_exists=false" >> $GITHUB_OUTPUT fi - name: Log backend cache status if: ${{ env.IS_FORK != 'true' }} run: | if [ "${{ steps.branch-cache-exists.outputs.cache_exists }}" == "true" ]; then echo "BACKEND CACHE HIT ✅" echo "Cache tag: ghcr.io/${{ github.repository_owner }}/keep-backend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }}" else echo "BACKEND CACHE MISS ❌" echo "Will attempt to use main branch cache and create a new branch cache" fi # For non-forks: Build and push to registry - name: Build and push backend image with registry cache if: ${{ env.IS_FORK != 'true' }} uses: docker/build-push-action@v4 with: context: . file: ./docker/Dockerfile.api push: true tags: | ghcr.io/${{ github.repository_owner }}/keep-backend:${{ github.sha }} # Use registry-based caching with branch-specific tags cache-from: | type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-backend:cache-${{ steps.cache-keys.outputs.DEPS_HASH }} type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-backend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }} cache-to: | type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-backend:cache-${{ steps.cache-keys.outputs.DEPS_HASH }},mode=max type=registry,ref=ghcr.io/${{ github.repository_owner }}/keep-backend:cache-${{ steps.cache-keys.outputs.SAFE_BRANCH_NAME }},mode=max # Add build args for better caching build-args: | BUILDKIT_INLINE_CACHE=1 # Verbose output outputs: type=image,push=true # Run tests with all services in one job run-mysql-with-redis: needs: [build-frontend, build-backend, prepare-test-environment] uses: ./.github/workflows/run-e2e-tests.yml with: db-type: mysql redis_enabled: true python-version: 3.11 is-fork: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork }} backend-image-name: ${{ needs.build-backend.outputs.image_name }} frontend-image-name: ${{ needs.build-frontend.outputs.image_name }} run-postgresql-without-redis: needs: [build-frontend, build-backend, prepare-test-environment] uses: ./.github/workflows/run-e2e-tests.yml with: db-type: postgres redis_enabled: false python-version: 3.11 is-fork: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork }} backend-image-name: ${{ needs.build-backend.outputs.image_name }} frontend-image-name: ${{ needs.build-frontend.outputs.image_name }} run-sqlite-without-redis: needs: [build-frontend, build-backend, prepare-test-environment] uses: ./.github/workflows/run-e2e-tests.yml with: db-type: sqlite redis_enabled: false python-version: 3.11 is-fork: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork }} backend-image-name: ${{ needs.build-backend.outputs.image_name }} frontend-image-name: ${{ needs.build-frontend.outputs.image_name }} ================================================ FILE: .github/workflows/test-pr-integrations.yml ================================================ name: Integration Tests on: push: branches: - main paths: - "keep/**" - "tests/**" pull_request: paths: - "keep/**" - "tests/**" workflow_dispatch: permissions: actions: write concurrency: group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.head_ref }} cancel-in-progress: true env: PYTHON_VERSION: 3.11 STORAGE_MANAGER_DIRECTORY: /tmp/storage-manager MYSQL_ROOT_PASSWORD: keep MYSQL_DATABASE: keep ELASTIC_PASSWORD: keeptests jobs: integration-tests: runs-on: ubuntu-latest services: mysql: image: mysql:5.7 env: MYSQL_ROOT_PASSWORD: ${{ env.MYSQL_ROOT_PASSWORD }} MYSQL_DATABASE: ${{ env.MYSQL_DATABASE }} ports: - 3306:3306 options: >- --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4 ports: - 9200:9200 env: ELASTIC_PASSWORD: ${{ env.ELASTIC_PASSWORD }} bootstrap.memory_lock: "true" discovery.type: "single-node" ES_JAVA_OPTS: "-Xms2g -Xmx2g" xpack.security.enabled: "true" keycloak: image: us-central1-docker.pkg.dev/keephq/keep/keep-keycloak-test env: KC_DB: dev-mem KC_HTTP_RELATIVE_PATH: /auth KEYCLOAK_ADMIN: keep_kc_admin KEYCLOAK_ADMIN_PASSWORD: keep_kc_admin ports: - 8787:8080 options: >- --health-cmd="/opt/keycloak/bin/kcadm.sh config credentials --server http://localhost:8080/auth --realm master --user keep_kc_admin --password keep_kc_admin || exit 1" --health-interval=10s --health-timeout=5s --health-retries=4 steps: - name: Checkout uses: actions/checkout@v3 - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - name: cache deps id: cache-deps uses: actions/cache@v4.2.0 with: path: .venv key: pydeps-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies using poetry run: poetry install --no-interaction --no-root --with dev - name: Run integration tests and report coverage run: | until nc -z 127.0.0.1 3306; do echo "waiting for MySQL..." sleep 1 done echo "MySQL is up and running!" poetry run coverage run --omit="*/test*" --branch -m pytest --integration --ignore=tests/e2e_tests/ - name: Convert coverage results to JSON run: poetry run coverage json --omit="keep/providers/*" - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v3 with: fail_ci_if_error: false files: coverage.json verbose: true ================================================ FILE: .github/workflows/test-pr-ut-ui.yml ================================================ name: Frontend Tests on: push: branches: - main paths: - "keep-ui/**" pull_request: paths: - "keep-ui/**" workflow_dispatch: permissions: actions: write concurrency: group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.head_ref }} cancel-in-progress: true env: NODE_VERSION: 20 jobs: frontend-tests: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - name: Set up Node.js ${{ env.NODE_VERSION }} uses: actions/setup-node@v3 with: node-version: ${{ env.NODE_VERSION }} cache: 'npm' cache-dependency-path: keep-ui/package-lock.json - name: Install dependencies working-directory: keep-ui run: npm ci - name: Run frontend tests working-directory: keep-ui run: npm run test # Optional: Add coverage reporting if your test setup supports it # Uncomment and adjust if you have coverage reporting configured # - name: Upload coverage reports to Codecov # uses: codecov/codecov-action@v3 # with: # fail_ci_if_error: false # directory: keep-ui/coverage # flags: frontend # verbose: true ================================================ FILE: .github/workflows/test-pr-ut.yml ================================================ name: Unit Tests on: push: branches: - main paths: - "keep/**" - "tests/**" pull_request: paths: - "keep/**" - "tests/**" workflow_dispatch: permissions: actions: write concurrency: group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.head_ref }} cancel-in-progress: true env: PYTHON_VERSION: 3.11 SQLALCHEMY_WARN_20: 1 jobs: unit-tests: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - uses: chartboost/ruff-action@v1 with: src: "./keep" - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - name: cache deps id: cache-deps uses: actions/cache@v4.2.0 with: path: .venv key: pydeps-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies using poetry run: poetry install --no-interaction --no-root --with dev - name: Run unit tests and report coverage run: | poetry run coverage run --omit="*/test*" --branch -m pytest --timeout 20 -n auto --non-integration --ignore=tests/e2e_tests/ - name: Convert coverage results to JSON run: poetry run coverage json --omit="keep/providers/*" - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v3 with: fail_ci_if_error: false files: coverage.json verbose: true ================================================ FILE: .github/workflows/test-workflow-examples.yml ================================================ name: Test workflow examples on: push: paths: - 'keep/providers/**' - 'examples/workflows/**' - 'keep-ui/entities/workflows/model/yaml.schema.ts' - 'keep-ui/scripts/validate-workflow-examples.ts' pull_request: paths: - 'keep/providers/**' - 'examples/workflows/**' - 'keep-ui/entities/workflows/model/yaml.schema.ts' - 'keep-ui/scripts/validate-workflow-examples.ts' workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.head_ref }}-${{ github.job }} cancel-in-progress: true env: NODE_VERSION: 20 PYTHON_VERSION: 3.11 STORAGE_MANAGER_DIRECTORY: /tmp/storage-manager jobs: test-workflow-examples: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - uses: chartboost/ruff-action@v1 with: src: "./keep" - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - name: cache deps id: cache-deps uses: actions/cache@v4.2.0 with: path: .venv key: pydeps-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies using poetry run: poetry install --no-interaction --no-root --with dev # Save list of providers to providers_list.json, because we don't have backend endpoint to get it - name: Save providers list run: | PYTHONPATH="${{ github.workspace }}" poetry run python ./scripts/save_providers_list.py - name: Set up Node.js ${{ env.NODE_VERSION }} uses: actions/setup-node@v3 with: node-version: ${{ env.NODE_VERSION }} cache: 'npm' cache-dependency-path: keep-ui/package-lock.json - name: Install dependencies working-directory: keep-ui run: npm ci - name: Run workflow examples validation working-directory: keep-ui run: npm run test:workflow-examples ================================================ FILE: .gitignore ================================================ # .DS_STORE .DS_Store **/.DS_Store # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # .csv files *.csv # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.lcov coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ # vscode .vscode/ # keep configuration file keep.yaml .keep.yaml providers.yaml .vercel keepstate.json # keep single tenant id e1faa321-35df-486b-8fa8-3601ee714011* # sqlite db *.sqlite3 state/* .terraform* examples/alerts/dd.yml keep-ui/node_modules keep-ui/node_modules/* cov.xml keep.db keepdd.db RANDOM_USER_ID storage # otel files tempo-data/ # docs docs/node_modules/ oauth2.cfg scripts/automatic_extraction_rules.py playwright_dump_*.html playwright_dump_*.png playwright_dump_*.txt playwright_dump_*.json ee/experimental/ai_temp/* ,e!ee/experimental/ai_temp/.gitkeep oauth2.cfg scripts/keep_slack_bot.py *.db providers_cache.json providers_list.json workflow-yaml-json-schema.json tests/provision/* !tests/provision/workflows* grafana/* !grafana/provisioning/ !grafana/dashboards/ keep/providers/grafana_provider/grafana/png/* topology.sh posthog.py ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: local hooks: - id: black name: black entry: black language: system types: [python] require_serial: true - id: end-of-file-fixer name: Fix End of Files entry: end-of-file-fixer language: system types: [text] stages: [commit, push, manual] - id: isort name: isort entry: isort require_serial: true language: system types_or: [cython, pyi, python] args: ["--filter-files", "--profile", "black"] - id: trailing-whitespace name: Trim Trailing Whitespace entry: trailing-whitespace-fixer language: system types: [text] stages: [commit, push, manual] - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.1.6 hooks: # Run the linter. - id: ruff args: [--fix] - repo: https://github.com/compilerla/conventional-pre-commit rev: v2.1.1 hooks: - id: conventional-pre-commit stages: [commit-msg] args: [] # optional: list of Conventional Commits types to allow e.g. [feat, fix, ci, chore, test] - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.0.3 hooks: - id: prettier types_or: [javascript, jsx, ts, tsx, json, yaml, css, scss, html, markdown] args: [--write] ================================================ FILE: .python-version ================================================ 3.11.1 ================================================ FILE: CHANGELOG.md ================================================ # CHANGELOG {% if context.history.unreleased | length > 0 %} {# UNRELEASED #} ## Unreleased {% for type_, commits in context.history.unreleased | dictsort %} ### {{ type_ | capitalize }} {% for commit in commits %}{% if type_ != "unknown" %} * {{ commit.commit.message.rstrip() }} ([`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }})) {% else %} * {{ commit.commit.message.rstrip() }} ([`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }})) {% endif %}{% endfor %}{% endfor %} {% endif %} {# RELEASED #} {% for version, release in context.history.released.items() %} ## {{ version.as_tag() }} ({{ release.tagged_date.strftime("%Y-%m-%d") }}) {% for type_, commits in release["elements"] | dictsort %} ### {{ type_ | capitalize }} {% for commit in commits %}{% if type_ != "unknown" %} * {{ commit.commit.message.rstrip() }} ([`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }})) {% else %} * {{ commit.commit.message.rstrip() }} ([`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }})) {% endif %}{% endfor %}{% endfor %}{% endfor %} ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Keep We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's: - Reporting a bug - Discussing the current state of the code - Submitting a fix - Proposing new features - Becoming a maintainer ## We Develop with Github We use github to host code, to track issues and feature requests, as well as accept pull requests. ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: 1. Fork the repo and create your branch from `main`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. Issue that pull request! ## Any contributions you make will be under the MIT Software License In short, when you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project. Feel free to contact the maintainers if that's a concern. ## Report bugs using Github's [issues](https://github.com/keephq/keep/issues) We use GitHub issues to track public bugs. Report a bug by [opening a new issue](); it's that easy! **Great Bug Reports** tend to have: - A quick summary and/or background - Steps to reproduce - Be specific! - Give sample code if you can. - What you expected would happen - What actually happens - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) People *love* thorough bug reports. I'm not even kidding. ## Use a Consistent Coding Style Follow PEP8, use `black` for formatting and `isort` to sort imports. ## License By contributing, you agree that your contributions will be licensed under its MIT License. ## References This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md) ================================================ FILE: LICENSE ================================================ Copyright (c) 2024 Keep Portions of this software are licensed as follows: * All content that resides under the "ee/" directory of this repository, if that directory exists, is licensed under the license defined in "ee/LICENSE". * Content outside of the above mentioned directories or restrictions above is available under the "MIT" license as defined below. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

The open-source AIOps and alert management platform


Single pane of glass, alert deduplication, enrichment, filtering and correlation, bi-directional integrations, workflows, dashboards.

Docs · Try it out · Report Bug · Book a Demo · Website

Sneak preview screenshot

- 🔍 **Single pane of glass** - Best-in-class customizable UI for all your alerts and incidents - 🛠️ **Swiss Army Knife for alerts** - Deduplication, correlation, filtering and enrichment - 🔄 **Deep integrations** - Bi-directional syncs with monitoring tools, customizable workflows - ⚡ **[Automation](#workflows)** - GitHub Actions for your monitoring tools - 🤖 **AIOps 2.0** - AI-powered correlation and summarization
> See full [platform documentation](https://docs.keephq.dev).
## Supported Integrations > View the full list in our [documentation](https://docs.keephq.dev/providers/documentation) > Missing a provider? [Submit a new provider request](https://github.com/keephq/keep/issues/new?assignees=&labels=provider&projects=&template=new_provider_request.md&title=) and we'll add it quickly! ### AI Backends for Enrichments, Correlations and Incident Context Gathering
Anthropic
Anthropic
OpenAI
OpenAI
DeepSeek
DeepSeek
Ollama
Ollama
LlamaCPP
LlamaCPP
Grok
Grok
Gemini
Gemini
### Observability Tools
AppDynamics
AppDynamics
Axiom
Axiom
Azure Monitoring
Azure Monitoring
Centreon
Centreon
Checkmk
Checkmk
Cilium
Cilium
Checkly
Checkly
CloudWatch
CloudWatch
Coralogix
Coralogix
Dash0
Dash0
Datadog
Datadog
Dynatrace
Dynatrace
Elastic
Elastic
GCP Monitoring
GCP Monitoring
Grafana
Grafana
Grafana Loki
Grafana Loki
Graylog
Graylog
Icinga2
Icinga2
Kibana
Kibana
LibreNMS
LibreNMS
NetBox
NetBox
Netdata
Netdata
New Relic
New Relic
OpenSearch Serverless
OpenSearch Serverless
Parseable
Parseable
Pingdom
Pingdom
Prometheus
Prometheus
Rollbar
Rollbar
Sentry
Sentry
SignalFX
SignalFX
OpenObserve
OpenObserve
Site24x7
Site24x7
Splunk
Splunk
StatusCake
StatusCake
SumoLogic
SumoLogic
SumoLogic
ThousandEyes
UptimeKuma
UptimeKuma
VictoriaLogs
VictoriaLogs
VictoriaMetrics
VictoriaMetrics
Wazuh
Wazuh
Zabbix
Zabbix
### Databases & Data Warehouses
BigQuery
BigQuery
ClickHouse
ClickHouse
Databend
Databend
MongoDB
MongoDB
MySQL
MySQL
PostgreSQL
PostgreSQL
Snowflake
Snowflake
### Communication Platforms
Discord
Discord
Google Chat
Google Chat
Mailgun
Mailgun
Mattermost
Mattermost
Ntfy.sh
Ntfy.sh
Pushover
Pushover
Resend
Resend
SendGrid
SendGrid
Slack
Slack
SMTP
SMTP
Telegram
Telegram
Twilio
Twilio
Teams
Teams
Zoom
Zoom
Zoom Chat
Zoom Chat
### Incident Management
Grafana Incident
Grafana Incident
Grafana OnCall
Grafana OnCall
Ilert
Ilert
Incident.io
Incident.io
AWS Incident Manager
AWS Incident Manager
OpsGenie
OpsGenie
PagerDuty
PagerDuty
Pagertree
Pagertree
SINGL4
SINGL4
Squadcast
Squadcast
Zenduty
Zenduty
Flashduty
Flashduty
### Ticketing Tools
Asana
Asana
GitHub
GitHub
GitLab
GitLab
Jira
Jira
Linear
Linear
LinearB
LinearB
Microsoft Planner
Microsoft Planner
Monday
Monday
Redmine
Redmine
ServiceNow
ServiceNow
Trello
Trello
YouTrack
YouTrack
### Container Orchestration Platforms
Azure AKS
Azure AKS
ArgoCD
ArgoCD
Flux CD
Flux
GKE
GKE
Kubernetes
Kubernetes
OpenShift
OpenShift
### Data Enrichment
Bash
Bash
OpenAI
OpenAI
Python
Python
QuickChart
QuickChart
SSH
SSH
Webhook
Webhook
### Workflow Orchestration
Airflow
Airflow
### Queues
AmazonSQS
Amazon SQS
Kafka
Kafka
## Workflows Keep is GitHub Actions for your monitoring tools. A Keep Workflow is a declarative YAML file that automates your alert and incident management. Each workflow consists of: - **Triggers** - What starts the workflow (alerts, incidents, schedule or manual) - **Steps** - Read or fetch data (enrichment, context) - **Actions** - Execute operations (update tickets, send notifications, restart servers) Here's a simple workflow that creates a Jira ticket for every `critical` alert from `sentry` for `payments` and `api` services. For more workflows, see [here](https://github.com/keephq/keep/tree/main/examples/workflows). ```yaml workflow: id: sentry-alerts description: create ticket alerts for critical alerts from sentry triggers: - type: alert # customize the filter to run only on critical alert from sentry filters: - key: source value: sentry - key: severity value: critical # regex to match specific services - key: service value: r"(payments|ftp)" actions: - name: send-slack-message-team-payments # if the alert is on the payments service, slack the payments team if: "'{{ alert.service }}' == 'payments'" provider: type: slack # control which Slack configuration you want to use config: " {{ providers.team-payments-slack }} " # customize the alert message with context from {{ alert }} or any other {{ step }} with: message: | "A new alert from Sentry: Alert: {{ alert.name }} - {{ alert.description }} {{ alert}}" - name: create-jira-ticket-oncall-board # control the workflow flow with "if" and "foreach" statements if: "'{{ alert.service }}' == 'ftp' and not '{{ alert.ticket_id }}'" provider: type: jira config: " {{ providers.jira }} " with: board_name: "Oncall Board" custom_fields: customfield_10201: "Critical" issuetype: "Task" # customize the summary summary: "{{ alert.name }} - {{ alert.description }} (created by Keep)" description: | "This ticket was created by Keep. Please check the alert details below: {code:json} {{ alert }} {code}" # enrich the alerts with more context. from now on, the alert will be assigned with the ticket id, type and url enrich_alert: - key: ticket_type value: jira - key: ticket_id value: results.issue.key - key: ticket_url value: results.ticket_url ``` ## Enterprise Ready - **Developer First** - Modern REST APIs, native SDKs, and comprehensive documentation for seamless integration - **[Enterprise Security](https://docs.keephq.dev/deployment/authentication/overview)** - Full authentication support (SSO, SAML, OIDC, LDAP) with granular access control (RBAC, ABAC) and team management - **Flexible Deployment** - Deploy on-premises or in air-gapped environments with cloud-agnostic architecture - **[Production Scale](https://docs.keephq.dev/deployment/stress-testing)** - High availability, performance-tested infrastructure supporting horizontal scaling for enterprise workloads ## Getting Started > Need help? Can't find your environment listed? Reach out on Slack and we'll help you quickly. Keep can run in various environments and configurations. The easiest way to start is with Keep's Docker Compose. - Running Keep [locally](https://docs.keephq.dev/development/getting-started). - Running Keep on [Kubernetes](https://docs.keephq.dev/deployment/kubernetes/installation). - Running Keep with [Docker](https://docs.keephq.dev/deployment/docker). - Running Keep on [AWS ECS](https://docs.keephq.dev/deployment/ecs). - Running Keep on [OpenShift](https://docs.keephq.dev/deployment/kubernetes/openshift). ## 🫵 Keepers ### Top Contributors A special thanks to our top contributors who help us make Keep great. You are more than awesome! - [Furkan](https://github.com/pehlicd) - [Asharon](https://github.com/asharonbaltazar) Want to become a top contributor? Join our Slack and DM Tal, Shahar, or Furkan. ### Contributors Thank you for contributing and continuously making Keep better, you're awesome 🫶 ================================================ FILE: docker/Dockerfile.api ================================================ FROM python:3.13.5-alpine as base # Install bash and runtime dependencies for grpc RUN apk add --no-cache bash libstdc++ ENV PYTHONFAULTHANDLER=1 \ PYTHONHASHSEED=random \ PYTHONUNBUFFERED=1 # THIS IS FOR DEBUGGING PURPOSES # RUN apt-get update && \ # apt-get install -y --no-install-recommends \ # iproute2 \ # net-tools \ # procps && \ # rm -rf /var/lib/apt/lists/* RUN addgroup -g 1000 keep && \ adduser -u 1000 -G keep -s /bin/sh -D keep WORKDIR /app FROM base as builder # Install build dependencies for Alpine RUN apk add --no-cache \ gcc \ g++ \ musl-dev \ libffi-dev \ openssl-dev \ postgresql-dev \ mysql-client \ build-base \ linux-headers \ git ENV PIP_DEFAULT_TIMEOUT=100 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ PIP_NO_CACHE_DIR=1 \ POETRY_VERSION=1.3.2 RUN pip install "poetry==$POETRY_VERSION" RUN python -m venv /venv COPY pyproject.toml poetry.lock ./ RUN poetry export -f requirements.txt --output requirements.txt --without-hashes --only main && \ /venv/bin/python -m pip install --upgrade -r requirements.txt && \ pip uninstall -y poetry COPY keep keep COPY ee keep/ee COPY examples examples COPY keep-ui/public/icons/unknown-icon.png unknown-icon.png RUN /venv/bin/pip install --use-deprecated=legacy-resolver . && \ rm -rf /root/.cache/pip && \ find /venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \ find /venv -type f -name "*.pyc" -delete 2>/dev/null || true FROM base as final ENV PATH="/venv/bin:${PATH}" ENV VIRTUAL_ENV="/venv" ENV EE_PATH="ee" COPY --from=builder /venv /venv COPY --from=builder /app/examples /examples COPY --from=builder /app/unknown-icon.png unknown-icon.png # as per Openshift guidelines, https://docs.openshift.com/container-platform/4.11/openshift_images/create-images.html#use-uid_create-images RUN chgrp -R 0 /app && chmod -R g=u /app && \ chown -R keep:keep /app && \ chown -R keep:keep /venv USER keep ENTRYPOINT ["/venv/lib/python3.13/site-packages/keep/entrypoint.sh"] CMD ["gunicorn", "keep.api.api:get_app", "--bind" , "0.0.0.0:8080" , "--workers", "4" , "-k" , "uvicorn.workers.UvicornWorker", "-c", "/venv/lib/python3.13/site-packages/keep/api/config.py", "--preload"] ================================================ FILE: docker/Dockerfile.cli ================================================ FROM python:3.11.6-slim as base ENV PYTHONFAULTHANDLER=1 \ PYTHONHASHSEED=random \ PYTHONUNBUFFERED=1 WORKDIR /app FROM base as builder ENV PIP_DEFAULT_TIMEOUT=100 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ PIP_NO_CACHE_DIR=1 \ POETRY_VERSION=1.3.2 RUN pip install "poetry==$POETRY_VERSION" RUN python -m venv /venv COPY . . RUN poetry build && /venv/bin/pip install --use-deprecated=legacy-resolver dist/*.whl FROM base as final ENV PATH="/venv/bin:${PATH}" ENV VIRTUAL_ENV="/venv" COPY --from=builder /venv /venv ================================================ FILE: docker/Dockerfile.dev.api ================================================ FROM python:3.11.6-slim as base ENV PYTHONFAULTHANDLER=1 \ PYTHONHASHSEED=random \ PYTHONUNBUFFERED=1 WORKDIR /app # Creating a virtual environment and installing dependencies ENV PIP_DEFAULT_TIMEOUT=100 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ PIP_NO_CACHE_DIR=1 \ POETRY_VERSION=1.3.2 RUN pip install "poetry==$POETRY_VERSION" RUN python -m venv /venv COPY pyproject.toml ./ RUN . /venv/bin/activate && poetry install --no-root COPY keep keep COPY ee keep/ee # Setting the virtual environment path ENV PYTHONPATH="/app:${PYTHONPATH}" ENV PATH="/venv/bin:${PATH}" ENV VIRTUAL_ENV="/venv" ENV POSTHOG_DISABLED="true" ENTRYPOINT ["/app/keep/entrypoint.sh"] CMD ["gunicorn", "keep.api.api:get_app", "--bind" , "0.0.0.0:8080" , "--workers", "1" , "-k" , "uvicorn.workers.UvicornWorker", "-c", "./keep/api/config.py", "--reload"] ================================================ FILE: docker/Dockerfile.dev.ui ================================================ # Use node alpine as it's a small node image FROM node:alpine # Create the directory on the node image # where our Next.js app will live RUN mkdir -p /app # Set /app as the working directory WORKDIR /app # Copy package.json and package-lock.json # to the /app working directory COPY keep-ui/package*.json /app/ # Copy the rest of our Next.js folder into /app COPY ./keep-ui/ /app # Install dependencies in /app RUN npm install # Install next globally and create a symlink RUN npm install -g next RUN ln -s /usr/local/lib/node_modules/next/dist/bin/next /usr/local/bin/next || echo "next binary already linked to bin" # Ensure port 3000 is accessible to our system EXPOSE 3000 CMD ["npm", "run", "dev"] ================================================ FILE: docker/Dockerfile.ui ================================================ FROM node:20-alpine AS base # Install dependencies only when needed FROM base AS deps # Check https://github.com/nodejs/docker-node/tree/b4117f9333da4138b03a546ec926ef50a31506c3#nodealpine to understand why libc6-compat might be needed. RUN apk add --no-cache libc6-compat WORKDIR /app # Install dependencies based on the preferred package manager COPY package.json package-lock.json ./ RUN npm ci --noproxy registry.npmjs.org --maxsockets 1 # Rebuild the source code only when needed FROM base AS builder WORKDIR /app COPY --from=deps /app/node_modules ./node_modules COPY . . # Next.js collects completely anonymous telemetry data about general usage. # Learn more here: https://nextjs.org/telemetry # Uncomment the following line in case you want to disable telemetry during the build. ENV NEXT_TELEMETRY_DISABLED 1 # If using npm comment out above and use below instead ENV API_URL http://localhost:8080 RUN NODE_OPTIONS=--max-old-space-size=8192 npm run build # Production image, copy all the files and run next FROM base AS runner ARG GIT_COMMIT_HASH=local ARG KEEP_VERSION=local ARG KEEP_INCLUDE_SOURCES=false WORKDIR /app # Inject the git commit hash into the build # This is being injected from the build script ENV GIT_COMMIT_HASH=${GIT_COMMIT_HASH} ENV KEEP_VERSION=${KEEP_VERSION} ENV KEEP_INCLUDE_SOURCES=${KEEP_INCLUDE_SOURCES} ENV NODE_ENV production # Uncomment the following line in case you want to disable telemetry during runtime. ENV NEXT_TELEMETRY_DISABLED 1 RUN addgroup --system --gid 1001 nodejs RUN adduser --system --uid 1001 nextjs COPY --from=builder /app/public ./public # Automatically leverage output traces to reduce image size # https://nextjs.org/docs/advanced-features/output-file-tracing COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./ COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static COPY entrypoint.sh /app/entrypoint.sh # as per Openshift guidelines, https://docs.openshift.com/container-platform/4.11/openshift_images/create-images.html#use-uid_create-images RUN chgrp -R 0 /app && chmod -R g=u /app USER nextjs EXPOSE 3000 ENV PORT 3000 ENV POSTHOG_KEY=phc_muk9qE3TfZsX3SZ9XxX52kCGJBclrjhkP9JxAQcm1PZ ENV POSTHOG_HOST=https://ingest.keephq.dev ENV PUSHER_HOST=localhost ENV PUSHER_PORT=6001 ENV PUSHER_APP_KEY=keepappkey ENV NEXT_PUBLIC_SENTRY_DSN=https://0d4d59e3105ffe8afa27dcb95a222009@o4505515398922240.ingest.us.sentry.io/4508258058764288 ENTRYPOINT ["/app/entrypoint.sh"] ================================================ FILE: docker-compose-with-arq.yml ================================================ services: keep-frontend: extends: file: docker-compose.common.yml service: keep-frontend-common image: us-central1-docker.pkg.dev/keephq/keep/keep-ui environment: - AUTH_TYPE=NO_AUTH - API_URL=http://keep-backend:8080 volumes: - ./state:/state depends_on: - keep-backend keep-backend: extends: file: docker-compose.common.yml service: keep-backend-common image: us-central1-docker.pkg.dev/keephq/keep/keep-api environment: - AUTH_TYPE=NO_AUTH - REDIS=true - REDIS_HOST=keep-arq-redis - REDIS_PORT=6379 volumes: - ./state:/state depends_on: - keep-arq-redis keep-websocket-server: extends: file: docker-compose.common.yml service: keep-websocket-server-common keep-arq-redis: image: redis/redis-stack ports: - "6379:6379" - "8081:8001" keep-arq-dashboard: image: us-central1-docker.pkg.dev/keephq/keep/keep-arq-dashboard ports: - "8082:8000" entrypoint: - "uvicorn" - "--host" - "0.0.0.0" - "arq_dashboard:app" environment: - ARQ_DASHBOARD_REDIS_URL=redis://keep-arq-redis:6379 ================================================ FILE: docker-compose-with-auth.yml ================================================ services: keep-frontend: extends: file: docker-compose.common.yml service: keep-frontend-common image: us-central1-docker.pkg.dev/keephq/keep/keep-ui environment: - AUTH_TYPE=DB - NEXTAUTH_SECRET=verysecretkey - API_URL=http://keep-backend:8080 volumes: - ./state:/state depends_on: - keep-backend keep-backend: extends: file: docker-compose.common.yml service: keep-backend-common image: us-central1-docker.pkg.dev/keephq/keep/keep-api environment: - AUTH_TYPE=DB - KEEP_JWT_SECRET=verysecretkey - KEEP_DEFAULT_USERNAME=keep - KEEP_DEFAULT_PASSWORD=keep volumes: - ./state:/state keep-websocket-server: extends: file: docker-compose.common.yml service: keep-websocket-server-common ================================================ FILE: docker-compose-with-otel.yaml ================================================ services: loki: image: grafana/loki:latest profiles: - otel ports: - "3100:3100" command: ["-config.file=/etc/loki/local-config.yaml"] tempo: image: grafana/tempo:latest profiles: - otel command: ["-config.file=/etc/tempo.yaml"] volumes: - ./otel-shared/tempo.yaml:/etc/tempo.yaml - ./tempo-data:/tmp/tempo ports: - "14268:14268" # jaeger ingest - "3200:3200" # tempo - "9095:9095" # tempo grpc - "4317:4317" # otlp grpc - "4318:4318" # otlp http - "9411:9411" # zipkin prometheus: image: prom/prometheus:latest profiles: - otel command: - --config.file=/etc/prometheus.yaml - --web.enable-remote-write-receiver - --enable-feature=exemplar-storage volumes: - ./otel-shared/prometheus.yaml:/etc/prometheus.yaml ports: - "9090:9090" alertmanager: image: prom/alertmanager profiles: - otel container_name: alertmanager volumes: - ./otel-shared/alertmanager.yml:/etc/alertmanager/alertmanager.yml command: - "--config.file=/etc/alertmanager/alertmanager.yml" grafana: image: grafana/grafana:10.0.3 profiles: - otel depends_on: - loki - tempo - prometheus volumes: - ./otel-shared/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml environment: - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_DISABLE_LOGIN_FORM=false - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor ports: - "3001:3000" # OpenTelemetry collector. Make sure you set USERID and GOOGLE_APPLICATION_CREDENTIALS # environment variables for your container to authenticate correctly otel-collector: image: otel/opentelemetry-collector-contrib:0.81.0 profiles: - otel ports: - "9100:9100" depends_on: - tempo - loki volumes: - ./otel-shared/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml keep-frontend-dev: extends: file: docker-compose.common.yml service: keep-frontend-common environment: - API_URL=http://keep-backend-dev:8080 build: dockerfile: docker/Dockerfile.dev.ui volumes: - ./keep-ui:/app - /app/node_modules - /app/.next depends_on: - keep-backend-dev keep-backend-dev: extends: file: docker-compose.common.yml service: keep-backend-common build: dockerfile: docker/Dockerfile.dev.api environment: - OTEL_SERVICE_NAME=keephq - OTLP_ENDPOINT=http://otel-collector:4317 - METRIC_OTEL_ENABLED=true volumes: - .:/app - ./state:/state keep-websocket-server: extends: file: docker-compose.common.yml service: keep-websocket-server-common log_collector: image: timberio/vector:0.32.2-debian profiles: - otel volumes: - ./otel-shared/vector.toml:/etc/vector/vector.toml - /var/run/docker.sock:/var/run/docker.sock volumes: certs: driver: local esdata01: driver: local kibanadata: driver: local db_data: ================================================ FILE: docker-compose.common.yml ================================================ services: keep-frontend-common: ports: - "3000:3000" environment: - NEXTAUTH_SECRET=secret - NEXTAUTH_URL=http://localhost:3000 - NEXT_PUBLIC_API_URL=http://localhost:8080 - POSTHOG_KEY=phc_muk9qE3TfZsX3SZ9XxX52kCGJBclrjhkP9JxAQcm1PZ - POSTHOG_HOST=https://ingest.keephq.dev - NEXT_PUBLIC_SENTRY_DSN=https://0d4d59e3105ffe8afa27dcb95a222009@o4505515398922240.ingest.us.sentry.io/4508258058764288 - PUSHER_HOST=localhost - PUSHER_PORT=6001 - PUSHER_APP_KEY=keepappkey keep-backend-common: ports: - "8080:8080" environment: - PORT=8080 - SECRET_MANAGER_TYPE=FILE - SECRET_MANAGER_DIRECTORY=/state - DATABASE_CONNECTION_STRING=sqlite:////state/db.sqlite3?check_same_thread=False - OPENAI_API_KEY=$OPENAI_API_KEY - PUSHER_APP_ID=1 - PUSHER_APP_KEY=keepappkey - PUSHER_APP_SECRET=keepappsecret - PUSHER_HOST=keep-websocket-server - PUSHER_PORT=6001 - USE_NGROK=false keep-websocket-server-common: image: quay.io/soketi/soketi:1.4-16-debian ports: - "6001:6001" - "9601:9601" environment: - SOKETI_USER_AUTHENTICATION_TIMEOUT=3000 - SOKETI_DEBUG=1 - SOKETI_DEFAULT_APP_ID=1 - SOKETI_DEFAULT_APP_KEY=keepappkey - SOKETI_DEFAULT_APP_SECRET=keepappsecret ================================================ FILE: docker-compose.dev.yml ================================================ services: keep-frontend-dev: extends: file: docker-compose.common.yml service: keep-frontend-common environment: - API_URL=http://keep-backend-dev:8080 - SENTRY_DISABLED=true build: dockerfile: docker/Dockerfile.dev.ui volumes: - ./keep-ui:/app - /app/node_modules - /app/.next depends_on: - keep-backend-dev keep-backend-dev: extends: file: docker-compose.common.yml service: keep-backend-common build: dockerfile: docker/Dockerfile.dev.api volumes: - .:/app - ./state:/state keep-websocket-server: extends: file: docker-compose.common.yml service: keep-websocket-server-common ================================================ FILE: docker-compose.yml ================================================ services: keep-frontend: extends: file: docker-compose.common.yml service: keep-frontend-common image: us-central1-docker.pkg.dev/keephq/keep/keep-ui environment: - AUTH_TYPE=NO_AUTH - API_URL=http://keep-backend:8080 volumes: - ./state:/state depends_on: - keep-backend keep-backend: extends: file: docker-compose.common.yml service: keep-backend-common image: us-central1-docker.pkg.dev/keephq/keep/keep-api environment: - AUTH_TYPE=NO_AUTH - PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus - KEEP_METRICS=true volumes: - ./state:/state keep-websocket-server: extends: file: docker-compose.common.yml service: keep-websocket-server-common grafana: image: grafana/grafana:latest profiles: - grafana ports: - "3001:3000" volumes: - ./grafana:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning - ./grafana/dashboards:/etc/grafana/dashboards environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=admin - GF_USERS_ALLOW_SIGN_UP=false depends_on: - prometheus prometheus: image: prom/prometheus:latest profiles: - grafana ports: - "9090:9090" volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml command: - "--config.file=/etc/prometheus/prometheus.yml" depends_on: - keep-backend ================================================ FILE: docs/README.md ================================================ How to run docs locally: ``` npm i -g mintlify mintlify dev ``` Read more: https://mintlify.com/docs/development ================================================ FILE: docs/alertevaluation/examples/victoriametricsmulti.mdx ================================================ --- title: "VictoriaMetrics Multi Alert Example" --- This example demonstrates a simple CPU usage multi-alert based on a metric: ```yaml workflow: # Unique identifier for this workflow id: query-victoriametrics-multi # Display name shown in the UI name: victoriametrics-multi-alert-example # Brief description of what this workflow does description: victoriametrics triggers: # This workflow can be triggered manually from the UI - type: manual steps: # Query VictoriaMetrics for CPU metrics - name: victoriametrics-step provider: # Use the VictoriaMetrics provider configuration config: "{{ providers.vm }}" type: victoriametrics with: # Query that returns the sum of CPU usage for each job # Example response: # [ # {'metric': {'job': 'victoriametrics'}, 'value': [1737808021, '0.022633333333333307']}, # {'metric': {'job': 'vmagent'}, 'value': [1737808021, '0.009299999999999998']} # ] query: sum(rate(process_cpu_seconds_total)) by (job) queryType: query actions: # Create an alert in Keep based on the query results - name: create-alert provider: type: keep with: # Only create alert if CPU usage is above threshold if: "{{ value.1 }} > 0.01 " # Alert must persist for 1 minute for: 1m # Use job label to create unique fingerprint for each alert fingerprint_fields: - labels.job alert: # Alert name includes the specific job name: "High CPU Usage on {{ metric.job }}" description: "CPU usage is high on the VM (created from VM metric)" # Set severity based on CPU usage thresholds: # > 0.9 = critical # > 0.7 = warning # else = info severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"' labels: # Job label is required for alert fingerprinting job: "{{ metric.job }}" # Additional context labels environment: production app: myapp service: api team: devops owner: alice ``` ================================================ FILE: docs/alertevaluation/examples/victoriametricssingle.mdx ================================================ --- title: "VictoriaMetrics Single Alert Example" --- This example demonstrates a simple CPU usage alert based on a metric: ```yaml # This workflow queries VictoriaMetrics metrics and creates alerts based on CPU usage workflow: # Unique identifier for this workflow id: query-victoriametrics # Display name shown in the UI name: victoriametrics-alert-example # Brief description of what this workflow does description: Monitors CPU usage metrics from VictoriaMetrics and creates alerts when thresholds are exceeded # Define how the workflow is triggered triggers: - type: manual # Can be triggered manually from the UI # Steps to execute in order steps: - name: victoriametrics-step provider: # Use VictoriaMetrics provider config defined in providers.vm config: "{{ providers.vm }}" type: victoriametrics with: # Query average CPU usage rate query: avg(rate(process_cpu_seconds_total)) queryType: query # Actions to take based on the query results actions: - name: create-alert provider: type: keep with: # Create alert if CPU usage exceeds threshold if: "{{ value.1 }} > 0.0040" alert: name: "High CPU Usage" description: "[Single] CPU usage is high on the VM (created from VM metric)" # Set severity based on CPU usage thresholds severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"' # Alert labels for filtering and routing labels: environment: production app: myapp service: api team: devops owner: alice ``` ================================================ FILE: docs/alertevaluation/overview.mdx ================================================ --- title: "Overview" --- The Keep Alert Evaluation Engine is a flexible system that enables you to create alerts based on any data source and define evaluation rules. Unlike traditional monitoring solutions that are tied to specific metrics, Keep's engine allows you to combine data from multiple sources and apply complex logic to determine when and how alerts should be triggered. ## Core Features ### Generic Data Source Support - Query any data source (databases, APIs, metrics systems) - Combine multiple data sources in a single alert rule - Apply custom transformations to the data ### Flexible Alert Evaluation - Define custom conditions using templated expressions - Support for complex boolean logic and mathematical operations - State management for alert transitions (pending->firing->resolved) - Deduplication and alert instance tracking ### Customizable Alert Definition - Full control over alert metadata (name, description, severity) - Dynamic labels based on evaluation context - Template support for all alert fields - Custom fingerprinting for alert grouping ## Core Components ### Alert States - **Pending**: Initial state when alert condition is met (relevant only if `for` supplied) - **Firing**: Active alert that has met its duration condition - **Resolved**: Alert that is no longer active ### Alert Rule Components 1. **Data Collection**: Query steps to gather data from any source 2. **Condition (`if`)**: Expression that determines when to create/update an alert 3. **Duration (`for`)**: Optional time period the condition must be true before firing 4. **Alert Definition**: Complete control over how the alert looks and behaves: - Name and description - Severity levels - Labels for routing - Custom fields and annotations ### State Management - **Fingerprinting**: Unique identifier for alert deduplication and state tracking - **Keep-Firing**: Control how long alerts remain active - **State Transitions**: Rules for how alerts move between states ## Examples The following examples demonstrate different ways to use the alert evaluation engine: - [Single Metric Alert](/alertevaluation/examples/victoriametricssingle) - Basic example showing metrics-based alerting - [Multiple Metrics Alert](/alertevaluation/examples/victoriametricsmulti) - Advanced example with multiple alert instances ================================================ FILE: docs/alerts/actionmenu.mdx ================================================ --- title: "Action Menu" --- The Action Menu in Keep provides quick access to common actions that can be performed on alerts. This menu enables teams to efficiently manage and interact with alerts directly from the table. ### (1) Run Workflow Trigger predefined workflows directly from the Action Menu. This allows automation of actions such as escalating alerts or notifying specific teams. ### (2) Create a New Workflow Quickly create a new workflow tailored to the selected alert. This is useful for handling unique cases that require a custom response. ### (3) View Alert History Access the full history of the alert, including changes to its status, comments, and any actions performed. This provides a clear timeline of the alert's lifecycle. ### (4) Manually Enrich Alert Add custom metadata or details to an alert manually. This can include additional context or information that assists with resolution. ### (5) Self Assign Assign the selected alert to yourself. This is ideal for team members who are taking ownership of specific alerts. ### (6) View Alert Open the alert details in the sidebar or dedicated alert view for a deeper dive into its metadata and context. ### (7) Source-Specific Actions Perform actions that are specific to the source of the alert. For example, linking directly to the monitoring tool or executing source-specific workflows. ### (8) Dismiss Alert Mark the alert as dismissed to indicate that no further action is required. This helps in managing and decluttering the alert table. ### (9) Change Status Update the status of the alert (e.g., from "firing" to "acknowledged"). This keeps the team informed about the current state of the alert. --- ================================================ FILE: docs/alerts/overview.mdx ================================================ --- title: "Overview" --- **Alert Management** empowers teams to effectively manage, monitor, and act on critical alerts. With a robust and user-friendly interface, Keep allows users to gain deep insights into their alerts, filter through large volumes of data, and take swift actions to maintain system health. Everything related with Alert Management can be customized: 1. **Alert table** - view and manage the alerts. 2. **Search Bar** - use CEL to filter alerts which can be saved as "Customized Presets". 3. **Facets** - slice and dice alerts. 4. **Columns and Time** - customize columns and theme for your preset. ================================================ FILE: docs/alerts/presets.mdx ================================================ --- title: "Customized Presets" --- You can think of a preset like a "Slack Channel" for your alerts - a logical container to follow only alerts that matter for you. With Keep's introduction of CEL (Common Expression Language) for alert filtering, users gain the flexibility to define more complex and precise alert filtering logic. This feature allows the creation of customizable filters using CEL expressions to refine alert visibility based on specific criteria. ## How It Works 1. **CEL Expression Creation**: Users craft CEL expressions that define the filtering criteria for alerts. 2. **Preset Definition**: These expressions can be saved as presets for easy application to different alert streams. 3. **Alert Filtering**: When applied, the CEL expressions evaluate each alert against the defined criteria, filtering the alert stream in real-time. ## Creating a CEL Expression There are two ways of creating a CEL expression in Keep ### Manually creating CEL query Use the [CEL Language Definition](https://github.com/google/cel-spec/blob/master/doc/langdef.md) documentation to better understand the capabilities of the Common Expression Language This is an example of how to query all the alerts that came from `Sentry` If the CEL syntax you typed in is invalid, an error message will show up (in this case, we used invalid `''` instead of `""`): ### Importing from an SQL query 1. Click on the "Import from SQL" button 2. Write/Paste your SQL query and hit the "Convert to CEL" button Which in turn will generate and apply a valid CEL query: ## Save Presets You can save your CEL queries into a `Preset` using the "Save current filter as a view" button You can name your `Preset` and configure whether it is "Private" (only the creating user will see this Preset) or account-wide available. The `Preset` will then be created and available for you to quickly navigate and used ## Practical Example For instance, a user could create a CEL expression to filter alerts by severity and source, such as `severity == 'critical' && service.contains('database')`, ensuring only critical alerts from database services are displayed. ## Best Practices - **Specificity in Expressions**: Craft expressions that precisely target the desired alerts to avoid filtering out relevant alerts. - **Presets Management**: Regularly review and update your presets to align with evolving alerting needs. - **Testing Expressions**: Before applying, test CEL expressions to ensure they correctly filter the desired alerts. ## Useful Links - [Common Expression Language](https://github.com/google/cel-spec?tab=readme-ov-file) - [CEL Language Definition](https://github.com/google/cel-spec/blob/master/doc/langdef.md) ================================================ FILE: docs/alerts/sidebar.mdx ================================================ --- title: "Alert Sidebar" --- The Alert Sidebar in Keep provides a detailed view of a selected alert, offering in-depth context and information to aid in alert management and resolution. This feature is designed to give users a comprehensive understanding of the alert without leaving the main interface. ### (1) Alert Name Displays the name of the alert, which typically summarizes the issue or event being reported. This is the primary identifier for the alert. ### (2) Alert Related Service Shows the service associated with the alert. This helps teams quickly understand which part of the infrastructure or application is affected. ### (3) Alert Source Indicates the source of the alert, such as the monitoring tool or system that generated it (e.g., Prometheus, Datadog). This provides context on where the alert originated. ### (4) Alert Description A detailed description of the alert, including specifics about the issue. This section helps provide a deeper understanding of what triggered the alert. ### (5) Alert Fingerprint A unique identifier for the alert. The fingerprint is used to correlate alerts and track their lifecycle across systems. ### (6) Alert Timeline Displays a chronological history of the alert, including when it was created, acknowledged, updated, or resolved. The timeline provides insights into how the alert has been managed. ### (7) Alert Topology View Offers a visual representation of the alert's impact on the system's topology. This view helps identify affected components and their relationships to other parts of the infrastructure. --- ================================================ FILE: docs/alerts/sound.mdx ================================================ --- title: "Sound Notifications" --- Sound notifications ensure you never miss important updates or alerts. ## How It Works 1. **Preset Notifications**: Mark a preset as "noisy," and any alert linked to it will play a sound. Alternatively, set individual alerts as `isNoisy=true` to trigger sounds through linked presets. 2. **Real-Time Alerts**: With WebSocket enabled, alerts arrive instantly. The server notifies the browser, which retrieves and processes new alerts immediately. ## Who Hears Notifications? Users with Keep open in their browser and the noisy preset visible in their navigation bar. Presets can be filtered to control notifications. ### Customizing 1. **Change the Default Sound**: Replace the `alert.mp3` file with a custom audio file of your choice. --- ================================================ FILE: docs/alerts/table.mdx ================================================ --- title: "Alert Table" --- The Alert Table is the central interface for viewing and managing alerts in Keep. It provides a comprehensive view of all alerts with powerful filtering, sorting, and interaction capabilities. ### (1) Columns Columns in the alert table can be customized to display the most relevant data. Users can select which columns to display and reorder them using drag-and-drop functionality. ### (2) Alert Bulk Action Easily select one or more alerts for bulk actions. Actions include options like "assign to incident," "dismiss," or other available workflows. ### (3) Alert Actions Menu The actions menu provides quick access to various operations for each alert, such as linking to incidents, creating tickets, or escalating. ### (4) Alert Link Each alert includes a badge that links directly to the original alert in the monitoring tool. Clicking this badge opens the alert in its source system for further investigation. ### (5) Alert Ticket You can asign ticket to alert. If an alert is associated with a ticket, a ticket badge will be displayed. Clicking on this badge navigates directly to the assigned ticket in the ticketing tool. ### (6) Alert Comment Users can add comments to any alert to provide additional context or share insights with team members. This improves collaboration and ensures all relevant information is available. ### (7) Alert Related Workflows View and trigger related workflows for an alert directly from the table. This allows seamless integration with predefined processes like escalation, suppression, or custom automation. ### (8) Sorting The table supports sorting by any column using the "sort" icon. This makes it easy to prioritize or organize alerts based on specific criteria. --- ================================================ FILE: docs/applications/github.mdx ================================================ --- title: "GitHub Application" sidebarTitle: "GitHub" description: "The Keep GitHub Application is a powerful tool that enhances your workflow by monitoring file changes under the parent `.keep/` directory in your repositories' pull requests. It automates the process of generating AI-generated alerts from plain English and allows you to seamlessly deploy these alerts to your provider using comments." --- ## Getting Started To start using the Keep GitHub Application, follow these simple steps: 1. Sign up and log in to the **[Keep's platform](https://platform.keephq.dev)**. 2. Install the **Keep GitHub Application** either through the onboarding screen or by visiting **[this link](https://github.com/apps/keephq)**. The installation process is straightforward and user-friendly. 3. Connect your preferred provider, such as Datadog, by linking it to Keep's platform. This step allows Keep to seamlessly generate and deploy alerts to your chosen provider. 4. You are now ready to go! The Keep GitHub Application is successfully integrated into your GitHub workflow. ## How does it work? The Keep GitHub Application operates seamlessly in the background, ensuring that you stay informed about relevant changes in your repositories. Whenever a pull request is opened or updated, the application monitors the files under the .keep/ directory. Once a change is detected, the GitHub application sends an HTTP request to Keep's API smart AI layer. The AI layer analyzes the content of the changed files and together with context from the provider (existing alerts, sample logs, etc.) generates an alert based on the user provided plain English description. The AI-powered alert generation ensures accuracy and relevance. After the alert is generated, the Keep GitHub Application automatically comments the alert on the respective file within the pull request. This allows you, as the user, to conveniently review and verify the generated alert. If the generated alert meets your requirements and is ready to be deployed, you can simply leave a comment on the file. The comment should include one of the predefined emojis, such as 🚀 or 🆗 (refer to the ["Deploying Alerts with Emojis"](#deploying-alerts-with-emojis) section). The Keep GitHub Application recognizes these emojis as commands to proceed with the deployment process. This intuitive workflow streamlines the alert generation and deployment process, providing you with a seamless experience and allowing you to focus on the core aspects of your project. ## Monitoring Files Under .keep/ Directory The Keep GitHub Application actively monitors the files residing within the `.keep/` directory located at the parent level of your repository. Any changes or updates made to these files will trigger the alert generation process. This allows you to focus on the essential aspects of your project while ensuring that relevant changes are promptly identified and acted upon. ## Alert File Structure Each file under the `.keep/` directory represents a single alert. The structure of an alert file follows the YAML format. Below is an example of an alert file: ```yaml title=alert-example.yaml # The alert text in plain English alert: | Count the error rate (4xx-5xx) this service has in the last 10 minutes. Alert when the threshold is above 5% out of total requests. Send a Slack message to the #alerts-playground channel and include all the context you have" # The provider you've previously connected and want this alert to be generated for provider: datadog # You can use this to override Keep's managed API and have the GitHub application # use the API that you run locally (using the NGROK URL) # api_url: https://OVERRIDE-KEEP-MANAGED-API ``` The alert file consists of the following components: 1. **Alert Text**: This section contains the plain English description of the alert. Write a clear and concise explanation of the conditions or criteria that should trigger the alert. You can include any relevant context to facilitate understanding and resolution. 2. **Provider**: Specify the provider to which you want the alert to be generated. This ensures that the alert seamlessly integrates with your existing monitoring and notification infrastructure. In the example above, the alert is configured to be generated for Datadog. 3. **API Override**: Optionally, you can include the api_url field to override Keep's managed API. This allows you to use your locally hosted API for advanced customization and integration purposes. **ngrok?** Imagine you have a secret hideout in your backyard, but you don't want anyone to know where it is. So, you build a tunnel from your hideout to a tree in your friend's backyard. This way, you can go into the tunnel in your yard and magically come out at the tree in your friend's yard. Now, let's say you have a cool website or a game that you want to show your friend, but it's running on your computer at home. Your friend is far away and can't come to your house. So, you need a way to show them your website or game over the internet. This is where ngrok comes in! Ngrok is like a magical tunnel, just like the one you built in your backyard. It creates a secure connection between your computer and the internet. It gives your computer a special address that people can use to reach your website or game, even though it's on your computer at home. When you start ngrok, it opens up a tunnel between your computer and the internet. It assigns a special address to your computer, like a secret door to your website or game. When your friend enters that address in their web browser, it's as if they're walking through the tunnel and reaching your website or game on your computer. So, ngrok is like a magical tunnel that helps you share your website or game with others over the internet, just like the secret tunnel you built to reach your friend's backyard! **How to start Keep with ngrok** ngrok is Controlled with the `USE_NGROK` environment variable.
Simply run Keep's API using the following command to start with ngrok: `USE_NGROK=true keep api` {" "} `USE_NGROK` is enabled by default when running with `docker-compose` **How to obtain ngrok URL?** When `USE_NGROK` is set, Keep will start with ngrok in the background.
You can find your private ngrok URL looking for this log line "`ngrok tunnel`": ```json { "asctime": "0000-00-00 00:00:00,000", "message": "ngrok tunnel: https://fab5-213-57-123-130.ngrok.io", ... } ``` The URL (https://fab5-213-57-123-130.ngrok.io in the example above) is a publicly accessible URL to your Keep API service running locally.
{" "} You can check that the ngrok tunnel is working properly by sending a simple HTTP GET request to `/healthcheck` Try: `curl -v https://fab5-213-57-123-130.ngrok.io/healthcheck` in our example.
## Deploying Alerts with Emojis To deploy an alert to the specified provider, you can simply leave a comment on the respective file using the 🚀 or 🆗 emojis. The Keep GitHub Application recognizes these emojis as commands and will initiate the deployment process accordingly. This streamlined approach ensures a smooth and intuitive experience when deploying alerts. For example, by leaving a comment with the 🚀 emoji, you can signal the Keep GitHub Application to deploy the alert to the specified provider (Datadog in our example above). The Keep GitHub Application will either mark the comment with 👍 meaning the alert was successfully deployed or 👎 and another comment with the failure reason in case the alert was not deployed. Keep GitHub Application has a retry mechanism that automatically tries to fix the alert in case it was not successfully deployed to the provider. If the alert that is deployed is different from the originally generated one, Keep Github Application will comment the updated one once again. ================================================ FILE: docs/authentication/okta.md ================================================ # Okta Integration Guide This document provides comprehensive information about the Okta integration in Keep, including configuration, deployment, maintenance, and testing. ## Overview Keep supports Okta as an authentication provider, enabling: - Single Sign-On (SSO) via Okta - JWT token validation with JWKS - User and group management through Okta - Role-based access control - Token refresh capabilities ## Environment Variables ### Backend Environment Variables | Variable | Description | Example | |----------|-------------|---------| | `AUTH_TYPE` | Set to `"okta"` to enable Okta authentication | `okta` | | `OKTA_DOMAIN` | Your Okta domain | `company.okta.com` | | `OKTA_API_TOKEN` | Admin API token for Okta management | `00aBcD3f4GhIJkl5m6NoPQr` | | `OKTA_ISSUER` | The issuer URL for your Okta application | `https://company.okta.com/oauth2/default` | | `OKTA_CLIENT_ID` | Client ID of your Okta application | `0oa1b2c3d4e5f6g7h8i9j` | | `OKTA_CLIENT_SECRET` | Client Secret of your Okta application | `abcd1234efgh5678ijkl9012` | | `OKTA_AUDIENCE` | (Optional) The audience for token validation | `api://keep` | ### Frontend Environment Variables | Variable | Description | Example | |----------|-------------|---------| | `AUTH_TYPE` | Set to `"OKTA"` to enable Okta authentication | `OKTA` | | `OKTA_CLIENT_ID` | Client ID of your Okta application | `0oa1b2c3d4e5f6g7h8i9j` | | `OKTA_CLIENT_SECRET` | Client Secret of your Okta application | `abcd1234efgh5678ijkl9012` | | `OKTA_ISSUER` | The issuer URL for your Okta application | `https://company.okta.com/oauth2/default` | | `OKTA_DOMAIN` | Your Okta domain | `company.okta.com` | ## Okta Configuration ### Creating an Okta Application 1. Sign in to your Okta Admin Console 2. Navigate to **Applications** > **Applications** 3. Click **Create App Integration** 4. Select **OIDC - OpenID Connect** as the Sign-in method 5. Choose **Web Application** as the Application type 6. Click **Next** ### Application Settings 1. **Name**: Enter a name for your application (e.g., "Keep") 2. **Grant type**: Select Authorization Code 3. **Sign-in redirect URIs**: Enter your app's callback URL, e.g., `https://your-keep-domain.com/api/auth/callback/okta` 4. **Sign-out redirect URIs**: Enter your app's sign-out URL, e.g., `https://your-keep-domain.com/signin` 5. **Assignments**: - **Skip group assignment for now** or assign to appropriate groups 6. Click **Save** ### Create API Token 1. Navigate to **Security** > **API** 2. Select the **Tokens** tab 3. Click **Create Token** 4. Name your token (e.g., "Keep Integration") 5. Copy the generated token value (this will be your `OKTA_API_TOKEN`) ### Configure OIDC Claims (Optional but Recommended) 1. Navigate to your application 2. Go to the **Sign On** tab 3. Under **OpenID Connect ID Token**, click **Edit** 4. Add custom claims: - `keep_tenant_id`: The tenant ID in Keep - `keep_role`: The user's role in Keep ## Deployment Instructions ### Docker Deployment Add the required environment variables to your docker-compose file or Kubernetes deployment: ```yaml environment: - AUTH_TYPE=okta - OKTA_DOMAIN=your-company.okta.com - OKTA_API_TOKEN=your-api-token - OKTA_ISSUER=https://your-company.okta.com/oauth2/default - OKTA_CLIENT_ID=your-client-id - OKTA_CLIENT_SECRET=your-client-secret ``` ### Next.js Frontend Configure environment variables in your `.env.local` file: ``` AUTH_TYPE=OKTA OKTA_CLIENT_ID=your-client-id OKTA_CLIENT_SECRET=your-client-secret OKTA_ISSUER=https://your-company.okta.com/oauth2/default OKTA_DOMAIN=your-company.okta.com ``` ### Vercel Deployment Add the environment variables in your Vercel project settings. ## User and Group Management ### Users The system automatically maps Okta users to Keep users. Key mappings: - Okta email → Keep email - Okta firstName → Keep name - Okta groups → Keep groups - Custom claim `keep_role` → Keep role (defaults to "user" if not specified) ### Groups Groups in Okta are synchronized with Keep. Groups with names starting with `keep_` are treated as roles. ### Roles Roles are implemented as Okta groups with the prefix `keep_`. For example: - `keep_admin` → Admin role in Keep - `keep_user` → User role in Keep ## Authentication Flow 1. User accesses Keep application 2. User is redirected to Okta login page 3. After successful authentication, Okta returns an ID token and access token 4. Keep validates the token using Okta's JWKS endpoint 5. Keep extracts user information and permissions from the token 6. When tokens expire, Keep automatically refreshes them using the refresh token ## Token Refresh The refresh token flow is handled automatically by the application: 1. The system detects when an access token is about to expire 2. It uses the refresh token to obtain a new access token from Okta 3. The new token is stored and used for subsequent requests ## Testing Strategies ### Unit Tests 1. **AuthVerifier Tests**: Test token validation with mock tokens ```python def test_okta_verify_bearer_token(): # Create a mock token with the expected claims # Initialize the OktaAuthVerifier # Verify the token is validated correctly ``` 2. **IdentityManager Tests**: Test user and group management ```python def test_okta_create_user(): # Mock Okta API responses # Test creating a user # Verify the correct API calls are made ``` ### Integration Tests 1. **End-to-End Authentication Flow**: - Create a test user in Okta - Attempt to log in to the application - Verify successful authentication 2. **Token Refresh Test**: - Obtain an access token and refresh token - Wait for token expiration - Verify token refresh occurs automatically 3. **Role-Based Access Control**: - Create users with different roles - Verify access to different endpoints based on roles ### Load Tests 1. **Token Validation Performance**: - Simulate multiple concurrent requests with tokens - Measure response time and system load - Verify JWKS caching is working correctly 2. **User Management Scaling**: - Test with a large number of users and groups - Measure performance of group and user operations ## Troubleshooting ### Common Issues 1. **Invalid Token Errors**: - Check that `OKTA_ISSUER` matches the issuer in your Okta application - Verify that token signing algorithm (RS256) is supported - Check for clock skew between your server and Okta 2. **API Request Failures**: - Verify that `OKTA_API_TOKEN` is valid and has sufficient permissions - Check rate limiting on Okta API 3. **User Not Found**: - Verify that the user exists in Okta - Check user status (active/deactivated) ### Debugging 1. Enable debug logging: ``` AUTH_DEBUG=true ``` 2. Check Okta API logs in the Okta Admin Console ## Maintenance Considerations ### Token Rotation - Rotate the `OKTA_API_TOKEN` periodically for security - Update the application with the new token without downtime ### JWKS Caching - The implementation caches JWKS keys for 24 hours - Adjust the cache duration if needed based on key rotation policy ### Custom Claims - When adding new custom claims, update both Okta configuration and code ### API Rate Limits - Be aware of Okta API rate limits - Implement retry logic for rate limit errors ## Code Structure ### Backend Components - **`keep/identitymanager/identity_managers/okta/okta_authverifier.py`**: Handles JWT validation with JWKS - **`keep/identitymanager/identity_managers/okta/okta_identitymanager.py`**: Manages users, groups, and roles via Okta API ### Frontend Components - **`auth.config.ts`**: NextAuth.js configuration for Okta - **`authenticationType.ts`**: Defines Okta as an authentication type ## Security Considerations 1. **Secure Storage of Secrets**: - Store `OKTA_CLIENT_SECRET` and `OKTA_API_TOKEN` securely - Never commit secrets to version control 2. **Token Validation**: - Always validate tokens with proper signature verification - Verify token audience and issuer 3. **Scoped API Tokens**: - Use the principle of least privilege for API tokens ## Future Improvements 1. **Enhanced Group Mapping**: - Implement more sophisticated group-to-role mappings - Support nested groups in Okta 2. **Custom Authorization Servers**: - Support multiple Okta authorization servers - Allow tenant-specific authorization servers 3. **Custom Scope Handling**: - Better integrate Okta scopes with Keep permissions ## Support and Resources - [Okta Developer Documentation](https://developer.okta.com/docs/reference/) - [NextAuth.js Okta Provider Documentation](https://next-auth.js.org/providers/okta) - [JWT Debugging Tools](https://jwt.io/) ================================================ FILE: docs/cli/commands/alert-enrich.mdx ================================================ --- sidebarTitle: "keep alert enrich" --- Enrich an alert. ## Usage ``` Usage: keep alert enrich [OPTIONS] [PARAMS]... ``` ## Options ## CLI Help ``` Usage: keep alert enrich [OPTIONS] [PARAMS]... Enrich an alert. Options: --fingerprint TEXT The fingerprint of the alert to enrich. [required] --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/alert-get.mdx ================================================ --- sidebarTitle: "keep alert get" --- Get an alert. ## Usage ``` Usage: keep alert get [OPTIONS] FINGERPRINT ``` ## Options ## CLI Help ``` Usage: keep alert get [OPTIONS] FINGERPRINT Options: --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/alert-list.mdx ================================================ --- sidebarTitle: "keep alert list" --- List alerts. ## Usage ``` Usage: keep alert list [OPTIONS] ``` ## Options * `filter`: * Type: STRING * Default: `none` * Usage: `--filter -f` Filter alerts based on specific attributes. E.g., --filter source=datadog * `export`: * Type: Path * Default: `none` * Usage: `--export` Export alerts to a specified JSON file. * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep alert list [OPTIONS] List alerts. Options: -f, --filter TEXT Filter alerts based on specific attributes. E.g., --filter source=datadog --export PATH Export alerts to a specified JSON file. --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/cli-alert.mdx ================================================ # cli alert Manage alerts. ## Usage ``` Usage: cli alert [OPTIONS] COMMAND [ARGS]... ``` ## Options * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: cli alert [OPTIONS] COMMAND [ARGS]... Manage alerts. Options: --help Show this message and exit. Commands: enrich Enrich an alert. get list List alerts. ``` ================================================ FILE: docs/cli/commands/cli-api.mdx ================================================ --- title: "api" sidebarTitle: "keep api" --- Start the API. ## Usage ``` Usage: keep api [OPTIONS] ``` ## Options * `multi_tenant`: * Type: BOOL * Default: `false` * Usage: `--multi-tenant` Enable multi-tenant mode * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep api [OPTIONS] Start the API. Options: --multi-tenant Enable multi-tenant mode --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/cli-config-new.mdx ================================================ --- sidebarTitle: "keep config new" --- Create new config. ## Usage ``` Usage: keep config new [OPTIONS]... ``` ## Options * `interactive`: * Type: BOOL * Default: `True` * Usage: `--interactive` Create config interactively. * `url`: * Type: STRING * Default: `http://localhost:8080` * Usage: `--url` The URL of the Keep backend server. * `api-key`: * Type: STRING * Default: `` * Usage: `--api-key` The api key for authenticating over keep. * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep config new [OPTIONS] create new config. Options: -u, --url TEXT The url of the keep api -a, --api-key TEXT The api key for keep -i, --interactive Interactive mode creating keep config (default True) --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/cli-config-show.mdx ================================================ --- sidebarTitle: "keep config show" --- Show keep configuration. ## Usage ``` Usage: keep config show [OPTIONS]... ``` ## Options * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep config show [OPTIONS] show the current config. Options: --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/cli-config.mdx ================================================ --- title: "config" sidebarTitle: "keep config" --- Set keep configuration. ## Usage ``` Usage: keep config [OPTIONS] COMMAND [ARGS]... ``` ## Options * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep config [OPTIONS] COMMAND [ARGS]... Manage the config. Options: --help Show this message and exit. Commands: new create new config. show show the current config. ``` ================================================ FILE: docs/cli/commands/cli-provider.mdx ================================================ # cli provider Manage providers. ## Usage ``` Usage: cli provider [OPTIONS] COMMAND [ARGS]... ``` ## Options * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: cli provider [OPTIONS] COMMAND [ARGS]... Manage providers. Options: --help Show this message and exit. Commands: connect delete list List providers. ``` ================================================ FILE: docs/cli/commands/cli-run.mdx ================================================ --- title: "run" sidebarTitle: "keep run" --- Run the alert. ## Usage ``` Usage: keep run [OPTIONS] ``` ## Options * `alerts_directory`: * Type: STRING * Default: `none` * Usage: `--alerts-directory --alerts-file -af` The path to the alert yaml/alerts directory * `alert_url`: * Type: STRING * Default: `none` * Usage: `--alert-url -au` A url that can be used to download an alert yaml NOTE: This argument is mutually exclusive with alerts_directory * `interval`: * Type: INT * Default: `0` * Usage: `--interval -i` When interval is set, Keep will run the alert every INTERVAL seconds * `providers_file`: * Type: STRING * Default: `providers.yaml` * Usage: `--providers-file -p` The path to the providers yaml * `tenant_id`: * Type: STRING * Default: `singletenant` * Usage: `--tenant-id -t` The tenant id * `api_key`: * Type: STRING * Default: `none` * Usage: `--api-key` The API key for keep's API * `api_url`: * Type: STRING * Default: `https://s.keephq.dev` * Usage: `--api-url` The URL for keep's API * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep run [OPTIONS] Run the alert. Options: -af, --alerts-directory, --alerts-file PATH The path to the alert yaml/alerts directory -au, --alert-url TEXT A url that can be used to download an alert yaml NOTE: This argument is mutually exclusive with alerts_directory -i, --interval INTEGER When interval is set, Keep will run the alert every INTERVAL seconds -p, --providers-file PATH The path to the providers yaml -t, --tenant-id TEXT The tenant id --api-key TEXT The API key for keep's API --api-url TEXT The URL for keep's API --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/cli-version.mdx ================================================ --- title: "version" sidebarTitle: "keep version" --- Get the library version. ## Usage ``` Usage: keep version [OPTIONS] ``` ## Options * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep version [OPTIONS] Get the library version. Options: --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/cli-whoami.mdx ================================================ --- title: "whoami" sidebarTitle: "keep whoami" --- Verify the api key auth. ## Usage ``` Usage: keep whoami [OPTIONS] ``` ## Options * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: keep whoami [OPTIONS] Verify the api key auth. Options: --help Show this message and exit. ``` ================================================ FILE: docs/cli/commands/cli-workflow.mdx ================================================ # cli workflow Manage workflows. ## Usage ``` Usage: cli workflow [OPTIONS] COMMAND [ARGS]... ``` ## Options * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: cli workflow [OPTIONS] COMMAND [ARGS]... Manage workflows. Options: --help Show this message and exit. Commands: apply Apply a workflow. list List workflows. run Run a workflow with a specified ID and fingerprint. runs Manage workflows executions. ``` ================================================ FILE: docs/cli/commands/cli.mdx ================================================ # cli Run Keep CLI. ## Usage ``` Usage: cli [OPTIONS] COMMAND [ARGS]... ``` ## Options * `verbose`: * Type: IntRange(0, None) * Default: `0` * Usage: `--verbose -v` Enable verbose output. * `json`: * Type: BOOL * Default: `false` * Usage: `--json -j` Enable json output. * `keep_config`: * Type: STRING * Default: `keep.yaml` * Usage: `--keep-config -c` The path to the keep config file (default keep.yaml) * `help`: * Type: BOOL * Default: `false` * Usage: `--help` Show this message and exit. ## CLI Help ``` Usage: cli [OPTIONS] COMMAND [ARGS]... Run Keep CLI. Options: -v, --verbose Enable verbose output. -j, --json Enable json output. -c, --keep-config TEXT The path to the keep config file (default keep.yaml) --help Show this message and exit. Commands: alert Manage alerts. api Start the API. config Get the config. provider Manage providers. run Run a workflow. version Get the library version. whoami Verify the api key auth. workflow Manage workflows. ``` ================================================ FILE: docs/cli/commands/extraction-create.mdx ================================================ --- sidebarTitle: "keep extraction create" --- Create a extraction rule. ## Usage ``` Usage: keep extraction create [OPTIONS] ``` ## Options * `name` * Type: STRING * Default: `` * Usage: `--name ` The name of the extraction. * `description` * Type: STRING * Default: `` * Usage: `--description ` The description of the extraction. * `priority` * Type: INTEGER RANGE * Default: `0` * Usage: `--priority ` The priority of the extraction, higher priority means this rule will execute first. `0<=x<=100`. * `pre` * Type: BOOL * Default: `false` * Usage: `--pre
`

  Whether this rule should be applied before or after the alert is standardized

* `attribute`
  * Type: STRING
  * Default: ``
  * Usage: `--attribute `

  Event attribute name to extract from.

* `regex`
  * Type: STRING
  * Default: ``
  * Usage: `--attribute `

  The regex rule to extract by. Regex format should be like python regex pattern for group matching.

* `condition`
  * Type: STRING
  * Default: ``
  * Usage: `--condition `

  CEL based condition.

* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.

## CLI Help

```
Usage: cli.py extraction create [OPTIONS]

  Create a extraction rule.

Options:
  -n, --name TEXT               The name of the extraction.  [required]
  -d, --description TEXT        The description of the extraction.
  -p, --priority INTEGER RANGE  The priority of the extraction, higher
                                priority means this rule will execute first.
                                [0<=x<=100]
  --pre BOOLEAN                 Whether this rule should be applied before or
                                after the alert is standardized.
  -a, --attribute TEXT          Event attribute name to extract from.
                                [required]
  -r, --regex TEXT              The regex rule to extract by. Regex format
                                should be like python regex pattern for group
                                matching.  [required]
  -c, --condition TEXT          CEL based condition.  [required]
  --help                        Show this message and exit.
```


================================================
FILE: docs/cli/commands/extraction-delete.mdx
================================================
---
sidebarTitle: "keep extraction delete"
---

Delete an extraction with a specified ID.

## Usage

```
Usage: keep extraction delete [OPTIONS]
```

## Options

* `extraction-id`
  * Type: STRING
  * Default: ``
  * Usage: `--extraction-id `

  The ID of the extraction to delete.

* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: cli.py extraction delete [OPTIONS]

  Delete a extraction with a specified ID.

Options:
  --extraction-id INTEGER  The ID of the extraction to delete.  [required]
  --help                   Show this message and exit.
```


================================================
FILE: docs/cli/commands/extractions-list.mdx
================================================
---
sidebarTitle: "keep extraction list"
---

List extractions.

## Usage

```
Usage: keep extraction list [OPTIONS]
```

List mappings.

## Options

* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.

## CLI Help

```
Usage: cli.py extraction list [OPTIONS]

  List extractions.

Options:
  --help  Show this message and exit.
```


================================================
FILE: docs/cli/commands/mappings-create.mdx
================================================
---
sidebarTitle: "keep mappings create"
---

Create a mapping rule.

## Usage

```
Usage: keep mappings create [OPTIONS]
```

## Options

* `name`
  * Type: STRING
  * Default: ``
  * Usage: `--name `

  The name of the mapping.

* `description`
  * Type: STRING
  * Default: ``
  * Usage: `--description `

  The description of the mapping.

* `file`
  * Type: STRING
  * Default: ``
  * Usage: `--file `

  The mapping file. Must be a CSV file.

* `matchers`
  * Type: STRING
  * Default: ``
  * Usage: `--matchers `

  The matchers of the mapping, as a comma-separated list of strings.

* `priority`
  * Type: INTEGER RANGE
  * Default: `0`
  * Usage: `--priority `

  The priority of the mapping, higher priority means this rule will execute first. `0<=x<=100`.

* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.

## CLI Help

```
Usage: keep mappings create [OPTIONS]

  Create a mapping rule.

Options:
  -n, --name TEXT               The name of the mapping.  [required]
  -d, --description TEXT        The description of the mapping.
  -f, --file PATH               The mapping file. Must be a CSV file.
                                [required]
  -m, --matchers TEXT           The matchers of the mapping, as a comma-
                                separated list of strings.  [required]
  -p, --priority INTEGER RANGE  The priority of the mapping, higher priority
                                means this rule will execute first.
                                [0<=x<=100]
  --help                        Show this message and exit.
```


================================================
FILE: docs/cli/commands/mappings-delete.mdx
================================================
---
sidebarTitle: "keep mappings delete"
---

Delete a mapping with a specified ID.

## Usage

```
Usage: keep mappings delete [OPTIONS]
```

## Options

* `mapping-id`
  * Type: STRING
  * Default: ``
  * Usage: `--mapping-id `

  The ID of the mapping to delete.

* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: keep mappings delete [OPTIONS]

  Delete a mapping with a specified ID

Options:
  --mapping-id INTEGER  The ID of the mapping to delete.  [required]
  --help                Show this message and exit.
```


================================================
FILE: docs/cli/commands/mappings-list.mdx
================================================
---
sidebarTitle: "keep mappings list"
---

List mappings.

## Usage

```
Usage: keep mappings [OPTIONS]
```

List mappings.

## Options

* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.

## CLI Help

```
Usage: keep mappings list [OPTIONS]

  List mappings.

Options:
  --help  Show this message and exit.
```


================================================
FILE: docs/cli/commands/provider-connect.mdx
================================================
---
sidebarTitle: "keep provider connect"
---

Connect a provider.

## Usage

```
Usage: keep provider connect [OPTIONS] PROVIDER_TYPE [PARAMS]...
```

## Options


## CLI Help

```
Usage: keep provider connect [OPTIONS] PROVIDER_TYPE [PARAMS]...

Options:
  -h, --help                Help on how to install this provider.
  -n, --provider-name TEXT  Every provider shuold have a name.
```


================================================
FILE: docs/cli/commands/provider-delete.mdx
================================================
---
sidebarTitle: "keep provider delete"
---

Delete a provider.

## Usage

```
Usage: keep provider delete [OPTIONS] [PROVIDER_ID]
```

## Options


## CLI Help

```
Usage: keep provider delete [OPTIONS] [PROVIDER_ID]

Options:
  --help  Show this message and exit.
```


================================================
FILE: docs/cli/commands/provider-list.mdx
================================================
---
sidebarTitle: "keep provider list"
---

List providers.

## Usage

```
Usage: keep provider list [OPTIONS]
```

## Options
* `available`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--available
-a`

  List provider that you can install.


* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: keep provider list [OPTIONS]

  List providers.

Options:
  -a, --available  List provider that you can install.
  --help           Show this message and exit.
```


================================================
FILE: docs/cli/commands/runs-list.mdx
================================================
---
sidebarTitle: "keep workflow runs list"
---

List workflow executions.

## Usage

```
Usage: keep workflow runs list [OPTIONS]
```

## Options
* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: keep workflow runs list [OPTIONS]

  List workflow executions.

Options:
  --help  Show this message and exit.
```


================================================
FILE: docs/cli/commands/runs-logs.mdx
================================================
---
sidebarTitle: "keep workflow runs logs"
---

Get workflow execution logs.

## Usage

```
Usage: keep workflow runs logs [OPTIONS] WORKFLOW_EXECUTION_ID
```

## Options


## CLI Help

```
Usage: keep workflow runs logs [OPTIONS] WORKFLOW_EXECUTION_ID

  Get workflow execution logs.

Options:
  --help  Show this message and exit.
```


================================================
FILE: docs/cli/commands/workflow-apply.mdx
================================================
---
sidebarTitle: "keep workflow apply"
---


Apply a workflow.

## Usage

```
Usage: keep workflow apply [OPTIONS]
```

## Options
* `file` (REQUIRED):
  * Type: Path
  * Default: `none`
  * Usage: `--file
-f`

  The workflow file


* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: keep workflow apply [OPTIONS]

  Apply a workflow.

Options:
  -f, --file PATH  The workflow file  [required]
  --help           Show this message and exit.
```


================================================
FILE: docs/cli/commands/workflow-list.mdx
================================================
---
sidebarTitle: "keep workflow list"
---

List workflows.

## Usage

```
Usage: keep workflow list [OPTIONS]
```

## Options
* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: keep workflow list [OPTIONS]

  List workflows.

Options:
  --help  Show this message and exit.
```


================================================
FILE: docs/cli/commands/workflow-run.mdx
================================================
---
sidebarTitle: "keep workflow run"
---

Run a workflow with a specified ID and fingerprint.

## Usage

```
Usage: keep workflow run [OPTIONS]
```

## Options
* `workflow_id` (REQUIRED):
  * Type: STRING
  * Default: `none`
  * Usage: `--workflow-id`

  The ID (UUID or name) of the workflow to run


* `fingerprint` (REQUIRED):
  * Type: STRING
  * Default: `none`
  * Usage: `--fingerprint`

  The fingerprint to query the payload


* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: keep workflow run [OPTIONS]

  Run a workflow with a specified ID and fingerprint.

Options:
  --workflow-id TEXT  The ID (UUID or name) of the workflow to run  [required]
  --fingerprint TEXT  The fingerprint to query the payload  [required]
  --help              Show this message and exit.
```


================================================
FILE: docs/cli/commands/workflow-runs.mdx
================================================
---
sidebarTitle: "keep workflow runs"
---

Manage workflows executions.

## Usage

```
Usage: cli workflow runs [OPTIONS] COMMAND [ARGS]...
```

## Options
* `help`:
  * Type: BOOL
  * Default: `false`
  * Usage: `--help`

  Show this message and exit.



## CLI Help

```
Usage: cli workflow runs [OPTIONS] COMMAND [ARGS]...

  Manage workflows executions.

Options:
  --help  Show this message and exit.

Commands:
  list  List workflow executions.
  logs  Get workflow execution logs.
```


================================================
FILE: docs/cli/github-actions.mdx
================================================
---
title: "Sync Keep Workflows With Github Action"
---

This documentation provides a detailed guide on how to use the Keep CLI within a GitHub Actions workflow to synchronize and manage Keep workflows from a directory. This setup automates the process of uploading workflows to Keep, making it easier to maintain and update them.





### Configuration
To set up this workflow in your repository:

- Add the workflow YAML file to your repository under `.github/workflows/`.
- Set your Keep API Key and URL as secrets in your repository settings if you haven't already.
- Make changes to your workflows in the specified directory or trigger the workflow manually through the GitHub UI.
- Change 'example/workflows/**' to the directory you store your Keep Workflows.


### GitHub Action Workflow
This GitHub Actions workflow automatically synchronizes workflows from a specified directory to Keep whenever there are changes. It also allows for manual triggering with optional parameters.

```yaml
# A workflow that sync Keep workflows from a directory
name: "Sync Keep Workflows"

on:
    push:
        paths:
          - 'examples/workflows/**'
    workflow_dispatch:
        inputs:
            keep_api_key:
              description: 'Keep API Key'
              required: false
            keep_api_url:
              description: 'Keep API URL'
              required: false
              default: 'https://api.keephq.dev'

jobs:
    sync-workflows:
        name: Sync workflows to Keep
        runs-on: ubuntu-latest
        container:
            image: us-central1-docker.pkg.dev/keephq/keep/keep-cli:latest
        env:
            KEEP_API_KEY: ${{ secrets.KEEP_API_KEY || github.event.inputs.keep_api_key }}
            KEEP_API_URL: ${{ secrets.KEEP_API_URL || github.event.inputs.keep_api_url }}

        steps:
        - name: Check out the repo
          uses: actions/checkout@v2

        - name: Run Keep CLI
          run: |
            keep workflow apply -f examples/workflows

```


================================================
FILE: docs/cli/installation.mdx
================================================
---
title: "Installation"
---
Missing an installation? submit a new installation  request and we will add it as soon as we can.


We recommend to install Keep CLI with Python version 3.11 for optimal compatibility and performance.
This choice ensures seamless integration with all dependencies, including pyarrow, which currently does not support Python 3.12


Need Keep CLI on other versions? Feel free to contact us! 

## Clone and install (Option 1)

### Install
First, clone Keep repository:

```shell
git clone https://github.com/keephq/keep.git && cd keep
```

Install Keep CLI with `pip`:

```shell
# MacOS if python or pip not present:
# brew install python@3.11
# brew install postgresql
pip3.11 install .
```
or with `poetry`:

```shell
poetry install
```

From now on, Keep should be installed locally and accessible from your CLI, test it by executing:

```
keep version
```

### Configuration

To get API key, check Keep UI -> your username (bottom left) -> Settings -> API Keys
```
keep config new --url http://backend.my_keep.my_awesome_org.com:backend_port --api-key your_personal_api_key
```

### Test

Now, 
```
keep workflow apply -f examples/workflows/query_clickhouse.yml
```

Congrats 🥳 Check your UI for the new workflow uploaded from the YAML file.


## Docker image (Option 2)
### Install

```
docker run -v ${PWD}:/app -v ~/.keep.yaml:/root/.keep.yaml -it us-central1-docker.pkg.dev/keephq/keep/keep-cli keep config new --url http://backend.my_keep.my_awesome_org.com:backend_port --api-key your_personal_api_key
```

### Test
```
docker run -v ${PWD}:/app -v ~/.keep.yaml:/root/.keep.yaml -it us-central1-docker.pkg.dev/keephq/keep/keep-cli workflow apply -f examples/workflows/query_clickhouse.yml
```


## Enable Auto Completion
Keep's CLI supports shell auto-completion, which can make your life a whole lot easier 😌
If you're using zsh

```shell title=~/.zshrc
eval "$(_KEEP_COMPLETE=zsh_source keep)"
```

If you're using bash

```bash title=~/.bashrc
eval "$(_KEEP_COMPLETE=bash_source keep)"
```

Using eval means that the command is invoked and evaluated every time a shell is started, which can delay shell responsiveness. To speed it up, write the generated script to a file, then source that.


================================================
FILE: docs/cli/overview.mdx
================================================
---
title: "Overview"
---

Keep CLI allow you to manage Keep from CLI.

Start by [installing](/cli/installation) Keep CLI and [running a workflow](/cli/commands/cli-run).

### Env variables

| Env var | Purpose | Required | Default Value | Valid options |
|:-------------------:|:-------:|:----------:|:-------------:|:-------------:|
| **KEEP_CLI_IGNORE_SSL** | Ignore SSL while connecting to the KEEP API | No | false | "true" or "false" |


================================================
FILE: docs/deployment/authentication/auth0-auth.mdx
================================================
---
title: "Auth0 Authentication"
---


Keep Cloud: ✅ 
Keep Enterprise On-Premises: ✅
Keep Open Source: ⛔️
Keep supports multi-tenant environments through Auth0, enabling separate tenants to operate independently within the same Keep platform. ### When to Use - **Already using Auth0:** If you are already using Auth0 in your organization, you can leverage it as Keep authentication provider. - **SSO/SAML:** Auth0 supports various Single Sign-On (SSO) and SAML protocols, allowing you to integrate Keep with your existing identity management systems. ### Setup Instructions To start Keep with Auth0 authentication, set the following environment variables: #### Frontend Environment Variables | Environment Variable | Description | Required | Default Value | |--------------------|-----------|:--------:|:-------------:| | AUTH_TYPE | Set to 'AUTH0' for Auth0 authentication | Yes | - | | AUTH0_DOMAIN | Your Auth0 domain | Yes | - | | AUTH0_CLIENT_ID | Your Auth0 client ID | Yes | - | | AUTH0_CLIENT_SECRET | Your Auth0 client secret | Yes | - | | AUTH0_ISSUER | Your Auth0 API issuer | Yes | - | #### Backend Environment Variables | Environment Variable | Description | Required | Default Value | |--------------------|-----------|:--------:|:-------------:| | AUTH_TYPE | Set to 'AUTH0' for Auth0 authentication | Yes | - | | AUTH0_MANAGEMENT_DOMAIN | Your Auth0 management domain | Yes | - | | AUTH0_CLIENT_ID | Your Auth0 client ID | Yes | - | | AUTH0_CLIENT_SECRET | Your Auth0 client secret | Yes | - | | AUTH0_AUDIENCE | Your Auth0 API audience | Yes | - | ### Example configuration Use the `docker-compose-with-auth0.yml` for an easy setup, which includes necessary environment variables for enabling Auth0 authentication. ================================================ FILE: docs/deployment/authentication/azuread-auth.mdx ================================================ --- title: "Azure AD Authentication" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: ⛔️
Keep supports enterprise authentication through Azure Entre ID (formerly known as Azure AD), enabling organizations to use their existing Microsoft identity platform for secure access management. ## When to Use - **Microsoft Environment:** If your organization uses Microsoft 365 or Azure services, Azure AD integration provides seamless authentication. - **Enterprise SSO:** Leverage Azure AD's Single Sign-On capabilities for unified access management. ## Setup Instructions (on Azure AD) ### Creating an Azure AD Application 1. Sign in to the [Azure Portal](https://portal.azure.com) 2. Navigate to **Microsoft Entra ID** > **App registrations** > **New registration** Azure AD App Registration 3. Configure the application: - Name: "Keep" Note that we are using "Register an application to integrate with Microsoft Entra ID (App you're developing)" since you're self-hosting Keep and need direct control over the authentication flow and permissions for your specific instance - unlike the cloud/managed version where Keep's team has already configured a centralized application registration. Azure AD App Registration 4. Configure the application (continue) - Supported account types: "Single tenant" We recommend using "Single tenant" for enhanced security as it restricts access to users within your organization only. While multi-tenant configuration is possible, it would allow users from any Azure AD directory to access your Keep instance, which could pose security risks unless you have specific cross-organization requirements. - Redirect URI: "Web" + your redirect URI We use "Web" platform instead of "Single Page Application (SPA)" because Keep's backend handles the authentication flow using client credentials/secrets, which is more secure than the implicit flow used in SPAs. This prevents exposure of tokens in the browser and provides stronger security through server-side token validation and refresh token handling. For localhost, the redirect would be http://localhost:3000/api/auth/callback/microsoft-entra-id For production, it should be something like http://your_keep_frontend_domain/api/auth/callback/microsoft-entra-id Azure AD App Registration 5. Finally, click "register" ### Configure Authentication After we created the application, let's configure the authentication. 1. Go to "App Registrations" -> "All applications" Azure AD Authentication Configuration 2. Click on your application -> "Add a certificate or secret" Azure AD Authentication Configuration 3. Click on "New client secret" and give it a name Azure AD Authentication Configuration 4. Keep the "Value", we will use it soon as `KEEP_AZUREAD_CLIENT_SECRET` Azure AD Authentication Configuration ### Configure Groups Keep maps Azure AD groups to roles with two default groups: 1. Admin Group (read + write) 2. NOC Group (read only) To create those groups, go to Groups -> All groups and create two groups: Azure AD Authentication Configuration Keep the Object id of these groups and use it as `KEEP_AZUREAD_ADMIN_GROUP_ID` and `KEEP_AZUREAD_NOC_GROUP_ID`. ### Configure Group Claims 1. Navigate to **Token configuration** Azure AD Authentication Configuration 2. Add groups claim: - Select "Security groups" and "Groups assigned to the application" - Choose "Group ID" as the claim value Azure AD Authentication Configuration Azure AD Authentication Configuration ### Configure Application Scopes 1. Go to "Expose an API" and click on "Add a scope" Azure AD Authentication Configuration 2. Keep the default Application ID and click "Save and continue" Azure AD Authentication Configuration 3. Add "default" as scope name, also give a display name and description Azure AD Authentication Configuration 3. Finally, click "Add scope" Azure AD Authentication Configuration ## Setup Instructions (on Keep) After you configured Azure AD you should have the following: 1. Azure AD Tenant ID 2. Azure AD Client ID How to get: Azure AD Authentication Configuration 3. Azure AD Client Secret [See Configure Authentication](#configure-authentication). 4. Azure AD Group ID's for Admins and NOC (read only) [See Configure Groups](#configure-groups). ### Configuration #### Frontend | Environment Variable | Description | Required | Default Value | |--------------------|-------------|:---------:|:-------------:| | AUTH_TYPE | Set to 'AZUREAD' for Azure AD authentication | Yes | - | | KEEP_AZUREAD_CLIENT_ID | Your Azure AD application (client) ID | Yes | - | | KEEP_AZUREAD_CLIENT_SECRET | Your client secret | Yes | - | | KEEP_AZUREAD_TENANT_ID | Your Azure AD tenant ID | Yes | - | | NEXTAUTH_URL | Your Keep application URL | Yes | - | | NEXTAUTH_SECRET | Random string for NextAuth.js | Yes | - | #### Backend | Environment Variable | Description | Required | Default Value | |--------------------|-------------|:---------:|:-------------:| | AUTH_TYPE | Set to 'AZUREAD' for Azure AD authentication | Yes | - | | KEEP_AZUREAD_TENANT_ID | Your Azure AD tenant ID | Yes | - | | KEEP_AZUREAD_CLIENT_ID | Your Azure AD application (client) ID | Yes | - | | KEEP_AZUREAD_ADMIN_GROUP_ID | The group ID of Keep Admins (read write) | Yes | - | | KEEP_AZUREAD_NOC_GROUP_ID | The group ID of Keep NOC (read only) | Yes | - | ## Features and Limitations #### Supported Features - Single Sign-On (SSO) - Role-based access control through Azure AD groups - Multi-factor authentication (when configured in Azure AD) #### Limitations See [Overview](/deployment/authentication/overview) ================================================ FILE: docs/deployment/authentication/db-auth.mdx ================================================ --- title: "DB Authentication" --- For applications requiring user management and authentication, Keep supports basic authentication with username and password. ### When to Use - **Self-Hosted Deployments:** When you're deploying Keep for individual use or within an organization. - **Enhanced Security:** Provides a simple yet effective layer of security for your Keep instance. ### Setup Instructions To start Keep with DB authentication, set the following environment variables: | Environment Variable | Description | Required | Frontend/Backend | Default Value | |--------------------|:-----------:|:--------:|:----------------:|:-------------:| | AUTH_TYPE | Set to 'DB' for database authentication | Yes | Both | - | | KEEP_JWT_SECRET | Secret for JWT token generation | Yes | Backend | - | | KEEP_DEFAULT_USERNAME | Default admin username | No | Backend | keep | | KEEP_DEFAULT_PASSWORD | Default admin password | No | Backend | keep | | KEEP_FORCE_RESET_DEFAULT_PASSWORD | Override the current admin password | No | Backend | false | ### Example configuration Use the `docker-compose-with-auth.yml` for an easy setup, which includes necessary environment variables for enabling basic authentication. ================================================ FILE: docs/deployment/authentication/keycloak-auth.mdx ================================================ --- title: "Keycloak Authentication" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: ⛔️
Keep supports Keycloak in a "managed" way where Keep auto-provisions all resources (realm, client, etc.). Keep can also work with externally managed Keycloak. To learn how, please contact the team on [Slack](https://slack.keephq.dev). Keep integrates with Keycloak to provide a powerful and flexible authentication system for multi-tenant applications, supporting Single Sign-On (SSO) and SAML. ### When to Use - **On Prem:** When deploying Keep on-premises and requiring a robust authentication system. - **OSS:** If you prefer using open-source software for your authentication needs. - **Enterprise Protocols:** When you need support for enterprise-level protocols like SAML and OpenID Connect. - **Fully Customized:** When you need a highly customizable authentication solution. - **RBAC:** When you require Role-Based Access Control for managing user permissions. - **User and Group Management:** When you need advanced user and group management capabilities. ### Setup Instructions To start Keep with Keycloak authentication, set the following environment variables: #### Frontend Environment Variables | Environment Variable | Description | Required | Default Value | |--------------------|-----------|:--------:|:-------------:| | AUTH_TYPE | Set to 'KEYCLOAK' for Keycloak authentication | Yes | - | | KEYCLOAK_ID | Your Keycloak client ID (e.g. keep) | Yes | - | | KEYCLOAK_ISSUER | Full URL to Your Keycloak issuer URL e.g. http://localhost:8181/auth/realms/keep | Yes | - | | KEYCLOAK_SECRET | Your Keycloak client secret | Yes | keep-keycloak-secret | #### Backend Environment Variables | Environment Variable | Description | Required | Default Value | |--------------------|-----------|:--------:|:-------------:| | AUTH_TYPE | Set to 'KEYCLOAK' for Keycloak authentication | Yes | - | | KEYCLOAK_URL | Full URL to your Keycloak server | Yes | http://localhost:8181/auth/ | | KEYCLOAK_REALM | Your Keycloak realm | Yes | keep | | KEYCLOAK_CLIENT_ID | Your Keycloak client ID | Yes | keep | | KEYCLOAK_CLIENT_SECRET | Your Keycloak client secret | Yes | keep-keycloak-secret | | KEYCLOAK_ADMIN_USER | Admin username for Keycloak | Yes | keep_admin | | KEYCLOAK_ADMIN_PASSWORD | Admin password for Keycloak | Yes | keep_admin | | KEYCLOAK_AUDIENCE | Audience for Keycloak | Yes | realm-management | ### Example configuration To get a better understanding on how to use Keep together with Keycloak, you can: - See [Keycloak](https://github.com/keephq/keep/tree/main/keycloak) directory for configuration, realm.json, etc - See Keep + Keycloak [docker-compose example](https://github.com/keephq/keep/blob/main/keycloak/docker-compose.yaml) ================================================ FILE: docs/deployment/authentication/no-auth.mdx ================================================ --- title: "No Authentication" --- Using this configuration in production is not secure and strongly discouraged. Deploying Keep without authentication is the quickest way to get up and running, ideal for local development or internal tools where security is not a concern. ## Setup Instructions Either if you use docker-compose, kubernetes, openshift or any other deployment method, add the following environment variable: ``` # Frontend AUTH_TYPE=NOAUTH # Backend AUTH_TYPE=NOAUTH ``` ## Implications With `AUTH_TYPE=NOAUTH`: - Keep won't show any login page and will let you consume APIs without authentication. - Keep will use a JWT with "keep" as the tenant id, but will not validate it. - Any API key provided in the `x-api-key` header will be accepted without validation. This configuration essentially bypasses all authentication checks, making it unsuitable for production environments where security is a concern. ================================================ FILE: docs/deployment/authentication/oauth2-proxy-gitlab.mdx ================================================ --- title: "Example: OAuth2‑Proxy + Keep + GitLab SSO" --- A **step‑by‑step cookbook** for adding single‑sign‑on to [Keep](https://github.com/keephq) with your **self‑hosted GitLab** using [oauth2‑proxy](https://oauth2‑proxy.github.io/) and the NGINX Ingress Controller. > **Conventions used below** > > * ``             – public FQDN where users access Keep (e.g. `keep.example.com`) > * ``           – URL of your GitLab instance (e.g. `gitlab.example.com`) > * ``         – container registry that stores images (omit if you use the public images) > * Kubernetes namespace **`keep`** – feel free to change it everywhere if you prefer another namespace. --- ## 1. Prerequisites | What | Why | | ------------------------------------------- | ----------------------------------------------------- | | Kubernetes cluster & `keep` namespace | Where Keep, oauth2‑proxy and Services live | | **ingress‑nginx** (or compatible) | Provides the `auth_request` feature oauth2‑proxy uses | | GitLab 15 + at `https://` | OpenID‑Connect issuer | | Helm 3.x & offline charts/images (optional) | If your cluster has no Internet egress | --- ## 2. Create the GitLab OAuth application 1. **GitLab ▸ Admin → Applications → New** 2. Name → `keep‑sso` 3. Redirect URI → `https:///oauth2/callback` 4. Scopes → `openid profile email` (+ `read_api` if you plan to gate access by group/project) 5. Save – copy the generated **Application ID** and **Secret**. --- ## 3. Kubernetes secrets & config ```bash # 3.1 Generate a 32‑byte cookie secret echo "$(openssl rand -base64 32 | head -c 32 | base64)" > cookie.b64 # 3.2 Store GitLab credentials and cookie secret kubectl -n keep create secret generic oauth2-proxy \ --from-literal=client-id= \ --from-literal=client-secret= \ --from-file=cookie-secret=cookie.b64 # 3.3 Add gitlab credentials and cookie secret using OAUTH2_PROXY ENV variables OAUTH2_PROXY_CLIENT_ID= OAUTH2_PROXY_CLIENT_SECRET= OAUTH2_PROXY_COOKIE_SECRET=cookie.b64 # (optional) store GitLab’s custom CA certificate kubectl -n keep create secret generic gitlab-ca \ --from-file=gitlab-ca.pem ``` ```yaml # 3.4 oauth2_proxy.cfg (ConfigMap) apiVersion: v1 kind: ConfigMap metadata: name: oauth2-proxy namespace: keep data: oauth2_proxy.cfg: | email_domains = ["*"] upstreams = ["file:///dev/null"] # we only use auth‑request mode provider = "gitlab" cookie_name = "keep-dev" #if empty, will use default cookie name: _oauth2_proxy cookie_secure = true ``` --- ## 4. Deploy **oauth2‑proxy** (Helm) ```yaml # values.oauth2-proxy.yaml – minimal baseline image: # replace with public image if desired repository: /oauth2-proxy/oauth2-proxy tag: v7.9.0 config: configFile: |- # content comes from the ConfigMap above extraArgs: oidc-issuer-url: https:// set-xauthrequest: "true" # add X-Auth-Request-*/X-Forwarded-* headers pass-authorization-header: "true" # add Authorization: Bearer # provider-ca-file: /ca/gitlab-ca.pem # enable if you mounted a corporate CA or use ssl-insecure-skip-verify: "true" to disable SSL check. extraVolumes: - name: gitlab-ca secret: secretName: gitlab-ca extraVolumeMounts: - name: gitlab-ca mountPath: /ca/gitlab-ca.pem subPath: gitlab-ca.pem readOnly: true service: type: ClusterIP ingress: enabled: false # we only need an internal Service ``` ```bash helm repo add oauth2-proxy https://oauth2-proxy.github.io/manifests helm upgrade --install oauth2-proxy oauth2-proxy/oauth2-proxy \ -n keep -f values.oauth2-proxy.yaml ``` *Lab‑only shortcut*: instead of mounting the CA you can temporarily add `ssl-insecure-skip-verify: "true"` under `extraArgs`. --- ## 5. Patch (or create) Keep’s Ingress resource Add **three** annotations so ingress‑nginx delegates auth to the Service: ```yaml global: ingress: annotations: nginx.ingress.kubernetes.io/auth-url: "http://oauth2-proxy.keep.svc.cluster.local/oauth2/auth" nginx.ingress.kubernetes.io/auth-signin: "https:///oauth2/start?rd=$request_uri" nginx.ingress.kubernetes.io/auth-response-headers: "authorization,x-auth-request-user,x-auth-request-email,x-forwarded-user,x-forwarded-email,x-forwarded-groups" ``` Redeploy Keep (or patch the Ingress manually). --- ## 6. Environment variables for Keep ```yaml backend: env: - name: AUTH_TYPE value: OAUTH2PROXY - name: KEEP_OAUTH2_PROXY_USER_HEADER value: x-auth-request-email - name: KEEP_OAUTH2_PROXY_ROLE_HEADER value: x-auth-request-groups - name: KEEP_OAUTH2_PROXY_AUTO_CREATE_USER value: true - name: KEEP_OAUTH2_PROXY_ADMIN_ROLES value: - name: KEEP_OAUTH2_PROXY_NOC_ROLES value: frontend: env: # Public URL the **browser** should use - name: NEXTAUTH_URL value: "https://" # URL the **server‑side** Next.js code can always reach - name: NEXTAUTH_URL_INTERNAL value: "http://keep-frontend.keep.svc.cluster.local:3000" # API URLs - name: API_URL_CLIENT # browser → ingress value: "/v2" - name: API_URL # server → backend Service (no auth‑proxy) value: "http://keep-backend.keep.svc.cluster.local:8080" #Oauth2-Proxy - name: AUTH_TYPE value: OAUTH2PROXY - name: KEEP_OAUTH2_PROXY_USER_HEADER value: x-auth-request-email - name: KEEP_OAUTH2_PROXY_ROLE_HEADER value: x-auth-request-groups ``` Roll out the frontend: ```bash kubectl -n keep rollout restart deploy/keep-frontend ``` --- ## 7. Quick validation ```bash # 7.1 Call auth endpoint without cookie – expect 401 curl -I http://oauth2-proxy.keep.svc.cluster.local/oauth2/auth # 7.2 Copy the keep-dev cookie from your browser session curl -I --cookie "keep-dev=" \ http://oauth2-proxy.keep.svc.cluster.local/oauth2/auth # expect 200 ``` Browser smoke‑test: * `https://` → redirect to GitLab → sign in → return to Keep. * DevTools ▸ Network → `/api/auth/session` returns **200**. --- ## 8. Troubleshooting | Symptom | Common cause & remedy | | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | **TLS error** `x509: certificate signed by unknown authority` | Mount your GitLab CA (`provider-ca-file`) or set `ssl-insecure-skip-verify=true` (dev only). | | Ingress logs `auth request unexpected status: 502` | `auth-url` is pointing at the external host – use the internal Service DNS (`http://oauth2-proxy.keep.svc.cluster.local`). | | Browser loops at `/signin?callbackUrl=…` | ① `set-xauthrequest` not enabled, or ② `auth-response-headers` not set, or ③ backend receives calls through oauth2‑proxy (`API_URL` wrong). | | Redirect to `0.0.0.0:3000` or pod name | `NEXTAUTH_URL` missing at **build time**; rebuild UI or override env. | | 401 from `/oauth2/auth` even with cookie | Cookie expired / clocks out of sync. Clear cookie and re‑login. | --- ## 9. Clean‑up ```bash helm -n keep uninstall oauth2-proxy helm -n keep uninstall keep # if you want to remove Keep kubectl -n keep delete secret oauth2-proxy gitlab-ca ``` --- ## Appendix A – Generate a 32‑byte cookie secret ```bash openssl rand -hex 16 | xxd -r -p | base64 ``` ## Appendix B – Sync images to an offline registry (example) ```bash skopeo copy docker://quay.io/oauth2-proxy/oauth2-proxy:v7.9.0 \ docker:///oauth2-proxy/oauth2-proxy:v7.9.0 ``` ================================================ FILE: docs/deployment/authentication/oauth2proxy-auth.mdx ================================================ --- title: "OAuth2Proxy Authentication" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: (experimental)
Delegate authentication to Oauth2Proxy. ### When to Use - **oauth2-proxy user:** Use this authentication method if you want to delegate authentication to an external Oauth2Proxy service. ### Setup Instructions To start Keep with Oauth2Proxy authentication, set the following environment variables: #### Frontend Environment Variables | Environment Variable | Description | Required | Default Value | |--------------------|-----------|:--------:|:-------------:| | AUTH_TYPE | Set to 'OAUTH2PROXY' for OAUTH2PROXY authentication | Yes | - | | KEEP_OAUTH2_PROXY_USER_HEADER | Header for the authenticated user's email | Yes | x-forwarded-email | | KEEP_OAUTH2_PROXY_ROLE_HEADER | Header for the authenticated user's role | Yes | x-forwarded-groups | #### Backend Environment Variables | Environment Variable | Description | Required | Default Value | |--------------------|-----------|:--------:|:-------------:| | AUTH_TYPE | Set to 'OAUTH2PROXY' for OAUTH2PROXY authentication | Yes | - | | KEEP_OAUTH2_PROXY_USER_HEADER | Header for the authenticated user's email | Yes | x-forwarded-email | | KEEP_OAUTH2_PROXY_ROLE_HEADER | Header for the authenticated user's role | Yes | x-forwarded-groups | | KEEP_OAUTH2_PROXY_AUTO_CREATE_USER | Automatically create user if not exists | No | true | | KEEP_OAUTH2_PROXY_ADMIN_ROLES | Role names for admin users | No | admin | | KEEP_OAUTH2_PROXY_NOC_ROLES | Role names for NOC (Network Operations Center) users | No | noc | | KEEP_OAUTH2_PROXY_WEBHOOK_ROLES | Role names for webhook users | No | webhook | ================================================ FILE: docs/deployment/authentication/okta-auth.mdx ================================================ --- title: "Okta Authentication" --- This document provides comprehensive information about the Okta integration in Keep. ## Overview Keep supports Okta as an authentication provider, enabling: - Single Sign-On (SSO) via Okta - OAuth2/OIDC authentication flow - JWT token verification with JWKS - Role-based access control through token claims ## Environment Variables ### Backend Environment Variables | Variable | Description | Required | |----------|-------------|----------| | `AUTH_TYPE` | Set to `"OKTA"` to enable Okta authentication | Yes | | `OKTA_DOMAIN` | Your Okta domain (e.g., `https://company.okta.com`) | Yes | | `OKTA_ISSUER` | The issuer URL for your Okta authorization server (e.g., `https://company.okta.com/oauth2/default`) | Yes | | `OKTA_CLIENT_ID` | Client ID of your Okta application | Yes | | `OKTA_CLIENT_SECRET` | Client Secret of your Okta application | Yes | | `OKTA_AUDIENCE` | Expected audience claim in the token. Falls back to `OKTA_CLIENT_ID` if not set | No | | `OKTA_JWKS_URL` | Explicit JWKS URL. If not set, derived from `OKTA_ISSUER` | No | | `OKTA_API_TOKEN` | Okta API token for management operations | No | ### Frontend Environment Variables | Variable | Description | Example | |----------|-------------|---------| | `AUTH_TYPE` | Set to `"OKTA"` to enable Okta authentication | `OKTA` | | `OKTA_ISSUER` | The issuer URL for your Okta authorization server | `https://company.okta.com/oauth2/default` | | `OKTA_CLIENT_ID` | Client ID of your Okta application | `0oa1bcdef2ghijklm3n4` | | `OKTA_CLIENT_SECRET` | Client Secret of your Okta application | `abcd1234efgh5678` | ## Okta Configuration ### Creating an Okta Application 1. Sign in to your Okta Admin Console 2. Navigate to **Applications** > **Applications** 3. Click **Create App Integration** 4. Select **OIDC - OpenID Connect** as the sign-in method 5. Select **Web Application** as the application type 6. Click **Next** ### Application Settings 1. **App integration name**: Enter a name for your application (e.g., "Keep") 2. **Sign-in redirect URIs**: Add your callback URL: `https://your-keep-domain.com/api/auth/callback/okta` 3. **Sign-out redirect URIs**: Add your sign-out URL: `https://your-keep-domain.com` 4. **Assignments**: Assign the application to the appropriate users or groups 5. Click **Save** 6. Copy the **Client ID** and **Client Secret** from the application settings ### Role Mapping Keep extracts the user role from the JWT token. The role is determined in the following order: 1. `keep_role` claim in the token 2. `role` claim in the token 3. First entry in the `groups` claim 4. Falls back to `user` role To configure role mapping, add a custom claim to your Okta authorization server: 1. Navigate to **Security** > **API** > **Authorization Servers** 2. Select your authorization server (e.g., `default`) 3. Go to the **Claims** tab 4. Add a claim named `keep_role` or `groups` that maps to the user's Keep role ================================================ FILE: docs/deployment/authentication/onelogin-auth.mdx ================================================ --- title: "OneLogin Authentication" --- This document provides comprehensive information about the OneLogin integration in Keep ## Overview Keep supports OneLogin as an authentication provider, enabling: - Single Sign-On (SSO) via OneLogin - OAuth2/OIDC authentication flow - Token refresh capabilities - Role-based access control through custom claims - Session management through NextAuth.js ## Environment Variables ### Backend Environment Variables | Variable | Description | Example | |----------|-------------|---------| | `AUTH_TYPE` | Set to `"ONELOGIN"` to enable OneLogin authentication | `ONELOGIN` | | `ONELOGIN_ISSUER` | The issuer URL for your OneLogin application | `https://company.onelogin.com/oidc/2` | | `ONELOGIN_CLIENT_ID` | Client ID of your OneLogin application | `abc123def456ghi789` | | `ONELOGIN_CLIENT_SECRET` | Client Secret of your OneLogin application | `abcd1234efgh5678ijkl9012` | | `ONELOGIN_ADMIN_ROLE` | Role to be mapped to a keep admin role | `KeepAdmin` | | `ONELOGIN_NOC_ROLE` | Role to be mapped to a keep noc role | `KeepNoc` | | `ONELOGIN_WEBHOOK_ROLE` | Role to be mapped to a keep webhook role | `KeepWebhook` | | `ONELOGIN_AUTO_CREATE_USER` | Whether to try and create autocreate users in keep | `True` | ### Frontend Environment Variables | Variable | Description | Example | |----------|-------------|---------| | `AUTH_TYPE` | Set to `"ONELOGIN"` to enable OneLogin authentication | `ONELOGIN` | | `ONELOGIN_ISSUER` | The issuer URL for your OneLogin application | `https://company.onelogin.com/oidc/2` | | `ONELOGIN_CLIENT_ID` | Client ID of your OneLogin application | `abc123def456ghi789` | | `ONELOGIN_CLIENT_SECRET` | Client Secret of your OneLogin application | `abcd1234efgh5678ijkl9012` | ## OneLogin Configuration ### Creating a OneLogin Application 1. Sign in to your OneLogin Admin Console 2. Navigate to **Applications** 3. Click **Add App** 4. Search for **OpenId Connect (OIDC)** and select it 5. Click **Save** ### Application Settings 1. **Display Name**: Enter a name for your application (e.g., "Keep") 2. **Redirect URIs**: Enter your app's callback URL, e.g., `https://your-keep-domain.com/api/auth/callback/onelogin` 3. **Login URL**: Enter your app's login URL, e.g., `https://your-keep-domain.com/signin` 4. **Role Mapping**: - Go to the Parameters tab - Map the groups to user roles or groups with the default value being semicolon delimited input values 5. Go to the **SSO** tab and configure: - **Application Type**: Web - **Token Endpoint**: Client Secret Post 6. **Access**: - Assign to appropriate roles or users 7. Click **Save** 8. Copy the client id, client secret and issuer URL from the SSO tab ================================================ FILE: docs/deployment/authentication/overview.mdx ================================================ --- title: "Overview" --- For every authentication-related question or issue, please join our [Slack](https://slack.keephq.dev). Keep supports various authentication providers and architectures to accommodate different deployment strategies and security needs, from development environments to production setups. ### Authentication Providers - [**No Authentication**](/deployment/authentication/no-auth) - Quick setup for testing or internal use cases. - [**DB**](/deployment/authentication/db-auth) - Simple username/password authentication. Works well for small teams or for dev/stage environments. Users and hashed password are stored on DB. - [**Auth0**](/deployment/authentication/auth0-auth) - Utilize Auth0 for scalable, auth0-based authentication. - [**Keycloak**](/deployment/authentication/keycloak-auth) - Utilize Keycloak for enterprise authentication methods such as SSO/SAML/OIDC, advanced RBAC with custom roles, resource-level permissions, and integration with user directories (LDAP). - [**AzureAD**](/deployment/authentication/azuread-auth) - Utilize Azure AD for SSO/SAML/OIDC nterprise authentication. - [**Okta**](/deployment/authentication/okta-auth) - Utilize Okta for SSO/OIDC authentication. - [**OneLogin**](/deployment/authentication/onelogin-auth) - Utilize OneLogin for SSO/OIDC authentication. Choosing the right authentication strategy depends on your specific use case, security requirements, and deployment environment. You can read more about each authentication provider. ### Authentication Features Comparison | Identity Provider | RBAC | SAML/OIDC/SSO | LDAP | Resource-based permission | User Management | Group Management | On Prem | License | |:---:|:----:|:---------:|:----:|:-------------------------:|:----------------:|:-----------------:|:-------:|:-------:| | **No Auth** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | **OSS** | | **DB** | ✅
(Predefiend roles) | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | **OSS** | | **Auth0** | ✅
(Predefiend roles) | ✅ | 🚧 | 🚧 | ✅ | 🚧 | ❌ | **EE** | | **Keycloak** | ✅
(Custom roles) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | **EE** | | **Oauth2Proxy** | ✅
(Predefiend roles) | ✅ | ❌ | ❌ | N/A | N/A | ✅ | **OSS** | | **Azure AD** | ✅
(Predefiend roles) | ✅ | ❌ | ❌ | By Azure AD | By Azure AD | ✅ | **EE** | | **Okta** | ✅
(Predefiend roles) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | **OSS** | | **OneLogin** | ✅
(Predefiend roles) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | **OSS** | ### How To Configure Some authentication providers require additional environment variables. These will be covered in detail on the specific authentication provider pages. The authentication scheme on Keep is controlled with environment variables both on the backend (Keep API) and the frontend (Keep UI). | Identity Provider | Environment Variable | Additional Variables Required | | ------------------------------------- | -------------------------------------------------------------- | ---------------------------- | | **No Auth** | `AUTH_TYPE=NOAUTH`| None | | **DB** | `AUTH_TYPE=DB` | `KEEP_JWT_SECRET` | | **Auth0** | `AUTH_TYPE=AUTH0` | `AUTH0_DOMAIN`, `AUTH0_CLIENT_ID`, `AUTH0_CLIENT_SECRET` | | **Keycloak** | `AUTH_TYPE=KEYCLOAK` | `KEYCLOAK_URL`, `KEYCLOAK_REALM`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_CLIENT_SECRET` | | **Oauth2Proxy** | `AUTH_TYPE=OAUTH2PROXY` | `OAUTH2_PROXY_USER_HEADER`, `OAUTH2_PROXY_ROLE_HEADER`, `OAUTH2_PROXY_AUTO_CREATE_USER` | | **AzureAD** | `AUTH_TYPE=AZUREAD` | See [AzureAD Configuration](/deployment/authentication/azuread-auth) | | **Okta** | `AUTH_TYPE=OKTA` | `OKTA_DOMAIN`, `OKTA_CLIENT_ID`, `OKTA_CLIENT_SECRET` | | **OneLogin** | `AUTH_TYPE=ONELOGIN` | See [OneLogin Configuration](/deployment/authentication/onelogin-auth) | For more details on each authentication strategy, including setup instructions and implications, refer to the respective sections. ================================================ FILE: docs/deployment/configuration.mdx ================================================ --- title: "Configuration" sidebarTitle: "Configuration" --- ## Background Keep is highly configurable through environment variables. This allows you to customize various aspects of both the backend and frontend components without modifying the code. Environment variables can be set in your deployment environment, such as in your Kubernetes configuration, Docker Compose file, or directly on your host system. ## Backend Environment Variables ### General General configuration variables control the core behavior of the Keep server. These settings determine fundamental aspects such as the server's host, port, and whether certain components like the scheduler and consumer are enabled. | Env var | Purpose | Required | Default Value | Valid options | | :----------------------------------: | :---------------------------------------------------: | :------: | :----------------------------: | :--------------------------: | | **KEEP_HOST** | Specifies the host for the Keep server | No | "0.0.0.0" | Valid hostname or IP address | | **PORT** | Specifies the port on which the backend server runs | No | 8080 | Any valid port number | | **SCHEDULER** | Enables or disables the workflow scheduler | No | "true" | "true" or "false" | | **CONSUMER** | Enables or disables the consumer | No | "true" | "true" or "false" | | **KEEP_VERSION** | Specifies the Keep version | No | "unknown" | Valid version string | | **KEEP_API_URL** | Specifies the Keep API URL | No | Constructed from HOST and PORT | Valid URL | | **KEEP_STORE_RAW_ALERTS** | Enables storing of raw alerts | No | "false" | "true" or "false" | | **TENANT_CONFIGURATION_RELOAD_TIME** | Time in minutes to reload tenant configurations | No | 5 | Positive integer | | **KEEP_LIVE_DEMO_MODE** | Keep will simulate incoming alerts and other activity | No | "false" | "true" or "false" | ### Logging and Environment Logging and environment configuration determines how Keep generates and formats log output. These settings are crucial for debugging, monitoring, and understanding the behavior of your Keep instance in different environments. | Env var | Purpose | Required | Default Value | Valid options | | :------------------: | :-----------------------------------------------------: | :------: | :--------------: | :---------------------------------------------: | | **LOG_LEVEL** | Sets the logging level for the application | No | "INFO" | "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL" | | **ENVIRONMENT** | Specifies the environment the application is running in | No | "production" | "development", "staging", "production" | | **LOG_FORMAT** | Specifies the log format | No | "open_telemetry" | "open_telemetry", "dev_terminal" | | **LOG_AUTH_PAYLOAD** | Enables logging of authentication payload | No | "false" | "true" or "false" | ### Database Database configuration is crucial for Keep's data persistence. Keep supports various database backends through SQLAlchemy, allowing flexibility in choosing and configuring your preferred database system. | Env var | Purpose | Required | Default Value | Valid options | | :----------------------------: | :-----------------------------------------------: | :------: | :-------------------------------: | :--------------------------------: | | **DATABASE_CONNECTION_STRING** | Specifies the database connection URL | Yes | None | Valid SQLAlchemy connection string | | **DATABASE_POOL_SIZE** | Sets the database connection pool size | No | 5 | Positive integer | | **DATABASE_MAX_OVERFLOW** | Sets the maximum overflow for the connection pool | No | 10 | Positive integer | | **DATABASE_ECHO** | Enables SQLAlchemy echo mode for debugging | No | False | Boolean (True/False) | | **DB_CONNECTION_NAME** | Specifies the Cloud SQL connection name | No | "keephq-sandbox:us-central1:keep" | Valid Cloud SQL connection string | | **DB_NAME** | Specifies the Cloud SQL database name | No | "keepdb" | Valid Cloud SQL database name | | **DB_SERVICE_ACCOUNT** | Service account for database impersonation | No | None | Valid service account email | | **DB_IP_TYPE** | Specifies the Cloud SQL IP type | No | "public" | "public", "private" or "psc" | | **SKIP_DB_CREATION** | Skips database creation and migrations | No | "false" | "true" or "false" | ### Resource Provisioning Resource provisioning settings control how Keep sets up initial resources. This configuration is particularly important for automating the setup process and ensuring that necessary resources are available when Keep starts. To elaborate on resource provisioning and its configuration, please see [provisioning docs](/deployment/provision/overview). | Env var | Purpose | Required | Default Value | Valid options | | :---------------------: | :---------------------------------------: | :------: | :-----------: | :---------------: | | **PROVISION_RESOURCES** | Enables or disables resource provisioning | No | "true" | "true" or "false" | ### Authentication Authentication configuration determines how Keep verifies user identities and manages access control. These settings are essential for securing your Keep instance and integrating with various authentication providers. For specific authentication type configuration, please see [authentication docs](/deployment/authentication/overview). | Env var | Purpose | Required | Default Value | Valid options | | :-----------------------------------: | :---------------------------------------------------------------: | :------: | :-----------: | :------------------------------------------------: | | **AUTH_TYPE** | Specifies the authentication type | No | "NOAUTH" | "AUTH0", "KEYCLOAK", "DB", "NOAUTH", "OAUTH2PROXY", "OKTA", "ONELOGIN" | | **KEEP_JWT_SECRET** | Secret key for JWT token generation and validation (DB auth only) | Yes | None | Any strong secret string | | **KEEP_DEFAULT_USERNAME** | Default username for the admin user (DB auth only) | No | "keep" | Any valid username string | | **KEEP_DEFAULT_PASSWORD** | Default password for the admin user (DB auth only) | No | "keep" | Any strong password string | | **KEEP_FORCE_RESET_DEFAULT_PASSWORD** | Forces reset of default user password | No | "false" | "true" or "false" | | **KEEP_DEFAULT_API_KEYS** | Comma-separated list of default API keys to provision | No | "" | Format: "name:role:secret,name:role:secret" | ### Service Mesh (Internal Alert Ingestion) These settings allow trusted services within the same Kubernetes cluster to POST alerts to Keep without requiring a Keep API key. This is intended for service-to-service communication where network-level authentication (e.g. Istio mTLS with AuthorizationPolicy) ensures only authorized callers can reach Keep's alert ingestion endpoints. | Env var | Purpose | Required | Default Value | Valid options | | :------------------------------------------: | :--------------------------------------------------------------------------: | :------: | :-----------: | :-----------------: | | **KEEP_ALLOW_MESH_ALERT_INGESTION** | Allows unauthenticated POST requests to `/alerts/event*` endpoints | No | "false" | "true" or "false" | When `KEEP_ALLOW_MESH_ALERT_INGESTION` is set to `"true"`, requests to `/alerts/event*` that do not carry an API key or bearer token are accepted and authenticated as an internal service with the `webhook` role. Calling services can optionally set the `X-Service-Name` HTTP header to identify themselves in Keep's logs and audit trail: ```bash curl -X POST http://keep-backend:8080/alerts/event \ -H "Content-Type: application/json" \ -H "X-Service-Name: my-service" \ -d '[{"id":"alert-1","name":"Example Alert","severity":"info","status":"firing","source":["my-service"]}]' ``` The authenticated entity will have: - **email**: `service:` (defaults to `service:unknown` if the header is not set) - **role**: `webhook` (grants `write:alert` and `write:incident` scopes) This feature bypasses API key authentication for the alert ingestion endpoints. You **must** pair it with network-level access control (such as Istio AuthorizationPolicy) to restrict which services can reach these endpoints. Without network-level enforcement, any client that can reach Keep's backend can POST alerts. ### Secrets Management Secrets Management configuration specifies how Keep handles sensitive information. This is crucial for securely storing and accessing confidential data such as API keys and integrations credentials. | Env var | Purpose | Required | Default Value | Valid options | | :--------------------------: | :-------------------------------------------------------------------: | :------: | :-----------: | :---------------------------: | | **SECRET_MANAGER_TYPE** | Defines the type of secret manager to use | Yes | "FILE" | "FILE", "GCP", "K8S", "VAULT", "DB" | | **SECRET_MANAGER_DIRECTORY** | Directory for storing secrets when using file-based secret management | No | "/state" | Any valid directory path | ### OpenTelemetry OpenTelemetry configuration enables comprehensive observability for Keep. These settings allow you to integrate Keep with various monitoring and tracing systems, enhancing your ability to debug and optimize performance. | Env var | Purpose | Required | Default Value | Valid options | | :-------------------------------------: | :-----------------------------------------: | :------: | :-----------: | :-----------------------: | | **OTEL_SERVICE_NAME** | OpenTelemetry service name | No | "keep-api" | Valid service name string | | **SERVICE_NAME** | Alternative for OTEL_SERVICE_NAME | No | "keep-api" | Valid service name string | | **OTEL_EXPORTER_OTLP_ENDPOINT** | OpenTelemetry collector endpoint | No | None | Valid URL | | **OTLP_ENDPOINT** | Alternative for OTEL_EXPORTER_OTLP_ENDPOINT | No | None | Valid URL | | **OTEL_EXPORTER_OTLP_TRACES_ENDPOINT** | OpenTelemetry traces endpoint | No | None | Valid URL | | **OTEL_EXPORTER_OTLP_LOGS_ENDPOINT** | OpenTelemetry logs endpoint | No | None | Valid URL | | **OTEL_EXPORTER_OTLP_METRICS_ENDPOINT** | OpenTelemetry metrics endpoint | No | None | Valid URL | | **CLOUD_TRACE_ENABLED** | Enables Google Cloud Trace exporter | No | "false" | "true" or "false" | | **METRIC_OTEL_ENABLED** | Enables OpenTelemetry metrics | No | "" | "true" or "false" | ### WebSocket Server (Pusher/Soketi) WebSocket server configuration controls real-time communication capabilities in Keep. These settings are important for enabling features that require instant updates and notifications. | Env var | Purpose | Required | Default Value | Valid options | | :-------------------: | :-------------------------------: | :-------------------: | :-----------: | :--------------------------: | | **PUSHER_DISABLED** | Disables Pusher integration | No | "false" | "true" or "false" | | **PUSHER_HOST** | Hostname of the Pusher server | No | None | Valid hostname or IP address | | **PUSHER_PORT** | Port of the Pusher server | No | None | Any valid port number | | **PUSHER_APP_ID** | Pusher application ID | Yes (if using Pusher) | None | Valid Pusher App ID | | **PUSHER_APP_KEY** | Pusher application key | Yes (if using Pusher) | None | Valid Pusher App Key | | **PUSHER_APP_SECRET** | Pusher application secret | Yes (if using Pusher) | None | Valid Pusher App Secret | | **PUSHER_USE_SSL** | Enables SSL for Pusher connection | No | False | Boolean (True/False) | | **PUSHER_CLUSTER** | Pusher cluster | No | None | Valid Pusher cluster name | ### OpenAI OpenAI configuration is used for integrating with OpenAI services. These settings are important if you're utilizing OpenAI capabilities within Keep for tasks such as natural language processing or AI-assisted operations. | Env var | Purpose | Required | Default Value | Valid options | Backend/Frontend | | :-------------------------: | :------------------------------------------------: | :------: | :-----------------: | :----------------------------------------------------------: | :--------------: | | **OPENAI_API_KEY** | API key for OpenAI services | No | None | Valid OpenAI API key | Both | | **OPENAI_MODEL_NAME** | Model name to use for OpenAI requests | No | "gpt-4o-2024-08-06" | Valid OpenAI model name (e.g., "gpt-4o", "gpt-4o-mini", ...) | Both | | **OPEN_AI_ORGANIZATION_ID** | Organization ID for OpenAI services | No | None | Valid OpenAI organization ID | Both | | **OPENAI_BASE_URL** | Base URL for OpenAI API (useful for LiteLLM proxy) | No | None | Valid URL (e.g., "http://localhost:4000") | Both | For various different LLM based features, we also require to set these environment variables for Keep's frontend too. ### Posthog Posthog configuration controls Keep's integration with the Posthog analytics platform. These settings are useful for tracking usage patterns and gathering insights about how your Keep instance is being used. | Env var | Purpose | Required | Default Value | Valid options | | :------------------: | :---------------------------: | :------: | :-----------------------------------------------: | :-------------------: | | **POSTHOG_API_KEY** | API key for PostHog analytics | No | "phc_muk9qE3TfZsX3SZ9XxX52kCGJBclrjhkP9JxAQcm1PZ" | Valid PostHog API key | | **POSTHOG_DISABLED** | Disables PostHog integration | No | "false" | "true" or "false" | ### Sentry Sentry configuration controls Keep's integration with Sentry for error monitoring and reporting. These settings are important for maintaining the stability and reliability of your Keep instance. | Env var | Purpose | Required | Default Value | Valid options | | :-----------------: | :-------------------------: | :------: | :-----------: | :---------------: | | **SENTRY_DISABLED** | Disables Sentry integration | No | "false" | "true" or "false" | ### Ngrok Ngrok configuration enables secure tunneling to your Keep instance. These settings are particularly useful for development or when you need to expose your local Keep instance to the internet securely. | Env var | Purpose | Required | Default Value | Valid options | | :------------------: | :----------------------------: | :------: | :-----------: | :--------------------: | | **USE_NGROK** | Enables ngrok for tunneling | No | "false" | "true" or "false" | | **NGROK_AUTH_TOKEN** | Authentication token for ngrok | No | None | Valid ngrok auth token | | **NGROK_DOMAIN** | Custom domain for ngrok | No | None | Valid domain name | ### Elasticsearch Elasticsearch configuration controls Keep's integration with Elasticsearch for advanced search capabilities. These settings are important if you're using Elasticsearch to enhance Keep's search functionality and performance. | Env var | Purpose | Required | Default Value | Valid options | | :----------------------: | :-----------------------------------------: | :--------------------------: | :-----------: | :---------------------------: | | **ELASTIC_ENABLED** | Enables Elasticsearch integration | No | "false" | "true" or "false" | | **ELASTIC_API_KEY** | API key for Elasticsearch | Yes (if using Elasticsearch) | None | Valid Elasticsearch API key | | **ELASTIC_HOSTS** | Comma-separated list of Elasticsearch hosts | Yes (if using Elasticsearch) | None | Valid Elasticsearch host URLs | | **ELASTIC_USER** | Username for Elasticsearch basic auth | No | None | Valid username | | **ELASTIC_PASSWORD** | Password for Elasticsearch basic auth | No | None | Valid password | | **ELASTIC_INDEX_SUFFIX** | Suffix for Elasticsearch index names | Yes (for single tenant) | None | Any valid string | ### Redis Redis configuration specifies the connection details for Keep's Redis instance. Redis is used for various caching and queueing purposes, making these settings important for optimizing Keep's performance and scalability. | Env var | Purpose | Required | Default Value | Valid options | | :----------------: | :-------------------: | :------: | :-----------: | :--------------------------: | | **REDIS** | Redis enabled | No | false | true or false | | **REDIS_HOST** | Redis server hostname | No | "localhost" | Valid hostname or IP address | | **REDIS_PORT** | Redis server port | No | 6379 | Valid port number | | **REDIS_USERNAME** | Redis username | No | None | Valid username string | | **REDIS_PASSWORD** | Redis password | No | None | Valid password string | ### Redis Sentinel Redis sentinel configuration specifies the connection details for Keep's Redis sentinel instance. Redis sentinel is used when you have a redis cluster and it acts as a broker. | Env var | Purpose | Required | Default Value | Valid options | | :---------------------------------: | :----------------------: | :------: | :-----------------: | :-----------------------------------------: | | **REDIS** | Redis enabled | No | false | true or false | | **REDIS_SENTINEL_HOSTS** | Redis sentinel server(s) | No | "localhost:26379" | "host1:port1,host2:port2" (comma-separated) | | **REDIS_SENTINEL_SERVICE_NAME** | Redis sentinel service name | No | "mymaster" | Valid service name string | | **REDIS_USERNAME** | Redis username | No | None | Valid username string | | **REDIS_PASSWORD** | Redis password | No | None | Valid password string | ### ARQ ARQ (Asynchronous Task Queue) configuration controls Keep's background task processing. These settings are crucial for managing how Keep handles long-running or scheduled tasks, ensuring efficient resource utilization and responsiveness. | Env var | Purpose | Required | Default Value | Valid options | | :--------------------------: | :-------------------------------------------------: | :------: | :-----------: | :------------------: | | **ARQ_BACKGROUND_FUNCTIONS** | Comma-separated list of background functions to run | No | None | Valid function names | | **ARQ_KEEP_RESULT** | Duration to keep job results (in seconds) | No | 3600 | Positive integer | | **ARQ_EXPIRES** | Default job expiration time (in seconds) | No | 3600 | Positive integer | | **ARQ_EXPIRES_AI** | AI job expiration time (in seconds) | No | 3600000 | Positive integer | ### Rate Limiting Rate limiting configuration controls how many requests can be made to Keep's API endpoints within a specified time period. This helps prevent abuse and ensures system stability. | Env var | Purpose | Required | Default Value | Valid options | | :------------------------: | :-----------------------------------: | :------: | :-----------: | :-----------------------------------------------------------------------------------: | | **KEEP_USE_LIMITER** | Enables or disables rate limiting | No | "false" | "true" or "false" | | **KEEP_LIMIT_CONCURRENCY** | Sets the rate limit for API endpoints | No | "100/minute" | Format: "{number}/{interval}" where interval can be "second", "minute", "hour", "day" | Currently, rate limiting is applied to the following endpoints: - POST `/alerts/event` - Generic event ingestion endpoint - POST `/alerts/{provider_type}` - Provider-specific event ingestion endpoints These endpoints are rate-limited according to the `KEEP_LIMIT_CONCURRENCY` setting when `KEEP_USE_LIMITER` is enabled. ### Maintenance Windows The strategy enables the ability to manage how the alerts are handled in case of a match with the Maintenance Windows Rules. | Env var | Purpose | Required | Default Value | Valid options | | :------------------------------: | :-----------------------------------------: | :---------------------: | :-----------: | :-----------------------------------------------: | | **MAINTENANCE_WINDOW_STRATEGY** | Choose the strategy | No | "default" | "default" or "recover_previous_status" | | **WATCHER_LAPSED_TIME** | Time in seconds to execute the alert review | No | 60 | Valid positive integer | ## Frontend Environment Variables Frontend configuration variables control the behavior and features of Keep's user interface. These settings are crucial for customizing the frontend's appearance, functionality, and integration with the backend services. ### General | Env var | Purpose | Required | Default Value | Valid options | | ---------------------------------- | ------------------------------------------------------------------- | -------- | ------------- | --------------- | | **API_URL** | Specifies the URL of the Keep backend API | Yes | None | Valid URL | | **AUTH_SESSION_TIMEOUT** | Specifies user session timeout in seconds. Default is 30 days. | No | 2592000 | Value in seconds| | **KEEP_HIDE_SENSITIVE_FIELDS** | Hides sensitive fields | No | None | "true", "false" | | **HIDE_NAVBAR_CORRELATION** | Hides the correlation page from the navigation bar in the UI | No | None | "true" | | **HIDE_NAVBAR_WORKFLOWS** | Hides the workflows page from the navigation bar in the UI | No | None | "true" | | **HIDE_NAVBAR_SERVICE_TOPOLOGY** | Hides the service topology page from the navigation bar in the UI | No | None | "true" | | **HIDE_NAVBAR_MAPPING** | Hides the mapping page from the navigation bar in the UI | No | None | "true" | | **HIDE_NAVBAR_EXTRACTION** | Hides the extraction page from the navigation bar in the UI | No | None | "true" | | **HIDE_NAVBAR_MAINTENANCE_WINDOW** | Hides the maintenance window page from the navigation bar in the UI | No | None | "true" | | **HIDE_NAVBAR_AI_PLUGINS** | Hides the AI plugins page from the navigation bar in the UI | No | None | "true" | | **KEEP_WF_LIST_EXTENDED_INFO** | Use a list instead a button to show the complete execution list | No | "true" | "true", "false" | ### Authentication Authentication configuration determines how Keep verifies user identities and manages access control. These settings are essential for securing your Keep instance and integrating with various authentication providers. | Env var | Purpose | Required | Default Value | Valid options | | :-----------------: | :-------------------------------: | :------: | :-----------: | :------------------------------------------------: | | **AUTH_TYPE** | Specifies the authentication type | No | "NOAUTH" | "AUTH0", "KEYCLOAK", "DB", "NOAUTH", "OAUTH2PROXY", "OKTA", "ONELOGIN" | | **NEXTAUTH_URL** | URL for NextAuth authentication | Yes | None | Valid URL | | **NEXTAUTH_SECRET** | Secret key for NextAuth | Yes | None | Strong secret string | ### Posthog | Env var | Purpose | Required | Default Value | Valid options | | :--------------: | :------------------------------------: | :------: | :-----------: | :-------------------: | | **POSTHOG_KEY** | PostHog API key for frontend analytics | No | None | Valid PostHog API key | | **POSTHOG_HOST** | PostHog Host for frontend analytics | No | None | Valid PostHog Host | ### Pusher Pusher configuration is essential for enabling real-time updates and communication in Keep's frontend. These settings allow the frontend to establish a WebSocket connection with the Pusher server, facilitating instant updates and notifications. | Env var | Purpose | Required | Default Value | Valid options | | :-----------------: | :---------------------------: | :---------------------: | :-----------: | :--------------------------: | | **PUSHER_DISABLED** | Disables Pusher integration | No | "false" | "true" or "false" | | **PUSHER_HOST** | Hostname of the Pusher server | No | "localhost" | Valid hostname or IP address | | **PUSHER_PORT** | Port of the Pusher server | No | 6001 | Valid port number | | **PUSHER_APP_KEY** | Pusher application key | Yes (if Pusher enabled) | "keepappkey" | Valid Pusher App Key | | **PUSHER_CLUSTER** | Pusher cluster | No | None | Valid Pusher cluster name | ================================================ FILE: docs/deployment/docker.mdx ================================================ --- title: "Docker" sidebarTitle: "Docker" --- ### Spin up Keep with docker-compose latest images The easiest way to start keep is is with docker-compose: ```shell curl https://raw.githubusercontent.com/keephq/keep/main/start.sh | sh ``` ```bash start.sh #!/bin/bash # Keep install script for docker compose echo "Creating state directory." mkdir -p state test -e state echo "Changing directory ownership to non-privileged user." chown -R 999:999 state || echo "Unable to change directory ownership, changing permissions instead." && chmod -R 0777 state which curl &> /dev/null || echo "curl not installed" curl https://raw.githubusercontent.com/keephq/keep/main/docker-compose.yml --output docker-compose.yml curl https://raw.githubusercontent.com/keephq/keep/main/docker-compose.common.yml --output docker-compose.common.yml docker compose up -d ``` The docker-compose.yml contains 3 services: - [keep-backend](https://console.cloud.google.com/artifacts/docker/keephq/us-central1/keep/keep-api?project=keephq) - a fastapi service that as the API server. - [keep-frontend](https://console.cloud.google.com/artifacts/docker/keephq/us-central1/keep/keep-ui?project=keephq) - a nextjs app that serves as Keep UI interface. - [keep-websocket-server](https://docs.soketi.app/getting-started/installation/docker) - Soketi (a pusher compatible websocket server) for real time alerting. ### Reinstall Keep with the option to refresh from scratch `Caution:` This usage context will refresh from the beginning and Keep's data and settings will be erased. Even other containers on this host are also erased. So please consider when using the steps below. For cases where you need to test many different options or simply want to reinstall Keep from scratch using docker compose without spending a lot of time, that is, without repeating the steps of installing docker, downloading the installer.. .. run the commands according to the previous instructions. Follow these steps #### Step1: Stop, Clear container, network, volume, image. In the directory containing the docker compose file you downloaded, say `/root/` ``` docker-compose down docker-compose down --rmi all docker-compose down -v docker system prune -a --volumes ``` #### Step2: Clear Config db, config file in state folder. ``` rm -rf state/* ``` #### Step 3: Run again ``` docker compose up -d ``` ================================================ FILE: docs/deployment/ecs.mdx ================================================ --- title: "AWS ECS" sidebarTitle: "AWS ECS" --- ## Step 1: Login to AWS Console - Open your web browser and navigate to the AWS Management Console. - Log in using your AWS account credentials. ## Step 2: Navigate to ECS - Click on the "Services" dropdown menu in the top left corner. - Select "ECS" from the list of services. ## Step 3: Create 3 Task Definitions - In the ECS dashboard, navigate to the "Task Definitions" section in the left sidebar. Task Definition - Click on "Create new Task Definition". ![Create new task definition](/images/ecs-task-def-create-new.png) ### Task Definition 1 (Frontend - KeepUI): - Task Definition Family: keep-frontend ![Task Definition Family](/images/ecs-task-def-frontend1.png) - Configure your container definitions as below: - Infrastructure Requirements: - Launch Type: AWS Fargate - OS, Architecture, Network mode: Linux/X86_64 - Task Size: - CPU: 1 vCPU - Memory: 2 GB - Task Role and Task Execution Role are optional if you plan on using secrets manager for example then create a task execution role to allow access to the secret manager you created. ![Infrastructure Requirements](/images/ecs-task-def-frontend2.png) - Container Details: - Name: keep-frontend - Image URI: us-central1-docker.pkg.dev/keephq/keep/keep-api:latest - Ports Mapping: - Container Port: 3000 - Protocol: TCP ![Container Details](/images/ecs-task-def-frontend3.png) - Environment Variables: (This can be static or you can use parameter store or secrets manager) - DATABASE_CONNECTION_STRING - AUTH_TYPE - KEEP_JWT_SECRET - KEEP_DEFAULT_USERNAME - KEEP_DEFAULT_PASSWORD - SECRET_MANAGER_TYPE - SECRET_MANAGER_DIRECTORY - USE_NGROK - KEEP_API_URL (The below variable is optional if you don't want to use websocket) - PUSHER_DISABLED (The below variables are optional if you want to use websocket) - PUSHER_APP_ID - PUSHER_APP_KEY - PUSHER_APP_SECRET - PUSHER_HOST - PUSHER_PORT ![Environment Variables](/images/ecs-task-def-frontend4.png) - Review and create your task definition. ### Task Definition 2 (Backend - keepAPI): - Configure your container definitions as below: - Task Definition Family: keep-frontend ![Task Definition Family](/images/ecs-task-def-backend1.png) - Infrastructure Requirements: - Launch Type: AWS Fargate - OS, Architecture, Network mode: Linux/X86_64 - Task Size: - CPU: 1 vCPU - Memory: 2 GB - Task Role and Task Execution Role are optional if you plan on using secrets manager for example then create a task execution role to allow access to the secret manager you created. ![Infrastructure Requirements](/images/ecs-task-def-backend2.png) - Container Details: - Name: keep-backend - Image URI: us-central1-docker.pkg.dev/keephq/keep/keep-api:latest - Ports Mapping: - Container Port: 8080 - Protocol: TCP ![Container Details](/images/ecs-task-def-backend3.png) - Environment Variables: (This can be static or you can use parameter store or secrets manager) - DATABASE_CONNECTION_STRING - AUTH_TYPE - KEEP_JWT_SECRET - KEEP_DEFAULT_USERNAME - KEEP_DEFAULT_PASSWORD - SECRET_MANAGER_TYPE - SECRET_MANAGER_DIRECTORY - USE_NGROK - KEEP_API_URL (The below variable is optional if you don't want to use websocket) - PUSHER_DISABLED (The below variables are optional if you want to use websocket) - PUSHER_APP_ID - PUSHER_APP_KEY - PUSHER_APP_SECRET - PUSHER_HOST - PUSHER_PORT ![Environment Variables](/images/ecs-task-def-backend4.png) - Storage: - Volume Name: keep-efs - Configuration Type: Configure at task definition creation - Volume type: EFS - Storage configurations: - File system ID: Select an existing EFS filesystem or create a new one - Root Directory: / ![Volume Configuration](/images/ecs-task-def-backend5.png) - Container mount points: - Container: select the container you just created - Source volume: keep-efs - Container path: /app - Make sure that Readonly is not selected ![Container Mount](/images/ecs-task-def-backend6.png) - Review and create your task definition. ### Task Definition 3 (Websocket): (This step is optional if you want to have automatic refresh of the alerts feed) - Configure your container definitions as below: - Task Definition Family: keep-frontend ![Task Definition Family](/images/ecs-task-def-websocket1.png) - Infrastructure Requirements: - Launch Type: AWS Fargate - OS, Architecture, Network mode: Linux/X86_64 - Task Size: - CPU: 0.25 vCPU - Memory: 1 GB - Task Role and Task Execution Role are optional if you plan on using secrets manager for example then create a task execution role to allow access to the secret manager you created. ![Infrastructure Requirements](/images/ecs-task-def-websocket2.png) - Container Details: - Name: keep-websocket - Image URI: quay.io/soketi/soketi:1.4-16-debian - Ports Mapping: - Container Port: 6001 - Protocol: TCP ![Container Details](/images/ecs-task-def-websocket3.png) - Environment Variables: (This can be static or you can use parameter store or secrets manager) - SOKETI_DEBUG - SOKETI_DEFAULT_APP_ID - SOKETI_DEFAULT_APP_KEY - SOKETI_DEFAULT_APP_SECRET - SOKETI_USER_AUTHENTICATION_TIMEOUT ![Environment Variables](/images/ecs-task-def-websocket4.png) - Review and create your task definition. ## Step 4: Create Keep Service - In the ECS dashboard, navigate to the "Clusters" section in the left sidebar. - Select the cluster you want to deploy your service to. - Click on the "Create" button next to "Services". - Configure your service settings. - Review and create your service. ## Step 5: Monitor Your Service - Once your service is created, monitor its status in the ECS dashboard. - You can view task status, service events, and other metrics to ensure your service is running correctly. ================================================ FILE: docs/deployment/kubernetes/architecture.mdx ================================================ --- title: "Architecture" sidebarTitle: "Architecture" --- ## High Level Architecture Keep architecture composes of two main components: 1. **Keep API** - A FastAPI-based backend server that handles business logic and API endpoints. 2. **Keep Frontend** - A Next.js-based frontend interface for user interaction. 3. **Websocket Server** - A Soketi server for real-time updates without page refreshes. 4. **Database Server** - A database used to store and manage persistent data. Supported databases include SQLite, PostgreSQL, MySQL, and SQL Server. ## Kubernetes Architecture Keep uses a single unified NGINX ingress controller to route traffic to all components (frontend, backend, and websocket). The ingress handles path-based routing: By default: - `/` routed to **Frontend** (configurable via `global.ingress.frontendPrefix`) - `/v2` routed to **Backend** (configurable via `global.ingress.backendPrefix`) - `/websocket` routed to **WebSocket** (configurable via `global.ingress.websocketPrefix`) ### General Components Keep uses kubernetes secret manager to store secrets such as integrations credentials. | Kubernetes Resource | Purpose | Required/Optional | Source | |:-------------------:|:-------:|:-----------------:|:------:| | ServiceAccount | Provides an identity for processes that run in a Pod. Used mainly for Keep API to access kubernetes secret manager | Required | [serviceaccount.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/serviceaccount.yaml) | | Role | Defines permissions for the ServiceAccount to manage secrets | Required | [role-secret-manager.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/role-secret-manager.yaml) | | RoleBinding | Associates the Role with the ServiceAccount | Required | [role-binding-secret-manager.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/role-binding-secret-manager.yaml) | | Secret Deletion Job | Cleans up Keep-related secrets when the Helm release is deleted | Required | [delete-secret-job.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/delete-secret-job.yaml) | ### Ingress Component | Kubernetes Resource | Purpose | Required/Optional | Source | |:-------------------:|:-------:|:-----------------:|:------:| | Shared NGINX Ingress | Routes all external traffic via one entry point | Optional | [nginx-ingress.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/nginx-ingress.yaml) | ### Frontend Components | Kubernetes Resource | Purpose | Required/Optional | Source | |:-------------------:|:-------:|:-----------------:|:------:| | Frontend Deployment | Manages the frontend application containers | Required | [frontend.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/frontend.yaml) | | Frontend Service | Exposes the frontend deployment within the cluster | Required | [frontend-service.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/frontend-service.yaml) | | Frontend Route (OpenShift) | Exposes the frontend service to external traffic on OpenShift | Optional | [frontend-route.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/frontend-route.yaml) | | Frontend HorizontalPodAutoscaler | Automatically scales the number of frontend pods | Optional | [frontend-hpa.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/frontend-hpa.yaml) | #### Backend Components | Kubernetes Resource | Purpose | Required/Optional | Source | |:-------------------:|:-------:|:-----------------:|:------:| | Backend Deployment | Manages the backend application containers | Required (if backend enabled) | [backend.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/backend.yaml) | | Backend Service | Exposes the backend deployment within the cluster | Required (if backend enabled) | [backend-service.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/backend-service.yaml) | | Backend Route (OpenShift) | Exposes the backend service to external traffic on OpenShift | Optional | [backend-route.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/backend-route.yaml) | | Backend HorizontalPodAutoscaler | Automatically scales the number of backend pods | Optional | [backend-hpa.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/backend-hpa.yaml) | #### Database Components Database components are optional. You can spin up Keep with your own database. | Kubernetes Resource | Purpose | Required/Optional | Source | |:-------------------:|:-------:|:-----------------:|:------:| | Database Deployment | Manages the database containers (e.g. MySQL or Postgres) | Optional | [db.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/db.yaml) | | Database Service | Exposes the database deployment within the cluster | Required (if deployment enabled) | [db-service.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/db-service.yaml) | | Database PersistentVolume | Provides persistent storage for the database | Optional | [db-pv.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/db-pv.yaml) | | Database PersistentVolumeClaim | Claims the persistent storage for the database | Optional | [db-pvc.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/db-pvc.yaml) | #### WebSocket Components WebSocket components are optional. You can spin up Keep with your own *Pusher compatible* WebSocket server. | Kubernetes Resource | Purpose | Required/Optional | Source | |:-------------------:|:-------:|:-----------------:|:------:| | WebSocket Deployment | Manages the WebSocket server containers (Soketi) | Optional | [websocket-server.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/websocket-server.yaml) | | WebSocket Service | Exposes the WebSocket deployment within the cluster | Required (if WebSocket enabled) | [websocket-server-service.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/websocket-server-service.yaml) | | WebSocket Route (OpenShift) | Exposes the WebSocket service to external traffic on OpenShift | Optional | [websocket-server-route.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/websocket-server-route.yaml) | | WebSocket HorizontalPodAutoscaler | Automatically scales the number of WebSocket server pods | Optional | [websocket-server-hpa.yaml](https://github.com/keephq/helm-charts/blob/main/charts/keep/templates/websocket-server-hpa.yaml) | These tables provide a comprehensive overview of the Kubernetes resources used in the Keep architecture, organized by component type. Each table describes the purpose of each resource, indicates whether it's required or optional, and provides a direct link to the source template in the Keep Helm charts GitHub repository. ### Kubernetes Configuration This sections covers only kubernetes-specific configuration. To learn about Keep-specific configuration, controlled by environment variables, see [Keep Configuration](/deployment/configuration) Each of these components can be customized via the `values.yaml` file in the Helm chart. Below are key configurations that can be adjusted for each component. #### 1. Frontend Configuration ```yaml frontend: enabled: true # Enable or disable the frontend deployment. replicaCount: 1 # Number of frontend replicas. image: repository: us-central1-docker.pkg.dev/keephq/keep/keep-ui pullPolicy: Always # Image pull policy (Always, IfNotPresent). tag: latest serviceAccount: create: true # Create a new service account. name: "" # Service account name (empty for default). podAnnotations: {} # Annotations for frontend pods. podSecurityContext: {} # Security context for the frontend pods. securityContext: {} # Security context for the containers. service: type: ClusterIP # Service type (ClusterIP, NodePort, LoadBalancer). port: 3000 # Port on which the frontend service is exposed. ``` #### 2. Backend Configuration ```yaml backend: enabled: true # Enable or disable the backend deployment. replicaCount: 1 # Number of backend replicas. image: repository: us-central1-docker.pkg.dev/keephq/keep/keep-api pullPolicy: Always # Image pull policy (Always, IfNotPresent). serviceAccount: create: true # Create a new service account. name: "" # Service account name (empty for default). podAnnotations: {} # Annotations for backend pods. podSecurityContext: {} # Security context for backend pods. securityContext: {} # Security context for containers. service: type: ClusterIP # Service type (ClusterIP, NodePort, LoadBalancer). port: 8080 # Port on which the backend API is exposed. ``` #### 3. WebSocket Server Configuration Keep uses Soketi as its websocket server. To learn how to configure it, please see [Soketi docs](https://github.com/soketi/charts/tree/master/charts/soketi). #### 4. Database Configuration Keep supports plenty of database (e.g. postgresql, mysql, sqlite, etc). It is out of scope to describe here how to deploy all of them to k8s. If you have specific questions - [contact us](https://slack.keephq.dev) and we will be happy to help. ================================================ FILE: docs/deployment/kubernetes/installation.mdx ================================================ --- title: "Installation" sidebarTitle: "Installation" --- The recommended way to install Keep on Kubernetes is via Helm Chart.

Follow these steps to set it up.
# Prerequisites ## Helm CLI See the [Helm documentation](https://helm.sh/docs/intro/install/) for instructions about installing helm. ## Ingress Controller (Optional) You can skip this step if: 1. You already have **ingress-nginx** installed. 2. You don't need to expose Keep to the internet/network. ### Overview An ingress controller is essential for managing external access to services in your Kubernetes cluster. It acts as a smart router and load balancer, allowing you to expose multiple services through a single entry point while handling SSL termination and routing rules. **Keep works best with both** [ingress-nginx](https://github.com/kubernetes/ingress-nginx) **and** [HAProxy Ingress](https://haproxy-ingress.github.io/) **controllers, but you can customize the helm chart for other ingress controllers too.** ### Nginx Ingress Controller #### Check ingress-nginx Installed You check if you already have ingress-nginx installed: ```bash # By default, the ingress-nginx will be installed under the ingress-nginx namespace kubectl -n ingress-nginx get pods NAME READY STATUS RESTARTS AGE ingress-nginx-controller-d49697d5f-hjhbj 1/1 Running 0 4h19m # Or check for the ingress class kubectl get ingressclass NAME CONTROLLER PARAMETERS AGE nginx k8s.io/ingress-nginx 4h19m ``` #### Install ingress-nginx To read about more installation options, see [ingress-nginx installation docs](https://kubernetes.github.io/ingress-nginx/deploy/). Since ingress-nginx 4.12, you'll need to add ``` --set controller.config.annotations-risk-level=Critical ``` See https://github.com/kubernetes/ingress-nginx/issues/12618#issuecomment-2566084202 ```bash # simplest way to install # we set snippet-annotations to true to allow rewrites # see https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#allow-snippet-annotations helm upgrade --install ingress-nginx ingress-nginx \ --repo https://kubernetes.github.io/ingress-nginx \ --set controller.config.allow-snippet-annotations=true \ --set controller.config.annotations-risk-level=Critical \ --namespace ingress-nginx --create-namespace ``` Verify installation: ```bash kubectl get ingressclass NAME CONTROLLER PARAMETERS AGE nginx k8s.io/ingress-nginx 4h19m ``` Verify if snippet annotations are enabled: ```bash kubectl get configmap -n ingress-nginx ingress-nginx-controller -o yaml | grep allow-snippet-annotations allow-snippet-annotations: "true" ``` ### HAProxy Ingress Controller #### Install ingress-haproxy To read about more installation options, see [haproxy-ingress installation docs](https://haproxy-ingress.github.io/docs/getting-started/). ```bash # simplest way to install helm upgrade --install haproxy-ingress haproxy-ingress \ --repo https://haproxy-ingress.github.io/charts \ --namespace ingress-haproxy --create-namespace ``` Verify installation: ```bash kubectl get ingressclass NAME CONTROLLER PARAMETERS AGE haproxy haproxy-ingress.github.io/controller 4h19m ``` Verify if controller is running: ```bash kubectl get pods -n ingress-haproxy -l app.kubernetes.io/instance=haproxy-ingress NAME READY STATUS RESTARTS AGE haproxy-ingress-controller-x4n2z 1/1 Running 0 4h19m ``` ## Installation ### With Ingress-NGINX (Recommended) ```bash # Add the Helm repository helm repo add keephq https://keephq.github.io/helm-charts # Install Keep with ingress enabled helm install keep keephq/keep -n keep --create-namespace ``` ### With Ingress-HAProxy (Recommended) ```bash # Add the Helm repository helm repo add keephq https://keephq.github.io/helm-charts # Install Keep with ingress enabled helm install keep keephq/keep -n keep --create-namespace --set global.ingress.className=haproxy ``` ### Without Ingress (Not Recommended) ```bash # Add the Helm repository helm repo add keephq https://keephq.github.io/helm-charts # Install Keep without ingress enabled. # You won't be able to access Keep from the network. helm install keep keephq/keep -n keep --create-namespace \ --set global.ingress.enabled=false ``` ## Accessing Keep ### Ingress If you installed Keep with ingress, you should be able to access Keep. ```bash kubectl -n keep get ingress NAME CLASS HOSTS ADDRESS PORTS AGE keep-ingress nginx * X.X.X.X 80 4h16m ``` Keep is available at http://X.X.X.X :) ### Without Ingress (Port-Forwarding) Use the following commands to access Keep locally without ingress: ```bash # Forward the UI kubectl port-forward svc/keep-frontend 3000:3000 -n keep & # Forward the Backend kubectl port-forward svc/keep-backend 8080:8080 -n keep & # Forward WebSocket server (optional) kubectl port-forward svc/keep-websocket 6001:6001 -n keep & ``` Keep is available at http://localhost:3000 :) ## Configuring HTTPS ### Prerequisites 1. Domain Name: Example - keep.yourcompany.com 2. TLS Certificate: Private key (tls.key) and certificate (tls.crt) ### Create the TLS Secret Assuming: - `tls.crt` contains the certificate. - `tls.key` contains the private key. ```bash # create the secret with kubectl kubectl create secret tls keep-tls --cert=./tls.crt --key=./tls.key -n keep ``` ### Update Helm Values for TLS ```bash helm upgrade -n keep keep keephq/keep \ --set "global.ingress.hosts[0].host=keep.example.com" \ --set "global.ingress.tls[0].hosts[0]=keep.example.com" \ --set "global.ingress.tls[0].secretName=keep-tls" ``` Alternatively, update your `values.yaml`: ```bash ... global: ingress: hosts: - host: keep.example.com tls: - hosts: - keep.example.com secretName: keep-tls ... ``` ## Uninstallation To remove Keep and clean up: ```bash helm uninstall keep -n keep kubectl delete namespace keep ``` ================================================ FILE: docs/deployment/kubernetes/openshift.mdx ================================================ --- title: "Openshift" sidebarTitle: "Openshift" --- Keep's Helm Chart also supports Openshift installation. Simply follow the Kubernetes set-up guide, but make sure to modify the following lines under frontend(/backend).route in the values.yaml file as follows: ``` enabled: true host: path: # should be / for default tls: wildcardPolicy: ``` ================================================ FILE: docs/deployment/kubernetes/overview.mdx ================================================ --- title: "Overview" sidebarTitle: "Overview" --- If you need help deploying Keep on Kubernetes or have any feedback or suggestions, feel free to open a ticket in our [GitHub repo](https://github.com/keephq/keep) or say hello in our [Slack](https://slack.keephq.dev). Keep is designed as a Kubernetes-native application. We maintain an opinionated, batteries-included Helm chart, but you can customize it as needed. ## Next steps - Install Keep on [Kubernetes](/deployment/kubernetes/installation). - Keep's [Helm Chart](https://github.com/keephq/helm-charts). - Keep with [Kubernetes Secret Manager](/deployment/secret-store#kubernetes-secret-manager) - Deep dive to Keep's kubernetes [Architecture](/deployment/kubernetes/architecture). - Install Keep on [OpenShift](/deployment/kubernetes/openshift). ================================================ FILE: docs/deployment/local-llm/keep-with-litellm.mdx ================================================ --- title: "Running Keep with LiteLLM" --- This guide is for users who want to run Keep with locally hosted LLM models. If you encounter any issues, please talk to us at our (Slack community)[https://slack.keephq.dev]. ## Overview This guide will help you set up Keep with LiteLLM, a versatile tool that supports over 100 LLM providers. LiteLLM acts as a proxy that adheres to OpenAI standards, allowing seamless integration with Keep. By following this guide, you can easily configure Keep to work with various LLM providers using LiteLLM. ### Motivation Incorporating LiteLLM with Keep allows organizations to run local models in on-premises and air-gapped environments. This setup is particularly beneficial for leveraging AIOps capabilities while ensuring that sensitive data does not leave the premises. By using LiteLLM as a proxy, you can seamlessly integrate with Keep and access a wide range of LLM providers without compromising data security. This approach is ideal for organizations that prioritize data privacy and need to comply with strict regulatory requirements. ## Prerequisites ### Running LiteLLM locally 1. Ensure you have Python and pip installed on your system. 2. Install LiteLLM by running the following command: ```bash pip install litellm ``` 3. Start LiteLLM with your desired model. For example, to use the HuggingFace model: ```bash litellm --model huggingface/bigcode/starcoder ``` This will start the proxy server on `http://0.0.0.0:4000`. ### Running LiteLLM with Docker To run LiteLLM using Docker, you can use the following command: ```bash docker run -p 4000:4000 litellm/litellm --model huggingface/bigcode/starcoder ``` This command will start the LiteLLM proxy in a Docker container, exposing it on port 4000. ## Configuration | Env var | Purpose | Required | Default Value | Valid options | | :-------------------------: | :-----------------------------------------: | :------: | :-----------: | :---------------------------------------: | | **OPEN_AI_ORGANIZATION_ID** | Organization ID for OpenAI/LiteLLM services | Yes | None | Valid organization ID string | | **OPEN_AI_API_KEY** | API key for OpenAI/LiteLLM services | Yes | None | Valid API key string | | **OPENAI_BASE_URL** | Base URL for the LiteLLM proxy | Yes | None | Valid URL (e.g., "http://localhost:4000") | These environment variables should be set on both Keep **frontend** and **backend**. ## Additional Resources - [LiteLLM Documentation](https://docs.litellm.ai/) By following these steps, you can leverage the power of multiple LLM providers with Keep, using LiteLLM as a flexible and powerful proxy. ================================================ FILE: docs/deployment/monitoring.mdx ================================================ --- title: "Monitoring" sidebarTitle: "Monitoring" --- # Healthchecks Keep's Backend healthcheck url: ``` {BACKEND_API_URL}/healthcheck ``` Keep's Frontend healthcheck url: ``` {FRONTEND_URL}/api/healthcheck ``` # Prometheus Metrics (TBD) > Please note that /api/metrics are not designed for production instance's health monitoring, but for usage monitoring by a specific tenant. ================================================ FILE: docs/deployment/provision/dashboard.mdx ================================================ --- title: "Dashboard Provisioning" --- Provisioning dashboards in Keep allows you to configure and manage visual representations of your data. This section will guide you through the steps required to set up and provision dashboards. ### Dashboard Provisioning Overview Dashboards in Keep are configured using JSON strings that define the layout, data sources, and visual components. These configurations can be managed through environment variables or configuration files. ### Environment Variables To provision dashboards, you need to set the following environment variable: | Environment Variable | Purpose | | -------------------- | ----------------------------------------------- | | `KEEP_DASHBOARDS` | JSON string containing dashboard configurations | ### Example Configuration Here is an example of how to set the `KEEP_DASHBOARDS` environment variable (dumped from the database): ```json [ { "dashboard_name": "My Dashboard", "dashboard_config": { "layout": [ { "i": "w-1728223503577", "x": 0, "y": 0, "w": 3, "h": 3, "minW": 2, "minH": 2, "static": false } ], "widget_data": [ { "i": "w-1728223503577", "x": 0, "y": 0, "w": 3, "h": 3, "minW": 2, "minH": 2, "static": false, "thresholds": [ { "value": 0, "color": "#22c55e" }, { "value": 20, "color": "#ef4444" } ], "preset": { "id": "11111111-1111-1111-1111-111111111111", "name": "feed", "options": [ { "label": "CEL", "value": "(!deleted && !dismissed)" }, { "label": "SQL", "value": { "sql": "(deleted=false AND dismissed=false)", "params": {} } } ], "created_by": null, "is_private": false, "is_noisy": false, "should_do_noise_now": false, "alerts_count": 98, "static": true, "tags": [] }, "name": "Test" } ] } } ] ``` Please read more at https://github.com/react-grid-layout/react-grid-layout for more information on the layout configuration options. ================================================ FILE: docs/deployment/provision/overview.mdx ================================================ --- title: "Overview" --- Keep supports various deployment and provisioning strategies to accommodate different environments and use cases, from development setups to production deployments. ### Provisioning Options Keep offers three main provisioning options: 1. [**Provider Provisioning**](/deployment/provision/provider) - Set up and manage data providers with their deduplication rules for Keep. 2. [**Workflow Provisioning**](/deployment/provision/workflow) - Configure and manage workflows within Keep. 3. [**Dashboard Provisioning**](/deployment/provision/dashboard) - Configure and manage dashboards within Keep. Choosing the right provisioning strategy depends on your specific use case, deployment environment, and scalability requirements. You can read more about each provisioning option in their respective sections. ### How To Configure Provisioning Some provisioning options require additional environment variables. These will be covered in detail on the specific provisioning pages. Provisioning in Keep is controlled through environment variables and configuration files. The main environment variables for provisioning are: | Provisioning Type | Environment Variable | Purpose | | ---------------------- | ------------------------------ | ------------------------------------------------------------------------- | | **Provider** | `KEEP_PROVIDERS` | JSON string containing provider configurations with deduplication rules | | **Workflow** | `KEEP_WORKFLOW` | One workflow to provision right from the env variable. | | **Workflows** | `KEEP_WORKFLOWS_DIRECTORY` | Directory path containing workflow configuration files | | **Dashboard** | `KEEP_DASHBOARDS` | JSON string containing dashboard configurations | Hint: use the script to get 1-liner from the workflow file for KEEP_WORKFLOW: ``` Use `cat workflow_file.yaml | awk '{printf "%s\\n", $0}' | tr -d '\n'; echo` to get the workflow in 1-string format. ``` For more details on each provisioning strategy, including setup instructions and implications, refer to the respective sections. ================================================ FILE: docs/deployment/provision/provider.mdx ================================================ --- title: "Providers Provisioning" --- For any questions or issues related to provider provisioning, please join our [Slack](https://slack.keephq.dev) community. Provider provisioning in Keep allows you to set up and manage data providers dynamically. This feature enables you to configure various data sources that Keep can interact with, such as monitoring systems, databases, or other services. ### Configuring Providers To provision providers and deduplication rules for them, we can configure via the environment variable. This can be done in two ways: 1. Using `KEEP_PROVIDERS` environment variable which either contains a JSON string or a path to a JSON file that contains the providers configurations. 2. Using `KEEP_PROVIDERS_DIRECTORY` environment variable which contains a path to a directory that contains the providers configurations (configured via YAML files). This is the recommended approach. Keep does not allow to use both `KEEP_PROVIDERS` and `KEEP_PROVIDERS_DIRECTORY` environment variables at the same time. Keep can automatically install webhooks for providers that support them. This behavior depends on the configuration and the provisioning method used. Please note: Deduplication rules are not mandatory for provider distribution. ### Providers provisioning using KEEP_PROVIDERS Providers provisioning JSON example: ```json { "keepVictoriaMetrics": { "type": "victoriametrics", "authentication": { "VMAlertHost": "http://localhost", "VMAlertPort": 1234 }, "install_webhook": true, "deduplication_rules": { "deduplication rule name example 1": { "description": "deduplication rule name example 1", "fingerprint_fields": ["fingerprint", "source", "service"], "full_deduplication": true, "ignore_fields": ["name", "lastReceived"] }, "deduplication rule name example 2": { "description": "deduplication rule name example 2", "fingerprint_fields": ["fingerprint", "source", "service"], "full_deduplication": false, } } }, "keepClickhouse1": { "type": "clickhouse", "authentication": { "host": "http://localhost", "port": 1234, "username": "keep", "password": "keep", "database": "keep-db" } } } ``` Spin up Keep with this `KEEP_PROVIDERS` value: ```json # ENV KEEP_PROVIDERS={"keepVictoriaMetrics":{"type":"victoriametrics","authentication":{"VMAlertHost":"http://localhost","VMAlertPort": 1234},"install_webhook":true},"keepClickhouse1":{"type":"clickhouse","authentication":{"host":"http://localhost","port":"4321","username":"keep","password":"1234","database":"keepdb"}}} ``` By default, when provisioning using `KEEP_PROVIDERS`, webhooks are automatically installed for providers that support them unless the `install_webhook` flag is set to `false`. ### Providers provisioning using KEEP_PROVIDERS_DIRECTORY Specify the path to the directory containing the providers configurations: ```bash # ENV KEEP_PROVIDERS_DIRECTORY=/path/to/providers ``` The directory should contain YAML files with the providers configurations. Example of a provider configuration YAML file: ```yaml name: keepVictoriaMetrics type: victoriametrics authentication: VMAlertHost: http://localhost VMAlertPort: 1234 install_webhook: false deduplication_rules: deduplication_rule_name_example_1: description: deduplication rule name example 1 fingerprint_fields: - fingerprint - source - service full_deduplication: true ignore_fields: - name - lastReceived ``` The `install_webhook` field controls whether Keep sets up webhooks automatically for that provider. By default, when provisioning using `KEEP_PROVIDERS_DIRECTORY`, webhook installation is disabled unless explicitly set to `true`. ### Supported Providers Keep supports a wide range of provider types. Each provider type has its own specific configuration requirements. To see the full list of supported providers and their detailed configuration options, please refer to our comprehensive provider documentation. ### Update Provisioned Providers #### Using KEEP_PROVIDERS Provider configurations can be updated dynamically by changing the `KEEP_PROVIDERS` environment variable. On every restart, Keep reads this environment variable and determines which providers need to be added or removed. This process allows for flexible management of data sources without requiring manual intervention. By simply updating the `KEEP_PROVIDERS` variable and restarting the application, you can efficiently add new providers, remove existing ones, or modify their configurations. The high-level provisioning mechanism: 1. Keep reads the `KEEP_PROVIDERS` value. 2. Keep checks if there are any provisioned providers that are no longer in the `KEEP_PROVIDERS` value, and deletes them. 3. Keep installs all providers from the `KEEP_PROVIDERS` value. #### Using KEEP_PROVIDERS_DIRECTORY Provider configurations can be updated dynamically by changing the YAML files in the `KEEP_PROVIDERS_DIRECTORY` directory. On every restart, Keep reads the YAML files in the `KEEP_PROVIDERS_DIRECTORY` directory and determines which providers need to be added or removed. The high-level provisioning mechanism: 1. Keep reads the YAML files in the `KEEP_PROVIDERS_DIRECTORY` directory. 2. Keep checks if there are any provisioned providers that are no longer in the YAML files, and deletes them. 3. Keep installs all providers from the YAML files. ================================================ FILE: docs/deployment/provision/workflow.mdx ================================================ --- title: "Workflow Provisioning" --- For any questions or issues related to workflow provisioning, please join our [Slack](https://slack.keephq.dev) community. Workflow provisioning in Keep allows you to set up and manage workflows dynamically. This feature enables you to configure various automated processes and tasks within your Keep deployment. ### Configuring Workflows To provision workflows, follow these steps: 1. Set the `KEEP_WORKFLOWS_DIRECTORY` environment variable to the path of your workflow configuration directory. 2. Create workflow configuration files in the specified directory. Example directory structure: ``` /path/to/workflows/ ├── workflow1.yaml ├── workflow2.yaml └── workflow3.yaml ``` ### Update Provisioned Workflows On every restart, Keep reads the `KEEP_WORKFLOWS_DIRECTORY` environment variable and determines which workflows need to be added, removed, or updated. This process allows for flexible management of workflows without requiring manual intervention. By simply updating the workflow files in the `KEEP_WORKFLOWS_DIRECTORY` and restarting the application, you can efficiently add new workflows, remove existing ones, or modify their configurations. The high-level provisioning mechanism: 1. Keep reads the `KEEP_WORKFLOWS_DIRECTORY` value. 2. Keep lists all workflow files under the `KEEP_WORKFLOWS_DIRECTORY` directory. 3. Keep compares the current workflow files with the previously provisioned workflows: - New workflow files are provisioned. - Missing workflow files are deprovisioned. - Updated workflow files are re-provisioned with the new configuration. 4. Keep updates its internal state to reflect the current set of provisioned workflows. ================================================ FILE: docs/deployment/secret-store.mdx ================================================ --- title: "Secret Store" sidebarTitle: "Secret Store" --- ## Overview Secret Manager selection is crucial for securing your application. Different modes can be set up depending on the deployment type. Our system supports four primary secret manager types. ## Secret Manager Factory The `SecretManagerFactory` is a utility class used to create instances of different types of secret managers. It leverages the Factory design pattern to abstract the creation logic based on the type of secret manager required. The factory supports creating instances of File, GCP, Kubernetes, and Vault Secret Managers. The `SECRET_MANAGER_TYPE` environment variable plays a crucial role in the SecretManagerFactory for determining the default type of secret manager to be instantiated when no specific type is provided in the method call. **Functionality**: **Default Secret Manager**: If the `SECRET_MANAGER_TYPE` environment variable is set, its value dictates the default type of secret manager that the factory will create. The value of this variable should correspond to one of the types defined in SecretManagerTypes enum (`FILE`, `AWS`, `GCP`, `K8S`, `VAULT`, `DB`). **Example Configuration**: Setting `SECRET_MANAGER_TYPE=GCP` in the environment will make the factory create instances of GcpSecretManager by default. If `SECRET_MANAGER_TYPE` is not set or is set to `FILE`, the factory defaults to creating instances of FileSecretManager. This environment variable provides flexibility and ease of configuration, allowing different secret managers to be used in different environments or scenarios without code changes. ## File Secret Manager The `FileSecretManager` is a concrete implementation of the BaseSecretManager for managing secrets stored in the file system. It uses a specified directory (defaulting to ./) to read, write, and delete secret files. Configuration: Set the environment variable `SECRET_MANAGER_DIRECTORY` to specify the directory where secrets are stored. If not set, defaults to the current directory (./). Usage: - Secrets are stored as files in the specified directory. - Reading a secret involves fetching content from a file. - Writing a secret creates or updates a file with the given content. - Deleting a secret removes the corresponding file. ## AWS Secret Manager The `AwsSecretManager` integrates with Amazon Web Services' Secrets Manager service for secure secret management. It provides a robust solution for storing and managing secrets in AWS environments. Configuration: Required environment variables: - `AWS_REGION`: The AWS region where your secrets are stored - For local development: - `AWS_ACCESS_KEY_ID`: Your AWS access key - `AWS_SECRET_ACCESS_KEY`: Your AWS secret access key Optional: - `AWS_KMS_KEY_ID`: The KMS key ID to use for encrypting secrets - `AWS_SECRET_MANAGER_TAGS`: Comma-separated list of tags to add to the secret in AWS Secrets Manager, e.g. `key=value,key2=value2` - `AWS_SECRET_ROTATION_ENABLED`: Set to `true` to enable automatic rotation of secrets (default: `false`) - `AWS_SECRET_ROTATION_DAYS`: Number of days between automatic rotations (default: `30`) - `AWS_SECRET_ROTATION_LAMBDA_ARN`: ARN of the Lambda function to use for secret rotation, required if rotation is enabled Usage: - Manages secrets using AWS Secrets Manager service - Supports creating, updating, reading, and deleting secrets - Can automatically configure secret rotation policies when creating new secrets ### AWS Secret Rotation Secret rotation is a security best practice that automatically updates secrets at regular intervals. When enabled, Keep will configure newly created secrets with a rotation schedule. To use secret rotation: 1. Create a Lambda function for rotating your secrets (AWS provides blueprints for common rotation scenarios) 2. Set `AWS_SECRET_ROTATION_ENABLED=true` in your environment 3. Set `AWS_SECRET_ROTATION_LAMBDA_ARN` to the ARN of your rotation Lambda function 4. Optionally set `AWS_SECRET_ROTATION_DAYS` to customize the rotation interval Example Lambda ARN format: `arn:aws:lambda:region:account-id:function:function-name` Note: Different secret types (database credentials, API keys, etc.) require different rotation logic. Make sure your Lambda function is appropriate for the type of secrets you're storing. ## Kubernetes Secret Manager ### Overview The `KubernetesSecretManager` interfaces with Kubernetes' native secrets system. It manages secrets within a specified Kubernetes namespace and is designed to operate within a Kubernetes cluster. ### Configuration - `SECRET_MANAGER_TYPE=k8s` - `K8S_NAMESPACE=keep` - environment variable to specify the Kubernetes namespace. Defaults to `.metadata.namespace` if not set. Assumes Kubernetes configurations (like service account tokens) are properly set up when running within a cluster. - `K8S_VERIFY_SSL_CERT=true` - environment variable to specify whether to verify the SSL certificate of the Kubernetes API. Defaults to `true`. Usage: - Secrets are stored as Kubernetes Secret objects. - Provides functionalities to create, retrieve, and delete Kubernetes secrets. - Handles base64 encoding and decoding as required by Kubernetes. ### Environment Variables From Secrets The Kubernetes Secret Manager integration allows Keep to fetch environment variables from Kubernetes Secrets. For sensitive environment variables, such as `DATABASE_CONNECTION_STRING`, it is recommended to store as a secret: #### Creating Database Connection Secret ```bash # Create the base64 encoded string without newline CONNECTION_STRING_B64=$(echo -n "mysql+pymysql://user:password@host:3306/dbname" | base64) # Create the Kubernetes secret kubectl create secret generic keep-db-secret \ --namespace=keep \ --from-literal=connection_string=$(echo -n "mysql+pymysql://user:password@host:3306/dbname" | base64) # Or using a YAML file: cat <If you are using Keep and have performance issues, we will be more than happy to help you. Just join our [slack](https://slack.keepqh.dev) and shoot a message on the **#help** channel. ## Overview Spec and stress testing are crucial to ensuring the robust performance and scalability of Keep. This documentation outlines the key areas of focus for testing Keep under different load conditions, considering both the simplicity of setup for smaller environments and the scalability mechanisms for larger deployments. Keep was initially designed to be user-friendly for setups handling less than 10,000 alerts. However, as alert volumes increase, users can leverage advanced features such as Elasticsearch for document storage and Redis + ARQ for queue-based alert ingestion. While these advanced configurations are not fully documented here, they are supported and can be discussed further in our Slack community. ## How To Reproduce To reproduce the stress testing scenarios mentioned above, please refer to the [STRESS.md](https://github.com/keephq/keep/blob/main/STRESS.md) file in Keep's repository. This document provides step-by-step instructions on how to set up, run, and measure the performance of Keep under different load conditions. ## Performance Testing ### Factors Affecting Specifications The primary parameters that affect the specification requirements for Keep are: 1. **Alerts Volume**: The rate at which alerts are ingested into the system. 2. **Total Alerts**: The cumulative number of alerts stored in the system. 3. **Number of Workflows**: How many automation run as a result of alert. ### Main Components: - **Keep Backend** - API and business logic. A container that serves FastAPI on top of gunicorn. - **Keep Frontend** - Web app. A container that serves the react app. - **Database** - Stores the alerts and any other operational data. - **Elasticsearch** (opt out by default) - Stores alerts as document for better search performance. - **Redis** (opt out by default) - Used, together with ARQ, as an alerts queue. ### Testing Scenarios: - **Low Volume (< 10,000 total alerts, hundreds of alerts per day)**: - **Setup**: Use a standard relational database (e.g., MySQL, PostgreSQL) with default configurations. - **Expectations**: Keep should handle queries and alert ingestion with minimal resource usage. - **Medium Volume (10,000 - 100,000 total alerts, thousands of alerts per day)**: - **Setup**: Scale the database to larger instances or clusters. Adjust best practices to the DB (e.g. increasing innodb_buffer_pool_size) - **Expectations**: CPU and RAM usage should increase proportionally but remain within acceptable limits. 3. **High Volume (100,000 - 1,000,000 total alerts, >five thousands of alerts per day)**: - **Setup**: Deploy Keep with Elasticsearch for storing alerts as documents. - **Expectations**: The system should maintain performance levels despite the large alert volume, with increased resource usage managed through scaling strategies. 4. **Very High Volume (> 1,000,000 total alerts, tens of thousands of alerts per day)**: - **Setup**: Deploy Keep with Elasticsearch for storing alerts as documents. - **Setup #2**: Deploy Keep with Redis and with ARQ to use Redis as a queue. ## Recommended Specifications by Alert Volume | **Number of Alerts** | **Keep Backend** | **Keep Database** | **Redis** | **Elasticsearch** | |------------------------|------------------------------------------------|-------------------------------------------------|------------------------------------------------|------------------------------------------------| | **< 10,000** | 1 vCPUs, 2GB RAM | 2 vCPUs, 8GB RAM | Not required | Not required | | **10,000 - 100,000** | 4 vCPUs, 8GB RAM | 8 vCPUs, 32GB RAM, optimized indexing | Not required | Not required | | **100,000 - 500,000** | 8 vCPUs, 16GB RAM | 8 vCPUs, 32GB RAM, advanced indexing | 4 vCPUs, 8GB RAM | 8 vCPUs, 32GB RAM, 2-3 nodes | | **> 500,000** | 8 vCPUs, 16GB RAM | 8 vCPUs, 32GB RAM, advanced indexing, sharding| 4 vCPUs, 8GB RAM | 8 vCPUs, 32GB RAM, 2-3 nodes | ## Performance by Operation Type, Load, and Specification | **Operation Type** | **Load** | **Specification** | **Execution Time** | |-----------------------|----------------------------|------------------------------|-----------------------------------| | Digest Alert | 100 alerts per minute | 4 vCPUs, 8GB RAM | ~0.5 seconds | | Digest Alert | 500 alerts per minute | 8 vCPUs, 16GB RAM | ~1 second | | Digest Alert | 1,000 alerts per minute | 16 vCPUs, 32GB RAM | ~1.5 seconds | | Run Workflow | 10 workflows per minute | 4 vCPUs, 8GB RAM | ~1 second | | Run Workflow | 50 workflows per minute | 8 vCPUs, 16GB RAM | ~2 seconds | | Run Workflow | 100 workflows per minute | 16 vCPUs, 32GB RAM | ~3 seconds | | Ingest via Queue | 100 alerts per minute | 4 vCPUs, 8GB RAM, Redis | ~0.3 seconds | | Ingest via Queue | 500 alerts per minute | 8 vCPUs, 16GB RAM, Redis | ~0.8 seconds | | Ingest via Queue | 1,000 alerts per minute | 16 vCPUs, 32GB RAM, Redis | ~1.2 seconds | ### Table Explanation: - **Operation Type**: The specific operation being tested (e.g., digesting alerts, running workflows). - **Load**: The number of operations per minute being processed (e.g., number of alerts per minute). - **Specification**: The CPU, RAM, and additional services used for the operation. - **Execution Time**: Approximate time taken to complete the operation under the given load and specification. ## Fine Tuning As any deployment has its own characteristics, such as the balance between volume vs. total count of alerts or volume vs. number of workflows, Keep can be fine-tuned with the following parameters: 1. **Number of Workers**: Adjust the number of Gunicorn workers to handle API requests more efficiently. You can also start additional API servers to distribute the load. 2. **Distinguish Between API Server Workers and Digesting Alerts Workers**: Separate the workers dedicated to handling API requests from those responsible for digesting alerts, ensuring that each set of tasks is optimized according to its specific needs. 3. **Add More RAM to the Database**: Increasing the RAM allocated to your database can help manage larger datasets and improve query performance, particularly when dealing with high volumes of alerts. 4. **Optimize Database Configuration**: Keep was mainly tested on MySQL and PostgreSQL. Different database may have different fine tuning mechanisms. 5. **Horizontal Scaling**: Consider deploying additional instances of the API and database services to distribute the load more effectively. ## FAQ ### 1. How do I estimate the spec I need for Keep? To estimate the specifications required for Keep, consider both the number of alerts per minute and the total number of alerts you expect to handle. Refer to the **Recommended Specifications by Alert Volume** table above to match your expected load with the appropriate resources. ### 2. How do I know if I need Elasticsearch? Elasticsearch is typically needed when you are dealing with more than 50,000 total alerts or if you require advanced search and query capabilities that are not efficiently handled by a traditional relational database. If your system’s performance degrades significantly as alert volume increases, it may be time to consider Elasticsearch. ### 3. How do I know if I need Redis? Redis is recommended when your alert ingestion rate exceeds 1,000 alerts per minute or when you notice that the API is becoming a bottleneck due to high ingestion rates. Redis, combined with ARQ (Asynchronous Redis Queue), can help manage and distribute the load more effectively. ### 4. What should I do if Keep's performance is still inadequate? If you have scaled according to the recommendations and are still facing performance issues, consider: - **Optimizing your database configuration**: Indexing, sharding, and query optimization can make a significant difference. - **Horizontal scaling**: Distribute the load across multiple instances of the API and database services. - **Reach out to our Slack community**: For personalized support, reach out to us on Slack, and we’ll help you troubleshoot and optimize your Keep deployment. For any additional questions or tailored advice, feel free to join our Slack community where our team and other users are available to assist you. ================================================ FILE: docs/development/external-url.mdx ================================================ --- title: "Keep with an external URL" sidebarTitle: "Keep with an external URL" --- ## Introduction Several features in Keep necessitate an external URL that is accessible from the internet. This is particularly crucial for functionalities like Webhook Integration when installing providers. Keep uses its API URL to establish itself as a webhook connector during this process. When an alert is triggered, the corresponding Provider attempts to activate the webhook, delivering the alert payload. Consequently, the webhook must be accessible over the internet for this process to work effectively. ## Utilizing NGROK for External Accessibility Keep supports the use of NGROK to create an accessible external URL. By starting Keep with the environment variable USE_NGROK=true, Keep will automatically initiate an NGROK tunnel and utilize this URL for webhook installations. While `USE_NGROK` is convenient for development or testing, it's important to note that each restart of Keep results in a new NGROK URL. This change in the URL means that providers configured with the old URL will no longer be able to communicate with Keep. For production environments, it's advisable to either: - Expose Keep with a permanent, internet-accessible URL. - Set up a static NGROK tunnel. Subsequently, configure Keep to use this stable URL by setting the KEEP_API_URL environment variable. ================================================ FILE: docs/development/getting-started.mdx ================================================ --- title: "Getting started" sidebarTitle: "Getting started" --- ### Docker-compose dev images You can use `docker-compose.dev.yaml` to start Keep in a development mode. First, clone the Keep repo: ``` git clone https://github.com/keephq/keep.git && cd keep ``` Next, run ``` docker compose -f docker-compose.dev.yml up ``` ### Install Keep CLI First, clone Keep repository: ```shell git clone https://github.com/keephq/keep.git && cd keep ``` Install Keep CLI ```shell poetry install ``` To access the Keep CLI activate the environment, and access from shell. ```shell poetry shell ``` From now on, Keep should be installed locally and accessible from your CLI, test it by executing: ``` keep version ``` ## Enable Auto Completion **Keep's CLI supports shell auto-completion, which can make your life a whole lot easier 😌** If you're using zsh ```shell title=~/.zshrc eval "$(_KEEP_COMPLETE=zsh_source keep)" ``` If you're using bash ```bash title=~/.bashrc eval "$(_KEEP_COMPLETE=bash_source keep)" ``` > Using eval means that the command is invoked and evaluated every time a shell is started, which can delay shell responsiveness. To speed it up, write the generated script to a file, then source that. ### Testing Run unittests: ```bash poetry run coverage run --branch -m pytest --ignore=tests/e2e_tests/ ``` Run E2E tests (run Keep locally before): ```bash poetry run playwright install; poetry run coverage run --branch -m pytest -s tests/e2e_tests/ ``` ### Migrations Migrations are automatically executed on a server startup. To create a migration: ```bash alembic -c keep/alembic.ini revision --autogenerate -m "Your message" ``` Hint: make sure your models are imported at `./api/models/db/migrations/env.py` for autogenerator to pick them up. ## VS Code (or Cursor) Run Keep from your VS Code (or Cursor) after cloning the repo by adding this configurations to your `.vscode/launch.json`: ```json { "version": "0.2.0", "configurations": [ { "name": "Keep Backend", "type": "debugpy", "request": "launch", "program": "keep/cli/cli.py", "console": "integratedTerminal", "justMyCode": false, "python": "venv/bin/python", "args": ["--json", "api","--multi-tenant"], "env": { "PYDEVD_DISABLE_FILE_VALIDATION": "1", "PYTHONPATH": "${workspaceFolder}/", "PUSHER_APP_ID": "1", "SECRET_MANAGER_DIRECTORY": "./state/", "PUSHER_HOST": "localhost", "PUSHER_PORT": "6001", "PUSHER_APP_KEY": "keepappkey", "PUSHER_APP_SECRET": "keepappsecret", "LOG_FORMAT": "dev_terminal", } }, { "name": "Keep Simulate Alerts", "type": "debugpy", "request": "launch", "program": "scripts/simulate_alerts.py", "console": "integratedTerminal", "justMyCode": false, "python": "venv/bin/python", "env": { "PYDEVD_DISABLE_FILE_VALIDATION": "1", "PYTHONPATH": "${workspaceFolder}/", "KEEP_API_URL": "http://localhost:8080", "KEEP_API_KEY": "some-api-key" } }, { "name": "Keep Frontend", "type": "node-terminal", "request": "launch", "command": "npm run dev", "cwd": "${workspaceFolder}/keep-ui", } ] } ``` Install dependencies: ``` python3.11 -m venv venv; source venv/bin/activate; pip install poetry; poetry install; cd keep-ui && npm i && cd ..; ``` Set frontend envs: ``` cp keep-ui/.env.local.example keep-ui/.env.local; echo "\n\n\n\nNEXTAUTH_SECRET="$(openssl rand -hex 32) >> keep-ui/.env.local; ``` Launch Pusher ([soketi](https://soketi.app/)) container in parallel: ```bash docker run -d -p 6001:6001 -p 9601:9601 -e SOKETI_USER_AUTHENTICATION_TIMEOUT=3000 -e SOKETI_DEFAULT_APP_KEY=keepappkey -e SOKETI_DEFAULT_APP_SECRET=keepappsecret -e SOKETI_DEFAULT_APP_ID=1 quay.io/soketi/soketi:1.4-16-debian ``` ## VS Code (or Cursor) + Docker For this guide to work, the [VS Code Docker](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-docker) extension is required. In air-gapped environments, you might consider building the container on an internet-connected computer, exporting the image using docker save, transferring it with docker load in the air-gapped environment, and then using the run configuration. In cases where you want to develop Keep but are unable to run it directly on your local laptop (e.g., with Windows), or if you lack access to all of its dependencies (e.g., in air-gapped environments), you can still accomplish this using VS Code (or Cursor) and Docker. To achieve this, follow these steps: 1. Clone Keep and open it with VS Code (or Cursor) 2. Create a tasks.json file to build and run the Keep API and Keep UI containers. 3. Create a launch.json configuration to start the containers and attach a debugger to them. 4. Profit. ### Clone Keep and open it with VS Code (or Cursor) ``` git clone https://github.com/keephq/keep.git && cd keep code . ``` ### Create tasks.json #### including building the containers ``` { "version": "2.0.0", "tasks": [ // The API and UI containers needs to be in the same docker network { "label": "docker-create-network", "type": "shell", "command": "docker network create keep-network || true", "problemMatcher": [] }, // Build the api container { "label": "docker-build-api-dev", "type": "docker-build", "dockerBuild": { "context": "${workspaceFolder}", "dockerfile": "${workspaceFolder}/Docker/Dockerfile.dev.api", "tag": "keep-api-dev:latest" } }, // Run the api container { "label": "docker-run-api-dev", "type": "docker-run", "dependsOn": [ "docker-build-api-dev", "docker-create-network" ], "python": { "args": [ "api" ], "file": "./keep/cli/cli.py" }, "dockerRun": { "network": "keep-network", "image": "keep-api-dev:latest", "containerName": "keep-api", "ports": [ { "containerPort": 8080, "hostPort": 8080 } ], "env": { "DEBUG": "1", "SECRET_MANAGER_TYPE": "FILE", "USE_NGROK": "false", "AUTH_TYPE": "DB" }, "volumes": [ { "containerPath": "/app", "localPath": "${workspaceFolder}" } ] } }, // Build the UI container { "label": "docker-build-ui", "type": "docker-build", "dockerBuild": { "context": "${workspaceFolder}", "dockerfile": "${workspaceFolder}/Docker/Dockerfile.dev.ui", "tag": "keep-ui-dev:latest" } }, // Run the UI container { "type": "docker-run", "label": "docker-run-ui", "dependsOn": [ "docker-build-ui", "docker-create-network" ], "dockerRun": { "network": "keep-network", "image": "keep-ui-dev:latest", "containerName": "keep-ui", "env": { // Uncomment for fully debug // "DEBUG": "*", "NODE_ENV": "development", "API_URL": "http://keep-api:8080", "AUTH_TYPE": "DB", }, "volumes": [ { "containerPath": "/app", "localPath": "${workspaceFolder}/keep-ui" } ], "ports": [ { "containerPort": 9229, "hostPort": 9229 }, { "containerPort": 3000, "hostPort": 3000 } ], "command": "npm run dev", }, "node": { "package": "${workspaceFolder}/keep-ui/package.json", "enableDebugging": true } } ] } ``` #### without building the containers To start Keep without building the containers, you'll need to have `keep-api-dev` and `keep-ui-dev` images loaded into your docker. ``` { "version": "2.0.0", "tasks": [ # The API and the UI needs to be in the same docker network { "label": "docker-create-network", "type": "shell", "command": "docker network create keep-network || true", "problemMatcher": [] }, # Run the API container { "label": "docker-run-api-dev", "type": "docker-run", "dependsOn": [ "docker-create-network" ], "python": { "args": [ "api" ], "file": "./keep/cli/cli.py" }, "dockerRun": { "network": "keep-network", "image": "keep-api-dev:latest", "containerName": "keep-api", "ports": [ { "containerPort": 8080, "hostPort": 8080 } ], "env": { "DEBUG": "1", "SECRET_MANAGER_TYPE": "FILE", "USE_NGROK": "false", "AUTH_TYPE": "DB" }, "volumes": [ { "containerPath": "/app", "localPath": "${workspaceFolder}" } ] } }, # Run the UI container { "type": "docker-run", "label": "docker-run-ui", "dependsOn": [ "docker-create-network" ], "dockerRun": { "network": "keep-network", "image": "keep-ui-dev:latest", "containerName": "keep-ui", "env": { // Uncomment for fully debug // "DEBUG": "*", "NODE_ENV": "development", "API_URL": "http://keep-api:8080", "AUTH_TYPE": "DB" }, "volumes": [ { "containerPath": "/app", "localPath": "${workspaceFolder}/keep-ui" } ], "ports": [ { "containerPort": 9229, "hostPort": 9229 }, { "containerPort": 3000, "hostPort": 3000 } ], "command": "npm run dev", }, "node": { "package": "${workspaceFolder}/keep-ui/package.json", "enableDebugging": true } } ] } ``` ### Create launch.json ``` { "name": "Docker: Keep API", "type": "docker", "request": "launch", "preLaunchTask": "docker-run-api-dev", "removeContainerAfterDebug": true, "containerName": "keep-api", "python": { "pathMappings": [ { "localRoot": "${workspaceFolder}", "remoteRoot": "/app" } ], "module": "keep.cli.cli" } }, { "name": "Docker: Keep UI", "type": "docker", "request": "launch", "removeContainerAfterDebug": true, "preLaunchTask": "docker-run-ui", "containerName": "keep-api", "platform": "node", "node": { "package": "${workspaceFolder}/keep-ui/package.json", "localRoot": "${workspaceFolder}/keep-ui" } }, ``` ================================================ FILE: docs/images/datadog_raw_alerts.txt ================================================ {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:05:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-unique-id&from_ts=1733928722000&to_ts=1733929922000&event_id=7879702138782271851&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929322000&to_ts=1733929622000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733929712000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-unique-id}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733929712000", "scopes": "service:keep-api-feature-unique-id", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879702138782271851", "tags": "monitor,service:keep-api-feature-unique-id", "id": "7879702138782271851", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:05:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-historical-rules-poc&from_ts=1733928722000&to_ts=1733929922000&event_id=7879702138713295486&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929322000&to_ts=1733929622000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733929712000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-historical-rules-poc}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733929712000", "scopes": "service:keep-api-feature-historical-rules-poc", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879702138713295486", "tags": "monitor,service:keep-api-feature-historical-rules-poc", "id": "7879702138713295486", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:05:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-grafana-legacy&from_ts=1733928842000&to_ts=1733930042000&event_id=7879704162663906513&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929442000&to_ts=1733929742000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733929833000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-grafana-legacy}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733929833000", "scopes": "service:keep-api-feature-grafana-legacy", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879704162663906513", "tags": "monitor,service:keep-api-feature-grafana-legacy", "id": "7879704162663906513", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:05:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-fix-2804-unlink-alert&from_ts=1733928902000&to_ts=1733930102000&event_id=7879705155994930207&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929502000&to_ts=1733929802000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733929892000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-fix-2804-unlink-alert}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733929892000", "scopes": "service:keep-api-fix-2804-unlink-alert", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879705155994930207", "tags": "monitor,service:keep-api-fix-2804-unlink-alert", "id": "7879705155994930207", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:14:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-matvey-kuk-workflows-fix&from_ts=1733929142000&to_ts=1733930342000&event_id=7879709198622396010&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929742000&to_ts=1733930042000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930133000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-matvey-kuk-workflows-fix}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930133000", "scopes": "service:keep-api-matvey-kuk-workflows-fix", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879709198622396010", "tags": "monitor,service:keep-api-matvey-kuk-workflows-fix", "id": "7879709198622396010", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:14:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-bugfix-yaml&from_ts=1733929142000&to_ts=1733930342000&event_id=7879709199720965710&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929742000&to_ts=1733930042000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930133000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-bugfix-yaml}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930133000", "scopes": "service:keep-api-bugfix-yaml", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879709199720965710", "tags": "monitor,service:keep-api-bugfix-yaml", "id": "7879709199720965710", "monitor_id": "160076582"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`Aborted connection`](https://app.datadoghq.com/logs/analytics?query=Aborted+connection&agg_m=count&agg_t=count&agg_q=database_id&index=%2A)** by **database_id**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:14:04 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077064?group=database_id%3Akeephq-sandbox%3Akeep&from_ts=1733929144000&to_ts=1733930344000&event_id=7879709234193994156&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077064/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=Aborted+connection&from_ts=1733929744000&to_ts=1733930044000&live=false&agg_m=count&agg_t=count&agg_q=database_id&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930135000", "event_type": "log_alert", "title": "[Triggered on {database_id:keephq-sandbox:keep}] Somethine weird in DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"Aborted connection\").index(\"*\").rollup(\"count\").by(\"database_id\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930135000", "scopes": "database_id:keephq-sandbox:keep", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879709234193994156", "tags": "database_id:keephq-sandbox:keep,monitor", "id": "7879709234193994156", "monitor_id": "160077064"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:15:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-grafana-legacy&from_ts=1733929202000&to_ts=1733930402000&event_id=7879710212645433433&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929802000&to_ts=1733930102000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930194000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-grafana-legacy}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930194000", "scopes": "service:keep-api-feature-grafana-legacy", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879710212645433433", "tags": "monitor,service:keep-api-feature-grafana-legacy", "id": "7879710212645433433", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:17:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-improvedocs&from_ts=1733929322000&to_ts=1733930522000&event_id=7879712214248911237&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929922000&to_ts=1733930222000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930313000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-improvedocs}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930313000", "scopes": "service:keep-api-feature-improvedocs", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879712214248911237", "tags": "monitor,service:keep-api-feature-improvedocs", "id": "7879712214248911237", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-ci-2766-simple-faster-ee&from_ts=1733929382000&to_ts=1733930582000&event_id=7879713295639221277&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733929982000&to_ts=1733930282000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930377000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-ci-2766-simple-faster-ee}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930377000", "scopes": "service:keep-api-ci-2766-simple-faster-ee", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879713295639221277", "tags": "monitor,service:keep-api-ci-2766-simple-faster-ee", "id": "7879713295639221277", "monitor_id": "160076582"} {"body": "%%%\nhttps://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&from_ts=1733929988000&to_ts=1733930288000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&event=AwAAAZO2TBElyb4KmQAAABhBWk8yVEJRZkFBQWozamRiYkp3THZBQUEAAAAkMDE5M2I2NGMtMjI3My00YzM0LThhOGUtNGM0MzllMDliNTkyAAAA0g \n\n @webhook-keep-datadog-webhook-integration-keep @webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 @webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375 @webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2\n\nMore than **0.0** log events matched in the last **5m** against the monitored query: **[`@http.status_code:(401 OR 403)`](https://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:08 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/134462228?group=service%3Akeep-api&from_ts=1733929388000&to_ts=1733930588000&event_id=7879713311244615328&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/134462228/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&from_ts=1733929988000&to_ts=1733930288000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930378000", "event_type": "log_alert", "title": "[P2] [Warn] Unauthorized access to API keep-api", "severity": "P2", "alert_type": "warning", "alert_query": "logs(\"@http.status_code:(401 OR 403)\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 5", "alert_transition": "Warn", "date": "1733930378000", "scopes": "service:keep-api", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879713311244615328", "tags": "environment:production,monitor,service:keep-api", "id": "7879713311244615328", "monitor_id": "134462228"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-feature-improvedocs&from_ts=1733929421000&to_ts=1733930621000&event_id=7879713879533759147&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930021000&to_ts=1733930321000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930412000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-improvedocs}] OperationalError DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930412000", "scopes": "service:keep-api-feature-improvedocs", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879713879533759147", "tags": "monitor,service:keep-api-feature-improvedocs", "id": "7879713879533759147", "monitor_id": "160077341"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-feature-grafana-legacy&from_ts=1733929421000&to_ts=1733930621000&event_id=7879713877056374717&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930021000&to_ts=1733930321000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930412000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-grafana-legacy}] OperationalError DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930412000", "scopes": "service:keep-api-feature-grafana-legacy", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879713877056374717", "tags": "monitor,service:keep-api-feature-grafana-legacy", "id": "7879713877056374717", "monitor_id": "160077341"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-feature-grafana-legacy&from_ts=1733929541000&to_ts=1733930741000&event_id=7879715880809613521&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930141000&to_ts=1733930441000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930532000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-grafana-legacy}] OperationalError DB", "severity": "", "alert_type": "success", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930532000", "scopes": "service:keep-api-feature-grafana-legacy", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879715880809613521", "tags": "monitor,service:keep-api-feature-grafana-legacy", "id": "7879715880809613521", "monitor_id": "160077341"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:15:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-grafana-legacy&from_ts=1733929562000&to_ts=1733930762000&event_id=7879716233501717500&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930162000&to_ts=1733930462000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930553000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-grafana-legacy}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930553000", "scopes": "service:keep-api-feature-grafana-legacy", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879716233501717500", "tags": "monitor,service:keep-api-feature-grafana-legacy", "id": "7879716233501717500", "monitor_id": "160076582"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-feature-improvedocs&from_ts=1733929541000&to_ts=1733930741000&event_id=7879715879428238181&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930141000&to_ts=1733930441000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930531000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-improvedocs}] OperationalError DB", "severity": "", "alert_type": "success", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930531000", "scopes": "service:keep-api-feature-improvedocs", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879715879428238181", "tags": "monitor,service:keep-api-feature-improvedocs", "id": "7879715879428238181", "monitor_id": "160077341"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:05:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api&from_ts=1733929502000&to_ts=1733930702000&event_id=7879715220610804188&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930102000&to_ts=1733930402000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930492000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930492000", "scopes": "service:keep-api", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879715220610804188", "tags": "monitor,service:keep-api", "id": "7879715220610804188", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:21:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api&from_ts=1733929562000&to_ts=1733930762000&event_id=7879716234457967220&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930162000&to_ts=1733930462000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930553000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930553000", "scopes": "service:keep-api", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879716234457967220", "tags": "monitor,service:keep-api", "id": "7879716234457967220", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:22:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary&from_ts=1733929622000&to_ts=1733930822000&event_id=7879717268154896752&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930222000&to_ts=1733930522000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930614000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930614000", "scopes": "service:keep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879717268154896752", "tags": "monitor,service:keep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary", "id": "7879717268154896752", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-ci-2766-simple-faster-ee&from_ts=1733929622000&to_ts=1733930822000&event_id=7879717259407496838&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930222000&to_ts=1733930522000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930614000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-ci-2766-simple-faster-ee}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930614000", "scopes": "service:keep-api-ci-2766-simple-faster-ee", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879717259407496838", "tags": "monitor,service:keep-api-ci-2766-simple-faster-ee", "id": "7879717259407496838", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:22:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-matvey-kuk-workflows-fix&from_ts=1733929622000&to_ts=1733930822000&event_id=7879717254454313398&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930222000&to_ts=1733930522000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930613000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-matvey-kuk-workflows-fix}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930613000", "scopes": "service:keep-api-matvey-kuk-workflows-fix", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879717254454313398", "tags": "monitor,service:keep-api-matvey-kuk-workflows-fix", "id": "7879717254454313398", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:17:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-improvedocs&from_ts=1733929682000&to_ts=1733930882000&event_id=7879718246849246186&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930282000&to_ts=1733930582000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930673000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-improvedocs}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930673000", "scopes": "service:keep-api-feature-improvedocs", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879718246849246186", "tags": "monitor,service:keep-api-feature-improvedocs", "id": "7879718246849246186", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:23:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-unique-id&from_ts=1733929682000&to_ts=1733930882000&event_id=7879718271367299818&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930282000&to_ts=1733930582000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930674000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-unique-id}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930674000", "scopes": "service:keep-api-feature-unique-id", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879718271367299818", "tags": "monitor,service:keep-api-feature-unique-id", "id": "7879718271367299818", "monitor_id": "160076582"} {"body": "%%%\n \n\n @webhook-keep-datadog-webhook-integration-keep @webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 @webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375 @webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2\n\nLess than **0.0** log events matched in the last **5m** against the monitored query: **[`@http.status_code:(401 OR 403)`](https://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:08 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/134462228?group=service%3Akeep-api&from_ts=1733929688000&to_ts=1733930888000&event_id=7879718351573282995&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/134462228/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&from_ts=1733930288000&to_ts=1733930588000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930679000", "event_type": "log_alert", "title": "[P2] [Recovered] Unauthorized access to API ", "severity": "P2", "alert_type": "success", "alert_query": "logs(\"@http.status_code:(401 OR 403)\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 5", "alert_transition": "Recovered", "date": "1733930679000", "scopes": "service:keep-api", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879718351573282995", "tags": "environment:production,monitor,service:keep-api", "id": "7879718351573282995", "monitor_id": "134462228"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:24:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable&from_ts=1733929742000&to_ts=1733930942000&event_id=7879719249416290957&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930342000&to_ts=1733930642000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930732000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930732000", "scopes": "service:keep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879719249416290957", "tags": "monitor,service:keep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable", "id": "7879719249416290957", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:24:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-historical-rules-poc&from_ts=1733929742000&to_ts=1733930942000&event_id=7879719250013996135&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930342000&to_ts=1733930642000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930732000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-historical-rules-poc}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930732000", "scopes": "service:keep-api-feature-historical-rules-poc", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879719250013996135", "tags": "monitor,service:keep-api-feature-historical-rules-poc", "id": "7879719250013996135", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:24:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-fix-2804-unlink-alert&from_ts=1733929742000&to_ts=1733930942000&event_id=7879719251943375976&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930342000&to_ts=1733930642000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930732000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-fix-2804-unlink-alert}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930732000", "scopes": "service:keep-api-fix-2804-unlink-alert", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879719251943375976", "tags": "monitor,service:keep-api-fix-2804-unlink-alert", "id": "7879719251943375976", "monitor_id": "160076582"} {"body": "%%%\nhttps://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&from_ts=1733930348000&to_ts=1733930648000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&event=AwAAAZO2UazNewhzMQAAABhBWk8yVWFfSUFBQUtSTVFERHFvenF3QUEAAAAkMDE5M2I2NTEtYzYzNi00MDYyLThhMzAtYTMyZTEyNzY3ZWM2AABZ8g \n\n @webhook-keep-datadog-webhook-integration-keep @webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 @webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375 @webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2\n\nMore than **5** log events matched in the last **5m** against the monitored query: **[`@http.status_code:(401 OR 403)`](https://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:24:08 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/134462228?group=service%3Akeep-api&from_ts=1733929748000&to_ts=1733930948000&event_id=7879719351243698466&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/134462228/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=%40http.status_code%3A%28401+OR+403%29&from_ts=1733930348000&to_ts=1733930648000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930738000", "event_type": "log_alert", "title": "[P2] [Triggered] Unauthorized access to API keep-api", "severity": "P2", "alert_type": "error", "alert_query": "logs(\"@http.status_code:(401 OR 403)\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 5", "alert_transition": "Triggered", "date": "1733930738000", "scopes": "service:keep-api", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879719351243698466", "tags": "environment:production,monitor,service:keep-api", "id": "7879719351243698466", "monitor_id": "134462228"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:24:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-feature-unique-id&from_ts=1733929781000&to_ts=1733930981000&event_id=7879719917965975808&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930381000&to_ts=1733930681000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930772000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-unique-id}] OperationalError DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930772000", "scopes": "service:keep-api-feature-unique-id", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879719917965975808", "tags": "monitor,service:keep-api-feature-unique-id", "id": "7879719917965975808", "monitor_id": "160077341"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:25:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-ci-2766-simple-faster-ee&from_ts=1733929802000&to_ts=1733931002000&event_id=7879720264592758877&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930402000&to_ts=1733930702000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930793000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-ci-2766-simple-faster-ee}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930793000", "scopes": "service:keep-api-ci-2766-simple-faster-ee", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879720264592758877", "tags": "monitor,service:keep-api-ci-2766-simple-faster-ee", "id": "7879720264592758877", "monitor_id": "160076582"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:25:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-ci-2766-simple-faster-ee&from_ts=1733929841000&to_ts=1733931041000&event_id=7879720914500490069&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930441000&to_ts=1733930741000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930832000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-ci-2766-simple-faster-ee}] OperationalError DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930832000", "scopes": "service:keep-api-ci-2766-simple-faster-ee", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879720914500490069", "tags": "monitor,service:keep-api-ci-2766-simple-faster-ee", "id": "7879720914500490069", "monitor_id": "160077341"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:25:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable&from_ts=1733929841000&to_ts=1733931041000&event_id=7879720915197393266&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930441000&to_ts=1733930741000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930832000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable}] OperationalError DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930832000", "scopes": "service:keep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879720915197393266", "tags": "monitor,service:keep-api-fix-2780-bug-incidents-nonetype-object-is-not-iterable", "id": "7879720915197393266", "monitor_id": "160077341"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:25:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-fix-2804-unlink-alert&from_ts=1733929841000&to_ts=1733931041000&event_id=7879720931357701015&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930441000&to_ts=1733930741000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930833000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-fix-2804-unlink-alert}] OperationalError DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930833000", "scopes": "service:keep-api-fix-2804-unlink-alert", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879720931357701015", "tags": "monitor,service:keep-api-fix-2804-unlink-alert", "id": "7879720931357701015", "monitor_id": "160077341"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:22:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary&from_ts=1733929862000&to_ts=1733931062000&event_id=7879721270410105860&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930462000&to_ts=1733930762000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930853000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930853000", "scopes": "service:keep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879721270410105860", "tags": "monitor,service:keep-api-fix-2732-bug-duplicate-entry-for-key-lastalertprimary", "id": "7879721270410105860", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:14:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-bugfix-yaml&from_ts=1733929862000&to_ts=1733931062000&event_id=7879721272152616299&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930462000&to_ts=1733930762000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930853000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-bugfix-yaml}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930853000", "scopes": "service:keep-api-bugfix-yaml", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879721272152616299", "tags": "monitor,service:keep-api-bugfix-yaml", "id": "7879721272152616299", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:21:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api&from_ts=1733929862000&to_ts=1733931062000&event_id=7879721270911102314&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930462000&to_ts=1733930762000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930853000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930853000", "scopes": "service:keep-api", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879721270911102314", "tags": "monitor,service:keep-api", "id": "7879721270911102314", "monitor_id": "160076582"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:18:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-matvey-kuk-workflows-fix&from_ts=1733929901000&to_ts=1733931101000&event_id=7879721950125022979&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930501000&to_ts=1733930801000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930893000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-matvey-kuk-workflows-fix}] OperationalError DB", "severity": "", "alert_type": "success", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930893000", "scopes": "service:keep-api-matvey-kuk-workflows-fix", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879721950125022979", "tags": "monitor,service:keep-api-matvey-kuk-workflows-fix", "id": "7879721950125022979", "monitor_id": "160077341"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:22:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-matvey-kuk-workflows-fix&from_ts=1733929922000&to_ts=1733931122000&event_id=7879722272469287876&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930522000&to_ts=1733930822000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930912000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-matvey-kuk-workflows-fix}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930912000", "scopes": "service:keep-api-matvey-kuk-workflows-fix", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879722272469287876", "tags": "monitor,service:keep-api-matvey-kuk-workflows-fix", "id": "7879722272469287876", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:27:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-improvedocs&from_ts=1733929922000&to_ts=1733931122000&event_id=7879722274337387922&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930522000&to_ts=1733930822000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930913000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-improvedocs}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930913000", "scopes": "service:keep-api-feature-improvedocs", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879722274337387922", "tags": "monitor,service:keep-api-feature-improvedocs", "id": "7879722274337387922", "monitor_id": "160076582"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:27:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-bugfix-yaml-width&from_ts=1733929922000&to_ts=1733931122000&event_id=7879722275107215736&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930522000&to_ts=1733930822000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930913000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-bugfix-yaml-width}] Error monitor", "severity": "", "alert_type": "error", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930913000", "scopes": "service:keep-api-bugfix-yaml-width", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879722275107215736", "tags": "monitor,service:keep-api-bugfix-yaml-width", "id": "7879722275107215736", "monitor_id": "160076582"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nMore than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:27:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-feature-improvedocs&from_ts=1733929961000&to_ts=1733931161000&event_id=7879722937552678433&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930561000&to_ts=1733930861000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930952000", "event_type": "log_alert", "title": "[Triggered on {service:keep-api-feature-improvedocs}] OperationalError DB", "severity": "", "alert_type": "error", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Triggered", "date": "1733930952000", "scopes": "service:keep-api-feature-improvedocs", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879722937552678433", "tags": "monitor,service:keep-api-feature-improvedocs", "id": "7879722937552678433", "monitor_id": "160077341"} {"body": "%%%\ntrace_id: \ntags: \nattributes: \n\n@webhook-keep-datadog-webhook-integration-keep \n@webhook-keep-datadog-webhook-integration-78645c69-61e9-4921-8e90-b1ae382280e5 \n@webhook-keep-datadog-webhook-integration-9ffb1c58-bd2b-4b2e-ad76-575caf43f5d2 \n@webhook-keep-datadog-webhook-integration-2f82730d-4cb5-466d-81b1-1aecb316f375\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`status:error`](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:23:02 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160076582?group=service%3Akeep-api-feature-unique-id&from_ts=1733929982000&to_ts=1733931182000&event_id=7879723300958553556&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160076582/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=status%3Aerror&from_ts=1733930582000&to_ts=1733930882000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733930974000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-unique-id}] Error monitor", "severity": "", "alert_type": "success", "alert_query": "logs(\"status:error\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733930974000", "scopes": "service:keep-api-feature-unique-id", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879723300958553556", "tags": "monitor,service:keep-api-feature-unique-id", "id": "7879723300958553556", "monitor_id": "160076582"} {"body": "%%%\n@webhook-keep-datadog-webhook-integration-keep\n\nLess than **0** log events matched in the last **5m** against the monitored query: **[`err.OperationalError`](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&agg_m=count&agg_t=count&agg_q=service&index=%2A)** by **service**\n\nThe monitor was last triggered at Wed Dec 11 2024 15:24:41 UTC.\n\n- - -\n\n[[Monitor Status](https://app.datadoghq.com/monitors/160077341?group=service%3Akeep-api-feature-unique-id&from_ts=1733930021000&to_ts=1733931221000&event_id=7879723934280491883&link_source=monitor_notif)] \u00b7 [[Edit Monitor](https://app.datadoghq.com/monitors/160077341/edit?link_source=monitor_notif)] \u00b7 [[Related Logs](https://app.datadoghq.com/logs/analytics?query=err.OperationalError&from_ts=1733930621000&to_ts=1733930921000&live=false&agg_m=count&agg_t=count&agg_q=service&index=%2A&link_source=monitor_notif)]\n%%%", "last_updated": "1733931012000", "event_type": "log_alert", "title": "[Recovered on {service:keep-api-feature-unique-id}] OperationalError DB", "severity": "", "alert_type": "success", "alert_query": "logs(\"err.OperationalError\").index(\"*\").rollup(\"count\").by(\"service\").last(\"5m\") > 0", "alert_transition": "Recovered", "date": "1733931012000", "scopes": "service:keep-api-feature-unique-id", "org": {"id": "831563", "name": "DPN | KeepHQ"}, "url": "https://app.datadoghq.com/event/event?id=7879723934280491883", "tags": "monitor,service:keep-api-feature-unique-id", "id": "7879723934280491883", "monitor_id": "160077341"} sqlite> ================================================ FILE: docs/incidents/facets.mdx ================================================ Faceted search is a powerful mechanism for enhancing search functionality, allowing users to filter and refine search results dynamically using multiple dimensions or "facets." These facets are predefined categories or attributes of the data. In Keep, the Incidents page supports faceted search by incident attributes. ### Predefined Incident Facets These are predefined Incident facets that can be used to filter incidents: - **Status**: Filter by Incident status - **Severity**: Filter by Incident severity - **Assignee**: Filter by Incident assignee - **Source**: Filter by alert source - **Service**: Filter by the service the Incident relates to ### Custom Facets Creation Keep also supports custom facets creation. Here is how to do this: 1. Click the "Add facet" button in the filtering panel. 2. Enter the Facet name. This is the name that will be displayed in the filter panel. 3. Enter the Facet property path the facet will filter by. 4. Click "Create". ### Supported Properties to create Facets for Incident supports facets by direct Incident fields and also by Alert's data linked to the Incident. Here is a list of properties you can create facets for: - **name**: Incident name - **summary**: Incident summary - **creation_time**: Incident creation time - **start_time**: Incident start time - **end_time**: Incident end time - **last_seen_time**: Incident last seen time - **is_predicted**: Whether the Incident is predicted - **is_candidate**: Whether the Incident is candidate - **alerts_count**: Number of alerts associated with the Incident - **merged_at**: When the Incident was merged - **merged_by**: Who merged the Incident - **hasLinkedIncident**: Whether the Incident has past incident linked - **alert.***: Refers to alert properties in the Incident. Examples: alert.labels.monitor, alert.monitor, etc. ================================================ FILE: docs/incidents/overview.mdx ================================================ --- title: "Overview" --- Keep's incident management system provides a comprehensive solution for handling, tracking, and resolving operational incidents. This system helps teams effectively manage incidents from detection through resolution, ensuring minimal downtime and efficient collaboration. ### (1) Incident Severity Displays the severity of the incident, helping teams prioritize and focus on the most critical issues. ### (2) Incident Name The unique name or identifier of the incident for easy reference and tracking. ### (3) Incident Summary (+ AI Summary) A brief overview of the incident, optionally enhanced with AI-generated summaries to provide deeper insights. ### (4) Link Similar Incidents Connects related incidents for better visibility into recurring or interconnected issues. ### (5) Involved Services Lists the services affected by the incident, allowing teams to understand the scope of the impact. ### (6) Affected Environments Specifies the environments (e.g., production, staging) impacted by the incident. ### (7) Run Workflow Quickly initiate workflows to address the incident, such as creating tickets, notifying teams, or executing remediation steps. ### (8) Edit Incident Allows modification of incident details, such as severity, name, or involved services, to keep information up-to-date. ### (9) Incident Status Indicates the current status of the incident (e.g., open, resolved, acknowledged). ### (10) Incident Last Seen At Records the most recent timestamp when the incident was observed, providing context for its activity. ### (11) Incident Started At Indicates when the incident was first detected, helping establish timelines for resolution. ### (12) Incident Assignee Displays the individual or team responsible for resolving the incident, promoting accountability. ### (13) Incident Group By Value Groups incidents based on a specific attribute, such as service, environment, or severity, for better organization. ### (14) Incident Related Alerts Lists all alerts linked to the incident, offering a complete view of its underlying causes. ### (15) Incident Activity Tracks all activities and updates related to the incident, enabling detailed audits and reviews. ### (16) Incident Timeline Provides a chronological view of the incident's lifecycle, including updates, actions, and status changes. ### (17) Incident Topology Visualizes the relationships between affected components, services, and infrastructure in a topology map. ### (18) Incident Workflows Lists workflows associated with the incident, showing actions taken or available options for resolution. ### (19) Incident Chat with AI (Incident Copilot) Engage with AI-powered chat for guidance, insights, or recommended actions related to the incident. ### (20) Incident Alert List Displays a detailed list of alerts contributing to the incident, with metadata for each alert. ### (21) Incident Alert Link Provides quick access to the original monitoring tool for a specific alert. ### (22) Incident Alert Status Shows the current status of each alert, such as acknowledged, resolved, or firing. ### (23) Incident Correlation Type Indicates how the incident was correlated: manually, via AI, or by rule-based logic. ### (24) Incident Alert Unlink Enables unlinking specific alerts from the incident if they are found to be unrelated. --- ================================================ FILE: docs/mint.json ================================================ { "$schema": "https://mintlify.com/schema.json", "name": "Keep", "logo": { "light": "/logo/light.png", "dark": "/logo/dark.png" }, "favicon": "/favicon.svg", "colors": { "primary": "#FA9E34", "light": "#FA9E34", "dark": "#FF9F36" }, "topbarCtaButton": { "type": "github", "url": "https://github.com/keephq/keep" }, "topbarLinks": [ { "name": "Platform", "url": "https://platform.keephq.dev/" } ], "analytics": { "posthog": { "apiKey": "phc_mYqciA4RO5g48K6KnmZtftn5xQa5625Aao7vsVC0gJ9" } }, "anchors": [], "navigation": [ { "group": "Overview", "pages": [ "overview/introduction", "overview/playground", "overview/usecases", { "group": "Key Concepts", "pages": [ "overview/glossary", "overview/cel", "overview/fingerprints", "overview/alertseverityandstatus", "overview/howdoeskeepgetmyalerts", "overview/comparisons" ] }, "overview/support", "overview/faq" ] }, { "group": "AIOps", "pages": [ { "group": "AI", "pages": [ "overview/ai-incident-assistant", "overview/ai-workflow-assistant", "overview/ai-semi-automatic-correlation", "overview/ai-in-workflows", "overview/ai-correlation" ] }, { "group": "Non-AI Correlation", "pages": [ "overview/correlation-rules", "overview/correlation-topology" ] }, "overview/deduplication", "overview/enrichment/extraction", "overview/enrichment/mapping", "overview/maintenance-windows", "overview/servicetopology", "overview/workflow-automation" ] }, { "group": "Alerts", "pages": [ "alerts/overview", "alerts/table", "alerts/actionmenu", "alerts/sidebar", "alerts/presets", "alerts/sound" ] }, { "group": "Incidents", "pages": ["incidents/overview", "incidents/facets"] }, { "group": "Workflow Automation", "pages": [ "workflows/overview", { "group": "Syntax", "pages": [ "workflows/syntax/triggers", "workflows/syntax/permissions", "workflows/syntax/steps-and-actions", "workflows/syntax/conditions", "workflows/syntax/functions", "workflows/syntax/context", "workflows/syntax/providers", "workflows/syntax/foreach", "workflows/syntax/enrichment" ] }, { "group": "Examples", "pages": [ "workflows/examples/autosupress", "workflows/examples/buisnesshours", "workflows/examples/create-servicenow-tickets", "workflows/examples/highsev", "workflows/examples/update-servicenow-tickets" ] } ] }, { "group": "Alert Evaluation Engine", "pages": [ "alertevaluation/overview", { "group": "Examples", "pages": [ "alertevaluation/examples/victoriametricssingle", "alertevaluation/examples/victoriametricsmulti" ] } ] }, { "group": "Providers", "pages": [ "providers/overview", "providers/linked-providers", "providers/provider-methods", { "group": "Supported Providers", "pages": [ "providers/documentation/airflow-provider", "providers/documentation/aks-provider", "providers/documentation/amazonsqs-provider", "providers/documentation/anthropic-provider", "providers/documentation/appdynamics-provider", "providers/documentation/asana-provider", "providers/documentation/s3-provider", "providers/documentation/argocd-provider", "providers/documentation/auth0-provider", "providers/documentation/axiom-provider", "providers/documentation/azuremonitoring-provider", "providers/documentation/bash-provider", "providers/documentation/bigquery-provider", "providers/documentation/centreon-provider", "providers/documentation/checkmk-provider", "providers/documentation/checkly-provider", "providers/documentation/cilium-provider", "providers/documentation/clickhouse-provider", "providers/documentation/cloudwatch-provider", "providers/documentation/console-provider", "providers/documentation/coralogix-provider", "providers/documentation/dash0-provider", "providers/documentation/databend-provider", "providers/documentation/datadog-provider", "providers/documentation/deepseek-provider", "providers/documentation/discord-provider", "providers/documentation/dynatrace-provider", "providers/documentation/eks-provider", "providers/documentation/elastic-provider", "providers/documentation/flashduty-provider", "providers/documentation/fluxcd-provider", "providers/documentation/gcpmonitoring-provider", "providers/documentation/gemini-provider", "providers/documentation/github-provider", "providers/documentation/github_workflows_provider", "providers/documentation/gitlab-provider", "providers/documentation/gitlabpipelines-provider", "providers/documentation/gke-provider", "providers/documentation/google_chat-provider", "providers/documentation/grafana-provider", "providers/documentation/grafana_incident-provider", "providers/documentation/grafana_loki-provider", "providers/documentation/grafana_oncall-provider", "providers/documentation/graylog-provider", "providers/documentation/grok-provider", "providers/documentation/http-provider", "providers/documentation/icinga2-provider", "providers/documentation/ilert-provider", "providers/documentation/incidentio-provider", "providers/documentation/incidentmanager-provider", "providers/documentation/jira-on-prem-provider", "providers/documentation/jira-provider", "providers/documentation/kafka-provider", "providers/documentation/keep-provider", "providers/documentation/kibana-provider", "providers/documentation/kubernetes-provider", "providers/documentation/libre_nms-provider", "providers/documentation/linear_provider", "providers/documentation/linearb-provider", "providers/documentation/litellm-provider", "providers/documentation/llamacpp-provider", "providers/documentation/mailgun-provider", "providers/documentation/mattermost-provider", "providers/documentation/microsoft-planner-provider", "providers/documentation/mock-provider", "providers/documentation/monday-provider", "providers/documentation/mongodb-provider", "providers/documentation/mysql-provider", "providers/documentation/netbox-provider", "providers/documentation/netdata-provider", "providers/documentation/new-relic-provider", "providers/documentation/ntfy-provider", "providers/documentation/ollama-provider", "providers/documentation/openai-provider", "providers/documentation/openobserve-provider", "providers/documentation/opensearchserverless-provider", "providers/documentation/openshift-provider", "providers/documentation/opsgenie-provider", "providers/documentation/pagerduty-provider", "providers/documentation/pagertree-provider", "providers/documentation/parseable-provider", "providers/documentation/pingdom-provider", "providers/documentation/posthog-provider", "providers/documentation/planner-provider", "providers/documentation/postgresql-provider", "providers/documentation/prometheus-provider", "providers/documentation/pushover-provider", "providers/documentation/python-provider", "providers/documentation/quickchart-provider", "providers/documentation/redmine-provider", "providers/documentation/resend-provider", "providers/documentation/rollbar-provider", "providers/documentation/sendgrid-provider", "providers/documentation/sentry-provider", "providers/documentation/service-now-provider", "providers/documentation/signalfx-provider", "providers/documentation/signl4-provider", "providers/documentation/site24x7-provider", "providers/documentation/slack-provider", "providers/documentation/smtp-provider", "providers/documentation/snowflake-provider", "providers/documentation/splunk-provider", "providers/documentation/squadcast-provider", "providers/documentation/ssh-provider", "providers/documentation/statuscake-provider", "providers/documentation/sumologic-provider", "providers/documentation/teams-provider", "providers/documentation/telegram-provider", "providers/documentation/template", "providers/documentation/thousandeyes-provider", "providers/documentation/trello-provider", "providers/documentation/twilio-provider", "providers/documentation/uptimekuma-provider", "providers/documentation/victorialogs-provider", "providers/documentation/victoriametrics-provider", "providers/documentation/vllm-provider", "providers/documentation/wazuh-provider", "providers/documentation/webhook-provider", "providers/documentation/websocket-provider", "providers/documentation/youtrack-provider", "providers/documentation/zabbix-provider", "providers/documentation/zenduty-provider", "providers/documentation/zoom-provider", "providers/documentation/zoom_chat-provider" ] }, "providers/adding-a-new-provider" ] }, { "group": "Deployment", "pages": [ "deployment/configuration", "deployment/monitoring", { "group": "Authentication", "pages": [ "deployment/authentication/overview", "deployment/authentication/no-auth", "deployment/authentication/db-auth", "deployment/authentication/auth0-auth", "deployment/authentication/azuread-auth", "deployment/authentication/keycloak-auth", "deployment/authentication/oauth2proxy-auth", "deployment/authentication/oauth2-proxy-gitlab", "deployment/authentication/okta-auth", "deployment/authentication/onelogin-auth" ] }, { "group": "Provision", "pages": [ "deployment/provision/overview", "deployment/provision/provider", "deployment/provision/workflow", "deployment/provision/dashboard" ] }, "deployment/secret-store", { "group": "Deploy On", "pages": [ "deployment/docker", { "group": "Kubernetes", "pages": [ "deployment/kubernetes/overview", "deployment/kubernetes/installation", "deployment/kubernetes/architecture", "deployment/kubernetes/openshift" ] }, "deployment/openshift", "deployment/ecs" ] }, { "group": "Local LLM", "pages": ["deployment/local-llm/keep-with-litellm"] }, "deployment/stress-testing" ] }, { "group": "Development", "pages": ["development/getting-started", "development/external-url"] }, { "group": "Keep CLI", "pages": [ "cli/overview", "cli/installation", "cli/github-actions", { "group": "Commands", "pages": [ { "group": "keep alert", "pages": [ "cli/commands/cli-alert", "cli/commands/alert-enrich", "cli/commands/alert-get", "cli/commands/alert-list" ] }, { "group": "keep provider", "pages": [ "cli/commands/cli-provider", "cli/commands/provider-connect", "cli/commands/provider-delete", "cli/commands/provider-list" ] }, { "group": "keep workflow", "pages": [ "cli/commands/cli-workflow", "cli/commands/workflow-apply", "cli/commands/workflow-list", "cli/commands/workflow-run", "cli/commands/workflow-runs", { "group": "keep workflow runs", "pages": ["cli/commands/runs-logs", "cli/commands/runs-list"] } ] }, { "group": "keep mappings", "pages": [ "cli/commands/mappings-list", "cli/commands/mappings-create", "cli/commands/mappings-delete" ] }, { "group": "keep extractions", "pages": [ "cli/commands/extraction-create", "cli/commands/extraction-delete", "cli/commands/extractions-list" ] }, "cli/commands/cli", "cli/commands/cli-api", "cli/commands/cli-config-new", "cli/commands/cli-config-show", "cli/commands/cli-run", "cli/commands/cli-config", "cli/commands/cli-version", "cli/commands/cli-whoami" ] } ] } ], "footerSocials": { "github": "https://github.com/keephq/keep" } } ================================================ FILE: docs/openapi.json ================================================ {"openapi": "3.0.2", "info": {"title": "Keep API", "description": "Rest API powering https://platform.keephq.dev and friends \ud83c\udfc4\u200d\u2640\ufe0f", "version": "0.24.5"}, "paths": {"/": {"get": {"summary": "Root", "description": "App desctiption and version.", "operationId": "root__get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}}}, "/providers": {"get": {"tags": ["providers"], "summary": "Get Providers", "operationId": "get_providers_providers_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/export": {"get": {"tags": ["providers"], "summary": "Get Installed Providers", "description": "export all installed providers", "operationId": "get_installed_providers_providers_export_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_type}/{provider_id}/configured-alerts": {"get": {"tags": ["providers"], "summary": "Get Alerts Configuration", "description": "Get alerts configuration from a provider", "operationId": "get_alerts_configuration_providers__provider_type___provider_id__configured_alerts_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {}, "type": "array", "title": "Response Get Alerts Configuration Providers Provider Type Provider Id Configured Alerts Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_type}/{provider_id}/logs": {"get": {"tags": ["providers"], "summary": "Get Logs", "description": "Get logs from a provider", "operationId": "get_logs_providers__provider_type___provider_id__logs_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}, {"required": false, "schema": {"type": "integer", "title": "Limit", "default": 5}, "name": "limit", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {}, "type": "array", "title": "Response Get Logs Providers Provider Type Provider Id Logs Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_type}/schema": {"get": {"tags": ["providers"], "summary": "Get Alerts Schema", "description": "Get the provider's API schema used to push alerts configuration", "operationId": "get_alerts_schema_providers__provider_type__schema_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "object", "title": "Response Get Alerts Schema Providers Provider Type Schema Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}}}, "/providers/{provider_type}/{provider_id}/alerts/count": {"get": {"tags": ["providers"], "summary": "Get Alert Count", "description": "Get number of alerts a specific provider has received (in a specific time time period or ever)", "operationId": "get_alert_count_providers__provider_type___provider_id__alerts_count_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}, {"required": true, "schema": {"type": "boolean", "title": "Ever"}, "name": "ever", "in": "query"}, {"required": false, "schema": {"type": "string", "format": "date-time", "title": "Start Time"}, "name": "start_time", "in": "query"}, {"required": false, "schema": {"type": "string", "format": "date-time", "title": "End Time"}, "name": "end_time", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_type}/{provider_id}/alerts": {"post": {"tags": ["providers"], "summary": "Add Alert", "description": "Push new alerts to the provider", "operationId": "add_alert_providers__provider_type___provider_id__alerts_post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}, {"required": false, "schema": {"type": "string", "title": "Alert Id"}, "name": "alert_id", "in": "query"}], "requestBody": {"content": {"application/json": {"schema": {"type": "object", "title": "Alert"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/test": {"post": {"tags": ["providers"], "summary": "Test Provider", "description": "Test a provider's alert retrieval", "operationId": "test_provider_providers_test_post", "requestBody": {"content": {"application/json": {"schema": {"type": "object", "title": "Provider Info"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_type}/{provider_id}": {"delete": {"tags": ["providers"], "summary": "Delete Provider", "operationId": "delete_provider_providers__provider_type___provider_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_id}/scopes": {"post": {"tags": ["providers"], "summary": "Validate Provider Scopes", "description": "Validate provider scopes", "operationId": "validate_provider_scopes_providers__provider_id__scopes_post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"anyOf": [{"type": "boolean"}, {"type": "string"}]}, "type": "object", "title": "Response Validate Provider Scopes Providers Provider Id Scopes Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_id}": {"put": {"tags": ["providers"], "summary": "Update Provider", "description": "Update provider", "operationId": "update_provider_providers__provider_id__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/install": {"post": {"tags": ["providers"], "summary": "Install Provider", "operationId": "install_provider_providers_install_post", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/install/oauth2/{provider_type}": {"post": {"tags": ["providers"], "summary": "Install Provider Oauth2", "operationId": "install_provider_oauth2_providers_install_oauth2__provider_type__post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"type": "object", "title": "Provider Info"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_id}/invoke/{method}": {"post": {"tags": ["providers"], "summary": "Invoke Provider Method", "description": "Invoke provider special method", "operationId": "invoke_provider_method_providers__provider_id__invoke__method__post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Method"}, "name": "method", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"type": "object", "title": "Method Params"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/install/webhook/{provider_type}/{provider_id}": {"post": {"tags": ["providers"], "summary": "Install Provider Webhook", "operationId": "install_provider_webhook_providers_install_webhook__provider_type___provider_id__post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/providers/{provider_type}/webhook": {"get": {"tags": ["providers"], "summary": "Get Webhook Settings", "operationId": "get_webhook_settings_providers__provider_type__webhook_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/ProviderWebhookSettings"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/actions": {"get": {"tags": ["actions"], "summary": "Get Actions", "description": "Get all actions", "operationId": "get_actions_actions_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["actions"], "summary": "Create Actions", "description": "Create new actions by uploading a file", "operationId": "create_actions_actions_post", "requestBody": {"content": {"multipart/form-data": {"schema": {"$ref": "#/components/schemas/Body_create_actions_actions_post"}}}}, "responses": {"201": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/actions/{action_id}": {"put": {"tags": ["actions"], "summary": "Put Action", "description": "Update an action", "operationId": "put_action_actions__action_id__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Action Id"}, "name": "action_id", "in": "path"}], "requestBody": {"content": {"multipart/form-data": {"schema": {"$ref": "#/components/schemas/Body_put_action_actions__action_id__put"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["actions"], "summary": "Delete Action", "description": "Delete an action", "operationId": "delete_action_actions__action_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Action Id"}, "name": "action_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/healthcheck": {"get": {"tags": ["healthcheck"], "summary": "Healthcheck", "description": "simple healthcheck endpoint", "operationId": "healthcheck_healthcheck_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "object", "title": "Response Healthcheck Healthcheck Get"}}}}}}}, "/alerts": {"get": {"tags": ["alerts"], "summary": "Get All Alerts", "description": "Get last alerts occurrence", "operationId": "get_all_alerts_alerts_get", "parameters": [{"required": false, "schema": {"type": "integer", "title": "Limit", "default": 1000}, "name": "limit", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/AlertDto"}, "type": "array", "title": "Response Get All Alerts Alerts Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["alerts"], "summary": "Delete Alert", "description": "Delete alert by finerprint and last received time", "operationId": "delete_alert_alerts_delete", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/DeleteRequestBody"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"type": "string"}, "type": "object", "title": "Response Delete Alert Alerts Delete"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/{fingerprint}/history": {"get": {"tags": ["alerts"], "summary": "Get Alert History", "description": "Get alert history", "operationId": "get_alert_history_alerts__fingerprint__history_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Fingerprint"}, "name": "fingerprint", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/AlertDto"}, "type": "array", "title": "Response Get Alert History Alerts Fingerprint History Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/{fingerprint}/assign/{last_received}": {"post": {"tags": ["alerts"], "summary": "Assign Alert", "description": "Assign alert to user", "operationId": "assign_alert_alerts__fingerprint__assign__last_received__post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Fingerprint"}, "name": "fingerprint", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Last Received"}, "name": "last_received", "in": "path"}, {"required": false, "schema": {"type": "boolean", "title": "Unassign", "default": false}, "name": "unassign", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"type": "string"}, "type": "object", "title": "Response Assign Alert Alerts Fingerprint Assign Last Received Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/event": {"post": {"tags": ["alerts"], "summary": "Receive Generic Event", "description": "Receive a generic alert event", "operationId": "receive_generic_event_alerts_event_post", "parameters": [{"required": false, "schema": {"type": "string", "title": "Fingerprint"}, "name": "fingerprint", "in": "query"}], "requestBody": {"content": {"application/json": {"schema": {"anyOf": [{"$ref": "#/components/schemas/AlertDto"}, {"items": {"$ref": "#/components/schemas/AlertDto"}, "type": "array"}, {"type": "object"}], "title": "Event"}}}, "required": true}, "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"anyOf": [{"$ref": "#/components/schemas/AlertDto"}, {"items": {"$ref": "#/components/schemas/AlertDto"}, "type": "array"}], "title": "Response Receive Generic Event Alerts Event Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/event/netdata": {"get": {"tags": ["alerts"], "summary": "Webhook Challenge", "description": "Helper function to complete Netdata webhook challenge", "operationId": "webhook_challenge_alerts_event_netdata_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}}}, "/alerts/event/{provider_type}": {"post": {"tags": ["alerts"], "summary": "Receive Event", "description": "Receive an alert event from a provider", "operationId": "receive_event_alerts_event__provider_type__post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": false, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "query"}, {"required": false, "schema": {"type": "string", "title": "Fingerprint"}, "name": "fingerprint", "in": "query"}], "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"type": "string"}, "type": "object", "title": "Response Receive Event Alerts Event Provider Type Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/{fingerprint}": {"get": {"tags": ["alerts"], "summary": "Get Alert", "description": "Get alert by fingerprint", "operationId": "get_alert_alerts__fingerprint__get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Fingerprint"}, "name": "fingerprint", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/AlertDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/enrich": {"post": {"tags": ["alerts"], "summary": "Enrich Alert", "description": "Enrich an alert", "operationId": "enrich_alert_alerts_enrich_post", "parameters": [{"description": "Dispose on new alert", "required": false, "schema": {"type": "boolean", "title": "Dispose On New Alert", "description": "Dispose on new alert", "default": false}, "name": "dispose_on_new_alert", "in": "query"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/EnrichAlertRequestBody"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"type": "string"}, "type": "object", "title": "Response Enrich Alert Alerts Enrich Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/unenrich": {"post": {"tags": ["alerts"], "summary": "Unenrich Alert", "description": "Un-Enrich an alert", "operationId": "unenrich_alert_alerts_unenrich_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/UnEnrichAlertRequestBody"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"type": "string"}, "type": "object", "title": "Response Unenrich Alert Alerts Unenrich Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/search": {"post": {"tags": ["alerts"], "summary": "Search Alerts", "description": "Search alerts", "operationId": "search_alerts_alerts_search_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/SearchAlertsRequest"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/AlertDto"}, "type": "array", "title": "Response Search Alerts Alerts Search Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/audit": {"post": {"tags": ["alerts"], "summary": "Get Multiple Fingerprint Alert Audit", "description": "Get alert timeline audit trail for multiple fingerprints", "operationId": "get_multiple_fingerprint_alert_audit_alerts_audit_post", "requestBody": {"content": {"application/json": {"schema": {"items": {"type": "string"}, "type": "array", "title": "Fingerprints"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/AlertAuditDto"}, "type": "array", "title": "Response Get Multiple Fingerprint Alert Audit Alerts Audit Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/{fingerprint}/audit": {"get": {"tags": ["alerts"], "summary": "Get Alert Audit", "description": "Get alert timeline audit trail", "operationId": "get_alert_audit_alerts__fingerprint__audit_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Fingerprint"}, "name": "fingerprint", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/AlertAuditDto"}, "type": "array", "title": "Response Get Alert Audit Alerts Fingerprint Audit Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/alerts/quality/metrics": {"get": {"tags": ["alerts"], "summary": "Get Alert Quality", "description": "Get alert quality", "operationId": "get_alert_quality_alerts_quality_metrics_get", "parameters": [{"required": false, "schema": {"items": {"type": "string"}, "type": "array", "title": "Fields", "default": []}, "name": "fields", "in": "query"}, {"required": false, "schema": {"type": "string", "title": "Time Stamp"}, "name": "time_stamp", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents": {"get": {"tags": ["incidents"], "summary": "Get All Incidents", "description": "Get last incidents", "operationId": "get_all_incidents_incidents_get", "parameters": [{"required": false, "schema": {"type": "boolean", "title": "Confirmed", "default": true}, "name": "confirmed", "in": "query"}, {"required": false, "schema": {"type": "integer", "title": "Limit", "default": 25}, "name": "limit", "in": "query"}, {"required": false, "schema": {"type": "integer", "title": "Offset", "default": 0}, "name": "offset", "in": "query"}, {"required": false, "schema": {"allOf": [{"$ref": "#/components/schemas/IncidentSorting"}], "default": "creation_time"}, "name": "sorting", "in": "query"}, {"required": false, "schema": {"items": {"$ref": "#/components/schemas/IncidentStatus"}, "type": "array"}, "name": "status", "in": "query"}, {"required": false, "schema": {"items": {"$ref": "#/components/schemas/IncidentSeverity"}, "type": "array"}, "name": "severity", "in": "query"}, {"required": false, "schema": {"items": {"type": "string"}, "type": "array", "title": "Assignees"}, "name": "assignees", "in": "query"}, {"required": false, "schema": {"items": {"type": "string"}, "type": "array", "title": "Sources"}, "name": "sources", "in": "query"}, {"required": false, "schema": {"items": {"type": "string"}, "type": "array", "title": "Affected Services"}, "name": "affected_services", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentsPaginatedResultsDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["incidents"], "summary": "Create Incident", "description": "Create new incident", "operationId": "create_incident_incidents_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentDtoIn"}}}, "required": true}, "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/meta": {"get": {"tags": ["incidents"], "summary": "Get Incidents Meta", "description": "Get incidents' metadata for filtering", "operationId": "get_incidents_meta_incidents_meta_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentListFilterParamsDto"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/{incident_id}": {"get": {"tags": ["incidents"], "summary": "Get Incident", "description": "Get incident by id", "operationId": "get_incident_incidents__incident_id__get", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "put": {"tags": ["incidents"], "summary": "Update Incident", "description": "Update incident by id", "operationId": "update_incident_incidents__incident_id__put", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}, {"description": "Whether the incident update request was generated by AI", "required": false, "schema": {"type": "boolean", "title": "Generatedbyai", "description": "Whether the incident update request was generated by AI", "default": false}, "name": "generatedByAi", "in": "query"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentDtoIn"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["incidents"], "summary": "Delete Incident", "description": "Delete incident by incident id", "operationId": "delete_incident_incidents__incident_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/merge": {"post": {"tags": ["incidents"], "summary": "Merge Incidents", "description": "Merge incidents", "operationId": "merge_incidents_incidents_merge_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/MergeIncidentsRequestDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/MergeIncidentsResponseDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/{incident_id}/alerts": {"get": {"tags": ["incidents"], "summary": "Get Incident Alerts", "description": "Get incident alerts by incident incident id", "operationId": "get_incident_alerts_incidents__incident_id__alerts_get", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}, {"required": false, "schema": {"type": "integer", "title": "Limit", "default": 25}, "name": "limit", "in": "query"}, {"required": false, "schema": {"type": "integer", "title": "Offset", "default": 0}, "name": "offset", "in": "query"}, {"required": false, "schema": {"type": "boolean", "title": "Include Unlinked", "default": false}, "name": "include_unlinked", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/AlertWithIncidentLinkMetadataPaginatedResultsDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["incidents"], "summary": "Add Alerts To Incident", "description": "Add alerts to incident", "operationId": "add_alerts_to_incident_incidents__incident_id__alerts_post", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}, {"required": false, "schema": {"type": "boolean", "title": "Is Created By Ai", "default": false}, "name": "is_created_by_ai", "in": "query"}], "requestBody": {"content": {"application/json": {"schema": {"items": {"type": "string", "format": "uuid"}, "type": "array", "title": "Alert Ids"}}}, "required": true}, "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/AlertDto"}, "type": "array", "title": "Response Add Alerts To Incident Incidents Incident Id Alerts Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["incidents"], "summary": "Delete Alerts From Incident", "description": "Delete alerts from incident", "operationId": "delete_alerts_from_incident_incidents__incident_id__alerts_delete", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"items": {"type": "string", "format": "uuid"}, "type": "array", "title": "Alert Ids"}}}, "required": true}, "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/AlertDto"}, "type": "array", "title": "Response Delete Alerts From Incident Incidents Incident Id Alerts Delete"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/{incident_id}/future_incidents": {"get": {"tags": ["incidents"], "summary": "Get Future Incidents For An Incident", "description": "Get same incidents linked to this one", "operationId": "get_future_incidents_for_an_incident_incidents__incident_id__future_incidents_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Incident Id"}, "name": "incident_id", "in": "path"}, {"required": false, "schema": {"type": "integer", "title": "Limit", "default": 25}, "name": "limit", "in": "query"}, {"required": false, "schema": {"type": "integer", "title": "Offset", "default": 0}, "name": "offset", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentsPaginatedResultsDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/{incident_id}/workflows": {"get": {"tags": ["incidents"], "summary": "Get Incident Workflows", "description": "Get incident workflows by incident id", "operationId": "get_incident_workflows_incidents__incident_id__workflows_get", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}, {"required": false, "schema": {"type": "integer", "title": "Limit", "default": 25}, "name": "limit", "in": "query"}, {"required": false, "schema": {"type": "integer", "title": "Offset", "default": 0}, "name": "offset", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/WorkflowExecutionsPaginatedResultsDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/event/{provider_type}": {"post": {"tags": ["incidents"], "summary": "Receive Event", "description": "Receive an alert event from a provider", "operationId": "receive_event_incidents_event__provider_type__post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Provider Type"}, "name": "provider_type", "in": "path"}, {"required": false, "schema": {"type": "string", "title": "Provider Id"}, "name": "provider_id", "in": "query"}], "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"type": "string"}, "type": "object", "title": "Response Receive Event Incidents Event Provider Type Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/{incident_id}/status": {"post": {"tags": ["incidents"], "summary": "Change Incident Status", "description": "Change incident status", "operationId": "change_incident_status_incidents__incident_id__status_post", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentStatusChangeDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/{incident_id}/comment": {"post": {"tags": ["incidents"], "summary": "Add Comment", "description": "Add incident audit activity", "operationId": "add_comment_incidents__incident_id__comment_post", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentStatusChangeDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/AlertAudit"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/ai/suggest": {"post": {"tags": ["incidents"], "summary": "Create With Ai", "description": "Create incident with AI", "operationId": "create_with_ai_incidents_ai_suggest_post", "requestBody": {"content": {"application/json": {"schema": {"items": {"type": "string"}, "type": "array", "title": "Alerts Fingerprints"}}}, "required": true}, "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentsClusteringSuggestion"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/ai/{suggestion_id}/commit": {"post": {"tags": ["incidents"], "summary": "Commit With Ai", "description": "Commit incidents with AI and user feedback", "operationId": "commit_with_ai_incidents_ai__suggestion_id__commit_post", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Suggestion Id"}, "name": "suggestion_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/IncidentCommit"}, "type": "array", "title": "Incidents With Feedback"}}}, "required": true}, "responses": {"202": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/IncidentDto"}, "type": "array", "title": "Response Commit With Ai Incidents Ai Suggestion Id Commit Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/incidents/{incident_id}/confirm": {"post": {"tags": ["incidents"], "summary": "Confirm Incident", "description": "Confirm predicted incident by id", "operationId": "confirm_incident_incidents__incident_id__confirm_post", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Incident Id"}, "name": "incident_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/IncidentDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/settings/webhook": {"get": {"tags": ["settings"], "summary": "Webhook Settings", "description": "Get details about the webhook endpoint (e.g. the API url and an API key)", "operationId": "webhook_settings_settings_webhook_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/WebhookSettings"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/settings/smtp": {"get": {"tags": ["settings"], "summary": "Get Smtp Settings", "description": "Get SMTP settings", "operationId": "get_smtp_settings_settings_smtp_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["settings"], "summary": "Update Smtp Settings", "description": "Install or update SMTP settings", "operationId": "update_smtp_settings_settings_smtp_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/SMTPSettings"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["settings"], "summary": "Delete Smtp Settings", "description": "Delete SMTP settings", "operationId": "delete_smtp_settings_settings_smtp_delete", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/settings/smtp/test": {"post": {"tags": ["settings"], "summary": "Test Smtp Settings", "description": "Test SMTP settings", "operationId": "test_smtp_settings_settings_smtp_test_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/SMTPSettings"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/settings/apikey": {"put": {"tags": ["settings"], "summary": "Update Api Key", "description": "Update API key secret", "operationId": "update_api_key_settings_apikey_put", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["settings"], "summary": "Create Key", "description": "Create API key", "operationId": "create_key_settings_apikey_post", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/settings/apikeys": {"get": {"tags": ["settings"], "summary": "Get Keys", "description": "Get API keys", "operationId": "get_keys_settings_apikeys_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/settings/apikey/{keyId}": {"delete": {"tags": ["settings"], "summary": "Delete Api Key", "description": "Delete API key", "operationId": "delete_api_key_settings_apikey__keyId__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Keyid"}, "name": "keyId", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/settings/sso": {"get": {"tags": ["settings"], "summary": "Get Sso Settings", "operationId": "get_sso_settings_settings_sso_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows": {"get": {"tags": ["workflows", "alerts"], "summary": "Get Workflows", "description": "Get workflows", "operationId": "get_workflows_workflows_get", "parameters": [{"required": false, "schema": {"type": "boolean", "title": "Is V2", "default": false}, "name": "is_v2", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"anyOf": [{"items": {"$ref": "#/components/schemas/WorkflowDTO"}, "type": "array"}, {"items": {"type": "object"}, "type": "array"}], "title": "Response Get Workflows Workflows Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["workflows", "alerts"], "summary": "Create Workflow", "description": "Create or update a workflow", "operationId": "create_workflow_workflows_post", "requestBody": {"content": {"multipart/form-data": {"schema": {"$ref": "#/components/schemas/Body_create_workflow_workflows_post"}}}, "required": true}, "responses": {"201": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/WorkflowCreateOrUpdateDTO"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/export": {"get": {"tags": ["workflows", "alerts"], "summary": "Export Workflows", "description": "export all workflow Yamls", "operationId": "export_workflows_workflows_export_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"type": "string"}, "type": "array", "title": "Response Export Workflows Workflows Export Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/{workflow_id}/run": {"post": {"tags": ["workflows", "alerts"], "summary": "Run Workflow", "description": "Run a workflow", "operationId": "run_workflow_workflows__workflow_id__run_post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Workflow Id"}, "name": "workflow_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"type": "object", "title": "Body"}}}}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "object", "title": "Response Run Workflow Workflows Workflow Id Run Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/test": {"post": {"tags": ["workflows", "alerts"], "summary": "Run Workflow From Definition", "description": "Test run a workflow from a definition", "operationId": "run_workflow_from_definition_workflows_test_post", "requestBody": {"content": {"multipart/form-data": {"schema": {"$ref": "#/components/schemas/Body_run_workflow_from_definition_workflows_test_post"}}}}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "object", "title": "Response Run Workflow From Definition Workflows Test Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/json": {"post": {"tags": ["workflows", "alerts"], "summary": "Create Workflow From Body", "description": "Create or update a workflow", "operationId": "create_workflow_from_body_workflows_json_post", "responses": {"201": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/WorkflowCreateOrUpdateDTO"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/random-templates": {"get": {"tags": ["workflows", "alerts"], "summary": "Get Random Workflow Templates", "description": "Get random workflow templates", "operationId": "get_random_workflow_templates_workflows_random_templates_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"type": "object"}, "type": "array", "title": "Response Get Random Workflow Templates Workflows Random Templates Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/{workflow_id}": {"get": {"tags": ["workflows", "alerts"], "summary": "Get Workflow By Id", "description": "Get workflow by ID", "operationId": "get_workflow_by_id_workflows__workflow_id__get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Workflow Id"}, "name": "workflow_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "put": {"tags": ["workflows", "alerts"], "summary": "Update Workflow By Id", "description": "Update a workflow", "operationId": "update_workflow_by_id_workflows__workflow_id__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Workflow Id"}, "name": "workflow_id", "in": "path"}], "responses": {"201": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/WorkflowCreateOrUpdateDTO"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["workflows", "alerts"], "summary": "Delete Workflow By Id", "description": "Delete workflow", "operationId": "delete_workflow_by_id_workflows__workflow_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Workflow Id"}, "name": "workflow_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/{workflow_id}/raw": {"get": {"tags": ["workflows", "alerts"], "summary": "Get Raw Workflow By Id", "description": "Get workflow executions by ID", "operationId": "get_raw_workflow_by_id_workflows__workflow_id__raw_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Workflow Id"}, "name": "workflow_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "string", "title": "Response Get Raw Workflow By Id Workflows Workflow Id Raw Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/executions": {"get": {"tags": ["workflows", "alerts"], "summary": "Get Workflow Executions By Alert Fingerprint", "description": "Get workflow executions by alert fingerprint", "operationId": "get_workflow_executions_by_alert_fingerprint_workflows_executions_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/WorkflowToAlertExecutionDTO"}, "type": "array", "title": "Response Get Workflow Executions By Alert Fingerprint Workflows Executions Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/{workflow_id}/runs": {"get": {"tags": ["workflows", "alerts"], "summary": "Get Workflow By Id", "description": "Get workflow executions by ID", "operationId": "get_workflow_by_id_workflows__workflow_id__runs_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Workflow Id"}, "name": "workflow_id", "in": "path"}, {"required": false, "schema": {"type": "integer", "title": "Tab", "default": 1}, "name": "tab", "in": "query"}, {"required": false, "schema": {"type": "integer", "title": "Limit", "default": 25}, "name": "limit", "in": "query"}, {"required": false, "schema": {"type": "integer", "title": "Offset", "default": 0}, "name": "offset", "in": "query"}, {"required": false, "schema": {"items": {"type": "string"}, "type": "array", "title": "Status"}, "name": "status", "in": "query"}, {"required": false, "schema": {"items": {"type": "string"}, "type": "array", "title": "Trigger"}, "name": "trigger", "in": "query"}, {"required": false, "schema": {"type": "string", "title": "Execution Id"}, "name": "execution_id", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/WorkflowExecutionsPaginatedResultsDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/workflows/{workflow_id}/runs/{workflow_execution_id}": {"get": {"tags": ["workflows", "alerts"], "summary": "Get Workflow Execution Status", "description": "Get a workflow execution status", "operationId": "get_workflow_execution_status_workflows__workflow_id__runs__workflow_execution_id__get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Workflow Execution Id"}, "name": "workflow_execution_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/WorkflowExecutionDTO"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/whoami": {"get": {"tags": ["whoami"], "summary": "Get Tenant Id", "description": "Get tenant id", "operationId": "get_tenant_id_whoami_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "object", "title": "Response Get Tenant Id Whoami Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/pusher/auth": {"post": {"tags": ["pusher"], "summary": "Pusher Authentication", "description": "Authenticate a user to a private channel\n\nArgs:\n request (Request): The request object\n tenant_id (str, optional): The tenant ID. Defaults to Depends(verify_bearer_token).\n pusher_client (Pusher, optional): Pusher client. Defaults to Depends(get_pusher_client).\n\nRaises:\n HTTPException: 403 if the user is not allowed to access the channel.\n\nReturns:\n dict: The authentication response.", "operationId": "pusher_authentication_pusher_auth_post", "requestBody": {"content": {"application/x-www-form-urlencoded": {"schema": {"$ref": "#/components/schemas/Body_pusher_authentication_pusher_auth_post"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "object", "title": "Response Pusher Authentication Pusher Auth Post"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/status": {"get": {"tags": ["status"], "summary": "Status", "description": "simple status endpoint", "operationId": "status_status_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"type": "object", "title": "Response Status Status Get"}}}}}}}, "/rules": {"get": {"tags": ["rules"], "summary": "Get Rules", "description": "Get Rules", "operationId": "get_rules_rules_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["rules"], "summary": "Create Rule", "description": "Create Rule", "operationId": "create_rule_rules_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/RuleCreateDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/rules/{rule_id}": {"put": {"tags": ["rules"], "summary": "Update Rule", "description": "Update Rule", "operationId": "update_rule_rules__rule_id__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["rules"], "summary": "Delete Rule", "description": "Delete Rule", "operationId": "delete_rule_rules__rule_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/preset": {"get": {"tags": ["preset"], "summary": "Get Presets", "description": "Get all presets for tenant", "operationId": "get_presets_preset_get", "parameters": [{"required": false, "schema": {"type": "string", "title": "Time Stamp"}, "name": "time_stamp", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/PresetDto"}, "type": "array", "title": "Response Get Presets Preset Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["preset"], "summary": "Create Preset", "description": "Create a preset for tenant", "operationId": "create_preset_preset_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/CreateOrUpdatePresetDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/PresetDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/preset/{uuid}": {"put": {"tags": ["preset"], "summary": "Update Preset", "description": "Update a preset for tenant", "operationId": "update_preset_preset__uuid__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Uuid"}, "name": "uuid", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/CreateOrUpdatePresetDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/PresetDto"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["preset"], "summary": "Delete Preset", "description": "Delete a preset for tenant", "operationId": "delete_preset_preset__uuid__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Uuid"}, "name": "uuid", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/preset/{preset_name}/alerts": {"get": {"tags": ["preset"], "summary": "Get Preset Alerts", "description": "Get the alerts of a preset", "operationId": "get_preset_alerts_preset__preset_name__alerts_get", "parameters": [{"required": true, "schema": {"type": "string", "title": "Preset Name"}, "name": "preset_name", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {}, "type": "array", "title": "Response Get Preset Alerts Preset Preset Name Alerts Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/preset/{preset_id}/tab": {"post": {"tags": ["preset"], "summary": "Create Preset Tab", "description": "Create a tab for a preset", "operationId": "create_preset_tab_preset__preset_id__tab_post", "parameters": [{"required": true, "schema": {"type": "string", "title": "Preset Id"}, "name": "preset_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/CreatePresetTab"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/preset/{preset_id}/tab/{tab_id}": {"delete": {"tags": ["preset"], "summary": "Delete Tab", "description": "Delete a tab from a preset", "operationId": "delete_tab_preset__preset_id__tab__tab_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Preset Id"}, "name": "preset_id", "in": "path"}, {"required": true, "schema": {"type": "string", "title": "Tab Id"}, "name": "tab_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/mapping": {"get": {"tags": ["enrichment", "mapping"], "summary": "Get Rules", "description": "Get all mapping rules", "operationId": "get_rules_mapping_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/MappingRuleDtoOut"}, "type": "array", "title": "Response Get Rules Mapping Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["enrichment", "mapping"], "summary": "Create Rule", "description": "Create a new mapping rule", "operationId": "create_rule_mapping_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/MappingRuleDtoIn"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/MappingRule"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/mapping/{rule_id}": {"put": {"tags": ["enrichment", "mapping"], "summary": "Update Rule", "description": "Update an existing rule", "operationId": "update_rule_mapping__rule_id__put", "parameters": [{"required": true, "schema": {"type": "integer", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/MappingRuleDtoIn"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/MappingRuleDtoOut"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["enrichment", "mapping"], "summary": "Delete Rule", "description": "Delete a mapping rule", "operationId": "delete_rule_mapping__rule_id__delete", "parameters": [{"required": true, "schema": {"type": "integer", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/groups": {"get": {"tags": ["auth", "groups"], "summary": "Get Groups", "description": "Get all groups", "operationId": "get_groups_auth_groups_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/Group"}, "type": "array", "title": "Response Get Groups Auth Groups Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["auth", "groups"], "summary": "Create Group", "description": "Create a group", "operationId": "create_group_auth_groups_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/CreateOrUpdateGroupRequest"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/groups/{group_name}": {"put": {"tags": ["auth", "groups"], "summary": "Update Group", "description": "Update a group", "operationId": "update_group_auth_groups__group_name__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Group Name"}, "name": "group_name", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/CreateOrUpdateGroupRequest"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["auth", "groups"], "summary": "Delete Group", "description": "Delete a group", "operationId": "delete_group_auth_groups__group_name__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Group Name"}, "name": "group_name", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/permissions": {"get": {"tags": ["auth", "permissions"], "summary": "Get Permissions", "description": "Get resources permissions", "operationId": "get_permissions_auth_permissions_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/ResourcePermission"}, "type": "array", "title": "Response Get Permissions Auth Permissions Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["auth", "permissions"], "summary": "Create Permissions", "description": "Create permissions for resources", "operationId": "create_permissions_auth_permissions_post", "requestBody": {"content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/ResourcePermission"}, "type": "array", "title": "Resource Permissions", "description": "List of resource permissions"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/permissions/scopes": {"get": {"tags": ["auth", "permissions"], "summary": "Get Scopes", "description": "Get all resources types", "operationId": "get_scopes_auth_permissions_scopes_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"type": "string"}, "type": "array", "title": "Response Get Scopes Auth Permissions Scopes Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/roles": {"get": {"tags": ["auth", "roles"], "summary": "Get Roles", "description": "Get roles", "operationId": "get_roles_auth_roles_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/Role"}, "type": "array", "title": "Response Get Roles Auth Roles Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["auth", "roles"], "summary": "Create Role", "description": "Create role", "operationId": "create_role_auth_roles_post", "requestBody": {"content": {"application/json": {"schema": {"allOf": [{"$ref": "#/components/schemas/CreateOrUpdateRole"}], "title": "Role", "description": "Role"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/roles/{role_id}": {"put": {"tags": ["auth", "roles"], "summary": "Update Role", "description": "Update role", "operationId": "update_role_auth_roles__role_id__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Role Id"}, "name": "role_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"allOf": [{"$ref": "#/components/schemas/CreateOrUpdateRole"}], "title": "Role", "description": "Role"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["auth", "roles"], "summary": "Delete Role", "description": "Delete role", "operationId": "delete_role_auth_roles__role_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Role Id"}, "name": "role_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/users": {"get": {"tags": ["auth", "users"], "summary": "Get Users", "description": "Get all users", "operationId": "get_users_auth_users_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/User"}, "type": "array", "title": "Response Get Users Auth Users Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["auth", "users"], "summary": "Create User", "description": "Create a user", "operationId": "create_user_auth_users_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/CreateUserRequest"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/auth/users/{user_email}": {"put": {"tags": ["auth", "users"], "summary": "Update User", "description": "Update a user", "operationId": "update_user_auth_users__user_email__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "User Email"}, "name": "user_email", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/UpdateUserRequest"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["auth", "users"], "summary": "Delete User", "description": "Delete a user", "operationId": "delete_user_auth_users__user_email__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "User Email"}, "name": "user_email", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/metrics": {"get": {"tags": ["metrics"], "summary": "Get Metrics", "description": "This endpoint is used by Prometheus to scrape such metrics from the application:\n- alerts_total {incident_name, incident_id} - The total number of alerts per incident.\n- open_incidents_total - The total number of open incidents.\n- workflows_executions_total {status} - The total number of workflow executions.\n\nPlease note that those metrics are per-tenant and are not designed to be used for the monitoring of the application itself.\n\nExample prometheus configuration:\n```\nscrape_configs:\n- job_name: \"scrape_keep\"\n scrape_interval: 5m # It's important to scrape not too often to avoid rate limiting.\n static_configs:\n - targets: [\"https://api.keephq.dev\"] # Or your own domain.\n authorization:\n type: Bearer\n credentials: \"{Your API Key}\"\n\n # Optional, you can add labels to exported incidents. \n # Label values will be equal to the last incident's alert payload value matching the label.\n # Attention! Don't add \"flaky\" labels which could change from alert to alert within the same incident.\n # Good labels: ['labels.department', 'labels.team'], bad labels: ['labels.severity', 'labels.pod_id']\n # Check Keep -> Feed -> \"extraPayload\" column, it will help in writing labels.\n\n params:\n labels: ['labels.service', 'labels.queue']\n # Will resuld as: \"labels_service\" and \"labels_queue\".\n```", "operationId": "get_metrics_metrics_get", "parameters": [{"required": false, "schema": {"items": {"type": "string"}, "type": "array", "title": "Labels"}, "name": "labels", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/extraction": {"get": {"tags": ["enrichment", "extraction"], "summary": "Get Extraction Rules", "description": "Get all extraction rules", "operationId": "get_extraction_rules_extraction_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/ExtractionRuleDtoOut"}, "type": "array", "title": "Response Get Extraction Rules Extraction Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["enrichment", "extraction"], "summary": "Create Extraction Rule", "description": "Create a new extraction rule", "operationId": "create_extraction_rule_extraction_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/ExtractionRuleDtoBase"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/ExtractionRuleDtoOut"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/extraction/{rule_id}": {"put": {"tags": ["enrichment", "extraction"], "summary": "Update Extraction Rule", "description": "Update an existing extraction rule", "operationId": "update_extraction_rule_extraction__rule_id__put", "parameters": [{"required": true, "schema": {"type": "integer", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/ExtractionRuleDtoBase"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/ExtractionRuleDtoOut"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["enrichment", "extraction"], "summary": "Delete Extraction Rule", "description": "Delete an extraction rule", "operationId": "delete_extraction_rule_extraction__rule_id__delete", "parameters": [{"required": true, "schema": {"type": "integer", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/dashboard": {"get": {"tags": ["dashboard"], "summary": "Read Dashboards", "operationId": "read_dashboards_dashboard_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/DashboardResponseDTO"}, "type": "array", "title": "Response Read Dashboards Dashboard Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["dashboard"], "summary": "Create Dashboard", "operationId": "create_dashboard_dashboard_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/DashboardCreateDTO"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/DashboardResponseDTO"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/dashboard/{dashboard_id}": {"put": {"tags": ["dashboard"], "summary": "Update Dashboard", "operationId": "update_dashboard_dashboard__dashboard_id__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Dashboard Id"}, "name": "dashboard_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/DashboardUpdateDTO"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/DashboardResponseDTO"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["dashboard"], "summary": "Delete Dashboard", "operationId": "delete_dashboard_dashboard__dashboard_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Dashboard Id"}, "name": "dashboard_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/dashboard/metric-widgets": {"get": {"tags": ["dashboard"], "summary": "Get Metric Widgets", "operationId": "get_metric_widgets_dashboard_metric_widgets_get", "parameters": [{"required": false, "schema": {"type": "boolean", "title": "Mttr", "default": true}, "name": "mttr", "in": "query"}, {"required": false, "schema": {"type": "boolean", "title": "Apd", "default": true}, "name": "apd", "in": "query"}, {"required": false, "schema": {"type": "boolean", "title": "Ipd", "default": true}, "name": "ipd", "in": "query"}, {"required": false, "schema": {"type": "boolean", "title": "Wpd", "default": true}, "name": "wpd", "in": "query"}, {"required": false, "schema": {"type": "string", "title": "Time Stamp"}, "name": "time_stamp", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/tags": {"get": {"tags": ["tags"], "summary": "Get Tags", "description": "get tags", "operationId": "get_tags_tags_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"type": "object"}, "type": "array", "title": "Response Get Tags Tags Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/maintenance": {"get": {"tags": ["maintenance"], "summary": "Get Maintenance Rules", "description": "Get all maintenance rules", "operationId": "get_maintenance_rules_maintenance_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/MaintenanceRuleRead"}, "type": "array", "title": "Response Get Maintenance Rules Maintenance Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["maintenance"], "summary": "Create Maintenance Rule", "description": "Create a new maintenance rule", "operationId": "create_maintenance_rule_maintenance_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/MaintenanceRuleCreate"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/MaintenanceRuleRead"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/maintenance/{rule_id}": {"put": {"tags": ["maintenance"], "summary": "Update Maintenance Rule", "description": "Update an existing maintenance rule", "operationId": "update_maintenance_rule_maintenance__rule_id__put", "parameters": [{"required": true, "schema": {"type": "integer", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/MaintenanceRuleCreate"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/MaintenanceRuleRead"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["maintenance"], "summary": "Delete Maintenance Rule", "description": "Delete a maintenance rule", "operationId": "delete_maintenance_rule_maintenance__rule_id__delete", "parameters": [{"required": true, "schema": {"type": "integer", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/topology": {"get": {"tags": ["topology"], "summary": "Get Topology Data", "description": "Get all topology data", "operationId": "get_topology_data_topology_get", "parameters": [{"required": false, "schema": {"type": "string", "title": "Provider Ids"}, "name": "provider_ids", "in": "query"}, {"required": false, "schema": {"type": "string", "title": "Services"}, "name": "services", "in": "query"}, {"required": false, "schema": {"type": "string", "title": "Environment"}, "name": "environment", "in": "query"}, {"required": false, "schema": {"type": "boolean", "title": "Include Empty Deps", "default": true}, "name": "include_empty_deps", "in": "query"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/TopologyServiceDtoOut"}, "type": "array", "title": "Response Get Topology Data Topology Get"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/topology/applications": {"get": {"tags": ["topology"], "summary": "Get Applications", "description": "Get all applications", "operationId": "get_applications_topology_applications_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"items": {"$ref": "#/components/schemas/TopologyApplicationDtoOut"}, "type": "array", "title": "Response Get Applications Topology Applications Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["topology"], "summary": "Create Application", "description": "Create a new application", "operationId": "create_application_topology_applications_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/TopologyApplicationDtoIn"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/TopologyApplicationDtoOut"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/topology/applications/{application_id}": {"put": {"tags": ["topology"], "summary": "Update Application", "description": "Update an application", "operationId": "update_application_topology_applications__application_id__put", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Application Id"}, "name": "application_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/TopologyApplicationDtoIn"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/TopologyApplicationDtoOut"}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["topology"], "summary": "Delete Application", "description": "Delete an application", "operationId": "delete_application_topology_applications__application_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "format": "uuid", "title": "Application Id"}, "name": "application_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/deduplications": {"get": {"tags": ["deduplications"], "summary": "Get Deduplications", "description": "Get Deduplications", "operationId": "get_deduplications_deduplications_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "post": {"tags": ["deduplications"], "summary": "Create Deduplication Rule", "description": "Create Deduplication Rule", "operationId": "create_deduplication_rule_deduplications_post", "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/DeduplicationRuleRequestDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/deduplications/fields": {"get": {"tags": ["deduplications"], "summary": "Get Deduplication Fields", "description": "Get Optional Fields For Deduplications", "operationId": "get_deduplication_fields_deduplications_fields_get", "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"additionalProperties": {"items": {"type": "string"}, "type": "array"}, "type": "object", "title": "Response Get Deduplication Fields Deduplications Fields Get"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}, "/deduplications/{rule_id}": {"put": {"tags": ["deduplications"], "summary": "Update Deduplication Rule", "description": "Update Deduplication Rule", "operationId": "update_deduplication_rule_deduplications__rule_id__put", "parameters": [{"required": true, "schema": {"type": "string", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "requestBody": {"content": {"application/json": {"schema": {"$ref": "#/components/schemas/DeduplicationRuleRequestDto"}}}, "required": true}, "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}, "delete": {"tags": ["deduplications"], "summary": "Delete Deduplication Rule", "description": "Delete Deduplication Rule", "operationId": "delete_deduplication_rule_deduplications__rule_id__delete", "parameters": [{"required": true, "schema": {"type": "string", "title": "Rule Id"}, "name": "rule_id", "in": "path"}], "responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {}}}}, "422": {"description": "Validation Error", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/HTTPValidationError"}}}}}, "security": [{"API Key": []}, {"HTTPBasic": []}, {"OAuth2PasswordBearer": []}]}}}, "components": {"schemas": {"AlertActionType": {"enum": ["alert was triggered", "alert acknowledged", "alert automatically resolved", "alert automatically resolved by API", "alert manually resolved", "alert status manually changed", "alert status changed by API", "alert status undone", "alert enriched by workflow", "alert enriched by mapping rule", "alert was deduplicated", "alert was assigned with ticket", "alert was unassigned from ticket", "alert ticket was updated", "alert enrichments disposed", "alert deleted", "alert enriched", "alert un-enriched", "a comment was added to the alert", "a comment was removed from the alert", "Alert is in maintenance window", "A comment was added to the incident"], "title": "AlertActionType", "description": "An enumeration."}, "AlertAudit": {"properties": {"id": {"type": "string", "format": "uuid", "title": "Id"}, "fingerprint": {"type": "string", "title": "Fingerprint"}, "tenant_id": {"type": "string", "title": "Tenant Id"}, "timestamp": {"type": "string", "format": "date-time", "title": "Timestamp"}, "user_id": {"type": "string", "title": "User Id"}, "action": {"type": "string", "title": "Action"}, "description": {"type": "string", "title": "Description"}}, "type": "object", "required": ["fingerprint", "tenant_id", "user_id", "action", "description"], "title": "AlertAudit"}, "AlertAuditDto": {"properties": {"id": {"type": "string", "title": "Id"}, "timestamp": {"type": "string", "format": "date-time", "title": "Timestamp"}, "fingerprint": {"type": "string", "title": "Fingerprint"}, "action": {"$ref": "#/components/schemas/AlertActionType"}, "user_id": {"type": "string", "title": "User Id"}, "description": {"type": "string", "title": "Description"}}, "type": "object", "required": ["id", "timestamp", "fingerprint", "action", "user_id", "description"], "title": "AlertAuditDto"}, "AlertDto": {"properties": {"id": {"type": "string", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "status": {"$ref": "#/components/schemas/AlertStatus"}, "severity": {"$ref": "#/components/schemas/AlertSeverity"}, "lastReceived": {"type": "string", "title": "Lastreceived"}, "firingStartTime": {"type": "string", "title": "Firingstarttime"}, "environment": {"type": "string", "title": "Environment", "default": "undefined"}, "isFullDuplicate": {"type": "boolean", "title": "Isfullduplicate", "default": false}, "isPartialDuplicate": {"type": "boolean", "title": "Ispartialduplicate", "default": false}, "duplicateReason": {"type": "string", "title": "Duplicatereason"}, "service": {"type": "string", "title": "Service"}, "source": {"items": {"type": "string"}, "type": "array", "title": "Source", "default": []}, "apiKeyRef": {"type": "string", "title": "Apikeyref"}, "message": {"type": "string", "title": "Message"}, "description": {"type": "string", "title": "Description"}, "pushed": {"type": "boolean", "title": "Pushed", "default": false}, "event_id": {"type": "string", "title": "Event Id"}, "url": {"type": "string", "maxLength": 65536, "minLength": 1, "format": "uri", "title": "Url"}, "labels": {"type": "object", "title": "Labels", "default": {}}, "fingerprint": {"type": "string", "title": "Fingerprint"}, "deleted": {"type": "boolean", "title": "Deleted", "default": false}, "dismissUntil": {"type": "string", "title": "Dismissuntil"}, "dismissed": {"type": "boolean", "title": "Dismissed", "default": false}, "assignee": {"type": "string", "title": "Assignee"}, "providerId": {"type": "string", "title": "Providerid"}, "providerType": {"type": "string", "title": "Providertype"}, "note": {"type": "string", "title": "Note"}, "startedAt": {"type": "string", "title": "Startedat"}, "isNoisy": {"type": "boolean", "title": "Isnoisy", "default": false}, "enriched_fields": {"items": {}, "type": "array", "title": "Enriched Fields", "default": []}, "incident": {"type": "string", "title": "Incident"}}, "type": "object", "required": ["name", "status", "severity", "lastReceived"], "title": "AlertDto", "example": {"id": "1234", "name": "Pod 'api-service-production' lacks memory", "status": "firing", "lastReceived": "2021-01-01T00:00:00.000Z", "environment": "production", "service": "backend", "source": ["prometheus"], "message": "The pod 'api-service-production' lacks memory causing high error rate", "description": "Due to the lack of memory, the pod 'api-service-production' is experiencing high error rate", "severity": "critical", "pushed": true, "url": "https://www.keephq.dev?alertId=1234", "labels": {"pod": "api-service-production", "region": "us-east-1", "cpu": "88", "memory": "100Mi"}, "ticket_url": "https://www.keephq.dev?enrichedTicketId=456", "fingerprint": "1234"}}, "AlertSeverity": {"enum": ["critical", "high", "warning", "info", "low"], "title": "AlertSeverity", "description": "An enumeration."}, "AlertStatus": {"enum": ["firing", "resolved", "acknowledged", "suppressed", "pending"], "title": "AlertStatus", "description": "An enumeration."}, "AlertWithIncidentLinkMetadataDto": {"properties": {"id": {"type": "string", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "status": {"$ref": "#/components/schemas/AlertStatus"}, "severity": {"$ref": "#/components/schemas/AlertSeverity"}, "lastReceived": {"type": "string", "title": "Lastreceived"}, "firingStartTime": {"type": "string", "title": "Firingstarttime"}, "environment": {"type": "string", "title": "Environment", "default": "undefined"}, "isFullDuplicate": {"type": "boolean", "title": "Isfullduplicate", "default": false}, "isPartialDuplicate": {"type": "boolean", "title": "Ispartialduplicate", "default": false}, "duplicateReason": {"type": "string", "title": "Duplicatereason"}, "service": {"type": "string", "title": "Service"}, "source": {"items": {"type": "string"}, "type": "array", "title": "Source", "default": []}, "apiKeyRef": {"type": "string", "title": "Apikeyref"}, "message": {"type": "string", "title": "Message"}, "description": {"type": "string", "title": "Description"}, "pushed": {"type": "boolean", "title": "Pushed", "default": false}, "event_id": {"type": "string", "title": "Event Id"}, "url": {"type": "string", "maxLength": 65536, "minLength": 1, "format": "uri", "title": "Url"}, "labels": {"type": "object", "title": "Labels", "default": {}}, "fingerprint": {"type": "string", "title": "Fingerprint"}, "deleted": {"type": "boolean", "title": "Deleted", "default": false}, "dismissUntil": {"type": "string", "title": "Dismissuntil"}, "dismissed": {"type": "boolean", "title": "Dismissed", "default": false}, "assignee": {"type": "string", "title": "Assignee"}, "providerId": {"type": "string", "title": "Providerid"}, "providerType": {"type": "string", "title": "Providertype"}, "note": {"type": "string", "title": "Note"}, "startedAt": {"type": "string", "title": "Startedat"}, "isNoisy": {"type": "boolean", "title": "Isnoisy", "default": false}, "enriched_fields": {"items": {}, "type": "array", "title": "Enriched Fields", "default": []}, "incident": {"type": "string", "title": "Incident"}, "is_created_by_ai": {"type": "boolean", "title": "Is Created By Ai", "default": false}}, "type": "object", "required": ["name", "status", "severity", "lastReceived"], "title": "AlertWithIncidentLinkMetadataDto", "example": {"id": "1234", "name": "Pod 'api-service-production' lacks memory", "status": "firing", "lastReceived": "2021-01-01T00:00:00.000Z", "environment": "production", "service": "backend", "source": ["prometheus"], "message": "The pod 'api-service-production' lacks memory causing high error rate", "description": "Due to the lack of memory, the pod 'api-service-production' is experiencing high error rate", "severity": "critical", "pushed": true, "url": "https://www.keephq.dev?alertId=1234", "labels": {"pod": "api-service-production", "region": "us-east-1", "cpu": "88", "memory": "100Mi"}, "ticket_url": "https://www.keephq.dev?enrichedTicketId=456", "fingerprint": "1234"}}, "AlertWithIncidentLinkMetadataPaginatedResultsDto": {"properties": {"limit": {"type": "integer", "title": "Limit", "default": 25}, "offset": {"type": "integer", "title": "Offset", "default": 0}, "count": {"type": "integer", "title": "Count"}, "items": {"items": {"$ref": "#/components/schemas/AlertWithIncidentLinkMetadataDto"}, "type": "array", "title": "Items"}}, "type": "object", "required": ["count", "items"], "title": "AlertWithIncidentLinkMetadataPaginatedResultsDto"}, "Body_create_actions_actions_post": {"properties": {"file": {"type": "string", "format": "binary", "title": "File"}}, "type": "object", "title": "Body_create_actions_actions_post"}, "Body_create_workflow_workflows_post": {"properties": {"file": {"type": "string", "format": "binary", "title": "File"}}, "type": "object", "required": ["file"], "title": "Body_create_workflow_workflows_post"}, "Body_pusher_authentication_pusher_auth_post": {"properties": {"channel_name": {"title": "Channel Name"}, "socket_id": {"title": "Socket Id"}}, "type": "object", "required": ["channel_name", "socket_id"], "title": "Body_pusher_authentication_pusher_auth_post"}, "Body_put_action_actions__action_id__put": {"properties": {"file": {"type": "string", "format": "binary", "title": "File"}}, "type": "object", "required": ["file"], "title": "Body_put_action_actions__action_id__put"}, "Body_run_workflow_from_definition_workflows_test_post": {"properties": {"file": {"type": "string", "format": "binary", "title": "File"}}, "type": "object", "title": "Body_run_workflow_from_definition_workflows_test_post"}, "CreateOrUpdateGroupRequest": {"properties": {"name": {"type": "string", "title": "Name"}, "roles": {"items": {"type": "string"}, "type": "array", "title": "Roles"}, "members": {"items": {"type": "string"}, "type": "array", "title": "Members"}}, "type": "object", "required": ["name", "roles", "members"], "title": "CreateOrUpdateGroupRequest"}, "CreateOrUpdatePresetDto": {"properties": {"name": {"type": "string", "title": "Name"}, "options": {"items": {"$ref": "#/components/schemas/PresetOption"}, "type": "array", "title": "Options"}, "is_private": {"type": "boolean", "title": "Is Private", "default": false}, "is_noisy": {"type": "boolean", "title": "Is Noisy", "default": false}, "tags": {"items": {"$ref": "#/components/schemas/TagDto"}, "type": "array", "title": "Tags", "default": []}}, "type": "object", "required": ["options"], "title": "CreateOrUpdatePresetDto"}, "CreateOrUpdateRole": {"properties": {"name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "scopes": {"items": {"type": "string"}, "type": "array", "uniqueItems": true, "title": "Scopes"}}, "type": "object", "title": "CreateOrUpdateRole"}, "CreatePresetTab": {"properties": {"name": {"type": "string", "title": "Name"}, "filter": {"type": "string", "title": "Filter"}}, "type": "object", "required": ["name", "filter"], "title": "CreatePresetTab"}, "CreateUserRequest": {"properties": {"username": {"type": "string", "title": "Username"}, "name": {"type": "string", "title": "Name"}, "password": {"type": "string", "title": "Password"}, "role": {"type": "string", "title": "Role"}, "groups": {"items": {"type": "string"}, "type": "array", "title": "Groups"}}, "type": "object", "required": ["username"], "title": "CreateUserRequest"}, "DashboardCreateDTO": {"properties": {"dashboard_name": {"type": "string", "title": "Dashboard Name"}, "dashboard_config": {"type": "object", "title": "Dashboard Config"}}, "type": "object", "required": ["dashboard_name", "dashboard_config"], "title": "DashboardCreateDTO"}, "DashboardResponseDTO": {"properties": {"id": {"type": "string", "title": "Id"}, "dashboard_name": {"type": "string", "title": "Dashboard Name"}, "dashboard_config": {"type": "object", "title": "Dashboard Config"}, "created_at": {"type": "string", "format": "date-time", "title": "Created At"}, "updated_at": {"type": "string", "format": "date-time", "title": "Updated At"}}, "type": "object", "required": ["id", "dashboard_name", "dashboard_config", "created_at", "updated_at"], "title": "DashboardResponseDTO"}, "DashboardUpdateDTO": {"properties": {"dashboard_config": {"type": "object", "title": "Dashboard Config"}, "dashboard_name": {"type": "string", "title": "Dashboard Name"}}, "type": "object", "title": "DashboardUpdateDTO"}, "DeduplicationRuleRequestDto": {"properties": {"name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "provider_type": {"type": "string", "title": "Provider Type"}, "provider_id": {"type": "string", "title": "Provider Id"}, "fingerprint_fields": {"items": {"type": "string"}, "type": "array", "title": "Fingerprint Fields"}, "full_deduplication": {"type": "boolean", "title": "Full Deduplication", "default": false}, "ignore_fields": {"items": {"type": "string"}, "type": "array", "title": "Ignore Fields"}}, "type": "object", "required": ["name", "provider_type", "fingerprint_fields"], "title": "DeduplicationRuleRequestDto"}, "DeleteRequestBody": {"properties": {"fingerprint": {"type": "string", "title": "Fingerprint"}, "lastReceived": {"type": "string", "title": "Lastreceived"}, "restore": {"type": "boolean", "title": "Restore", "default": false}}, "type": "object", "required": ["fingerprint", "lastReceived"], "title": "DeleteRequestBody"}, "EnrichAlertRequestBody": {"properties": {"enrichments": {"additionalProperties": {"type": "string"}, "type": "object", "title": "Enrichments"}, "fingerprint": {"type": "string", "title": "Fingerprint"}}, "type": "object", "required": ["enrichments", "fingerprint"], "title": "EnrichAlertRequestBody"}, "ExtractionRuleDtoBase": {"properties": {"name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "priority": {"type": "integer", "title": "Priority", "default": 0}, "attribute": {"type": "string", "title": "Attribute"}, "condition": {"type": "string", "title": "Condition"}, "disabled": {"type": "boolean", "title": "Disabled", "default": false}, "regex": {"type": "string", "title": "Regex"}, "pre": {"type": "boolean", "title": "Pre", "default": false}}, "type": "object", "required": ["name", "regex"], "title": "ExtractionRuleDtoBase"}, "ExtractionRuleDtoOut": {"properties": {"name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "priority": {"type": "integer", "title": "Priority", "default": 0}, "attribute": {"type": "string", "title": "Attribute"}, "condition": {"type": "string", "title": "Condition"}, "disabled": {"type": "boolean", "title": "Disabled", "default": false}, "regex": {"type": "string", "title": "Regex"}, "pre": {"type": "boolean", "title": "Pre", "default": false}, "id": {"type": "integer", "title": "Id"}, "created_by": {"type": "string", "title": "Created By"}, "created_at": {"type": "string", "format": "date-time", "title": "Created At"}, "updated_by": {"type": "string", "title": "Updated By"}, "updated_at": {"type": "string", "format": "date-time", "title": "Updated At"}}, "type": "object", "required": ["name", "regex", "id", "created_at"], "title": "ExtractionRuleDtoOut"}, "Group": {"properties": {"id": {"type": "string", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "roles": {"items": {"type": "string"}, "type": "array", "title": "Roles", "default": []}, "members": {"items": {"type": "string"}, "type": "array", "title": "Members", "default": []}, "memberCount": {"type": "integer", "title": "Membercount", "default": 0}}, "type": "object", "required": ["id", "name"], "title": "Group"}, "HTTPValidationError": {"properties": {"detail": {"items": {"$ref": "#/components/schemas/ValidationError"}, "type": "array", "title": "Detail"}}, "type": "object", "title": "HTTPValidationError"}, "IncidentCommit": {"properties": {"accepted": {"type": "boolean", "title": "Accepted"}, "original_suggestion": {"type": "object", "title": "Original Suggestion"}, "changes": {"type": "object", "title": "Changes"}, "incident": {"$ref": "#/components/schemas/IncidentDto"}}, "type": "object", "required": ["accepted", "original_suggestion", "incident"], "title": "IncidentCommit"}, "IncidentDto": {"properties": {"user_generated_name": {"type": "string", "title": "User Generated Name"}, "assignee": {"type": "string", "title": "Assignee"}, "user_summary": {"type": "string", "title": "User Summary"}, "same_incident_in_the_past_id": {"type": "string", "format": "uuid", "title": "Same Incident In The Past Id"}, "id": {"type": "string", "format": "uuid", "title": "Id"}, "start_time": {"type": "string", "format": "date-time", "title": "Start Time"}, "last_seen_time": {"type": "string", "format": "date-time", "title": "Last Seen Time"}, "end_time": {"type": "string", "format": "date-time", "title": "End Time"}, "creation_time": {"type": "string", "format": "date-time", "title": "Creation Time"}, "alerts_count": {"type": "integer", "title": "Alerts Count"}, "alert_sources": {"items": {"type": "string"}, "type": "array", "title": "Alert Sources"}, "severity": {"$ref": "#/components/schemas/IncidentSeverity"}, "status": {"allOf": [{"$ref": "#/components/schemas/IncidentStatus"}], "default": "firing"}, "services": {"items": {"type": "string"}, "type": "array", "title": "Services"}, "is_predicted": {"type": "boolean", "title": "Is Predicted"}, "is_confirmed": {"type": "boolean", "title": "Is Confirmed"}, "generated_summary": {"type": "string", "title": "Generated Summary"}, "ai_generated_name": {"type": "string", "title": "Ai Generated Name"}, "rule_fingerprint": {"type": "string", "title": "Rule Fingerprint"}, "fingerprint": {"type": "string", "title": "Fingerprint"}, "merged_into_incident_id": {"type": "string", "format": "uuid", "title": "Merged Into Incident Id"}, "merged_by": {"type": "string", "title": "Merged By"}, "merged_at": {"type": "string", "format": "date-time", "title": "Merged At"}}, "type": "object", "required": ["id", "alerts_count", "alert_sources", "severity", "services", "is_predicted", "is_confirmed"], "title": "IncidentDto", "example": {"id": "c2509cb3-6168-4347-b83b-a41da9df2d5b", "name": "Incident name", "user_summary": "Keep: Incident description", "status": "firing"}}, "IncidentDtoIn": {"properties": {"user_generated_name": {"type": "string", "title": "User Generated Name"}, "assignee": {"type": "string", "title": "Assignee"}, "user_summary": {"type": "string", "title": "User Summary"}, "same_incident_in_the_past_id": {"type": "string", "format": "uuid", "title": "Same Incident In The Past Id"}}, "type": "object", "title": "IncidentDtoIn", "example": {"id": "c2509cb3-6168-4347-b83b-a41da9df2d5b", "name": "Incident name", "user_summary": "Keep: Incident description", "status": "firing"}}, "IncidentListFilterParamsDto": {"properties": {"statuses": {"items": {"$ref": "#/components/schemas/IncidentStatus"}, "type": "array", "default": ["firing", "resolved", "acknowledged", "merged"]}, "severities": {"items": {"$ref": "#/components/schemas/IncidentSeverity"}, "type": "array", "default": ["critical", "high", "warning", "info", "low"]}, "assignees": {"items": {"type": "string"}, "type": "array", "title": "Assignees"}, "services": {"items": {"type": "string"}, "type": "array", "title": "Services"}, "sources": {"items": {"type": "string"}, "type": "array", "title": "Sources"}}, "type": "object", "required": ["assignees", "services", "sources"], "title": "IncidentListFilterParamsDto"}, "IncidentSeverity": {"enum": ["critical", "high", "warning", "info", "low"], "title": "IncidentSeverity", "description": "An enumeration."}, "IncidentSorting": {"enum": ["creation_time", "start_time", "last_seen_time", "severity", "status", "alerts_count", "-creation_time", "-start_time", "-last_seen_time", "-severity", "-status", "-alerts_count"], "title": "IncidentSorting", "description": "An enumeration."}, "IncidentStatus": {"enum": ["firing", "resolved", "acknowledged", "merged"], "title": "IncidentStatus", "description": "An enumeration."}, "IncidentStatusChangeDto": {"properties": {"status": {"$ref": "#/components/schemas/IncidentStatus"}, "comment": {"type": "string", "title": "Comment"}}, "type": "object", "required": ["status"], "title": "IncidentStatusChangeDto"}, "IncidentsClusteringSuggestion": {"properties": {"incident_suggestion": {"items": {"$ref": "#/components/schemas/IncidentDto"}, "type": "array", "title": "Incident Suggestion"}, "suggestion_id": {"type": "string", "title": "Suggestion Id"}}, "type": "object", "required": ["incident_suggestion", "suggestion_id"], "title": "IncidentsClusteringSuggestion"}, "IncidentsPaginatedResultsDto": {"properties": {"limit": {"type": "integer", "title": "Limit", "default": 25}, "offset": {"type": "integer", "title": "Offset", "default": 0}, "count": {"type": "integer", "title": "Count"}, "items": {"items": {"$ref": "#/components/schemas/IncidentDto"}, "type": "array", "title": "Items"}}, "type": "object", "required": ["count", "items"], "title": "IncidentsPaginatedResultsDto"}, "MaintenanceRuleCreate": {"properties": {"name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "cel_query": {"type": "string", "title": "Cel Query"}, "start_time": {"type": "string", "format": "date-time", "title": "Start Time"}, "duration_seconds": {"type": "integer", "title": "Duration Seconds"}, "suppress": {"type": "boolean", "title": "Suppress", "default": false}, "enabled": {"type": "boolean", "title": "Enabled", "default": true}}, "type": "object", "required": ["name", "cel_query", "start_time"], "title": "MaintenanceRuleCreate"}, "MaintenanceRuleRead": {"properties": {"id": {"type": "integer", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "created_by": {"type": "string", "title": "Created By"}, "cel_query": {"type": "string", "title": "Cel Query"}, "start_time": {"type": "string", "format": "date-time", "title": "Start Time"}, "end_time": {"type": "string", "format": "date-time", "title": "End Time"}, "duration_seconds": {"type": "integer", "title": "Duration Seconds"}, "updated_at": {"type": "string", "format": "date-time", "title": "Updated At"}, "suppress": {"type": "boolean", "title": "Suppress", "default": false}, "enabled": {"type": "boolean", "title": "Enabled", "default": true}}, "type": "object", "required": ["id", "name", "created_by", "cel_query", "start_time", "end_time"], "title": "MaintenanceRuleRead"}, "MappingRule": {"properties": {"id": {"type": "integer", "title": "Id"}, "tenant_id": {"type": "string", "title": "Tenant Id"}, "priority": {"type": "integer", "title": "Priority", "default": 0}, "name": {"type": "string", "maxLength": 255, "title": "Name"}, "description": {"type": "string", "maxLength": 2048, "title": "Description"}, "file_name": {"type": "string", "maxLength": 255, "title": "File Name"}, "created_by": {"type": "string", "maxLength": 255, "title": "Created By"}, "created_at": {"type": "string", "format": "date-time", "title": "Created At"}, "disabled": {"type": "boolean", "title": "Disabled", "default": false}, "override": {"type": "boolean", "title": "Override", "default": true}, "condition": {"type": "string", "maxLength": 2000, "title": "Condition"}, "type": {"type": "string", "maxLength": 255, "title": "Type"}, "matchers": {"items": {"type": "string"}, "type": "array", "title": "Matchers"}, "rows": {"items": {"type": "object"}, "type": "array", "title": "Rows"}, "updated_by": {"type": "string", "maxLength": 255, "title": "Updated By"}, "last_updated_at": {"type": "string", "format": "date-time", "title": "Last Updated At"}}, "type": "object", "required": ["tenant_id", "name", "type", "matchers"], "title": "MappingRule"}, "MappingRuleDtoIn": {"properties": {"name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "file_name": {"type": "string", "title": "File Name"}, "priority": {"type": "integer", "title": "Priority", "default": 0}, "matchers": {"items": {"type": "string"}, "type": "array", "title": "Matchers"}, "type": {"type": "string", "enum": ["csv", "topology"], "title": "Type", "default": "csv"}, "rows": {"items": {"type": "object"}, "type": "array", "title": "Rows"}}, "type": "object", "required": ["name", "matchers"], "title": "MappingRuleDtoIn"}, "MappingRuleDtoOut": {"properties": {"name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "file_name": {"type": "string", "title": "File Name"}, "priority": {"type": "integer", "title": "Priority", "default": 0}, "matchers": {"items": {"type": "string"}, "type": "array", "title": "Matchers"}, "type": {"type": "string", "enum": ["csv", "topology"], "title": "Type", "default": "csv"}, "id": {"type": "integer", "title": "Id"}, "created_by": {"type": "string", "title": "Created By"}, "created_at": {"type": "string", "format": "date-time", "title": "Created At"}, "attributes": {"items": {"type": "string"}, "type": "array", "title": "Attributes", "default": []}, "updated_by": {"type": "string", "title": "Updated By"}, "last_updated_at": {"type": "string", "format": "date-time", "title": "Last Updated At"}}, "type": "object", "required": ["name", "matchers", "id", "created_at"], "title": "MappingRuleDtoOut"}, "MergeIncidentsRequestDto": {"properties": {"source_incident_ids": {"items": {"type": "string", "format": "uuid"}, "type": "array", "title": "Source Incident Ids"}, "destination_incident_id": {"type": "string", "format": "uuid", "title": "Destination Incident Id"}}, "type": "object", "required": ["source_incident_ids", "destination_incident_id"], "title": "MergeIncidentsRequestDto"}, "MergeIncidentsResponseDto": {"properties": {"merged_incident_ids": {"items": {"type": "string", "format": "uuid"}, "type": "array", "title": "Merged Incident Ids"}, "skipped_incident_ids": {"items": {"type": "string", "format": "uuid"}, "type": "array", "title": "Skipped Incident Ids"}, "failed_incident_ids": {"items": {"type": "string", "format": "uuid"}, "type": "array", "title": "Failed Incident Ids"}, "destination_incident_id": {"type": "string", "format": "uuid", "title": "Destination Incident Id"}, "message": {"type": "string", "title": "Message"}}, "type": "object", "required": ["merged_incident_ids", "skipped_incident_ids", "failed_incident_ids", "destination_incident_id", "message"], "title": "MergeIncidentsResponseDto"}, "PermissionEntity": {"properties": {"id": {"type": "string", "title": "Id"}, "type": {"type": "string", "title": "Type"}, "name": {"type": "string", "title": "Name"}}, "type": "object", "required": ["id", "type"], "title": "PermissionEntity"}, "PresetDto": {"properties": {"id": {"type": "string", "format": "uuid", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "options": {"items": {}, "type": "array", "title": "Options", "default": []}, "created_by": {"type": "string", "title": "Created By"}, "is_private": {"type": "boolean", "title": "Is Private", "default": false}, "is_noisy": {"type": "boolean", "title": "Is Noisy", "default": false}, "should_do_noise_now": {"type": "boolean", "title": "Should Do Noise Now", "default": false}, "alerts_count": {"type": "integer", "title": "Alerts Count", "default": 0}, "static": {"type": "boolean", "title": "Static", "default": false}, "tags": {"items": {"$ref": "#/components/schemas/TagDto"}, "type": "array", "title": "Tags", "default": []}}, "type": "object", "required": ["id", "name"], "title": "PresetDto"}, "PresetOption": {"properties": {"label": {"type": "string", "title": "Label"}, "value": {"anyOf": [{"type": "string"}, {"type": "object"}], "title": "Value"}}, "type": "object", "required": ["label", "value"], "title": "PresetOption"}, "PresetSearchQuery": {"properties": {"cel_query": {"type": "string", "minLength": 0, "title": "Cel Query"}, "sql_query": {"type": "object", "title": "Sql Query"}, "limit": {"type": "integer", "minimum": 0.0, "title": "Limit", "default": 1000}, "timeframe": {"type": "integer", "minimum": 0.0, "title": "Timeframe", "default": 0}}, "type": "object", "required": ["cel_query", "sql_query"], "title": "PresetSearchQuery"}, "ProviderDTO": {"properties": {"type": {"type": "string", "title": "Type"}, "id": {"type": "string", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "installed": {"type": "boolean", "title": "Installed"}}, "type": "object", "required": ["type", "name", "installed"], "title": "ProviderDTO"}, "ProviderWebhookSettings": {"properties": {"webhookDescription": {"type": "string", "title": "Webhookdescription"}, "webhookTemplate": {"type": "string", "title": "Webhooktemplate"}, "webhookMarkdown": {"type": "string", "title": "Webhookmarkdown"}}, "type": "object", "required": ["webhookTemplate"], "title": "ProviderWebhookSettings"}, "ResourcePermission": {"properties": {"resource_id": {"type": "string", "title": "Resource Id"}, "resource_name": {"type": "string", "title": "Resource Name"}, "resource_type": {"type": "string", "title": "Resource Type"}, "permissions": {"items": {"$ref": "#/components/schemas/PermissionEntity"}, "type": "array", "title": "Permissions"}}, "type": "object", "required": ["resource_id", "resource_name", "resource_type", "permissions"], "title": "ResourcePermission"}, "Role": {"properties": {"id": {"type": "string", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "scopes": {"items": {"type": "string"}, "type": "array", "uniqueItems": true, "title": "Scopes"}, "predefined": {"type": "boolean", "title": "Predefined", "default": true}}, "type": "object", "required": ["id", "name", "description", "scopes"], "title": "Role"}, "RuleCreateDto": {"properties": {"ruleName": {"type": "string", "title": "Rulename"}, "sqlQuery": {"type": "object", "title": "Sqlquery"}, "celQuery": {"type": "string", "title": "Celquery"}, "timeframeInSeconds": {"type": "integer", "title": "Timeframeinseconds"}, "timeUnit": {"type": "string", "title": "Timeunit"}, "groupingCriteria": {"items": {}, "type": "array", "title": "Groupingcriteria", "default": []}, "groupDescription": {"type": "string", "title": "Groupdescription"}, "requireApprove": {"type": "boolean", "title": "Requireapprove", "default": false}, "resolveOn": {"type": "string", "title": "Resolveon", "default": "never"}}, "type": "object", "required": ["ruleName", "sqlQuery", "celQuery", "timeframeInSeconds", "timeUnit"], "title": "RuleCreateDto"}, "SMTPSettings": {"properties": {"host": {"type": "string", "title": "Host"}, "port": {"type": "integer", "title": "Port"}, "from_email": {"type": "string", "title": "From Email"}, "username": {"type": "string", "title": "Username"}, "password": {"type": "string", "format": "password", "title": "Password", "writeOnly": true}, "secure": {"type": "boolean", "title": "Secure", "default": true}, "to_email": {"type": "string", "title": "To Email", "default": "keep@example.com"}}, "type": "object", "required": ["host", "port", "from_email"], "title": "SMTPSettings", "example": {"host": "smtp.example.com", "port": 587, "username": "user@example.com", "password": "password", "secure": true, "from_email": "noreply@example.com", "to_email": ""}}, "SearchAlertsRequest": {"properties": {"query": {"$ref": "#/components/schemas/PresetSearchQuery"}, "timeframe": {"type": "integer", "title": "Timeframe"}}, "type": "object", "required": ["query", "timeframe"], "title": "SearchAlertsRequest"}, "TagDto": {"properties": {"id": {"type": "string", "title": "Id"}, "name": {"type": "string", "title": "Name"}}, "type": "object", "required": ["name"], "title": "TagDto"}, "TopologyApplicationDtoIn": {"properties": {"id": {"type": "string", "format": "uuid", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "services": {"items": {"$ref": "#/components/schemas/TopologyServiceDtoIn"}, "type": "array", "title": "Services", "default": []}}, "type": "object", "required": ["name"], "title": "TopologyApplicationDtoIn"}, "TopologyApplicationDtoOut": {"properties": {"id": {"type": "string", "format": "uuid", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "description": {"type": "string", "title": "Description"}, "services": {"items": {"$ref": "#/components/schemas/TopologyApplicationServiceDto"}, "type": "array", "title": "Services", "default": []}}, "type": "object", "required": ["id", "name"], "title": "TopologyApplicationDtoOut"}, "TopologyApplicationServiceDto": {"properties": {"id": {"type": "integer", "title": "Id"}, "name": {"type": "string", "title": "Name"}, "service": {"type": "string", "title": "Service"}}, "type": "object", "required": ["id", "name", "service"], "title": "TopologyApplicationServiceDto"}, "TopologyServiceDependencyDto": {"properties": {"serviceId": {"type": "integer", "title": "Serviceid"}, "serviceName": {"type": "string", "title": "Servicename"}, "protocol": {"type": "string", "title": "Protocol", "default": "unknown"}}, "type": "object", "required": ["serviceId", "serviceName"], "title": "TopologyServiceDependencyDto"}, "TopologyServiceDtoIn": {"properties": {"id": {"type": "integer", "title": "Id"}}, "type": "object", "required": ["id"], "title": "TopologyServiceDtoIn"}, "TopologyServiceDtoOut": {"properties": {"source_provider_id": {"type": "string", "title": "Source Provider Id"}, "repository": {"type": "string", "title": "Repository"}, "tags": {"items": {"type": "string"}, "type": "array", "title": "Tags"}, "service": {"type": "string", "title": "Service"}, "display_name": {"type": "string", "title": "Display Name"}, "environment": {"type": "string", "title": "Environment", "default": "unknown"}, "description": {"type": "string", "title": "Description"}, "team": {"type": "string", "title": "Team"}, "email": {"type": "string", "title": "Email"}, "slack": {"type": "string", "title": "Slack"}, "ip_address": {"type": "string", "title": "Ip Address"}, "mac_address": {"type": "string", "title": "Mac Address"}, "category": {"type": "string", "title": "Category"}, "manufacturer": {"type": "string", "title": "Manufacturer"}, "id": {"type": "integer", "title": "Id"}, "dependencies": {"items": {"$ref": "#/components/schemas/TopologyServiceDependencyDto"}, "type": "array", "title": "Dependencies"}, "application_ids": {"items": {"type": "string", "format": "uuid"}, "type": "array", "title": "Application Ids"}, "updated_at": {"type": "string", "format": "date-time", "title": "Updated At"}}, "type": "object", "required": ["service", "display_name", "id", "dependencies", "application_ids"], "title": "TopologyServiceDtoOut"}, "UnEnrichAlertRequestBody": {"properties": {"enrichments": {"items": {"type": "string"}, "type": "array", "title": "Enrichments"}, "fingerprint": {"type": "string", "title": "Fingerprint"}}, "type": "object", "required": ["enrichments", "fingerprint"], "title": "UnEnrichAlertRequestBody"}, "UpdateUserRequest": {"properties": {"username": {"type": "string", "title": "Username"}, "password": {"type": "string", "title": "Password"}, "role": {"type": "string", "title": "Role"}, "groups": {"items": {"type": "string"}, "type": "array", "title": "Groups"}}, "type": "object", "title": "UpdateUserRequest"}, "User": {"properties": {"email": {"type": "string", "title": "Email"}, "name": {"type": "string", "title": "Name"}, "role": {"type": "string", "title": "Role"}, "picture": {"type": "string", "title": "Picture"}, "created_at": {"type": "string", "title": "Created At"}, "last_login": {"type": "string", "title": "Last Login"}, "ldap": {"type": "boolean", "title": "Ldap", "default": false}, "groups": {"items": {"$ref": "#/components/schemas/Group"}, "type": "array", "title": "Groups", "default": []}}, "type": "object", "required": ["email", "name", "created_at"], "title": "User"}, "ValidationError": {"properties": {"loc": {"items": {"anyOf": [{"type": "string"}, {"type": "integer"}]}, "type": "array", "title": "Location"}, "msg": {"type": "string", "title": "Message"}, "type": {"type": "string", "title": "Error Type"}}, "type": "object", "required": ["loc", "msg", "type"], "title": "ValidationError"}, "WebhookSettings": {"properties": {"webhookApi": {"type": "string", "title": "Webhookapi"}, "apiKey": {"type": "string", "title": "Apikey"}, "modelSchema": {"type": "object", "title": "Modelschema"}}, "type": "object", "required": ["webhookApi", "apiKey", "modelSchema"], "title": "WebhookSettings"}, "WorkflowCreateOrUpdateDTO": {"properties": {"workflow_id": {"type": "string", "title": "Workflow Id"}, "status": {"type": "string", "enum": ["created", "updated"], "title": "Status"}, "revision": {"type": "integer", "title": "Revision", "default": 1}}, "type": "object", "required": ["workflow_id", "status"], "title": "WorkflowCreateOrUpdateDTO"}, "WorkflowDTO": {"properties": {"id": {"type": "string", "title": "Id"}, "name": {"type": "string", "title": "Name", "default": "Workflow file doesn't contain name"}, "description": {"type": "string", "title": "Description", "default": "Workflow file doesn't contain description"}, "created_by": {"type": "string", "title": "Created By"}, "creation_time": {"type": "string", "format": "date-time", "title": "Creation Time"}, "triggers": {"items": {"type": "object"}, "type": "array", "title": "Triggers"}, "interval": {"type": "integer", "title": "Interval"}, "disabled": {"type": "boolean", "title": "Disabled", "default": false}, "last_execution_time": {"type": "string", "format": "date-time", "title": "Last Execution Time"}, "last_execution_status": {"type": "string", "title": "Last Execution Status"}, "providers": {"items": {"$ref": "#/components/schemas/ProviderDTO"}, "type": "array", "title": "Providers"}, "workflow_raw": {"type": "string", "title": "Workflow Raw"}, "revision": {"type": "integer", "title": "Revision", "default": 1}, "last_updated": {"type": "string", "format": "date-time", "title": "Last Updated"}, "invalid": {"type": "boolean", "title": "Invalid", "default": false}, "last_executions": {"items": {"type": "object"}, "type": "array", "title": "Last Executions"}, "last_execution_started": {"type": "string", "format": "date-time", "title": "Last Execution Started"}, "provisioned": {"type": "boolean", "title": "Provisioned", "default": false}, "provisioned_file": {"type": "string", "title": "Provisioned File"}}, "type": "object", "required": ["id", "created_by", "creation_time", "providers", "workflow_raw"], "title": "WorkflowDTO"}, "WorkflowExecutionDTO": {"properties": {"id": {"type": "string", "title": "Id"}, "workflow_id": {"type": "string", "title": "Workflow Id"}, "started": {"type": "string", "format": "date-time", "title": "Started"}, "triggered_by": {"type": "string", "title": "Triggered By"}, "status": {"type": "string", "title": "Status"}, "workflow_name": {"type": "string", "title": "Workflow Name"}, "logs": {"items": {"$ref": "#/components/schemas/WorkflowExecutionLogsDTO"}, "type": "array", "title": "Logs"}, "error": {"type": "string", "title": "Error"}, "execution_time": {"type": "number", "title": "Execution Time"}, "results": {"type": "object", "title": "Results"}}, "type": "object", "required": ["id", "workflow_id", "started", "triggered_by", "status"], "title": "WorkflowExecutionDTO"}, "WorkflowExecutionLogsDTO": {"properties": {"id": {"type": "integer", "title": "Id"}, "timestamp": {"type": "string", "format": "date-time", "title": "Timestamp"}, "message": {"type": "string", "title": "Message"}, "context": {"type": "object", "title": "Context"}}, "type": "object", "required": ["id", "timestamp", "message"], "title": "WorkflowExecutionLogsDTO"}, "WorkflowExecutionsPaginatedResultsDto": {"properties": {"limit": {"type": "integer", "title": "Limit", "default": 25}, "offset": {"type": "integer", "title": "Offset", "default": 0}, "count": {"type": "integer", "title": "Count"}, "items": {"items": {"$ref": "#/components/schemas/WorkflowExecutionDTO"}, "type": "array", "title": "Items"}, "passCount": {"type": "integer", "title": "Passcount", "default": 0}, "avgDuration": {"type": "number", "title": "Avgduration", "default": 0.0}, "workflow": {"$ref": "#/components/schemas/WorkflowDTO"}, "failCount": {"type": "integer", "title": "Failcount", "default": 0}}, "type": "object", "required": ["count", "items"], "title": "WorkflowExecutionsPaginatedResultsDto"}, "WorkflowToAlertExecutionDTO": {"properties": {"workflow_id": {"type": "string", "title": "Workflow Id"}, "workflow_execution_id": {"type": "string", "title": "Workflow Execution Id"}, "alert_fingerprint": {"type": "string", "title": "Alert Fingerprint"}, "workflow_status": {"type": "string", "title": "Workflow Status"}, "workflow_started": {"type": "string", "format": "date-time", "title": "Workflow Started"}}, "type": "object", "required": ["workflow_id", "workflow_execution_id", "alert_fingerprint", "workflow_status", "workflow_started"], "title": "WorkflowToAlertExecutionDTO"}}, "securitySchemes": {"API Key": {"type": "apiKey", "in": "header", "name": "X-API-KEY"}, "HTTPBasic": {"type": "http", "scheme": "basic"}, "OAuth2PasswordBearer": {"type": "oauth2", "flows": {"password": {"scopes": {}, "tokenUrl": "token"}}}}}} ================================================ FILE: docs/overview/ai-correlation.mdx ================================================ --- title: "AI Correlation" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: ⛔️
Keep's AI correlation engine provides a distinctive approach to fully AI-driven alert correlation. By using historical alert data as its training dataset, the system intelligently classifies new alerts and assigns them to appropriate incidents. The AI correlator runs on cycles, each iteration cycle completes in 5-15 minutes: 1) Model trained based on historical data. 2) Model is evaluated. 3) All unassigned alerts are clustered and added to incidents when their confidence score exceeds the threshold. Configuration UI: Incident with alerts correlated by AI: Check the demo on a playground: https://playground.keephq.dev/ai To activate the feature for your on-premises tenant, please [talk to us](https://www.keephq.dev/meet-keep). ## Frequent questions: **Model used:** proprietary model developed and hosted by Keep.
**Training dataset:** tenant's alerts and incidents.
**Privacy:** tenant's data is used only for training of the model for the same tenant. Data is not mixed between tenants for training. ================================================ FILE: docs/overview/ai-in-workflows.mdx ================================================ --- title: "AI in Workflows" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: ✅
AI in workflows enables you to integrate third-party AI providers as "steps" and "actions" within your workflows. Could be useful for: 1. Human input normalization. 2. Routing. 3. Severity definition. 4. Summorization. Supported providers include DeepSeek, OpenAI, Anthropic, Grok, Gemini, Ollama, Llama.cpp, vLLM, and more. Check the "AI" filter on the "Providers" page for a complete list. Blogpost with examples: https://www.keephq.dev/blog/launch-week-ai-powered-workflows ## Frequent questions: **Model used:** client's own 3'rd party LLM provider. Could be cloud or self-hosted.
**Privacy:** Data stays within Keep unless it's explicitly processed wia workflow to an explicitly connected 3'rd party provider. Data flow is defined by user. ================================================ FILE: docs/overview/ai-incident-assistant.mdx ================================================ --- title: "AI Incident Assistant" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: (experimental)
The AI incident assistant is a chat feature embedded in the incident page. It streamlines all incident context—including alerts, descriptions, and impacted topology—to the LLM, helping on-call engineers gather information faster and resolve incidents more efficiently. Users can ask for root cause analysis and even execute commands on third-party services ([read more about provider methods](/providers/provider-methods#via-ai-assistant)). ## Frequent questions: **Model used:** OpenAI, a model hosted by Keep, or other.
**Data flow:** Data is shared between LLM provider and Keep whether the LLM provider may vary depending on the contract. ================================================ FILE: docs/overview/ai-semi-automatic-correlation.mdx ================================================ --- title: "AI Semi Automatic Correlation" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: (experimental)
The Semi-Automatic Incident Engine is a powerful tool designed for teams handling a moderate volume of alerts (fewer than 100 per day). It helps you quickly identify critical issues among numerous alerts—finding the needle in the haystack. How to use: 1. Navigate to the Feed section 2. Select a few alerts 3. Click the "Create Incidents With AI" button Once activated, the system will process your alerts through its LLM (Large Language Model) and present you with potential incident candidates for review. ## Frequent questions: **Model used:** OpenAI, a model hosted by Keep, or other.
**Data flow:** Data is shared between LLM provider and Keep whether the LLM provider may vary depending on the contract. ================================================ FILE: docs/overview/ai-workflow-assistant.mdx ================================================ --- title: "AI Workflow Builder Assistant" --- Keep Cloud: ✅
Keep Enterprise On-Premises: ✅
Keep Open Source: (experimental)
AI-driven workflow builder (don't confuse it with [AI in workflows](./ai-in-workflows)) is a chat-like UI to build workflows using natural language. It works in the “human in the loop” paradigm, proposing changes and applying them only after the user's explicit consent. It simplifies workflow-building routines and helps a broader group of engineers within the organization adopt workflows. Go to "Workflows" -> "+ Create Workflow" to find the AI Assistant: Launch Blogpost: https://www.keephq.dev/blog/launch-week-ai-workflow-builder ## Frequent questions: **Model used:** OpenAI, a model hosted by Keep, or other.
**Data flow:** Data is shared between LLM provider and Keep whether the LLM provider may vary depending on the contract. ================================================ FILE: docs/overview/alertseverityandstatus.mdx ================================================ --- title: "Alerts Severity and Status" --- In Keep, alerts are treated as first-class citizens, with clearly defined severities and statuses to aid in quick and efficient response. ## Alert Severity Alert severity in Keep is classified into five categories, helping teams prioritize their response based on the urgency and impact of the alert. | Severity Level | Description | Expected Value | |----------------|-------------------------------------------------------|----------------| | CRITICAL | Requires immediate action. | "critical" | | HIGH | Needs to be addressed soon. | "high" | | WARNING | Indicates a potential problem. | "warning" | | INFO | Provides information, no immediate action required. | "info" | | LOW | Minor issues or lowest priority. | "low" | ## Alert Status The status of an alert in Keep reflects its current state in the alert lifecycle. | Status | Description | Expected Value | |--------------|-----------------------------------------------------------------------------|----------------| | FIRING | Active alert indicating an ongoing issue. | "firing" | | RESOLVED | The issue has been resolved, and the alert is no longer active. | "resolved" | | ACKNOWLEDGED | The alert has been acknowledged but not resolved. | "acknowledged" | | SUPPRESSED | Alert is suppressed due to various reasons. | "suppressed" | | PENDING | No Data or insufficient data to determine the alert state. | "pending" | ## Provider Alert Mappings Different providers might have their specific ways of defining and handling alert severity and status. Keep standardizes these variations by mapping them to the defined enums (AlertSeverity and AlertStatus). Here's how various providers align with Keep's alert system: | Provider | Severity Mapping | Status Mapping | |---------------|--------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------| | CloudWatch | N/A | ALARM -> FIRING, OK -> RESOLVED, INSUFFICIENT_DATA -> PENDING | | Prometheus | "critical" -> CRITICAL "warning" -> WARNING, "info" -> INFO, "low" -> LOW | "firing" -> FIRING, "resolved" -> RESOLVED | | Datadog | "P4" -> INFO, "P3" -> WARNING, "P2" -> HIGH, "P1" -> CRITICAL | "Triggered" -> FIRING, "Recovered" -> RESOLVED, "Muted" -> SUPPRESSED | | PagerDuty | "P1" -> CRITICAL, "P2" -> HIGH, "P3" -> WARNING, "P4" -> INFO | "triggered" -> FIRING, "acknowledged" -> ACKNOWLEDGED, "resolved" -> RESOLVED | | Pingdom | N/A | "down" -> FIRING, "up" -> RESOLVED, "paused" -> SUPPRESSED | | Dynatrace | "critical" -> CRITICAL, "warning" -> WARNING, "info" -> INFO | "open" -> FIRING, "closed" -> RESOLVED, "acknowledged" -> ACKNOWLEDGED | | Grafana | "critical" -> CRITICAL, "high" -> HIGH, "warning" -> WARNING, "info" -> INFO | "ok" -> RESOLVED, "paused" -> SUPPRESSED, "alerting" -> FIRING, "pending" -> PENDING, "no_data" -> PENDING | | New Relic | "critical" -> CRITICAL, "warning" -> WARNING, "info" -> INFO | "open" -> FIRING, "closed" -> RESOLVED, "acknowledged" -> ACKNOWLEDGED | | Sentry | "fatal" -> CRITICAL, "error" -> HIGH, "warning" -> WARNING, "info" -> INFO, "debug" -> LOW | "resolved" -> RESOLVED, "unresolved" -> FIRING, "ignored" -> SUPPRESSED | | Zabbix | "not_classified" -> LOW, "information" -> INFO, "warning" -> WARNING, "average" -> WARNING, "high" -> HIGH, "disaster" -> CRITICAL | "problem" -> FIRING, "ok" -> RESOLVED, "acknowledged" -> ACKNOWLEDGED, "suppressed" -> SUPPRESSED | ================================================ FILE: docs/overview/cel.mdx ================================================ --- title: "Common Expression Language (CEL)" --- It worth reading [CEL official docs](https://cel.dev) to learn about the language and its syntax. Keep utilizes **CEL (Common Expression Language)** as a powerful and flexible tool to evaluate and filter alerts against predefined rules. CEL enables users to write precise expressions that define conditions under which alerts are processed, displayed, or acted upon. This capability enhances alert management by allowing granular control over visibility and response to incoming alerts. ## How Keep Uses CEL ### Alert Filtering Alerts are dynamically evaluated against CEL expressions to determine which alerts meet the specified criteria. This real-time filtering ensures only the most relevant alerts are surfaced. ### Rule Evaluation CEL expressions can be embedded in rules to enforce specific actions, such as escalating an alert or triggering a workflow. ### Presets Users can save frequently used CEL expressions as presets for quick and consistent application across different alert views or teams. ## Examples ### Filter Alerts from a Specific Service ```cel service.contains("database") ``` ### Combine Multiple Conditions ```cel severity == "critical" && source == "prometheus" ``` ### Exclude Specific Alerts ```cel !(service == "auth" && severity == "low") ``` ================================================ FILE: docs/overview/comparisons.mdx ================================================ --- title: "Comparison" --- It's often easier to grasp a tool's features by comparing it to others in the same ecosystem. Here, we'll explain how Keep interacts with and compares to these tools. ## Keep vs IRM (PagerDuty, OpsGenie, etc.) Incident management tools aim to notify the right person at the right time, simplify reporting, and set up efficient war rooms. "Keep" focuses on the alert lifecycle, noise reduction, and AI-driven alert-incident correlation. Essentially, Keep acts as an 'intelligent layer before the IRM,' managing millions of alerts before they reach your IRM tool. Keep offers high-quality integrations with PagerDuty, OpsGenie, Grafana OnCall, and more. ## Keep vs AIOps in Observability (Elastic, Splunk, etc.) Keep is different because it’s able to correlate alerts between different observability platforms. | | Keep | Alternative | | ------------------------------------- | -------------------------------------------------------------- | ---------------------------- | | Aggregates alerts from one platform | ✅ | ✅ | | Aggregates alerts from multiple platforms | ✅ | ❌ | | Correlates alerts between multiple sources | ✅ | ❌ | | Alerts enrichment | ✅ | ❌ | | Open source | ✅ | ❌ | | Workflow automation | ✅ | ❌ | ## Keep vs AIOps platforms (BigPanda, Moogsoft, etc.) Keep is an alternative to platforms like BigPanda and Moogsoft. Customers who have used both traditional platforms and Keep notice a significant improvement in alert correlation. Unlike the manual methods of other platforms, Keep uses advanced state-of-the-art AI models for easier and more effective alert correlation. | | Keep | Alternative | | ------------------------------------- | -------------------------------------------------------------- | ---------------------------- | | Aggregation of alerts | ✅ | ✅ | | Integrations | ✅ (Bi-directional) | ✅ (Webhooks) | | Alerts enrichment | ✅ | ✅ | | Open source | ✅ | ❌ | | Workflow automation | ✅ (GitHub Actions-like, infrastructure as code) | ✅ | | Managed version | ✅ | ✅ | | On-Premises | ✅ | ❌ | | Noise reduction & correlation | ✅ (AI) | ✅ (Rule-based in some cases) | ================================================ FILE: docs/overview/correlation-rules.mdx ================================================ --- title: "Manual Correlation Rules" --- The Keep Correlation Engine is a versatile tool for correlating and consolidating alerts into incidents or incident-candidates. This guide explains the core concepts, usage, and best practices for effectively utilizing the rule engine. ## Core Concepts - **Rule definition**: A rule in Keep is a set of conditions that, when met, creates an incident or incident-candidate. - **Alert attributes**: These are characteristics or data points of an alert, such as source, severity, or any attribute an alert might have. - **Conditions and logic**: Rules are built by defining conditions based on alert attributes, using logical operators (like AND/OR) to combine multiple conditions. ## Creating Correlation Rules Creating a rule involves defining the conditions under which an alert should be categorized or actions should be grouped. 1. **Accessing the Correlation Engine**: Navigate to the Correlation section in the Keep platform. 2. **Defining rule criteria**: - **Name the rule**: Assign a descriptive name that reflects its purpose. - **Set conditions**: Use alert attributes to create conditions. For example, a rule might specify that an alert with a severity of 'critical' and a source of 'Prometheus' should be categorized as 'High Priority'. - **Logical grouping**: Combine conditions using logical operators to form comprehensive rules. - **Manual approve**: Create Incident-candidate or full-fledged incident. ## Dynamic Incident Naming The correlation engine supports dynamic incident naming based on alert attributes. This allows you to create more meaningful and context-aware incident names that reflect the actual alert data. ### Template Variables You can use template variables in your incident name using the `{{ alert.attribute }}` syntax. These variables are replaced with actual values from the alerts. For example: - `{{alert.labels.host}}` - References the host from alert labels - `{{alert.service}}` - References the service name from the alert ### Behavior with Multiple Alerts When an incident contains multiple alerts: - Values from all alerts are automatically concatenated with commas - Duplicate values are automatically deduplicated - If a new alert adds a unique value, the incident name is updated to include it #### Dynamic Name Example **Template:** "Service Issue on `{{alert.labels.host}}`" **First alert** ``` { ... { "labels": { "host": "host1" } } ... } ``` **Second alert** ``` { ... { "labels": { "host": "host2" } } ... } ``` **Incident Name** Service Issue on host1,host2 ## Examples - **Metric-based alerts**: Construct a rule to pinpoint alerts associated with specific metrics, such as high CPU usage on servers. This can be achieved by grouping alerts that share a common attribute, like a 'CPU usage' tag, ensuring you quickly identify and address performance issues. - **Feature-related alerts**: Establish rules to create incident by specific features or services. For instance, you can start incident based on a 'service' or 'URL' tag. This approach is particularly useful for tracking and managing alerts related to distinct functionalities or components within your application. - **Team-based alert management**: Implement rules to create incidents according to team responsibilities. This might involve grouping based on the systems or services a particular team oversees. Such a strategy ensures that alerts are promptly directed to the appropriate team, enhancing response times and efficiency. ================================================ FILE: docs/overview/correlation-topology.mdx ================================================ --- title: "Topology Correlation" --- The Topology Processor is a core component of Keep that helps correlate alerts based on your infrastructure's topology, creating meaningful incidents that reflect the relationships between your services and applications. It automatically analyzes incoming alerts and their relationship to your infrastructure topology, creating incidents when multiple related services or components of an application are affected. Read more about [Service Topology](/overview/servicetopology). The Topology Processor is disabled by default. To enable it, set the environment variable `KEEP_TOPOLOGY_PROCESSOR=true`. ## How It Works 1. **Service Discovery**: The processor maintains a map of your infrastructure's topology, including: - Services and their relationships - Applications and their constituent services - Dependencies between different components 2. **Alert Processing**: Every few seconds, the processor: - Analyzes recent alerts - Maps alerts to services in your topology - Creates or updates incidents based on application-level impact 3. **Incident Creation**: When multiple services within an application have active alerts: - Creates a new application-level incident - Groups related alerts under this incident - Provides context about the affected application and its services ## Configuration ### Environment Variables | Variable | Description | Default | | ------------------------------------------ | --------------------------------------------------- | ------- | | `KEEP_TOPOLOGY_PROCESSOR` | Enable/disable the topology processor | `false` | | `KEEP_TOPOLOGY_PROCESSOR_INTERVAL` | Interval for processing alerts (in seconds) | `10` | | `KEEP_TOPOLOGY_PROCESSOR_LOOK_BACK_WINDOW` | Look back window for alert correlation (in minutes) | `15` | ## Incident Management ### Creation When the processor detects alerts affecting multiple services within an application: - Creates a new incident with type "topology" - Names it "Application incident: {application_name}" - Automatically confirms the incident - Links all related alerts to the incident ### Resolution Incidents can be configured to resolve automatically when: - All related alerts are resolved - Specific resolution criteria are met ## Best Practices 1. **Service Mapping** - Ensure services in alerts match your topology definitions - Maintain up-to-date topology information 2. **Application Definition** - Group related services into logical applications - Define clear service boundaries 3. **Alert Configuration** - Include service information in your alerts - Use consistent service naming across monitoring tools ## Example If you have an application "payment-service" consisting of multiple microservices: ```json { "application": "payment-service", "services": ["payment-api", "payment-processor", "payment-database"] } ``` When alerts come in for both `payment-api` and `payment-database`, the Topology Processor will: 1. Recognize these services belong to the same application 2. Create a single incident for "payment-service" 3. Group both alerts under this incident 4. Provide application-level context in the incident description ## Limitations - Currently supports only application-based incident creation - One active incident per application at a time - Requires service information in alerts for correlation ================================================ FILE: docs/overview/deduplication.mdx ================================================ --- title: "Deduplication" --- Alert deduplication is a crucial feature in Keep that helps reduce noise and streamline incident management by grouping similar alerts together. This process ensures that your team isn't overwhelmed by a flood of notifications for what is essentially the same issue, allowing for more efficient and focused incident response. ## Glossary - **Deduplication Rule**: A set of criteria used to determine if alerts should be grouped together. - **Partial Deduplication**: Correlates instances of alerts into single alerts, considering the case of the same alert with different statuses (e.g., firing and resolved). This is the default mode where specified fields are used to identify and group related alerts. - **Fingerprint Fields**: Specific alert attributes used to identify similar alerts. - **Full Deduplication**: A mode where alerts are considered identical if all fields match exactly (except those explicitly ignored). This helps avoid system overload by discarding duplicate alerts. - **Ignore Fields**: In full deduplication mode, these are fields that are not considered when comparing alerts. ## Deduplication Types ### Partial Deduplication Partial deduplication allows you to specify certain fields (fingerprint fields) that are used to identify similar alerts. Alerts with matching values in these specified fields are considered duplicates and are grouped together. This method is flexible and allows for fine-tuned control over how alerts are deduplicated. Every provider integrated with Keep comes with pre-built partial deduplication rule tailored to that provider's specific alert format and common use cases. The default fingerprint fields defined using `FINGERPRINT_FIELDS` attributes in the provider code (e.g. [datadog provider](https://github.com/keephq/keep/blob/main/keep/providers/datadog_provider/datadog_provider.py#L188) or [gcp monitoring provider](https://github.com/keephq/keep/blob/main/keep/providers/gcpmonitoring_provider/gcpmonitoring_provider.py#L52)). ### Full Deduplication When full deduplication is enabled, Keep will also discard exact same events (excluding ignore fields). This mode considers all fields of an alert when determining duplicates, except for explicitly ignored fields. By default, exact similar events excluding lastReceived time are fully deduplicated and discarded. This helps prevent system overload from repeated identical alerts. ## Real Examples of Alerts and Results ### Example 1: Partial Deduplication **Rule** - Deduplicate based on 'service' and 'error_message' fields. ```json # alert 1 { "service": "payment", "error_message": "Database connection failed", "severity": "high", "lastReceived": "2023-05-01T10:00:00Z" } # alert 2 { "service": "payment", "error_message": "Database connection failed", "severity": "critical", "lastReceived": "2023-05-01T10:05:00Z" } # alert 3 { "service": "auth", "error_message": "Invalid token", "severity": "medium", "lastReceived": "2023-05-01T10:10:00Z" } ``` **Result**: - Alerts 1 and 2 are deduplicated into a single alert, fields are updated. - Alert 3 remains separate as it has a different service and error message. ### Example 2: Full Deduplication **Rule**: Full deduplication with 'timestamp' as an ignore field **Incoming Alerts**: ```json # alert 1 { service: "api", error: "Rate limit exceeded", user_id: "12345", lastReceived: "2023-05-02T14:00:00Z" } # alert 2 (discarded as its identical) { service: "api", error: "Rate limit exceeded", user_id: "12345", lastReceived: "2023-05-02T14:01:00Z" } # alert 3 { service: "api", error: "Rate limit exceeded", user_id: "67890", lastReceived: "2023-05-02T14:02:00Z" } ``` **Result**: - Alerts 1 and 2 are deduplicated as they are identical except for the ignored timestamp field. - Alert 3 remains separate due to the different user_id. ## How It Works Keep's deduplication process follows these steps: 1. **Alert Ingestion**: Every alert received by Keep is first ingested into the system. 2. **Enrichment**: After ingestion, each alert undergoes an enrichment process. This step adds additional context or information to the alert, enhancing its value and usefulness. 3. **Deduplication**: Following enrichment, Keep's alert deduplicator comes into play. It applies the defined deduplication rules to the enriched alerts. ================================================ FILE: docs/overview/enrichment/extraction.mdx ================================================ --- title: "Extraction" --- Keep's Alert Extraction enrichment feature enables dynamic extraction of data from incoming alerts using regular expressions. This powerful tool allows users to define extraction rules that identify and extract data based on patterns, enriching alerts with additional structured data derived directly from alert content. ## Introduction Handling a variety of alert formats and extracting relevant information can be challenging. Keep's Alert Extraction feature simplifies this process by allowing users to define regex-based rules that automatically extract key pieces of information from alerts. This capability is crucial for standardizing alert data and enhancing alert context, which facilitates more effective monitoring and response strategies. ## How It Works 1. **Rule Definition**: Users create extraction rules specifying the regex patterns to apply to certain alert attributes. 2. **Attribute Specification**: Each rule defines which attribute of the alert should be examined by the regex. 3. **Data Extraction**: When an alert is received, the system applies the regex to the specified attribute. If the pattern matches, named groups within the regex define new attributes to be extracted and added to the alert. 4. **First Match Enforcement**: The extraction process is designed to stop after the first successful match. Once a rule successfully applies and enriches the alert, no further rules are processed. This ensures efficiency and prevents overlapping or redundant data extraction. 5. **Alert Enrichment**: Extracted values are added to the alert, enhancing its data with additional attributes for improved analysis. ## Practical Example Suppose you receive alerts with a message attribute formatted as "Error 404: Not Found - [UserID: 12345]". You can define an extraction rule with a regex such as `Error (?P\d+): (?P.+) - \[UserID: (?P\d+)\]` to extract `error_code`, `error_message`, and `user_id` as separate attributes in the alert. ## Core Concepts - **Regex (Regular Expression)**: A powerful pattern-matching syntax used to identify specific patterns within text. In the context of extraction rules, regex is used to define how data should be extracted from alert attributes. It is crucial that regex patterns adhere to [Python's regex syntax](https://docs.python.org/3.11/library/re.html#match-objects), especially concerning group matching using named groups. - **Attribute**: The part of the alert data (e.g., message, description) that the regex is applied to. - **Named Groups**: Part of the regex pattern that specifies placeholders for extracting specific data points into new alert attributes. ## Creating an Extraction Rule To create an alert extraction rule: 1. **Select the Attribute**: Choose which attribute of the alert should be examined by the regex. 2. **Define the Regex**: Write a regex pattern with named groups that specify what information to extract. Ensure the regex is valid according to Python’s regex standards, particularly for group matching. 3. **Configure Conditions**: Optionally, specify conditions under which this rule should apply, using CEL (Common Expression Language) for complex logic. ## Best Practices - **Test Regex Patterns**: Before deploying a new extraction rule, thoroughly test the regex pattern to ensure it correctly matches and extracts data according to Python's regex standards. - **Monitor Extraction Performance**: Keep track of how extraction rules are performing and whether they are enriching alerts as expected. Adjust patterns as necessary based on incoming alert data. - **Use Specific Conditions**: When applicable, define conditions to limit when extraction rules apply, reducing unnecessary processing and focusing on relevant alerts. ================================================ FILE: docs/overview/enrichment/mapping.mdx ================================================ --- title: "Mapping" --- Keep's Alert Mapping enrichment feature provides a powerful mechanism for dynamically enhancing alert data by leveraging external data sources, such as CSV files and topology data. This feature allows for the matching of incoming alerts to specific records in a CSV file or topology data based on predefined attributes (matchers) and enriching those alerts with additional information from the matched records. ## Introduction In complex monitoring environments, the need to enrich alert data with additional context is critical for effective alert analysis and response. Keep's Alert Mapping and Enrichment enables users to define rules that match alerts to rows in a CSV file or topology data, appending or modifying alert attributes with the values from matching rows. This process adds significant value to each alert, providing deeper insights and enabling more precise and informed decision-making. ## How It Works ## Mapping with CSV Files 1. **Rule Definition**: Users define mapping rules that specify which alert attributes (matchers) should be used for matching alerts to rows in a CSV file. 2. **CSV File Specification**: A CSV file is associated with each mapping rule. This file contains additional data that should be added to alerts matching the rule. 3. **Alert Matching**: When an alert is received, the system checks if it matches the conditions of any mapping rule based on the specified matchers. 4. **Data Enrichment**: If a match is found, the alert is enriched with additional data from the corresponding row in the CSV file. CVS file will look like: | region |responsible_team | severity_override | |--------------|-----------------|---------------------------------| | us-east-1 | team-alpha | high | | us-west-2 | team-beta | medium | | eu-central-1 | team-gamma | low | ## Mapping with Topology Data 1. **Rule Definition**: Users define mapping rules that specify which alert attributes (matchers) should be used for matching alerts to topology data. 2. **Topology Data Specification**: Topology data is associated with each mapping rule. This data contains additional information about the components and their relationships in your environment. 3. **Alert Matching**: When an alert is received, the system checks if it matches the conditions of any mapping rule based on the specified matchers. 4. **Data Enrichment**: If a match is found, the alert is enriched with additional data from the corresponding topology data. ## Practical Example Imagine you have a CSV file with columns representing different aspects of your infrastructure, such as `region`, `responsible_team`, and `severity_override`. By creating a mapping rule that matches alerts based on `service` and `region`, you can automatically enrich alerts with the responsible team and adjust severity based on the matched row in the CSV file. Similarly, you can use topology data to enrich alerts. For example, if an alert is related to a specific service, you can use topology data to find related components and their statuses, providing a more comprehensive view of the issue. ## Core Concepts - **Matchers**: Attributes within the alert used to identify matching rows within the CSV file or topology data. Common matchers include identifiers like `service` or `region`. - **CSV File**: A structured file containing rows of data. Each column represents a potential attribute that can be added to an alert. - **Topology Data**: Information about the components and their relationships in your environment. This data can be used to enrich alerts with additional context. - **Enrichment**: The process of adding new attributes or modifying existing ones in an alert based on the data from a matching CSV row or topology data. ## Creating a Mapping Rule To create an alert mapping and enrichment rule: 1. **Define the Matchers**: Specify which alert attributes will be used to match rows in the CSV file or topology data. 2. **Specify the Data Source**: Provide the CSV file or specify the topology data to be used for enrichment. 3. **Configure the Rule**: Set additional parameters, such as whether the rule should override existing alert attributes. ## Best Practices - **Keep CSV Files and Topology Data Updated**: Regularly update the CSV files and topology data to reflect the current state of your infrastructure and operational data. - **Use Specific Matchers**: Define matchers that are unique and relevant to ensure accurate matching. - **Monitor Rule Performance**: Review the application of mapping rules to ensure they are working as expected and adjust them as necessary. ================================================ FILE: docs/overview/faq.mdx ================================================ --- title: "FAQ" sidebarTitle: FAQ --- ## FAQ ### 1. "Failed to copy alert/fingerprint. Please check your browser permissions" Modern browsers block clipboard access from insecure ("http") origins for security reasons. To confirm the root cause of the issue, check your website settings in the browser: If you see the "Blocked to protect your privacy" message or similar text under clipboard settings, this confirms the error is due to an insecure origin: To resolve this: - For production: Configure HTTPS for your Keep deployment - For local development: Use "localhost" which browsers treat as a secure origin - If using a custom domain locally: Enable HTTPS or switch to "localhost" If you're accessing Keep from a secure origin and still experiencing this issue, please [reach out](https://slack.keephq.dev) to us. ================================================ FILE: docs/overview/fingerprints.mdx ================================================ --- title: "Fingerprints" sidebarTitle: "Fingerprints" description: "Fingerprints are unique identifiers associated with alert instances in Keep. Every provider declares the fields fingerprints are calculated upon" --- Fingerprints defaults to Alert Name if the provider does not declare fingerprint fields. Fingerprints serve several important purposes in the context of alerting within Keep: ### De-Duplication Alert fingerprints are used to prevent the duplication of enrichments/workflows triggering for the same underlying alert. When Keep receives an alert, it calculates a fingerprint based on the configured fields declared within the Provider. If two alerts have the same fingerprint, Keep considers them to be duplicates and will present one of them. This helps reduce alert noise and prevent unnecessary workflow triggers/enrichments. ### Grouping Keep uses alert fingerprints to group related alerts together. Alerts with the same fingerprint are considered to be part of the same group, indicating that they are triggered by the same underlying condition or problem. Grouping alerts makes it easier for operators to understand relations between different alert-sources, the root cause of an issue and take appropriate action faster. ### Silencing Alert fingerprints are used in third-party tools to manage silences/mutes. Silencing allows operators to temporarily suppress alerts with specific fingerprints, providing a way to acknowledge and handle known issues without generating additional notifications/triggers. ### Visualization Alert fingerprints can also be used for visualization and analysis purposes. They help in tracking the history and status of alerts over time and provide a means to correlate alerts with specific conditions or changes in the monitored system. The process of generating a fingerprint involves hashing the fields configured in the provider and their values associated an alert instance. This results in a fixed-length, hexadecimal string that uniquely identifies that alert. When Keep receives/gets an alert, it calculates the fingerprint for each alert to determine if it should trigger a workflow, be grouped, or is silenced. In summary, Keep alert fingerprints are essential for managing and organizing alerts in every third-party system. They help prevent duplicates, group related alerts, enable silencing, and facilitate analysis and visualization of alert data, ultimately aiding in the effective operation and maintenance of monitored systems. ### Examples This is the base provider class implementation for fingerprint fields: ```python base_provider.py class BaseProvider(metaclass=abc.ABCMeta): OAUTH2_URL = None PROVIDER_SCOPES: list[ProviderScope] = [] PROVIDER_METHODS: list[ProviderMethod] = [] FINGERPRINT_FIELDS: list[str] = [] ``` This is Datadog's provider implementation for fingerprint fields, where we calculate fingerprint based on the event groups and monitor id, as an example: ```python datadog_provider.py class DatadogProvider(BaseProvider): """ Datadog provider class. """ PROVIDER_SCOPES = [ ... ] PROVIDER_METHODS = [ ... ] FINGERPRINT_FIELDS = ["groups", "monitor_id"] ``` Keep allows for customization in anything related with fingerprints. If you want to change the way a specific provider calculates the fingerprint of an alert, you can simply configure the fields you require. ================================================ FILE: docs/overview/glossary.mdx ================================================ --- title: "Glossary" --- ## Alert An alert is an event that is triggered when something bad happens or going to happen. The term "alert" can sometimes be interchanged with "alarm" (e.g. in CloudWatch) or "monitor" (Datadog). ## Incident An incident is a group of alerts that are related to each other. ## Provider A provider can be a module that pulls alerts into Keep or pushes data out of keep by interacting with external systems. ### Provider as a data source Within the context of a Workflow, a Provider can: - Query data - query Datadog's API or run a SQL query against a database. - Push data - send a Slack message or create a PagerDuty incident. ### Provider as an alert source When you connect a Provider, Keep begins to read and process alerts from that Provider. For example, after connecting your Prometheus instance, you'll start seeing your Prometheus alerts in Keep. A Provider can either push alerts into Keep, or Keep can pull alerts from the Provider. #### Push alerts to Keep (Manual) You can configure your alert source to push alerts into Keep. For example, consider Prometheus. If you want to push alerts from Prometheus to Keep, you'll need to configure Prometheus Alertmanager to send the alerts to 'https://api.keephq.dev/alerts/event/prometheus' using API key authentication. Each Provider implements Push mechanism and is documented under the specific Provider page. #### Push alerts to Keep (Automatic) In compatible tools, Keep can automatically integrate with the alerting policy of the source tool and add itself as an alert destination. You can learn more about Webhook Integration [here](/providers/overview). Please note that this will slightly modify your monitors/notification policy. ### Pull alerts by Keep Keep also integrates with the alert APIs of various tools and can automatically pull alerts. While pulling is easier to set up (requiring only credentials), pushing is preferable when automation is involved. ## Workflow Workflows consist of a list of [Steps](/workflows/overview#steps) and [Actions](/workflows/overview#actions). A workflow can be triggered in the following ways: - When an Alert is triggered. - In a predefined interval. - Manually. Workflows are commonly used to: 1. Enrich your alerts with more context. 2. Automate the response to alert. 3. Create multi-step alerts. ## API first Keep is an API-first platform, meaning that anything you can do via the UI can also be accomplished through the [API](https://api.keephq.dev/redoc) This gives you the flexibility to integrate Keep with your existing stack and to automate alert remediation and enrichment processes. ================================================ FILE: docs/overview/howdoeskeepgetmyalerts.mdx ================================================ --- title: "Push vs Pull alerts" --- There are primarily two ways to get alerts into Keep: We strongly recommend using the push method for alerting, as pulling does not include a lot of the features, like workflow automation. It is mainly used for a quick way to get alerts into Keep and start exploring the value. ### Push When you connect a [Provider](/providers), Keep automatically instruments the tools to send alerts to Keep via webhook. As an example, when you connect Grafana, Keep will automatically create a new Webhook contact point in Grafana, and a new Notification Policy to send all alerts to Keep. You can configure which providers you want to push from by checking the `Install Webhook` checkbox in the provider settings. ### Pull When you connect a [Provider](/providers), Keep will start pulling alerts from the tool automatically. Pulling interval is defined by the `KEEP_PULL_INTERVAL` environment variable and defaults to 7 days (in minutes) and can be completely turned off by using the `KEEP_PULL_DATA_ENABLED` environment variable. You can also configure which providers you want to pull from by checking the `Pulling Enabled` checkbox in the provider settings. ================================================ FILE: docs/overview/introduction.mdx ================================================ --- title: "Introduction" description: "Keep is an open-source alert management and AIOps platform that is a swiss-knife for alerting, automation, and noise reduction." --- Keep has a new playground! Visit the [Playground](https://playground.keephq.dev) to explore its powerful features, experiment with configurations, and test AIOps techniques in a sandbox environment. Once you're ready to start using Keep in your environment, head over to the [Platform](https://platform.keephq.dev) to set up your tenant and get started. Don't forget to join our [Slack community](https://slack.keephq.dev) for help and to share your feedback. ## What's AIOps? In simple words, AI for IT Operations (aka AIOps) is about automating repetitive tasks, reducing noise from monitoring tools, and helping teams overcome alert fatigue by turning overwhelming data into actionable insights. With AIOps, teams can eliminate noise, prioritize critical issues, and focus on solving real problems rather than constantly firefighting alerts. ## Why do we build Keep? Working with current tools such as BigPanda, Splunk ITSI, or ServiceNow ITOM, we identified a gap: - **No Open Source Solution:** We have Grafana for visualization and Prometheus for metrics, but nothing for AIOps. Keep fills this gap as the first open-source solution for AIOps. - **Not DevOps/SRE Friendly:** Current tools are enterprise-focused but not in a good way. If you're an SRE team lead or head of IT operations in a company with ~100 employees, the existing tools won't work for you. They're too expensive, and their UX requires a dedicated team just for setup and maintenance. Keep is enterprise-ready (scaling, SSO, etc.) but also designed for small teams that want to adopt AIOps practices. - **A "Post LLM Era" AIOps:** Existing tools were built in a different technical era. Keep is designed to leverage the advancements of the large language model (LLM) era, integrating AI more seamlessly into IT operations. ## Our Philosophy - **Easy to start** – Whether locally or on Kubernetes, we provide one-click solutions like `helm install` and `docker-compose` so you can quickly spin up Keep and start exploring its capabilities. - **Easy to extend** – Keep is designed with extensibility in mind, making it straightforward to add new integrations or functionality to meet your specific needs. - **Easy to deploy** – Every aspect of Keep can be provisioned as code, enabling seamless automation of deployments and integration into your CI/CD pipelines. - **Easy to collaborate** – As an open-source project, we truly believe in the power of community and collaboration. We actively listen to user feedback and strive to continuously improve Keep based on the needs and insights of our users. ## Our Vision Keep is built so every team can benefit from AIOps. Whether you're a small team looking for a Kubernetes-local single pane of glass for your Prometheus alerts, or an enterprise with dozens of tools generating alerts and needing to sync with your ServiceNow tickets, Keep is for you. Our vision is to democratize AIOps, making it accessible and practical for teams of all sizes. ## What you should read next - [Key Concepts](/overview/glossary): Understand the foundational ideas behind Keep. - [Use Cases](/overview/usecases): Learn how Keep can solve specific IT operations challenges. - [Playground](/overview/playground): Explore Keep's playground. ================================================ FILE: docs/overview/maintenance-windows.mdx ================================================ --- title: "Maintenance Windows" --- Keep's Maintenance Windows feature provides a critical mechanism for managing alert noise during scheduled maintenance periods or other planned events. By defining Maintenance Window rules, users can suppress alerts that are irrelevant during these times, ensuring that only actionable alerts reach the operations team. ## Introduction In dynamic IT environments, it's common to have periods where certain alerts are expected and should not trigger incident responses. Keep's Maintenance Windows feature allows users to define specific rules that temporarily suppress alerts based on various conditions, such as time windows or alert attributes. This helps prevent unnecessary alert fatigue and ensures that teams can focus on critical issues. ## How It Works 1. **Maintenance Window Rule Definition**: Users define Maintenance Window rules specifying the conditions under which alerts should be suppressed. 2. **Condition Specification**: A CEL (Common Expression Language) query is associated with each Maintenance Window rule to define the conditions for suppression. 3. **Time Window Configuration**: Maintenance Window rules can be set for specific start and end times, or based on a relative duration. 4. **Alert Suppression**: During the active period of a Maintenance Window rule, any alerts matching the defined conditions are either suppressed and **not shown in alerts feed** or shown in the feed in suppressed status (**this is configurable**). ## Practical Example Suppose your team schedules a database upgrade that could trigger numerous non-critical alerts. You can create a Maintenance Window rule that suppresses alerts from the database service during the upgrade window. This ensures that your operations team isn't overwhelmed by non-actionable alerts, allowing them to focus on more critical issues. ## Core Concepts - **Maintenance Window Rules**: Configurations that define when and which alerts should be suppressed based on time windows and conditions. - **CEL Query**: A query language used to specify the conditions under which alerts should be suppressed. For example, a CEL query might suppress alerts where the source is a specific service during a maintenance window. - **Time Window**: The specific start and end times or relative duration during which the Maintenance Window rule is active. - **Alert Suppression**: The process of ignoring alerts that match the Maintenance Window rule's conditions during the specified time window. ## Status-Based Filtering in Maintenance Windows In Keep, certain alert statuses are automatically ignored by Maintenance Window rules. Specifically, alerts with the statuses RESOLVED and ACKNOWLEDGED are not suppressed by Maintenance Window rules. This is intentional to ensure that resolving alerts can still be processed and appropriately close or update active incidents. ### Why Are Some Statuses Ignored? • RESOLVED Alerts: These alerts indicate that an issue has been resolved. By allowing these alerts to bypass Maintenance Window rules, Keep ensures that any active incidents related to the alert can be properly closed, maintaining the integrity of the alert lifecycle. • ACKNOWLEDGED Alerts: These alerts have been acknowledged by an operator, signaling that they are being addressed. Ignoring these alerts in Maintenance Windows ensures that operators can track the progress of incidents and take necessary actions without interference. By excluding these statuses from Maintenance Window suppression, Keep allows for the continuous and accurate management of alerts, even during Maintenance Window periods, ensuring that resolution processes are not disrupted. ## Creating a Maintenance Window Rule To create a Maintenance Window rule: 1. **Define the Maintenance Window Name and Description**: Provide a name and optional description for the Maintenance Window rule to easily identify its purpose. 2. **Specify the CEL Query**: Use CEL to define the conditions under which alerts should be suppressed (e.g., `source == "database"`). 3. **Set the Time Window**: Choose a specific start and end time, or define a relative duration for the Maintenance Window. 4. **Enable the Rule**: Decide whether the rule should be active immediately or scheduled for future use. ## Best Practices - **Plan Maintenance Windows in Advance**: Schedule Maintenance Window periods in advance for known maintenance windows to prevent unnecessary alerts. - **Use Specific Conditions**: Define precise CEL queries to ensure only the intended alerts are suppressed. - **Review and Update Maintenance Windows**: Regularly review active Maintenance Window rules to ensure they are still relevant and adjust them as necessary. ## Strategies In order to handle the alerts during Maintenance Windows, Keep provides some Strategies to handle how these alerts are treated: ### 1. Default The default behaviour of Maintenance Windows is to **Suppressed** alerts that match the defined conditions. ### 2. Recover status This strategy relies on the following premise: An alert received inside the Maintenance Window must be inhibited and once the Maintenance Window is over, the alert must recover its previous flow. The following actions will therefore be taken with a new alert: - When an alert is received, it will be checked against the Maintenance Window rules. - If the alert matches any Maintenance Window rule, its status will be set to **Maintenance**. - Workflows and Incidents handling are skipped. Every WATCHER_LAPSED_TIME seconds, the watcher will check whether there is any active Maintenance Window for every alert with a Maintenance status. If so, the following actions will be taken: - The alert will swap its status, and previous status. - Workflows, Incidents handling, Pusher and Presets notifications will be launched in the same way as a new alert. #### 2.1 What is an expired Maintenance Window? For a maintenance window to be considered expired, the following conditions must be met: - The **End Time** must be earlier than the current time. - The **Enabled** flag must be set to **False**. #### 2.2 What are the specific conditions to use the Recover Status Strategy? - Set **MAINTENANCE_WINDOW_STRATEGY** environment variable to **recover_previous_status**. - "Alerts will show in suppressed status" option must be set to **True** in the Maintenance Window rule configuration. - **Enabled** flag must be set to **True** in the Maintenance Window rule configuration. ================================================ FILE: docs/overview/playground.mdx ================================================ --- title: "Playground" description: "Dive into Keep's [sandbox environment](https://playground.keephq.dev) to experience the full range of its AIOps capabilities." --- Use Keep's [playground](https://playground.keephq.dev) to explore, experiment, and understand how Keep streamlines operations and reduces noise, enabling you to gain clarity and control over your IT ecosystem. What to look at: - [Alerts](#alerts) - [Incidents](#incidents) - [Providers](#providers) - [Workflows](#workflows) - [AIOps Techniques](#aiops-techniques) ## Alerts Get a single pane of glass view for all your alerts with customizable presets. Use CEL (Common Expression Language) syntax for precise filtering, configure the alerts table layout to match your workflow, and explore facets for quick insights into alert patterns and metrics. ## Incidents Examine incidents in detail, including their associated alerts and timelines. Test correlation logic and mapping configurations that group related alerts into incidents, and validate your suppression or resolution strategies. ## Providers Integrate with external data sources or alert providers like Prometheus, Datadog, or GCP Monitoring. Configure and test mappings to ensure proper ingestion and normalization of data from various sources into Keep's unified schema. ## Workflows Build and test automated workflows to manage alerts and incidents with precision. Experiment with both an intuitive UI builder and advanced scripting capabilities to trigger actions, notifications, or external integrations based on dynamic conditions. ## AIOps Techniques Test and refine deduplication, enrichment, mapping, and extraction rules to optimize alert handling. Experiment with these techniques to transform raw alerts into actionable data and reduce noise effectively. ================================================ FILE: docs/overview/servicetopology.mdx ================================================ --- title: "Service Topology" --- The Service Topology feature in Keep provides a visual representation of your service dependencies, allowing you to quickly understand the relationships between various components in your system. By mapping services and their interactions, you can gain insights into how issues in one service may impact others, enabling faster root-cause analysis and more effective incident resolution. ## Key Concepts - **Nodes**: Represent individual services, applications, or infrastructure components. - **Edges**: Show the dependencies and interactions between nodes. ## Supported Providers } > } > } > } > } > ## Features ### Visualizing Dependencies The service topology graph helps you: - Identify critical dependencies between services. - Understand how failures in one service propagate through the system. - Highlight single points of failure or bottlenecks. ### Real-Time Health Indicators Nodes and edges are enriched with health indicators derived from alerts and metrics. This allows you to: - Quickly spot issues in your architecture. - Prioritize incident resolution based on affected dependencies. ### Filter and Focus Use filters to focus on specific parts of the topology, such as: - A particular environment (e.g., production, staging). - A service group (e.g., all database-related services). - Alerts of a specific severity or type. ### Incident Integration Service topology integrates seamlessly with Keep’s incident management features. When an incident is triggered, you can: - View the affected nodes and their dependencies directly on the topology graph. - Analyze how alerts related to the incident are propagating through the system. - Use this information to guide remediation efforts. ### Manually adding Topology This features allows you to create and manipulate your services and the dependencies between them. - Click on `+ Add Node` to add a new service to your map. - Field `Service` and `Display Name` are mandatory fields and rest of the fields are optional. (Note: `Tags` accepts CSV) - Click `Save`, this adds a new service to your map. - You can add multiple such services and add connections/dependencies between them. - You can select on or more manually created services (holding Ctrl select multiple services), and delete them all at once using the `Delete Services` option. - You can click any service and use `Update Service` button to update a service. - To add a dependency drag from any service's right handle (source) to another service's left handle (target). - You can remove a dependency by dragging away a dependency from it's target handle and leave it. - To add a protocol to your dependency: click the dependency > Click `Edit Dependency` > Fill in the protocol in the popup > Click `OK`. - You can only manipulate the services that are created manually. - Creating or updating a dependency is only possible between two manually created services. ### Importing and Exporting topology You can Import/Export topology data: services + applications + dependencies to/from keep using this feature. - Click the menu item to get the Import/Export option. - Data is Imported and Exported in YAML Format. - Below is a sample YAML: ```yaml applications: - description: 'A sample application for monitoring and management' id: 398e7b9a-bc0f-487a-b6d7-049a16e500e4 name: monitoring-app repository: 'https://github.com/sample-org/monitoring-app' services: - 556041 - 556061 dependencies: - depends_on_service_id: 556051 id: 6219 protocol: HTTP service_id: 556041 - depends_on_service_id: 556081 id: 6220 protocol: HTTPS service_id: 556051 - depends_on_service_id: 556041 id: 6221 protocol: GRPC service_id: 556061 - depends_on_service_id: 556071 id: 6222 protocol: TCP service_id: 556061 - depends_on_service_id: 556051 id: 6223 protocol: UDP service_id: 556071 services: - id: 556041 display_name: Auth Service service: PAH3VXB category: Backend description: 'Handles user authentication and session management' email: 'auth-team@example.com' environment: production ip_address: '192.168.1.10' is_manual: false mac_address: '00:1A:2B:3C:4D:5E' manufacturer: 'Dell' namespace: 'auth' repository: 'https://github.com/sample-org/auth-service' slack: '#auth-alerts' source_provider_id: ebe062c4814f483cb2c5d556fbb9395c tags: ['authentication', 'security'] team: 'Auth Team' - id: 556051 display_name: Log Aggregator service: PFRKUOO category: Monitoring description: 'Main service responsible for collecting and aggregating logs' email: 'logs-team@example.com' environment: staging ip_address: '192.168.1.11' is_manual: false mac_address: '00:1A:2B:3C:4D:5F' manufacturer: 'HP' namespace: 'logs' repository: 'https://github.com/sample-org/log-aggregator' slack: '#logs-alerts' source_provider_id: ebe062c4814f483cb2c5d556fbb9395c tags: ['monitoring', 'logging'] team: 'Logs Team' - id: 556061 display_name: Core API service: PWKXGRK category: API description: 'Main business logic service for processing user data' email: 'backend-team@example.com' environment: production ip_address: '192.168.1.12' is_manual: false mac_address: '00:1A:2B:3C:4D:60' manufacturer: 'Cisco' namespace: 'api' repository: 'https://github.com/sample-org/core-api' slack: '#backend-alerts' source_provider_id: ebe062c4814f483cb2c5d556fbb9395c tags: ['api', 'backend'] team: 'Backend Team' - id: 556071 display_name: Database Service service: PFEIHAU category: Storage description: 'Handles database operations and caching' email: 'db-team@example.com' environment: production ip_address: '192.168.1.13' is_manual: false mac_address: '00:1A:2B:3C:4D:61' manufacturer: 'IBM' namespace: 'db' repository: 'https://github.com/sample-org/database-service' slack: '#db-alerts' source_provider_id: ebe062c4814f483cb2c5d556fbb9395c tags: ['database', 'storage'] team: 'Database Team' - id: 556081 display_name: Service Mesh service: PC8HHE7 category: Infrastructure description: 'Handles networking and service discovery' email: 'infra-team@example.com' environment: production ip_address: '192.168.1.14' is_manual: false mac_address: '00:1A:2B:3C:4D:62' manufacturer: 'Juniper' namespace: 'mesh' repository: 'https://github.com/sample-org/service-mesh' slack: '#infra-alerts' source_provider_id: ebe062c4814f483cb2c5d556fbb9395c tags: ['networking', 'mesh'] team: 'Infra Team' ``` ================================================ FILE: docs/overview/support.mdx ================================================ --- title: "Support" sidebarTitle: Support --- ## Overview You can use the following methods to ask for support/help with anything related with Keep: You can use the [Keep Slack community](https://slack.keephq.dev) to get support. You can use support@keephq.dev to send inquiries. ================================================ FILE: docs/overview/usecases.mdx ================================================ --- title: "Use Cases" --- Keep is a versatile platform that adapts to the needs of various roles and scenarios in IT operations. Whether you're a DevOps engineer managing infrastructure, an SRE ensuring uptime, or a NOC team lead handling alert noise, Keep provides tailored solutions. The platform also addresses a broad range of use cases, from centralizing alert management to automating responses and ensuring SLA compliance. Explore how Keep can simplify your workflows and improve operational efficiency, no matter your role or challenge. --- ## By Role ### For DevOps Keep enables DevOps engineers to centralize alert management, automate responses, and fine-tune alert configurations. With integrations to tools like Prometheus and Grafana, you can streamline monitoring workflows, reduce noise, and focus on delivering reliable infrastructure. ### For SREs Site Reliability Engineers can benefit from Keep’s ability to correlate alerts across systems, enrich them with contextual data, and automate remediation steps. Use Keep to maintain service uptime and reduce the burden of on-call duties by ensuring actionable alerts. ### For Software Engineers Software engineers can use Keep to understand the context of alerts that impact their services. By integrating alert enrichment and automated workflows, they can quickly identify and resolve issues without sifting through raw logs or multiple monitoring tools. ### For Engineering Managers Keep helps engineering managers track and manage the overall health of their systems. Gain insights into alert trends, manage noise reduction strategies, and ensure your teams focus on critical issues with Keep’s centralized dashboard and analytics. ### For NOC Team Leads Keep empowers NOC teams with advanced alert visualization, centralized management, and actionable insights. Use features like throttling, muting, and faceted search to streamline incident handling and minimize alert fatigue. ### For Heads of IT Operations For heads of IT operations, Keep provides an enterprise-ready yet flexible solution for managing complex environments. Gain visibility into system health, ensure compliance with SLAs, and scale your operations with Keep’s automation and alert correlation capabilities. --- ## By Use Case ### Central Alert Management No more navigating between multiple Prometheus instances and dealing with per-region, per-account CloudWatch settings. By linking your alert-triggering tools to Keep, you gain a centralized dashboard for managing all your alerts. Review, throttle, mute, and fine-tune alerts from a single console. ### Alerts Enrichment Keep allows you to enrich alerts with additional context from observability tools, databases, and ticketing systems. Need enterprise-specific alert triggers or want to include extra details about customer impact? Keep makes it easy to augment alerts for better decision-making. ### Automate Alert Response Automate responses to common alerts, reducing the time spent on repetitive tasks. For example, confirm a 502 error on an endpoint with an additional query or check if an issue affects a low-priority customer before escalating it to your team. ### Multi-Environment Monitoring Centralize alerts across multiple environments, such as staging, production, and testing. Keep helps you manage environment-specific rules while providing a unified view of your system health. ### Noise Reduction Use deduplication, throttling, and muting to significantly reduce noise from excessive or redundant alerts. Keep ensures your teams are only notified of critical issues. ### SLA Compliance Track alert resolution times and ensure compliance with SLAs. Keep’s automation and reporting features enable you to monitor and meet contractual obligations seamlessly. ### Incident Correlation Correlate related alerts to identify the root cause of incidents quickly. Use Keep’s workflows and mapping rules to group alerts and provide actionable insights for resolution. ### Ticketing Integration Sync alerts with ticketing tools like Jira and ServiceNow. Automate ticket creation, track updates, and ensure seamless workflows between operations and development teams. --- ================================================ FILE: docs/overview/workflow-automation.mdx ================================================ --- title: "Workflows" --- Workflow automation designed to transform how you manage alerts and incidents. It allows you to automate responses, integrate seamlessly with your existing tools, and build complex workflows tailored to your needs. With workflow automation, you can reduce manual effort, improve response times, and ensure consistent handling of recurring scenarios. This section provides an abstract overview of workflows in Keep. To dive deeper into creating and managing workflows, refer to the dedicated [Workflow Documentation](#workflow-documentation) and explore our [GitHub repository](https://github.com/keephq/keep/tree/main/examples/workflows) for ready-to-use examples. ## Why Workflow Automation is Core Every alert, incident, or integration can be part of a workflow. Whether it’s auto-creating tickets, sending Slack notifications, or enriching alerts with external data, workflows are central to making Keep a powerful and flexible tool for your IT operations. ## Explore Further ### 1. Detailed Workflow Documentation Explore [Workflow Documentation](#workflow-documentation) to learn: - How to define triggers, actions, and steps. - Best practices for designing efficient workflows. - Advanced use cases, such as conditional branching and multi-step automation. ### 2. Workflow Examples on GitHub Check out our [GitHub repository](https://github.com/keephq/keep/tree/main/examples/workflows) for: - Pre-built workflows ready to use in your environment. - Examples for common use cases, such as auto-remediation, alert enrichment, and multi-channel notifications. - Contributions from the community, showcasing innovative ways to use Keep workflows. --- Workflow automation is at the heart of Keep’s mission to make AIOps accessible and actionable. Use this as a starting point, and explore the rich resources available to master workflows and revolutionize your alert management. ================================================ FILE: docs/providers/adding-a-new-provider.mdx ================================================ --- title: "Adding a new Provider" sidebarTitle: "Adding a New Provider" --- This guide explains how to create a new provider for Keep. Providers are integrations that allow Keep to interact with external services for alerting, querying data, managing incidents, or building topology maps. ## Table of contents - [Provider structure](#provider-structure) - [Step-by-step implementation](#step-by-step-implementation) - [Provider attributes](#provider-attributes) - [Abstract methods](#abstract-methods) - [Provider types and capabilities](#provider-types-and-capabilities) - [Authentication configuration](#authentication-configuration) - [Testing your provider](#testing-your-provider) - [Best practices](#best-practices) - [Common patterns](#common-patterns) - [Complete provider example](#complete-provider-example) - [Checklist](#checklist) ## Provider structure Each provider in Keep follows a specific structure: ``` keep/providers/ ├── yourservice_provider/ │ ├── __init__.py │ └── yourservice_provider.py ``` **Important Notes:** - Keep's ProvidersFactory automatically discovers providers based on the directory naming convention (`*_provider`). - You don't need to register them explicitly - just follow the naming pattern. - The provider type is automatically extracted from the class name (for example, `ServiceNowProvider` → `servicenow`). ## Step-by-step implementation ### 1. Create provider directory Create a new directory under `keep/providers/` with the pattern `{service}_provider`: ```bash mkdir keep/providers/yourservice_provider ``` ### 2. Create the provider module Create `yourservice_provider.py` with the following structure: ```python """ YourService Provider is a class that allows integration with YourService. """ import dataclasses import json import os from typing import Optional, List, Dict, Any import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod @pydantic.dataclasses.dataclass class YourserviceProviderAuthConfig: """YourService authentication configuration.""" api_endpoint: str = dataclasses.field( metadata={ "required": True, "description": "YourService API endpoint URL", "validation": "https_url", # Optional: validates HTTPS URLs } ) api_key: str = dataclasses.field( metadata={ "required": True, "description": "API key for YourService", "sensitive": True, # Marks field as sensitive in UI } ) region: str = dataclasses.field( default="us-east-1", metadata={ "required": False, "description": "YourService region", "type": "select", "options": ["us-east-1", "eu-west-1", "ap-south-1"], } ) class YourserviceProvider(BaseProvider): """Send alerts and fetch data from YourService.""" # Required: Display name shown in UI PROVIDER_DISPLAY_NAME = "YourService" # Required: Categories for provider classification PROVIDER_CATEGORY = ["Monitoring"] # Optional: Tags for searchability PROVIDER_TAGS = ["alert", "data"] # Optional: Define required scopes/permissions PROVIDER_SCOPES = [ ProviderScope( name="read:alerts", description="Read alerts from YourService", mandatory=True, documentation_url="https://docs.yourservice.com/permissions", alias="Read Alerts", ), ProviderScope( name="write:alerts", description="Create and update alerts", mandatory=False, mandatory_for_webhook=True, # Required only for webhook setup ), ] # Optional: OAuth2 URL (MUST be set as class attribute, not in __init__) OAUTH2_URL = None # Or os.environ.get("YOURSERVICE_OAUTH2_URL") def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) # Initialize any client libraries or state here # Note: Logger is automatically available as self.logger # Context manager provides access to: # - self.context_manager.tenant_id: Current tenant ID # - self.context_manager.workflow_id: Current workflow ID # - self.context_manager.workflow_execution_id: Current execution ID # - self.context_manager.get_full_context(): Full workflow context def validate_config(self): """ Validates required configuration for YourService provider. This is an abstract method that MUST be implemented. """ self.authentication_config = YourserviceProviderAuthConfig( **self.config.authentication ) def dispose(self): """ Cleanup any resources when provider is disposed. This is an abstract method that MUST be implemented, even if it just passes. """ pass ``` ### 3. Create the __init__.py File Create `keep/providers/yourservice_provider/__init__.py`: ```python from keep.providers.yourservice_provider.yourservice_provider import ( YourserviceProvider, YourserviceProviderAuthConfig ) __all__ = ["YourserviceProvider", "YourserviceProviderAuthConfig"] ``` ### 4. Add provider documentation Create `docs/providers/documentation/yourservice-provider.mdx` following the documentation template. Provider configuration fields are automatically documented through auto-generated snippets. Keep generates the snippet files in `docs/snippets/providers/` from the provider's AuthConfig metadata and includes them in the documentation automatically. ## Provider architecture ### Abstract methods Every provider must implement these two abstract methods from BaseProvider: 1. **`validate_config(self)`** - Validates and processes the provider configuration 2. **`dispose(self)`** - Clean up resources when the provider is disposed of ### Provider capabilities Providers expose capabilities through standard methods: - **`_notify(**kwargs)`** - Send notifications or alerts - **`_query(**kwargs)`** - Query data from the provider - **`_get_alerts()`** - Fetch alerts for monitoring - **`setup_webhook(...)`** - Configure webhook endpoints - **`validate_scopes()`** - Check provider permissions - **`expose()`** - Return parameters calculated during execution for use in workflows The public methods `notify()` and `query()` wrap the private implementations (`_notify()` and `_query()`) with additional capabilities like enrichment and error handling. Always implement the private methods. ### Provider discovery Keep automatically discovers providers based on naming conventions: - Location: `keep/providers/` directory - Directory naming: Must end with `_provider` (for example, `slack_provider`) - Main file: Must match directory name with `.py` extension (for example, `slack_provider.py`) - No explicit registration needed - just follow the naming convention ### Implementation examples #### Validate_config() ```python def validate_config(self): """Validate and process provider configuration.""" self.authentication_config = YourserviceProviderAuthConfig( **self.config.authentication ) ``` #### Dispose() ```python def dispose(self): """Cleanup any resources.""" # Close connections, cleanup clients, etc. # Can just pass if no cleanup needed pass ``` ### Provider type extraction The provider type is automatically extracted from your class name: - `YourserviceProvider` → `yourservice` - `ServiceNowProvider` → `service.now` - `DatadogProvider` → `datadog` This happens via the `_extract_type()` method in BaseProvider. ### Provider attributes Providers should define the following class attributes: - `PROVIDER_DISPLAY_NAME`: String used for UI display (for example, "Slack") - `PROVIDER_CATEGORY`: List of categories from the allowed values (see Provider Categories section) - `PROVIDER_COMING_SOON`: Boolean flag to mark providers as not ready (default: False) - `WEBHOOK_INSTALLATION_REQUIRED`: Boolean to make webhook setup mandatory in UI (default: False) - `PROVIDER_TAGS`: List of tags describing provider capabilities (for example, ["alert", "messaging"]) - `PROVIDER_SCOPES`: List of ProviderScope objects defining required permissions - `PROVIDER_METHODS`: List of ProviderMethod objects for additional capabilities (see [Provider Methods](/providers/provider-methods)) - `FINGERPRINT_FIELDS`: List of field names used to calculate alert fingerprints - `OAUTH2_URL`: OAuth 2.0 authorization URL if provider supports OAuth 2.0 authentication ### Provider categories Providers must specify one or more categories from the following list: ```python PROVIDER_CATEGORY: list[Literal[ "AI", "Monitoring", "Incident Management", "Cloud Infrastructure", "Ticketing", "Identity", "Developer Tools", "Database", "Identity and Access Management", "Security", "Collaboration", "Organizational Tools", "CRM", "Queues", "Orchestration", "Others" ]] ``` ### Provider tags Valid options for `PROVIDER_TAGS`: - `"alert"` - Provider handles alerts - `"ticketing"` - Provider manages tickets - `"messaging"` - Provider sends messages - `"data"` - Provider queries data - `"queue"` - Provider manages queues - `"topology"` - Provider provides topology data - `"incident"` - Provider manages incidents ### Provider scope ```python @dataclass class ProviderScope: """ Provider scope model. Args: name (str): The name of the scope. description (Optional[str]): The description of the scope. mandatory (bool): Whether the scope is mandatory. mandatory_for_webhook (bool): Whether the scope is mandatory for webhook auto installation. documentation_url (Optional[str]): The documentation url of the scope. alias (Optional[str]): Another alias of the scope. """ name: str description: Optional[str] = None mandatory: bool = False mandatory_for_webhook: bool = False documentation_url: Optional[str] = None alias: Optional[str] = None ``` ### Provider config ```python @dataclass class ProviderConfig: """ Provider configuration model. Args: description (Optional[str]): The description of the provider. authentication (dict): The configuration for the provider. """ authentication: Optional[dict] name: Optional[str] = None description: Optional[str] = None def __post_init__(self): if not self.authentication: return for key, value in self.authentication.items(): if ( isinstance(value, str) and value.startswith("{{") and value.endswith("}}") ): self.authentication[key] = chevron.render(value, {"env": os.environ}) ``` ### Base provider ```python """ Base class for all providers. """ class BaseProvider(metaclass=abc.ABCMeta): OAUTH2_URL = None PROVIDER_SCOPES: list[ProviderScope] = [] PROVIDER_METHODS: list[ProviderMethod] = [] FINGERPRINT_FIELDS: list[str] = [] PROVIDER_TAGS: list[ Literal["alert", "ticketing", "messaging", "data", "queue", "topology", "incident"] ] = [] PROVIDER_DISPLAY_NAME: str = None PROVIDER_CATEGORY: list[str] = [] PROVIDER_COMING_SOON: bool = False WEBHOOK_INSTALLATION_REQUIRED: bool = False def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig, webhook_template: Optional[str] = None, webhook_description: Optional[str] = None, webhook_markdown: Optional[str] = None, provider_description: Optional[str] = None, ): """ Initialize a provider. Args: provider_id (str): The provider id. **kwargs: Provider configuration loaded from the provider yaml file. """ self.provider_id = provider_id self.config = config self.webhook_template = webhook_template self.webhook_description = webhook_description self.provider_description = provider_description self.context_manager = context_manager self.logger = context_manager.get_logger() self.validate_config() self.logger.debug( "Base provider initalized", extra={"provider": self.__class__.__name__} ) self.provider_type = self._extract_type() self.results = [] # tb: we can have this overriden by customer configuration, when initializing the provider self.fingerprint_fields = self.FINGERPRINT_FIELDS def _extract_type(self): """ Extract the provider type from the provider class name. Returns: str: The provider type. """ name = self.__class__.__name__ name_without_provider = name.replace("Provider", "") name_with_spaces = ( re.sub("([A-Z])", r" \1", name_without_provider).lower().strip() ) return name_with_spaces.replace(" ", ".") @abc.abstractmethod def dispose(self): """ Dispose of the provider. """ raise NotImplementedError("dispose() method not implemented") @abc.abstractmethod def validate_config(self): """ Validate provider configuration. """ raise NotImplementedError("validate_config() method not implemented") def validate_scopes(self) -> dict[str, bool | str]: """ Validate provider scopes. Returns: dict: where key is the scope name and value is whether the scope is valid (True boolean) or string with error message. """ return {} def notify(self, **kwargs): """ Output alert message. Args: **kwargs (dict): The provider context (with statement) """ # trigger the provider results = self._notify(**kwargs) self.results.append(results) # if the alert should be enriched, enrich it enrich_alert = kwargs.get("enrich_alert", []) if not enrich_alert or not results: return results if results else None self._enrich(enrich_alert, results) return results def _enrich(self, enrichments, results, audit_enabled=True): """ Enrich alert or incident with provider specific data. This method replaces the deprecated _enrich_alert method and supports both alert and incident enrichment. Args: enrichments: List of enrichment configurations results: Results from the provider action audit_enabled: Whether to audit the enrichment operation (default: True) """ self.logger.debug("Extracting the fingerprint from the alert") if "fingerprint" in results: fingerprint = results["fingerprint"] elif self.context_manager.foreach_context.get("value", {}): # TODO: if it's zipped, we need to extract the fingerprint from the zip (i.e. multiple foreach) fingerprint = self.context_manager.foreach_context.get("value", {}).get( "fingerprint" ) # else, if we are in an event context, use the event fingerprint elif self.context_manager.event_context: # TODO: map all cases event_context is dict and update them to the DTO # and remove this if statement if isinstance(self.context_manager.event_context, dict): fingerprint = self.context_manager.event_context.get("fingerprint") # Alert DTO else: fingerprint = self.context_manager.event_context.fingerprint else: fingerprint = None if not fingerprint: self.logger.error( "No fingerprint found for alert enrichment", extra={"provider": self.provider_id}, ) raise Exception("No fingerprint found for alert enrichment") self.logger.debug("Fingerprint extracted", extra={"fingerprint": fingerprint}) _enrichments = {} # enrich only the requested fields for enrichment in enrichments: try: if enrichment["value"].startswith("results."): val = enrichment["value"].replace("results.", "") parts = val.split(".") r = copy.copy(results) for part in parts: r = r[part] _enrichments[enrichment["key"]] = r else: _enrichments[enrichment["key"]] = enrichment["value"] except Exception: self.logger.error( f"Failed to enrich alert - enrichment: {enrichment}", extra={"fingerprint": fingerprint, "provider": self.provider_id}, ) continue self.logger.info("Enriching alert", extra={"fingerprint": fingerprint}) try: enrich_alert(self.context_manager.tenant_id, fingerprint, _enrichments) except Exception as e: self.logger.error( "Failed to enrich alert in db", extra={"fingerprint": fingerprint, "provider": self.provider_id}, ) raise e self.logger.info("Alert enriched", extra={"fingerprint": fingerprint}) def _notify(self, **kwargs): """ Output alert message. Args: **kwargs (dict): The provider context (with statement) """ raise NotImplementedError("notify() method not implemented") def _query(self, **kwargs: dict): """ Query the provider using the given query Args: kwargs (dict): The provider context (with statement) Raises: NotImplementedError: _description_ """ raise NotImplementedError("query() method not implemented") def query(self, **kwargs: dict): # just run the query results = self._query(**kwargs) # now add the type of the results to the global context if results and isinstance(results, list): self.context_manager.dependencies.add(results[0].__class__) elif results: self.context_manager.dependencies.add(results.__class__) enrich_alert = kwargs.get("enrich_alert", []) if enrich_alert: self._enrich(enrich_alert, results) # and return the results return results @staticmethod def _format_alert( event: dict | list[dict], provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: """ Format incoming event(s) into AlertDto object(s). Args: event: Single event dict or list of event dicts provider_instance: Optional provider instance for context Returns: AlertDto or list of AlertDto objects """ raise NotImplementedError("format_alert() method not implemented") @classmethod def format_alert(cls, event: dict) -> AlertDto | list[AlertDto]: logger = logging.getLogger(__name__) logger.debug("Formatting alert") formatted_alert = cls._format_alert(event) logger.debug("Alert formatted") return formatted_alert @staticmethod def get_alert_fingerprint(alert: AlertDto, fingerprint_fields: list = []) -> str: """ Get the fingerprint of an alert. Args: event (AlertDto): The alert to get the fingerprint of. fingerprint_fields (list, optional): The fields we calculate the fingerprint upon. Defaults to []. Returns: str: hexdigest of the fingerprint or the event.name if no fingerprint_fields were given. """ if not fingerprint_fields: return alert.name fingerprint = hashlib.sha256() event_dict = alert.dict() for fingerprint_field in fingerprint_fields: fingerprint_field_value = event_dict.get(fingerprint_field, None) if isinstance(fingerprint_field_value, (list, dict)): fingerprint_field_value = json.dumps(fingerprint_field_value) if fingerprint_field_value: fingerprint.update(str(fingerprint_field_value).encode()) return fingerprint.hexdigest() def get_alerts_configuration(self, alert_id: Optional[str] = None): """ Get configuration of alerts from the provider. Args: alert_id (Optional[str], optional): If given, gets a specific alert by id. Defaults to None. """ # todo: we'd want to have a common alert model for all providers (also for consistent output from GPT) raise NotImplementedError("get_alerts() method not implemented") def deploy_alert(self, alert: dict, alert_id: Optional[str] = None): """ Deploy an alert to the provider. Args: alert (dict): The alert to deploy. alert_id (Optional[str], optional): If given, deploys a specific alert by id. Defaults to None. """ raise NotImplementedError("deploy_alert() method not implemented") def _get_alerts(self) -> list[AlertDto]: """ Get alerts from the provider. """ raise NotImplementedError("get_alerts() method not implemented") def get_alerts(self) -> list[AlertDto]: """ Get alerts from the provider. """ with tracer.start_as_current_span(f"{self.__class__.__name__}-get_alerts"): alerts = self._get_alerts() # enrich alerts with provider id for alert in alerts: alert.providerId = self.provider_id return alerts def get_alerts_by_fingerprint(self, tenant_id: str) -> dict[str, list[AlertDto]]: """ Get alerts from the provider grouped by fingerprint, sorted by lastReceived. Returns: dict[str, list[AlertDto]]: A dict of alerts grouped by fingerprint, sorted by lastReceived. """ alerts = self.get_alerts() if not alerts: return {} # get alerts, group by fingerprint and sort them by lastReceived with tracer.start_as_current_span(f"{self.__class__.__name__}-get_last_alerts"): get_attr = operator.attrgetter("fingerprint") grouped_alerts = { fingerprint: list(alerts) for fingerprint, alerts in itertools.groupby( sorted( alerts, key=get_attr, ), get_attr, ) } # enrich alerts with tracer.start_as_current_span(f"{self.__class__.__name__}-enrich_alerts"): pulled_alerts_enrichments = get_enrichments( tenant_id=tenant_id, fingerprints=grouped_alerts.keys(), ) for alert_enrichment in pulled_alerts_enrichments: if alert_enrichment: alerts_to_enrich = grouped_alerts.get( alert_enrichment.alert_fingerprint ) for alert_to_enrich in alerts_to_enrich: parse_and_enrich_deleted_and_assignees( alert_to_enrich, alert_enrichment.enrichments ) for enrichment in alert_enrichment.enrichments: # set the enrichment setattr( alert_to_enrich, enrichment, alert_enrichment.enrichments[enrichment], ) return grouped_alerts def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ) -> dict | None: """ Setup a webhook for the provider. Args: tenant_id (str): The tenant ID keep_api_url (str): The Keep API URL for webhook callbacks api_key (str): The API key for authentication setup_alerts (bool, optional): Whether to setup alerts. Defaults to True. Returns: dict | None: Dictionary of secrets to be saved if any, None otherwise Raises: NotImplementedError: If not implemented by the provider """ raise NotImplementedError("setup_webhook() method not implemented") @staticmethod def get_alert_schema() -> dict: """ Get the alert schema description for the provider. e.g. How to define an alert for the provider that can be pushed via the API. Returns: str: The alert format description. """ raise NotImplementedError( "get_alert_format_description() method not implemented" ) @staticmethod def oauth2_logic(**payload) -> dict: """ Logic for oauth2 authentication. For example, in Slack oauth2, we need to get the code from the payload and exchange it for a token. return: dict: The secrets to be saved as the provider configuration. (e.g. the Slack access token) """ raise NotImplementedError("oauth2_logic() method not implemented") @staticmethod def parse_event_raw_body(raw_body: bytes | dict) -> dict: """ Parse the raw body of an event and create an ingestible dict from it. For instance, in parseable, the "event" is just a string > b'Alert: Server side error triggered on teststream1\nMessage: server reporting status as 500\nFailing Condition: status column equal to abcd, 2 times' and we want to return an object > {'alert': 'Server side error triggered on teststream1', 'message': 'server reporting status as 500', 'failing_condition': 'status column equal to abcd, 2 times'} If this method is not implemented for a provider, it should convert the raw body to a dict. Args: raw_body (bytes | dict): The raw body of the incoming event (can be bytes or dict) Returns: dict: Ingestible event dictionary """ if isinstance(raw_body, dict): return raw_body return raw_body def get_logs(self, limit: int = 5) -> list: """ Get logs from the provider. Args: limit (int): The number of logs to get. """ raise NotImplementedError("get_logs() method not implemented") def expose(self): """Expose parameters that were calculated during query time. Each provider can expose parameters that were calculated during query time. E.g. parameters that were supplied by the user and were rendered by the provider. A concrete example is the "_from" and "to" of the Datadog Provider which are calculated during execution. """ # TODO - implement dynamically using decorators and return {} def start_consume(self): """Get the consumer for the provider. should be implemented by the provider if it has a consumer. for an example, see Kafka Provider Returns: Consumer: The consumer for the provider. """ return def status(self) -> bool: """Return the status of the provider. Returns: bool: The status of the provider. """ return { "status": "should be implemented by the provider if it has a consumer", "error": "", } @property def is_consumer(self) -> bool: """Return consumer if the inherited class has a start_consume method. Returns: bool: _description_ """ return self.start_consume.__qualname__ != "BaseProvider.start_consume" def _push_alert(self, alert: dict): """ Push an alert to the provider. Args: alert (dict): The alert to push. """ # if this is not a dict, try to convert it to a dict if not isinstance(alert, dict): try: alert_data = json.loads(alert) except Exception: alert_data = alert_data else: alert_data = alert # if this is still not a dict, we can't push it if not isinstance(alert_data, dict): self.logger.warning( "We currently support only alert represented as a dict, dismissing alert", extra={"alert": alert}, ) return # now try to build the alert model # we will have a lot of default values here to support all providers and all cases, the # way to fine tune those would be to use the provider specific model or enforce that the event from the queue will be casted into the fields alert_model = AlertDto( id=alert_data.get("id", str(uuid.uuid4())), name=alert_data.get("name", "alert-from-event-queue"), status=alert_data.get("status", AlertStatus.FIRING), lastReceived=alert_data.get("lastReceived", datetime.datetime.now()), environment=alert_data.get("environment", "alert-from-event-queue"), isDuplicate=alert_data.get("isDuplicate", False), duplicateReason=alert_data.get("duplicateReason", None), service=alert_data.get("service", "alert-from-event-queue"), source=alert_data.get("source", [self.provider_type]), message=alert_data.get("message", "alert-from-event-queue"), description=alert_data.get("description", "alert-from-event-queue"), severity=alert_data.get("severity", AlertSeverity.INFO), pushed=alert_data.get("pushed", False), event_id=alert_data.get("event_id", str(uuid.uuid4())), url=alert_data.get("url", None), fingerprint=alert_data.get("fingerprint", None), ) # push the alert to the provider url = f'{os.environ["KEEP_API_URL"]}/alerts/event' headers = { "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": self.context_manager.api_key, } response = requests.post(url, json=alert_model.dict(), headers=headers) try: response.raise_for_status() self.logger.info("Alert pushed successfully") except Exception: self.logger.error( f"Failed to push alert to {self.provider_id}: {response.content}" ) ``` ## Provider types and capabilities ### Base provider types Keep supports several base provider types, each with specific capabilities: 1. **BaseProvider** (`keep/providers/base/base_provider.py`) - Basic provider capabilities - Methods: `_notify()`, `_query()`, `_get_alerts()` - Use for: General integrations 2. **BaseTopologyProvider** (`keep/providers/base/base_provider.py`) - Extends BaseProvider - Methods: `pull_topology()` - Use for: Services that provide infrastructure topology data - Example: Datadog Provider (`keep/providers/datadog_provider/datadog_provider.py`) 3. **BaseIncidentProvider** (`keep/providers/base/base_provider.py`) - Extends BaseProvider - Methods: `_get_incidents()`, `_format_incident()` (static), `format_incident()` (classmethod), `setup_incident_webhook()` - Use for: Incident management systems - Example: PagerDuty Provider (`keep/providers/pagerduty_provider/pagerduty_provider.py`) ### Common capabilities #### 1. Notification (`_notify`) Send alerts or messages to external services: ```python def _notify(self, title: str, description: str = "", **kwargs) -> dict: # Implementation ``` #### 2. Query (`_query`) Fetch data from external services: ```python def _query(self, query: str, **kwargs) -> list: # Implementation ``` #### 3. Alert Fetching (`_get_alerts`) Pull alerts for monitoring: ```python def _get_alerts(self) -> List[AlertDto]: # Implementation ``` #### 4. Webhook support Handle incoming webhooks: ```python @staticmethod def parse_event_raw_body(raw_body: bytes | str) -> dict: # Parse webhook payload @staticmethod def _format_alert(event: dict, provider_instance: "BaseProvider" = None) -> AlertDto | list[AlertDto]: # Format webhook events into alerts ``` #### 5. OAuth 2.0 support Handle OAuth 2.0 authentication: ```python # IMPORTANT: Define OAUTH2_URL as a class attribute at the class level, NOT in __init__ class YourserviceProvider(BaseProvider): OAUTH2_URL = os.environ.get("YOURSERVICE_OAUTH2_URL") # Must be at class level @staticmethod def oauth2_logic(**payload) -> dict: # OAuth 2.0 implementation ``` #### 6. Consumer providers For providers that consume messages from queues or streams: ```python def start_consume(self): """ Start consuming messages from the provider. This method is called when Keep starts the provider as a consumer. Implement long-running consumption logic here. """ # Example: Kafka consumer while True: message = self.consumer.poll() if message: self._push_alert(message) @property def is_consumer(self) -> bool: """Provider is automatically detected as consumer if start_consume is implemented.""" return True # Automatically set if start_consume is overridden def status(self) -> dict: """Return the status of the consumer.""" return { "status": "running" if self.consumer_active else "stopped", "error": self.last_error if hasattr(self, 'last_error') else "" } ``` ### Specialized base classes Keep provides specialized base classes for specific provider types: #### Base topology provider For providers that manage infrastructure topology and service dependencies: ```python from keep.providers.base.base_topology_provider import BaseTopologyProvider class MyTopologyProvider(BaseTopologyProvider): def pull_topology(self) -> tuple[list[TopologyServiceInDto], dict]: """ Pull topology data from the provider. Returns: tuple: A tuple of (services list, edges dict) """ # Implement topology fetching logic pass ``` #### BaseIncidentProvider For providers that manage incidents and incident response: ```python from keep.providers.base.base_incident_provider import BaseIncidentProvider class MyIncidentProvider(BaseIncidentProvider): def _get_incidents(self) -> list[IncidentDto]: """ Fetch incidents from the provider (abstract method). Returns: list[IncidentDto]: List of incidents """ # Implement incident fetching logic pass @staticmethod def _format_incident( event: dict, provider_instance: "BaseProvider" = None ) -> IncidentDto | list[IncidentDto]: """ Format raw incident data into IncidentDto objects. Args: event: Raw incident data from webhook or API provider_instance: Optional provider instance for context Returns: IncidentDto or list of IncidentDto objects """ # Implement incident formatting logic pass def setup_incident_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True, ) -> dict | None: """ Setup webhook for incident updates. Args: tenant_id: Tenant identifier keep_api_url: Keep API URL for callbacks api_key: API key for authentication setup_alerts: Whether to also setup alert webhooks Returns: dict | None: Secrets to save if any """ # Implement webhook setup logic pass ``` Note: The `get_incidents()` method is automatically provided by the base class and wraps `_get_incidents()`. The `format_incident()` class method handles provider loading and calls `_format_incident()`. ### Authentication configuration Providers should define an authentication configuration class as a dataclass with proper field types and validation: ```python import dataclasses import pydantic from keep.validation.fields import HttpsUrl, NoSchemeUrl, UrlPort @pydantic.dataclasses.dataclass class MyProviderAuthConfig: """Configuration for MyProvider authentication.""" api_key: str = dataclasses.field( metadata={ "required": True, "description": "API Key for authentication", "sensitive": True, # Masks the field value in UI } ) api_url: HttpsUrl = dataclasses.field( default="https://api.example.com", metadata={ "required": False, "description": "API endpoint URL (HTTPS only)", "documentation_url": "https://docs.example.com/api", "validation": "https_url", # Maps to HttpsUrl validator } ) host: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "Service hostname", "hint": "example.com or 192.168.1.1", "validation": "no_scheme_url", # Maps to NoSchemeUrl validator } ) port: UrlPort = dataclasses.field( default=443, metadata={ "required": False, "description": "Service port", "validation": "port", # Validates port range 1-65535 } ) workspace_id: str = dataclasses.field( metadata={ "required": True, "description": "Workspace identifier", "hint": "Can be found in Settings > Workspace", } ) region: str = dataclasses.field( default="us-east-1", metadata={ "required": False, "description": "Service region", "type": "select", # Renders as dropdown in UI "options": ["us-east-1", "eu-west-1", "ap-south-1"], } ) ``` #### Field validation Keep provides built-in field validation through custom Pydantic field types: | Validation Type | Field Type | Description | Example | |----------------|------------|-------------|---------| | `"https_url"` | `HttpsUrl` | Validates HTTPS URLs only | `https://api.example.com` | | `"any_http_url"` | `pydantic.AnyHttpUrl` | Validates any HTTP/HTTPS URL | `http://example.com` | | `"no_scheme_url"` | `NoSchemeUrl` | Validates URLs without scheme | `example.com:8080` | | `"port"` | `UrlPort` | Validates port numbers (1-65535) | `443` | | `"multihost_url"` | `MultiHostUrl` | Validates multi-host URLs | `mongodb://host1:27017,host2:27017` | | `"no_scheme_multihost_url"` | `NoSchemeMultiHostUrl` | Multi-host URLs without scheme | `host1:9092,host2:9092` | To use validation: 1. Import the appropriate field type from `keep.validation.fields` 2. Use it as the field type annotation 3. Add the corresponding validation string in metadata Example implementations: ```python # HTTPS-only webhook URL webhook_url: HttpsUrl = dataclasses.field( metadata={ "required": True, "description": "Webhook endpoint (HTTPS required)", "sensitive": True, "validation": "https_url", } ) # Database connection with multiple hosts connection_string: MultiHostUrl = dataclasses.field( metadata={ "required": True, "description": "Database connection string", "hint": "mongodb://host1:27017,host2:27017/dbname", "validation": "multihost_url", } ) # SSH connection ssh_host: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "SSH hostname or IP", "validation": "no_scheme_url", } ) ssh_port: UrlPort = dataclasses.field( default=22, metadata={ "required": False, "description": "SSH port", "validation": "port", } ) ``` #### Metadata fields reference - `required`: Whether the field is mandatory - `description`: Field description shown in UI - `sensitive`: Whether to mask the field value (for secrets) - `hidden`: Whether to hide the field in UI - `documentation_url`: Link to relevant documentation - `hint`: Help text for users - `validation`: Validation type string (see preceding table) - `type`: UI input type (for example, "select" for dropdown) - `options`: List of valid options for select fields - `config_main_group`: Group name for organizing fields in UI - `config_sub_group`: Sub-group name for nested organization The validation system ensures that configuration values are valid before Keep instantiates the provider. Invalid values are rejected with clear error messages, improving the user experience and preventing runtime errors. ## Testing your provider ### 1. Unit test Create `tests/test_yourservice_provider.py`: ```python import pytest from keep.providers.yourservice_provider.yourservice_provider import YourserviceProvider from keep.providers.models.provider_config import ProviderConfig from keep.contextmanager.contextmanager import ContextManager def test_yourservice_provider_init(): """Test provider initialization.""" config = ProviderConfig( authentication={ "api_endpoint": "https://api.yourservice.com", "api_key": "test-key", } ) context_manager = ContextManager(tenant_id="test", workflow_id="test") provider = YourserviceProvider( context_manager=context_manager, provider_id="test", config=config ) assert provider.authentication_config.api_endpoint == "https://api.yourservice.com" assert provider.authentication_config.api_key == "test-key" @pytest.fixture def mock_requests(monkeypatch): """Mock requests module.""" import requests class MockResponse: def __init__(self, json_data, status_code=200): self.json_data = json_data self.status_code = status_code def json(self): return self.json_data def raise_for_status(self): pass def mock_post(*args, **kwargs): return MockResponse({"success": True}) def mock_get(*args, **kwargs): return MockResponse({"alerts": []}) monkeypatch.setattr(requests, "post", mock_post) monkeypatch.setattr(requests, "get", mock_get) def test_yourservice_notify(mock_requests): """Test notification sending.""" config = ProviderConfig( authentication={ "api_endpoint": "https://api.yourservice.com", "api_key": "test-key", } ) context_manager = ContextManager(tenant_id="test", workflow_id="test") provider = YourserviceProvider( context_manager=context_manager, provider_id="test", config=config ) result = provider.notify(message="Test message") assert result["success"] is True ``` ### 2. Integration test Test with the provider factory: ```python def test_provider_factory_loading(): """Test that provider loads correctly through factory.""" from keep.providers.providers_factory import ProvidersFactory # Get provider class provider_class = ProvidersFactory.get_provider_class("yourservice") assert provider_class.__name__ == "YourserviceProvider" # Get all providers all_providers = ProvidersFactory.get_all_providers() yourservice = next((p for p in all_providers if p.type == "yourservice"), None) assert yourservice is not None assert yourservice.display_name == "YourService" ``` ### 3. Manual testing You can test your provider by running it directly: ```bash cd keep python -m keep.providers.yourservice_provider.yourservice_provider ``` The `if __name__ == "__main__":` block allows you to test provider initialization and basic capabilities. Add a test block to your provider for direct execution: ```python if __name__ == "__main__": # Test the provider directly import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Initialize the provider with test config config = ProviderConfig( authentication={ "api_endpoint": "https://api.yourservice.com", "api_key": "test-key", } ) provider = YourserviceProvider( context_manager=context_manager, provider_id="test", config=config ) # Test provider methods print("Provider initialized successfully!") # Test specific functionality try: result = provider._query("test query") print(f"Query result: {result}") except Exception as e: print(f"Query failed: {e}") ``` ## Best practices ### 1. Error handling Always handle API errors gracefully: ```python from keep.exceptions.provider_exception import ProviderException try: response = requests.get(url) response.raise_for_status() except requests.exceptions.RequestException as e: raise ProviderException(f"Failed to fetch data: {str(e)}") ``` ### 2. Logging Use the provider's logger: ```python self.logger.info("Fetching alerts from YourService") self.logger.error(f"Failed to connect: {str(e)}") ``` ### 3. Configuration validation Validate configuration in `validate_config()`: ```python def validate_config(self): self.authentication_config = YourserviceProviderAuthConfig( **self.config.authentication ) # Additional validation if not self.authentication_config.api_endpoint.startswith("https://"): raise ValueError("API endpoint must use HTTPS") ``` ### 4. Alert formatting When returning alerts, use Keep's standard format: ```python from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus alert = AlertDto( id="unique-alert-id", name="Alert Title", description="Detailed description", severity=AlertSeverity.HIGH, status=AlertStatus.FIRING, lastReceived=datetime.now().isoformat(), source=["yourservice"], fingerprint="unique-fingerprint", labels={"key": "value"}, annotations={"runbook": "https://docs.example.com"}, ) ``` ### 5. Secrets management Never hardcode secrets. Use environment variables or configuration: ```python client_id = os.environ.get("YOURSERVICE_CLIENT_ID") if not client_id: raise ProviderException("YOURSERVICE_CLIENT_ID environment variable not set") ``` ## Common patterns ### 1. Provider health checks Implement health monitoring using the `ProviderHealthMixin`: ```python from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin class YourserviceProvider(BaseProvider, ProviderHealthMixin): HAS_HEALTH_CHECK = True # The mixin provides automatic health checking for: # - Topology coverage validation # - Spammy alerts detection # - Alerting rule usage monitoring ``` The health check mixin is particularly useful for monitoring providers that collect topology data or handle high volumes of alerts. ### 2. Pagination Handle paginated API responses: ```python def _get_all_items(self): items = [] page = 1 while True: response = self._query_page(page) items.extend(response["items"]) if not response.get("has_next"): break page += 1 return items ``` ### 3. Rate limiting Respect API rate limits: ```python import time from typing import Any def _rate_limited_request(self, url: str, **kwargs) -> Any: max_retries = 3 for attempt in range(max_retries): try: response = requests.get(url, **kwargs) if response.status_code == 429: # Rate limited retry_after = int(response.headers.get("Retry-After", 60)) self.logger.warning(f"Rate limited, waiting {retry_after}s") time.sleep(retry_after) continue response.raise_for_status() return response.json() except Exception as e: if attempt == max_retries - 1: raise time.sleep(2 ** attempt) # Exponential backoff ``` ### 4. Caching Cache frequently accessed data: ```python from datetime import datetime, timedelta class YourserviceProvider(BaseProvider): def __init__(self, context_manager, provider_id, config): super().__init__(context_manager, provider_id, config) self._cache = {} self._cache_ttl = timedelta(minutes=5) def _get_cached_data(self, key: str) -> Any: if key in self._cache: data, timestamp = self._cache[key] if datetime.now() - timestamp < self._cache_ttl: return data return None def _set_cached_data(self, key: str, data: Any): self._cache[key] = (data, datetime.now()) ``` ### 5. Webhook signature verification Verify webhook authenticity: ```python import hmac import hashlib @staticmethod def verify_webhook_signature(raw_body: bytes, signature: str, secret: str) -> bool: expected = hmac.new( secret.encode(), raw_body, hashlib.sha256 ).hexdigest() return hmac.compare_digest(expected, signature) ``` ### 6. Exposing runtime parameters Use the `expose()` method to make runtime-calculated values available to workflows: ```python class YourserviceProvider(BaseProvider): def __init__(self, context_manager, provider_id, config): super().__init__(context_manager, provider_id, config) self._from_timestamp = None self._to_timestamp = None def _query(self, metric: str, from_time: str = "1h", **kwargs): # Calculate actual timestamps self._to_timestamp = datetime.now() self._from_timestamp = self._to_timestamp - parse_duration(from_time) # Query with calculated timestamps return self._fetch_metrics(metric, self._from_timestamp, self._to_timestamp) def expose(self): """Expose calculated parameters for workflow use.""" exposed = {} if self._from_timestamp: exposed["from"] = self._from_timestamp.isoformat() if self._to_timestamp: exposed["to"] = self._to_timestamp.isoformat() return exposed ``` This allows workflows to access the actual timestamps used in queries, not just the relative time strings. ## Complete provider example Here's a minimal example of a complete provider implementation: ```python from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.contextmanager.contextmanager import ContextManager class MyProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "My Service" PROVIDER_CATEGORY = ["Monitoring", "Incident Management"] PROVIDER_TAGS = ["alert", "messaging"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig, webhook_template: Optional[str] = None, webhook_description: Optional[str] = None, webhook_markdown: Optional[str] = None, provider_description: Optional[str] = None, ): super().__init__( context_manager, provider_id, config, webhook_template, webhook_description, webhook_markdown, provider_description ) def validate_config(self): # Validate the provider configuration pass def dispose(self): # Clean up resources pass def _query(self, **kwargs): # Implement query logic pass def _notify(self, **kwargs): # Implement notification logic pass ``` ## File references - **Base Provider Classes**: `keep/providers/base/base_provider.py` - **Provider Models**: `keep/providers/models/` - **Provider Factory**: `keep/providers/providers_factory.py` - **Provider Exceptions**: `keep/exceptions/provider_exception.py` - **Example Providers**: - Simple: `keep/providers/slack_provider/slack_provider.py` - Complex: `keep/providers/datadog_provider/datadog_provider.py` - Database: `keep/providers/clickhouse_provider/clickhouse_provider.py` - Incident: `keep/providers/pagerduty_provider/pagerduty_provider.py` - Topology: `keep/providers/datadog_provider/datadog_provider.py` - **Tests**: `tests/test_*_provider.py` - **Documentation**: `docs/providers/documentation/` - **Additional Docs**: - `docs/providers/adding-a-new-provider.mdx` - `docs/providers/provider-methods.mdx` - `docs/providers/linked-providers.mdx` ## Checklist - [ ] Create provider directory and files - [ ] Implement AuthConfig class with proper metadata - [ ] Implement provider class with required methods - [ ] Add provider to `__init__.py` - [ ] Set appropriate PROVIDER_DISPLAY_NAME, PROVIDER_CATEGORY, and PROVIDER_TAGS - [ ] Implement `validate_config()` and `dispose()` - [ ] Add at least one capability (`_notify`, `_query`, or `_get_alerts`) - [ ] Create documentation in `docs/providers/documentation/` - [ ] Write unit tests - [ ] Test with provider factory - [ ] Handle errors gracefully - [ ] Add logging statements - [ ] Validate in Keep UI - [ ] If supporting webhooks, implement `_format_alert()` static method - [ ] If supporting OAuth 2.0, set OAUTH2_URL as class attribute - [ ] Consider implementing `validate_scopes()` for scope validation - [ ] Consider implementing `get_provider_metadata()` for provider versioning ## Getting help - Review existing providers for examples - Check the base provider classes for available methods - Look at test files for testing patterns - Ask in Keep's GitHub discussions or issues - Review the [Provider Methods documentation](/providers/provider-methods) for advanced capabilities - Understand [Linked vs Connected Providers](/providers/linked-providers) ================================================ FILE: docs/providers/documentation/airflow-provider.mdx ================================================ --- title: "Airflow" sidebarTitle: "Airflow Provider" description: "The Airflow provider integration allows you to send alerts (e.g. DAG failures) from Airflow to Keep via webhooks." --- import AutoGeneratedSnippet from '/snippets/providers/airflow-snippet-autogenerated.mdx'; ## Overview [Apache Airflow](https://airflow.apache.org/docs/apache-airflow/stable/index.html) is an open-source tool for programmatically authoring, scheduling, and monitoring data pipelines. Airflow's extensible Python framework enables you to build workflows that connect with virtually any technology. When working with Airflow, it's essential to monitor the health of your DAGs and tasks to ensure that your data pipelines run smoothly. The Airflow Provider integration allows seamless communication between Airflow and Keep, so you can forward alerts, such as task failures, directly to Keep via webhook configurations. ![Apache Airflow](/images/airflow_1.png) ## Connecting Airflow to Keep ### Alert Integration via Webhook To connect Airflow to Keep, configure Airflow to send alerts using Keep's webhook. You must provide: - **Keep Webhook URL**: The webhook URL provided by Keep (for example, `https://api.keephq.dev/alerts/event/airflow`). - **Keep API Key**: The API key generated on Keep's platform, which is used for authentication. A common method to integrate Airflow with Keep is by configuring alerts through [Airflow Callbacks](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/callbacks.html). For instance, when an Airflow task fails, a callback can send an alert to Keep via the webhook. There are several steps to implement this: ### Step 1: Define Keep's Alert Information Structure your alert payload with the following information: ```python data = { "name": "Airflow Task Failure", "description": "Task keep_task failed in DAG keep_dag", "status": "firing", "service": "pipeline", "severity": "critical", } ``` ### Step 2: Configure Keep's Webhook Credentials To send alerts to Keep, configure the webhook URL and API key. Below is an example of how to send an alert using Python: > **Note**: You need to set up the `KEEP_API_KEY` environment variable with your Keep API key. ```python import os import requests def send_alert_to_keep(dag_id, task_id, execution_date, error_message): # Replace with your specific Keep webhook URL if different. keep_webhook_url = "https://api.keephq.dev/alerts/event/airflow" api_key = os.getenv("KEEP_API_KEY") headers = { "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": api_key, } data = { "name": f"Airflow Task Failure: {task_id}", "message": f"Task {task_id} failed in DAG {dag_id} at {execution_date}", "status": "firing", "service": "pipeline", "severity": "critical", "description": str(error_message), } response = requests.post(keep_webhook_url, headers=headers, json=data) response.raise_for_status() ``` ### Step 3: Configure the Airflow Callback Function Now, configure the callback so that an alert is sent to Keep when a task fails. You can attach this callback to one or more tasks in your DAG as shown below: ```python import os import requests from datetime import datetime from datetime import timedelta from airflow import DAG from airflow.operators.bash_operator import BashOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } def send_alert_to_keep(dag_id, task_id, execution_date, error_message): # Replace with your specific Keep webhook URL if different. keep_webhook_url = "https://api.keephq.dev/alerts/event/airflow" api_key = os.getenv("KEEP_API_KEY") headers = { "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": api_key, } data = { "name": f"Airflow Task Failure: {task_id}", "message": f"Task {task_id} failed in DAG {dag_id} at {execution_date}", "status": "firing", "service": "pipeline", "severity": "critical", "description": str(error_message), } response = requests.post(keep_webhook_url, headers=headers, json=data) response.raise_for_status() def task_failure_callback(context): send_alert_to_keep( dag_id=context["dag"].dag_id, task_id=context["task_instance"].task_id, execution_date=context["execution_date"], error_message=context.get("exception", "Unknown error"), ) dag = DAG( dag_id="keep_dag", default_args=default_args, description="A simple DAG with Keep integration", schedule_interval=None, start_date=datetime(2025, 1, 1), catchup=False, ) task = BashOperator( task_id="keep_task", bash_command="exit 1", dag=dag, on_failure_callback=task_failure_callback, ) ``` ### Step 4: Observe Alerts in Keep After setting up the above configuration, any failure in your Airflow tasks will trigger an alert that is sent to Keep via the configured webhook. You can then view, manage, and respond to these alerts using the Keep dashboard. ![Keep Alerts](/images/airflow_2.png) ## Useful Links - [Airflow Documentation](https://airflow.apache.org/docs/apache-airflow/stable/index.html) - [Airflow Callbacks](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/callbacks.html) - [Airflow Connection](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html) ================================================ FILE: docs/providers/documentation/aks-provider.mdx ================================================ --- title: "Azure AKS" description: "Azure AKS provider to view kubernetes resources." --- import AutoGeneratedSnippet from '/snippets/providers/aks-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Azure AKS, follow below steps: 1. Log in to your [Azure](https://azure.microsoft.com/) account. 2. Go to your kubernetes service page and click on `Connect` button and then click on `Open Cloud Shell`. 3. Run `az ad sp create-for-rbac --role owner --scopes /subscriptions/` in the cloud shell, you will get response similar to: ``` { "appId": "xxxxxx-xxxxx-xxxxxx-xxxx", "displayName": "azure-cli-2023-11-06-13-00-52", "password": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "tenant": "xxxxx-xxxxx-xxxx-xxxxx" } ``` In above JSON object, the `appId` is `client_id`, `password` is `client_secret` and `tenant` is `tenant_id` ## Notes - This provider allows you to interact with Azure AKS to query resources in kubernetes cluster. ## Useful Links - [Azure AKS List Cluster User Creds](https://learn.microsoft.com/en-us/rest/api/aks/managed-clusters/list-cluster-user-credentials?view=rest-aks-2023-08-01&tabs=HTTP) - [Azure AKS Doc](https://learn.microsoft.com/en-us/azure/aks/) ================================================ FILE: docs/providers/documentation/amazonsqs-provider.mdx ================================================ --- title: "AmazonSQS Provider" sidebarTitle: "AmazonSQS Provider" description: "The AmazonSQS provider enables you to pull & push alerts to the Amazon SQS Queue." --- import AutoGeneratedSnippet from '/snippets/providers/amazonsqs-snippet-autogenerated.mdx'; ## Overview The **AmazonSQS Provider** facilitates Consuming SQS messages as alerts Notifying/Pushing messages to SQS Queue ## Inputs for AmazonSQS Action - `message`: str: Body/Message for the notification - `group_id`: str | None: Mandatory only if Queue is of type FIFO, ignored incase of a normal Queue. - `dedup_id`: str | None: Mandatory only if Queue is of type FIFO, ignored incase of a normal Queue. - **kwargs: dict | None: You can pass additional key-value pairs, that will be sent as MessageAttributes in the notification. ## Output for AmazonSQS Action For more detail, visit [sqs-documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sqs/client/send_message.html#). ```json { 'MD5OfMessageBody': 'string', 'MD5OfMessageAttributes': 'string', 'MD5OfMessageSystemAttributes': 'string', 'MessageId': 'string', 'SequenceNumber': 'string' } ``` - When using the AmazonSQS action, if your queue is fifo, then it is **mandatory** to pass a dedup_id & group_id. - All the extra fields present in the MessageAttribute is stored in alert.label as a key-value pair dictionary. - You can pass these attributes in the SQS Queue message and keep will extract and use these field for the alert - name - status: Possible values 'firing' | 'resolved' | 'acknowledged' | 'suppressed' | 'pending' defaults to 'firing'. - severity: Possible values 'critical' | 'high' | 'warning' | 'info' | 'low' defaults to 'high' - description Permissions needed for the key-id pair are: 1. AmazonSQSFullAccess: If you want to notify + receive, this is sqs::read + sqs::write scope. 2. AmazonSQSReadOnlyAccess: If you want to just receive, this is the sqs::read scope. You can find these under: IAM > Users > [YOUR_USER] > Permission > Add Permissions > Add Permissions > Attach policies directly > Search for SQS. To create key-id pair, follow this: 1. Search IAM in AWS console, press enter. 2. Go to users 3. Select the user that you want to 4. Click on `Create access key` 5. Select `Third party service`, Click `Next` 6. Add `Description Tag` click `Next` 7. Copy/Download the key-id pair. ## Useful Links - [AmazonSQS Boto3 Examples](https://docs.aws.amazon.com/code-library/latest/ug/python_3_sqs_code_examples.html) - [Boto3 SQS Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sqs.html) ================================================ FILE: docs/providers/documentation/anthropic-provider.mdx ================================================ --- title: "Anthropic Provider" description: "The Anthropic Provider allows for integrating Anthropic's Claude language models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/anthropic-snippet-autogenerated.mdx'; The Anthropic Provider supports querying Claude language models for prompt-based interactions. ## Outputs Currently, the Claude Provider outputs the response from the model based on the prompt provided. ## Connecting with the Provider To connect to Claude, you'll need to obtain an API Key: 1. Log in to your Anthropic account at [Anthropic Console](https://console.anthropic.com). 2. Navigate to the **API Keys** section. 3. Click on **Create Key** to generate a new API key for Keep. Use the generated API key in the `authentication` section of your Claude Provider configuration. ================================================ FILE: docs/providers/documentation/appdynamics-provider.mdx ================================================ --- title: "AppDynamics" sidebarTitle: "AppDynamics Provider" description: "AppDynamics provider allows you to get AppDynamics `alerts/actions` via webhook installation" --- import AutoGeneratedSnippet from '/snippets/providers/appdynamics-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Ensure you have a AppDynamics account with the necessary [permissions](https://docs.appdynamics.com/accounts/en/cisco-appdynamics-on-premises-user-management/roles-and-permissions). The basic permissions required are `Account Owner` or `Administrator`. Alternatively you can create an account [instructions](https://docs.appdynamics.com/accounts/en/global-account-administration/access-management/manage-user-accounts) ## Provider configuration 1. Find your account name [here](https://accounts.appdynamics.com/overview). 2. Get the appId of the Appdynamics instance in which you wish to install the webhook into. 3. Determine the Host [here](https://accounts.appdynamics.com/overview). ### Basic Auth authentication 1. Obtain AppDynamics **Username** and **Password** 2. Go to **Basic Auth** tab under **Authentication** section 3. Enter **Username** and **Password** Keep add AppDynamics Username and Password ### Access Token authentication 1. Log in to the **Controller UI** as an **Account Owner** or other roles with the **Administer users**, **groups**, **roles** permission. 2. Go to **Administration** AppDynamics Administration 3. Go to **API Client** tab AppDynamics API Client tab 4. Click **+ Create** Create new AppDynamics API Client 5. Fill Client **Name** and **Description** 6. Click **Generate Secret** AppDynamics generate API Client Secret This API Client secret is not an authentication token yet 7. Add **Account Owner** and/or **Administrator** roles AppDynamics add API Client roles 8. Click **Save** AppDynamics save API Client 9. Click **Generate Temporary Token** AppDynamics Generate API Client Temporary Access Token This token is not persistent, but since Keep uses it just once to install Webhook, we will use it without oAuth 10. Click **Save** one again This is important. Otherwise generated token will not be saved and authentication will fail 11. Copy generated token AppDynamics copy API Client Temporary Access Token 12. Go to **Access Token** tab under **Authentication** section Keep add AppDynamics Access Token 13. Enter Access Token ## Connecting provider 1. Ensure **Install webhook** is checked 2. Click **Connect** ## Webhook Integration Modifications The webhook integration adds Keep as an alert monitor within the AppDynamics instance. It can be found under the "Alerts & Respond" section. The integration automatically gains access to the following scopes within AppDynamics: - `administrator` - `authenticated` ## Useful Links - [AppDynamics HTTP Action Templates](https://docs.appdynamics.com/appd/24.x/24.3/en/extend-cisco-appdynamics/cisco-appdynamics-apis/configuration-import-and-export-api#id-.ConfigurationImportandExportAPIv24.2-ImportHTTPActionTemplatesintoanAccount) - [AppDynamics Permissions and Roles](https://docs.appdynamics.com/accounts/en/cisco-appdynamics-on-premises-user-management/roles-and-permissions) - [AppDynamics User Accounts](https://docs.appdynamics.com/accounts/en/global-account-administration/access-management/manage-user-accounts) ================================================ FILE: docs/providers/documentation/argocd-provider.mdx ================================================ --- title: "ArgoCD Provider" sidebarTitle: "ArgoCD Provider" description: "The ArgoCD provider enables you to pull topology and Application data." --- import AutoGeneratedSnippet from '/snippets/providers/argocd-snippet-autogenerated.mdx'; ## Overview The **ArgoCD Provider** facilitates pulling Topology and Application data from ArgoCD. ArgoCD Applications are mapped to Keep Services ArgoCD ApplicationSets are mapped to Keep Applcations ## Connecting with the Provider 1. Obtain the **access token** from your ArgoCD instance by following `Generate auth token` from [ArgoCD's User management docs](https://argo-cd.readthedocs.io/en/stable/operator-manual/user-management/#manage-users). 2. Set the **deployment URL** to your ArgoCD instance's base URL (e.g., `https://localhost:8080`). ## Features The **ArgoCD Provider** supports the following key features: - **Topology**: Configures the Topology usin the applications from ArgoCD. - **Applications**: Creates Applications using the ApplicationSets from ArgoCD. ## Useful Links - [ArgoCD API Documentation](https://argo-cd.readthedocs.io/en/stable/developer-guide/api-docs) - [ArgoCD User Management](https://argo-cd.readthedocs.io/en/stable/operator-manual/user-management/#local-usersaccounts) ================================================ FILE: docs/providers/documentation/asana-provider.mdx ================================================ --- title: "Asana" sidebarTitle: "Asana Provider" description: "Asana Provider allows you to create and update tasks in Asana" --- import AutoGeneratedSnippet from '/snippets/providers/asana-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Go to [Asana](https://app.asana.com/0/developer-console) 2. Click on `Create New Personal Access Token`. 3. Give it a name and click on `Create`. 4. Copy the generated token. This will be used as the `Personal Access Token` in the provider settings. ## Useful Links - [Asana](https://asana.com) ================================================ FILE: docs/providers/documentation/auth0-provider.mdx ================================================ --- title: "Auth0" sidebarTitle: "Auth0 Provider" description: "Auth0 provider allows interaction with Auth0 APIs for authentication and user management." --- import AutoGeneratedSnippet from '/snippets/providers/auth0-snippet-autogenerated.mdx'; ## Connecting with the Provider The Auth0 provider connects to both the **Authentication API** and the **Management API**, enabling functionality such as token-based authentication and user management. Depending on your needs, you can: - Use the **Authentication API** to obtain access tokens, manage user profiles, or handle multi-factor authentication. - Use the **Management API** to automate the configuration of your Auth0 environment, register applications, manage users, and more. ## Useful Links -[Auth0 API Documentation](https://auth0.com/docs/api) -[Auth0 as an authentication method for keep](https://docs.keephq.dev/deployment/authentication/auth0-auth) ================================================ FILE: docs/providers/documentation/axiom-provider.mdx ================================================ --- title: "Axiom Provider" description: "Axiom Provider is a class that allows to ingest/digest data from Axiom." --- import AutoGeneratedSnippet from '/snippets/providers/axiom-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Axiom, you need to create an API token from your Axiom account. Follow these steps: 1. Log in to your Axiom account. 2. Go to the **API Access** page under the **Settings** menu. 3. Click the **Create Token** button and enter a name for the token. 4. Copy the token value and keep it safe. 5. Add the token value to the `authentication` section in the Axiom Provider configuration. To access datasets, you need to provide the organization ID. You can find your organization ID in the URL of the Axiom web app. For example, if your Axiom URL is `https://app.axiom.co/organizations/1234`, then your organization ID is `1234`. ## Notes - This provider supports a limited set of features provided by the Axiom API. - The `startTime` and `endTime` parameters use ISO-8601 format. - The `query` function returns the response in JSON format from the Axiom API. ## Webhook Integration 1. In Axiom, go to the `Monitors` tab in the Axiom dashboad. 2. Click on `Notifiers` in the left sidebar and create a new notifier. 3. Give it a name and select `Custom Webhook` as kind of notifier. Enter the webhook url as [https://api.keephq.dev/alerts/event/axiom](https://api.keephq.dev/alerts/event/axiom). 4. Follow the below steps to create a new API key in Keep. 5. Go to Keep dashboard and click on the profile icon in the botton left corner and click `Settings`. 6. Select `Users and Access` tab and then select `API Keys` tab and create a new API key. 7. Give name and select the role as `webhook` and click on `Create API Key`. 8. Copy the API key. 9. Add a new header with key as `X-API-KEY` and create a new API key in Keep and paste it as the value and save the webhook. 10. Go to `Monitors` tab and click on the `Monitors` in the left sidebar and create a new monitor. 11. Create a new monitor and select the notifier created in the previous step as per your requirement. Refer [Axiom Monitors](https://axiom.co/docs/monitor-data/monitors) to create a new monitor. 12. Save the monitor. Now, you will receive the alerts in Keep. ## Useful Links - [Axiom API Documentation](https://axiom.co/docs/restapi/introduction) ================================================ FILE: docs/providers/documentation/azuremonitoring-provider.mdx ================================================ --- title: "Azure Monitor" sidebarTitle: "Azure Monitor Provider" description: "Azure Monitorg provider allows you to get alerts from Azure Monitor via webhooks." --- import AutoGeneratedSnippet from '/snippets/providers/azuremonitoring-snippet-autogenerated.mdx'; ## Overview The Azure Monitor Provider integrates Keep with Azure Monitor, allowing you to receive alerts within Keep's platform. By setting up a webhook in Azure, you can ensure that critical alerts are sent to Keep, allowing for efficient monitoring and response. ## Connecting Azure Monitor to Keep Connecting Azure Monitor to Keep involves creating an Action Group in Azure, adding a webhook action, and configuring the Alert Rule to use the new Action Group. ### Step 1: Navigate an Action Group 1. Log in to your Azure portal. 2. Navigate to **Monitor** > **Alerts** > **Action groups**. ### Step 2: Create new Action Group 1. Click on **+ Create**. ### Step 3: Fill Action Group details 1. Choose the Subscription and Resource Group. 2. Give the Action Group an indicative name. ### Step 4: Go to "Action" and add Keep as a Webhook ### Step 5: Test Keep Webhook action ### Step 6: View the alert in Keep ## Useful Links - [Azure Monitor alert webhook](https://learn.microsoft.com/en-us/azure/azure-monitor/alerts/alerts-webhooks) - [Azure Monitor alert payload](https://learn.microsoft.com/en-us/azure/azure-monitor/alerts/alerts-payload-samples) - [Azure Monitor action groups](https://learn.microsoft.com/en-us/azure/azure-monitor/alerts/action-groups) ================================================ FILE: docs/providers/documentation/bash-provider.mdx ================================================ --- title: "Bash" sidebarTitle: "Bash Provider" description: "Bash provider allows executing Bash commands in a workflow, with a limitation for cloud execution." --- import AutoGeneratedSnippet from '/snippets/providers/bash-snippet-autogenerated.mdx'; ## Connecting with the Provider The Bash provider allows you to run Bash commands or scripts in your workflow. You can pass in any valid Bash command, and it will be executed in a local environment. ### **Cloud Limitation** This provider is disabled for cloud environments and can only be used in local or self-hosted environments. ## Usefull Links -[Bash Documentation](https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html) ================================================ FILE: docs/providers/documentation/bigquery-provider.mdx ================================================ --- title: "BigQuery" sidebarTitle: "BigQuery Provider" description: "BigQuery provider allows interaction with Google BigQuery for querying and managing datasets." --- import AutoGeneratedSnippet from '/snippets/providers/bigquery-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Create a Google Cloud project and enable the BigQuery API. 2. Create a service account in your Google Cloud project and download the JSON key file. 3. Share the necessary datasets with the service account. 4. Configure your provider using the `service_account_key`, `project_id`, and `dataset`. ================================================ FILE: docs/providers/documentation/centreon-provider.mdx ================================================ --- title: "Centreon" sidebarTitle: "Centreon Provider" description: "Centreon allows you to monitor your infrastructure with ease." --- import AutoGeneratedSnippet from '/snippets/providers/centreon-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Centreon can be SaaS or On-premises. You need to have an instance of Centreon running. 2. Go to Administration > API Tokens and create a new token for an admin user. 3. Use the URL of your Centreon instance and the API token to configure the provider. ## Usefull Links - [Centreon](https://www.centreon.com/) ## Note - Centreon only supports the following [host state](https://docs.centreon.com/docs/api/rest-api-v1/#realtime-information) (UP = 0, DOWN = 2, UNREA = 3) ================================================ FILE: docs/providers/documentation/checkly-provider.mdx ================================================ --- title: 'Checkly' sidebarTitle: 'Checkly Provider' description: 'Checkly allows you to receive alerts from Checkly using API endpoints as well as webhooks' --- import AutoGeneratedSnippet from '/snippets/providers/checkly-snippet-autogenerated.mdx'; ## Connecting Checkly to Keep 1. Open Checkly dashboard and click on your profile picture in the top right corner. 2. Click on `User Settings`. 3. Open the `API Keys` tab and click on `Create API Key` to generate a new API key. 4. Copy the API key. 5. Open `General` tab under Account Settings and copy the `Account ID`. 6. Go to Keep, add Checkly as a provider and enter the API key and Account ID in the respective fields and click on `Connect`. ## Webhooks Integration 1. Open Checkly dashboard and open `Alerts` tab in the left sidebar. 2. Click on `Add more channels` 3. Select `Webhook` from the list of available channels. 4. Enter a name for the webhook, select the method as `POST` 5. Enter [https://api.keephq.dev/alerts/event/checkly](https://api.keephq.dev/alerts/event/checkly) as the URL. 6. Copy the below snippet and paste in the `Body` of Webhook. Refer the screenshot below for reference. ```json { "event": "{{ALERT_TITLE}}", "alert_type": "{{ALERT_TYPE}}", "check_name": "{{CHECK_NAME}}", "group_name": "{{GROUP_NAME}}", "check_id": "{{CHECK_ID}}", "check_type": "{{CHECK_TYPE}}", "check_result_id": "{{CHECK_RESULT_ID}}", "check_error_message": "{{CHECK_ERROR_MESSAGE}}", "response_time": "{{RESPONSE_TIME}}", "api_check_response_status_code": "{{API_CHECK_RESPONSE_STATUS_CODE}}", "api_check_response_status_text": "{{API_CHECK_RESPONSE_STATUS_TEXT}}", "run_location": "{{RUN_LOCATION}}", "ssl_days_remaining": "{{SSL_DAYS_REMAINING}}", "ssl_check_domain": "{{SSL_CHECK_DOMAIN}}", "started_at": "{{STARTED_AT}}", "tags": "{{TAGS}}", "link": "{{RESULT_LINK}}", "region": "{{REGION}}", "uuid": "{{$UUID}}" } ``` 7. Go to Headers tab and add a new header with key as `X-API-KEY` and create a new API key in Keep and paste it as the value and save the webhook. 8. Follow the below steps to create a new API key in Keep. 9. Go to Keep dashboard and click on the profile icon in the botton left corner and click `Settings`. 10. Select `Users and Access` tab and then select `API Keys` tab and create a new API key. 11. Give name and select the role as `webhook` and click on `Create API Key`. 12. Use the generated API key in the `X-API-KEY` header of the webhook created in Checkly. ## Useful Links - [Checkly Website](https://www.checklyhq.com/) ================================================ FILE: docs/providers/documentation/checkmk-provider.mdx ================================================ --- title: 'Checkmk' sidebarTitle: 'Checkmk Provider' description: 'Checkmk provider allows you to get alerts from Checkmk via webhooks.' --- import AutoGeneratedSnippet from '/snippets/providers/checkmk-snippet-autogenerated.mdx'; ## Overview The Checkmk provider enables seamless integration between Keep and Checkmk. It allows you to get alerts from Checkmk to Keep via webhooks making it easier to manage your infrastructure and applications in one place. ## Connecting Checkmk to Keep To connect Checkmk to Keep, you need to configure it as a webhook from Checkmk. Follow the steps below to set up the integration: 1. Keep webhook script need to installed on the Checkmk server. 2. You can download the Keep webhook script using the following command: ```bash wget -O webhook-keep.py https://github.com/keephq/keep/blob/main/keep/providers/checkmk_provider/webhook-keep.py?raw=true ``` 3. Copy the downloaded script to the following path on the Checkmk server: If you are using Checkmk Docker container, then copy it to the following path according to your docker volume mapping: ```bash cp webhook-keep.py /omd/sites//local/share/check_mk/notifications/webhook-keep.py cd /omd/sites//local/share/check_mk/notifications ``` If you are using Checkmk installed on the server, then copy it to the following path: ```bash cp webhook-keep.py ~/local/share/check_mk/notifications/webhook-keep.py cd ~/local/share/check_mk/notifications ``` 4. Make the script executable: ```bash chmod +x webhook-keep.py ``` 5. Now go to the Checkmk web interface and navigate to Setup 6. Click on Notifications under Events 6. Click on Add rule 7. In the Notifications method method, select "webhook-keep" as the notification method. 8. Configure the Rule properties, Contact selections, and Conditions according to your requirements. 9. The first parameter is the Webhook URL of Keep which is `https://api.keephq.dev/alerts/event/checkmk`. 10. The second parameter is the API Key of Keep which you can generate in the [Keep settings](https://platform.keephq.dev/settings?selectedTab=users&userSubTab=api-keys). 11. Click on Save to save the configuration. 12. Now you will start receiving alerts from Checkmk to Keep via webhooks when the configured conditions are met. ## Useful Links - [Checkmk](https://checkmk.com/) ================================================ FILE: docs/providers/documentation/cilium-provider.mdx ================================================ --- title: "Cilium" sidebarTitle: "Cilium Provider" description: "Cilium provider enables topology discovery by analyzing network flows between services in your Kubernetes cluster using Hubble." --- import AutoGeneratedSnippet from '/snippets/providers/cilium-snippet-autogenerated.mdx'; ## Overview Cilium provider is in Beta and is not working with authentication yet. The current way to pull topology data from your kubernetes cluster, is to run: ```bash # hubble-relay usually installed at kube-system, but it depends on your cluster. kubectl port-forward -n kube-system svc/hubble-relay 4245:80 ``` and then use `localhost:4245` to pull topology data. If you need help with connecting Cilium provider, [reach out](https://slack.keephq.dev). The Cilium provider leverages Hubble's network flow data to automatically discover service dependencies and build a topology map of your Kubernetes applications. ## Authentication Parameters | Parameter | Description | Example | |-----------|-------------|----------| | `cilium_base_endpoint` | The base endpoint of the Cilium Hubble relay | `localhost:4245` | ## Outputs The provider returns topology information including: - Service names and their dependencies - Namespace information - Pod labels and cluster metadata - Network-based relationships between services ## Service Discovery Logic The provider identifies services using the following hierarchy: 1. Workload name (if available) 2. Kubernetes labels (`k8s:app=` or `k8s:app.kubernetes.io/name=`) 3. Pod name (stripped of deployment suffixes) ## Requirements - A running Kubernetes cluster with Cilium installed - Hubble enabled and accessible via gRPC - Network visibility (flow logs) enabled in Cilium ## Limitations - Only captures active network flows between pods - Service discovery is limited to pods with proper Kubernetes labels - Requires direct access to the Hubble relay endpoint ## Useful Links - [Cilium Documentation](https://docs.cilium.io/) - [Hubble Documentation](https://docs.cilium.io/en/stable/hubble/) - [Kubernetes Network Policies](https://kubernetes.io/docs/concepts/services-networking/network-policies/) ## Google Kubernetes Engine specific If you are using a GKE cluster, you cannot connect Keep to the Google-managed hubble-relay directly because: - hubble-relay operates only in secure mode, - hubble-relay requires client certificate authentication. However, Keep does not currently support these features. To work around this, you can add an NGINX Pod that listens on a plaintext HTTP port and proxies requests to hubble-relay secure port using hubble-relay certificates. You need a GKE cluster with [dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2) . [Dataplane v2 observability](https://cloud.google.com/kubernetes-engine/docs/how-to/configure-dpv2-observability) must be enabled. Here is an example of running a plaintext NGINX proxy: ```yaml --- apiVersion: v1 kind: ConfigMap metadata: name: hubble-relay-insecure-nginx namespace: gke-managed-dpv2-observability data: nginx.conf: | user nginx; worker_processes auto; error_log /dev/stdout notice; pid /var/run/nginx.pid; events { worker_connections 1024; } http { log_format main '$remote_addr - $remote_user [$time_local] "$request" ' '$status $body_bytes_sent "$http_referer" ' '"$http_user_agent" "$http_x_forwarded_for"'; access_log /dev/stdout main; server { listen 80; http2 on; location / { grpc_pass grpcs://hubble-relay.gke-managed-dpv2-observability.svc.cluster.local:443; grpc_ssl_certificate /etc/nginx/certs/client.crt; grpc_ssl_certificate_key /etc/nginx/certs/client.key; grpc_ssl_trusted_certificate /etc/nginx/certs/hubble-relay-ca.crt; } } } --- kind: Deployment apiVersion: apps/v1 metadata: name: hubble-relay-insecure namespace: gke-managed-dpv2-observability labels: k8s-app: hubble-relay-insecure app.kubernetes.io/name: hubble-relay-insecure app.kubernetes.io/part-of: cilium spec: replicas: 1 selector: matchLabels: k8s-app: hubble-relay-insecure template: metadata: labels: k8s-app: hubble-relay-insecure app.kubernetes.io/name: hubble-relay-insecure app.kubernetes.io/part-of: cilium spec: securityContext: fsGroup: 1000 seccompProfile: type: RuntimeDefault containers: - name: frontend image: nginx:alpine ports: - name: http containerPort: 80 volumeMounts: - name: hubble-relay-insecure-nginx-conf mountPath: /etc/nginx/ readOnly: true - name: hubble-relay-client-certs mountPath: /etc/nginx/certs/ readOnly: true volumes: - configMap: name: hubble-relay-insecure-nginx name: hubble-relay-insecure-nginx-conf - name: hubble-relay-client-certs projected: defaultMode: 0400 sources: - secret: name: hubble-relay-client-certs items: - key: ca.crt path: hubble-relay-ca.crt - key: tls.crt path: client.crt - key: tls.key path: client.key --- kind: Service apiVersion: v1 metadata: name: hubble-relay-insecure namespace: gke-managed-dpv2-observability labels: k8s-app: hubble-relay-insecure app.kubernetes.io/name: hubble-relay-insecure app.kubernetes.io/part-of: cilium spec: type: ClusterIP selector: k8s-app: hubble-relay-insecure ports: - name: http port: 80 targetPort: 80 ``` Now you can connect Keep with google-managed hubble-relay by adding Cilium provider using `hubble-relay-insecure.gke-managed-dpv2-observability:80` address. ================================================ FILE: docs/providers/documentation/clickhouse-provider.mdx ================================================ --- title: 'ClickHouse' sidebarTitle: 'ClickHouse Provider' description: 'ClickHouse provider allows you to interact with ClickHouse database.' --- import AutoGeneratedSnippet from '/snippets/providers/clickhouse-snippet-autogenerated.mdx'; ## Overview ClickHouse is an open-source column-oriented DBMS for online analytical processing that allows users to generate analytical reports using SQL queries in real-time. ## Connecting with the ClickHouse provider 1. Obtain the required authentication parameters. 2. Add ClickHouse provider to your keep account and configure with the above authentication parameters. ## Useful Links - [ClickHouse](https://clickhouse.com/) - [ClickHouse Statements](https://clickhouse.com/docs/en/sql-reference/statements/) ================================================ FILE: docs/providers/documentation/cloudwatch-provider.mdx ================================================ --- title: "CloudWatch" sidebarTitle: "CloudWatch Provider" description: "CloudWatch provider enables seamless integration with AWS CloudWatch for alerting and monitoring, directly pushing alarms into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/cloudwatch-snippet-autogenerated.mdx'; ## Overview The CloudWatch Provider offers a direct integration with AWS CloudWatch, enabling Keep users to receive CloudWatch alarms within the Keep platform. This integration centralizes the monitoring and alerting capabilities, allowing for timely responses to changes in the infrastructure or application health. ### Key Features: - **Webhook Integration**: Facilitates automatic subscription to AWS SNS topics linked with CloudWatch alarms, ensuring that Keep is notified of all relevant alarms. - **Support for Custom SNS Topics**: Allows the use of both pre-existing SNS topics and the specification of custom SNS topics for alarm notifications. - **Broad Monitoring Scope**: Utilizes CloudWatch's comprehensive alarm system to monitor application and infrastructure health. - **Adaptable Authentication**: Accommodates both permanent and temporary AWS credentials to suit various security and operational requirements. ## Connecting with the Provider To integrate CloudWatch with Keep, you'll need the following: - An AWS account with permissions to access CloudWatch and SNS services. - A configured Keep account with API access. - Appropriate AWS IAM permissions for the CloudWatch provider. ## Setting Up the Integration For a seamless setup process, ensure your AWS IAM roles are properly configured with the necessary permissions for CloudWatch and SNS access. ### Steps: 1. **Configure AWS IAM Roles**: Ensure the IAM role used by the CloudWatch provider has permissions for `cloudwatch:DescribeAlarms`, `cloudwatch:PutMetricAlarm`, `sns:ListSubscriptionsByTopic`, and other relevant actions. 2. **Specify Authentication Details**: In the Keep platform, enter the AWS Access Key, Secret, and Region details in the CloudWatch provider configuration. 3. **Set Up SNS Topic (Optional)**: If using a custom SNS topic, specify its ARN or name in the provider configuration. Keep will use this topic to listen for alarm notifications. 4. **Activate the Provider**: Finalize the setup in Keep to start receiving CloudWatch alarms. ## Troubleshooting - Ensure the AWS credentials provided have the correct permissions and are not expired. - Verify that the SNS topics are correctly configured to send notifications to Keep. - Check the CloudWatch alarms to ensure they are active and correctly configured to trigger under the desired conditions. ## Webhook Integration Modifications The webhook integration for CloudWatch adds Keep as a subscriber to the SNS topics associated with CloudWatch alarms. This integration allows Keep to receive notifications for all alarms triggered within the AWS environment. The integration automatically gains access to the following scopes within CloudWatch: - `cloudwatch:DescribeAlarms` ## Useful Links - [AWS CloudWatch Documentation](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/WhatIsCloudWatch.html) - [AWS SNS Documentation](https://docs.aws.amazon.com/sns/latest/dg/welcome.html) ================================================ FILE: docs/providers/documentation/console-provider.mdx ================================================ --- title: "Console" sidebarTitle: "Console Provider" description: "Console provider is sort of a mock provider that projects given alert message to the console." --- import AutoGeneratedSnippet from '/snippets/providers/console-snippet-autogenerated.mdx'; ## Inputs - message: The alert message to print to the console ## Outputs This provider has no outputs ## Authentication Parameters This provider has no authentication ## Connecting with the Provider This provider doesn't require any connection ## Notes _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Useful Links _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Example ```python config = { "description": "Console Output Provider", "authentication": {}, } provider = ProvidersFactory.get_provider( provider_id='mock', provider_type="console", provider_config=config ) provider.notify( message="Simple alert showing context with name: {name}".format( name="John Doe" ) ) ``` ![](/images/console_provider_example.png) ================================================ FILE: docs/providers/documentation/coralogix-provider.mdx ================================================ --- title: 'Coralogix' sidebarTitle: 'Coralogix Provider' description: 'Coralogix provider allows you to send alerts from Coralogix to Keep using webhooks.' --- import AutoGeneratedSnippet from '/snippets/providers/coralogix-snippet-autogenerated.mdx'; ## Overview Coralogix is a modern observability platform delivers comprehensive visibility into all your logs, metrics, traces and security events with end-to-end monitoring. ## Connecting Coralogix to Keep To connect Coralogix to Keep, you need to configure it as a webhook from Coralogix. Follow the steps below to set up the integration: 1. From the Coralogix toolbar, navigate to Data Flow > Outbound Webhooks. 2. In the Outbound Webhooks section, click Generic Webhook. 3. Click Add New. 4. Enter a webhook name and set the URL to `https://api.keephq.dev/alerts/event/coralogix`. 5. Select HTTP method (POST). 6. Generate an API key with webhook role from the [Keep settings](https://platform.keephq.dev/settings?selectedTab=api-key). Copy the API key and paste it in the request header in the next step. 7. Add a request header with the key "x-api-key" and API key as the value in coralogix webhook configuration. 8. Edit the body of the messages that will be sent when the webhook is triggered (optional). 9. Save the configuration. ## Useful Links - [Coralogix Website](https://coralogix.com/) ================================================ FILE: docs/providers/documentation/dash0-provider.mdx ================================================ --- title: 'Dash0' sidebarTitle: 'Dash0 Provider' description: 'Dash0 provider allows you to get events from Dash0 using webhooks.' --- import AutoGeneratedSnippet from '/snippets/providers/dash0-snippet-autogenerated.mdx'; ## Overview Dash0 is modern OpenTelemetry Native Observability, built on CNCF Open Standards such as PromQL, Perses and OTLP with full cost control. ## Connecting Dash0 to Keep To connect Dash0 to Keep, you need to create a webhook in Dash0. 1. Go to Dash0 dashboard and click on Organization settings. 2. Click on `Notification Channels` and create a New notification channel of type `Webhook`. 3. Give a name to the webhook and enter [https://api.keephq.dev/alerts/event/dash0](https://api.keephq.dev/alerts/event/dash0) as the URL. 4. Follow the below steps to create a new API key in Keep. 5. Go to Keep dashboard and click on the profile icon in the botton left corner and click `Settings`. 6. Select `Users and Access` tab and then select `API Keys` tab and create a new API key. 7. Give name and select the role as `webhook` and click on `Create API Key`. 8. Copy the API key. 9. Add a new request header with key `X-API-KEY` and value as the API key copied from Keep and save the webhook. 10. Go to `Notifications` under `Alerting` and create a new notification rule if required or change the existing notification rule to use the webhook created. 11. Go to `Checks` under `Alerting` and create a new check or edit an existing check to use the notification rule created. 12. Now you will start receiving events in Keep from Dash0. ## Useful Links - [Dash0](https://dash0.com/) ================================================ FILE: docs/providers/documentation/databend-provider.mdx ================================================ --- title: 'Databend' sidebarTitle: 'Databend Provider' description: 'Databend provider allows you to query databases' --- import AutoGeneratedSnippet from '/snippets/providers/databend-snippet-autogenerated.mdx'; ## Overview Databend is an open-source, serverless, cloud-native data lakehouse built on object storage with a decoupled storage and compute architecture. It delivers exceptional performance and rapid elasticity, aiming to be the open-source alternative to Snowflake. ## Useful Links - [Databend](https://www.databend.com/) ================================================ FILE: docs/providers/documentation/datadog-provider.mdx ================================================ --- title: "Datadog" sidebarTitle: "Datadog Provider" description: "Datadog provider allows you to query Datadog metrics and logs for monitoring and analytics." --- import AutoGeneratedSnippet from '/snippets/providers/datadog-snippet-autogenerated.mdx'; ## Connecting with the Provider ### API Key To obtain the Datadog API key, follow these steps: 1. Log in to your Datadog account. 2. Navigate to the "Integrations" section. 3. Click on the "API" tab. 4. Generate a new API Key. ### App Key To obtain the Datadog App Key, follow these steps: 1. Log in to your Datadog account. 2. Navigate to the "Integrations" section. 3. Click on the "API" tab. 4. Generate a new App Key or use an existing one. ## Fingerprinting Fingerprints in Datadog are calculated based on the `groups` and `monitor_id` fields of an incoming/pulled event. ## Notes _No information yet, feel free to contribute it using the "Edit this page" link at the bottom of the page_ ## Useful Links - [Datadog API Documentation](https://docs.datadoghq.com/api/) - [Datadog Query Language](https://docs.datadoghq.com/dashboards/querying/) ## Webhook Integration Modifications The webhook integration adds Keep as a monitor within Datadog. It can be found under the "Monitors" section. The integration automatically gains access to the following scopes within Datadog: - `monitors_read` - `monitors_write` - `create_webhooks` ================================================ FILE: docs/providers/documentation/deepseek-provider.mdx ================================================ --- title: "DeepSeek Provider" description: "The DeepSeek Provider enables integration of DeepSeek's language models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/deepseek-snippet-autogenerated.mdx'; The DeepSeek Provider supports querying DeepSeek language models for prompt-based interactions. ## Connecting with the Provider To connect to DeepSeek, you'll need to obtain an API Key: 1. Sign up for an account at [DeepSeek](https://platform.deepseek.com) 2. Navigate to your account settings 3. Generate an API key for Keep Use the generated API key in the `authentication` section of your DeepSeek Provider configuration. ================================================ FILE: docs/providers/documentation/discord-provider.mdx ================================================ --- title: "Discord" sidebarTitle: "Discord Provider" description: "Discord provider is a provider that allows to send notifications to Discord" --- import AutoGeneratedSnippet from '/snippets/providers/discord-snippet-autogenerated.mdx'; ## Connecting with the Provider - Open the Discord server where you want to create the webhook. - Click on the settings icon next to the server name, and select "Server Settings." - In the left-hand menu, click on "Integrations," and then click on "Webhooks." - Click the "Create Webhook" button, and give your webhook a name. ## Useful Links - https://discord.com/developers/docs/resources/webhook#execute-webhook ================================================ FILE: docs/providers/documentation/dynatrace-provider.mdx ================================================ --- title: "Dynatrace" sidebarTitle: "Dynatrace Provider" description: "Dynatrace provider allows integration with Dynatrace for monitoring, alerting, and collecting metrics." --- import AutoGeneratedSnippet from '/snippets/providers/dynatrace-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Log in to your Dynatrace account and navigate to "Settings" → "Integration" → "Dynatrace API." 2. Generate an API token with appropriate permissions (e.g., Read metrics). 3. Get your environment's Dynatrace URL. 4. Configure the Dynatrace provider using the API token and Dynatrace URL. ## Useful Links -[Dynatrace API Documentation](https://docs.dynatrace.com/docs/dynatrace-api) ================================================ FILE: docs/providers/documentation/eks-provider.mdx ================================================ --- title: "EKS Provider" description: "EKS provider integrates with AWS EKS and let you interatct with kubernetes clusters hosted on EKS." --- import AutoGeneratedSnippet from '/snippets/providers/eks-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Amazon EKS, follow these steps: 1. Log in to your [AWS Console](https://aws.amazon.com/) 2. Create an IAM user with EKS permissions: ```bash aws iam create-user --user-name eks-user ``` 3. Attach required policies: ```bash aws iam attach-user-policy --user-name eks-user --policy-arn arn:aws:iam::aws:policy/AmazonEKSClusterPolicy aws iam attach-user-policy --user-name eks-user --policy-arn arn:aws:iam::aws:policy/AmazonEKSServicePolicy ``` 4. Create access keys ```bash aws iam create-access-key --user-name eks-user ``` You should get: ``` { "AccessKey": { "AccessKeyId": "AKIAXXXXXXXXXXXXXXXX", "SecretAccessKey": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "Status": "Active" } } ``` The `AccessKeyId` is your `access_key` and `SecretAccessKey` is your `secret_access_key`. 5. Note your cluster name and region from the EKS console or using: ```bash aws eks list-clusters --region ``` ## Required Permissions The AWS IAM user needs these permissions: 1. **eks:DescribeCluster** 2. **eks:ListClusters** Additional permissions for specific operations: 3. **eks:AccessKubernetesApi** for pod/deployment operations 4. **eks:UpdateCluster** for scaling operations | Command | AWS IAM Permissions | |---------|-------------------| | `get_pods` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | | `get_pvc` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | | `get_node_pressure` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | | `get_deployment` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | | `scale_deployment` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | | `exec_command` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | | `restart_pod` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | | `get_pod_logs` | `eks:DescribeCluster`
`eks:AccessKubernetesApi` | ================================================ FILE: docs/providers/documentation/elastic-provider.mdx ================================================ --- title: "Elastic" sidebarTitle: "Elastic Provider" description: "Elastic provider is a provider used to query Elasticsearch (tested with elastic.co)" --- import AutoGeneratedSnippet from '/snippets/providers/elastic-snippet-autogenerated.mdx'; ## Connecting with the Provider ### API Key To obtain the Elastic API key, follow these steps: 1. Log in to your elastic.co account 2. Go to the "Elasticsearch Service" section 3. Click on the "API Key" button 4. Generate a new API Key ### Cloud ID To obtain the Elastic Cloud ID, follow these steps: 1. Log in to your elastic.co account 2. Go to the "Elasticsearch Service" section 3. Find the "Cloud ID" in the Overview page. ================================================ FILE: docs/providers/documentation/flashduty-provider.mdx ================================================ --- title: "Flashduty" sidebarTitle: "Flashduty Provider" description: "Flashduty docs" --- import AutoGeneratedSnippet from '/snippets/providers/flashduty-snippet-autogenerated.mdx'; ![Flashduty](/images/flashduty_1.png) ## Integration Key Generation The Flashduty gets integration key as an authentication method 1.Enter the Flashduty console, select Integration Center => Alert Events to enter the integration selection page ![Flashduty](/images/flashduty_2.png) 2.Select Keep integration 3.Define a name for the current integration 4.Configure default routing and select the corresponding channel 5.Copy the integration Key to Keep 6.Complete the integration configuration ![Flashduty](/images/flashduty_3.png) ## Useful Links - https://docs.flashcat.cloud/en/flashduty/keep-alert-integration-guide?nav=01JCQ7A4N4WRWNXW8EWEHXCMF5 ================================================ FILE: docs/providers/documentation/fluxcd-provider.mdx ================================================ --- title: "Flux CD" sidebarTitle: "Flux CD Provider" description: "Flux CD Provider enables integration with Flux CD for GitOps topology and alerts." --- import AutoGeneratedSnippet from '/snippets/providers/fluxcd-snippet-autogenerated.mdx'; ## Overview Flux CD is a GitOps tool for Kubernetes that provides continuous delivery through automated deployment, monitoring, and management of applications. This provider allows you to integrate Flux CD with Keep to get a single pane of glass for monitoring your GitOps deployments. ## Features ### Topology The Flux CD provider pulls topology data from the following Flux CD resources: - GitRepositories - HelmRepositories - HelmCharts - OCIRepositories - Buckets - Kustomizations - HelmReleases The topology shows the relationships between these resources, allowing you to visualize the GitOps deployment process. Resources are categorized as: - **Source**: GitRepositories, HelmRepositories, OCIRepositories, Buckets - **Deployment**: Kustomizations, HelmReleases ### Alerts The Flux CD provider gets alerts from two sources: 1. Kubernetes events related to Flux CD controllers 2. Status conditions of Flux CD resources (GitRepositories, Kustomizations, HelmReleases) Alerts include: - Failed GitRepository operations - Failed Kustomization operations - Failed HelmRelease operations - Non-ready resources Alert severity is determined based on: - **Critical**: Events with "failed", "error", "timeout", "backoff", or "crash" in the reason - **High**: Other warning events - **Info**: Normal events ## Connecting with the Provider The Flux CD provider supports multiple authentication methods: 1. **Kubeconfig file content** (recommended for external access) 2. **API server URL and token** 3. **In-cluster configuration** (when running inside a Kubernetes cluster) 4. **Default kubeconfig file** (from ~/.kube/config) ### Using Kubeconfig ```yaml apiVersion: keep.sh/v1 kind: Provider metadata: name: flux-cd spec: type: fluxcd authentication: kubeconfig: | apiVersion: v1 kind: Config clusters: - name: my-cluster cluster: server: https://kubernetes.example.com certificate-authority-data: BASE64_ENCODED_CA_CERT users: - name: my-user user: token: MY_TOKEN contexts: - name: my-context context: cluster: my-cluster user: my-user current-context: my-context context: my-context namespace: flux-system ``` ### Using API Server and Token ```yaml apiVersion: keep.sh/v1 kind: Provider metadata: name: flux-cd spec: type: fluxcd authentication: api-server: https://kubernetes.example.com token: MY_TOKEN namespace: flux-system ``` > Note: Both `api-server` and `api_server` formats are supported for backward compatibility. ### Using In-Cluster Configuration ```yaml apiVersion: keep.sh/v1 kind: Provider metadata: name: flux-cd spec: type: fluxcd authentication: namespace: flux-system ``` ## Comparison with ArgoCD Provider Keep supports both Flux CD and ArgoCD for GitOps deployments. Here's a comparison of the two providers: | Feature | Flux CD | ArgoCD | |---------|---------|--------| | Topology | ✅ | ✅ | | Alerts | ✅ | ✅ | | Resource Types | GitRepositories, HelmRepositories, Kustomizations, HelmReleases | Applications, Projects | | Authentication | Kubeconfig, API Server, In-Cluster | Username/Password, Token | | Deployment Model | Kubernetes Controllers | Server + Controllers | | UI Integration | No (CLI only) | Yes (Web UI) | ## Related Resources - [Flux CD Documentation](https://fluxcd.io/docs/) - [Flux CD GitHub Repository](https://github.com/fluxcd/flux2) - [Keep Documentation](https://docs.keephq.dev) ================================================ FILE: docs/providers/documentation/gcpmonitoring-provider.mdx ================================================ --- title: "GCP Monitoring" sidebarTitle: "GCP Monitoring Provider" description: "GCP Monitoring provider allows you to get alerts and logs from GCP Monitoring via webhooks and log queries." --- import AutoGeneratedSnippet from '/snippets/providers/gcpmonitoring-snippet-autogenerated.mdx'; ## Overview The GCP Monitoring Provider enables seamless integration between Keep and GCP Monitoring, allowing alerts from GCP Monitoring to be directly sent to Keep through webhook configurations. In addition to alerts, the provider now supports querying log entries from GCP Logging, enabling a comprehensive view of alerts and associated logs within Keep's platform. ## Connecting GCP Monitoring to Keep ### Alert Integration via Webhook To connect GCP Monitoring alerts to Keep, configure a webhook as a notification channel in GCP Monitoring and link it to the desired alert policy. ### Step 1: Access Notification Channels Log in to the Google Cloud Platform console. Navigate to **Monitoring > Alerting > Notification channels**. ### Step 2: Add a New Webhook Within the Webhooks section, click on **ADD NEW**. ### Step 3: Configure the Webhook In the Endpoint URL field, enter the webhook URL provided by Keep. - **Display Name**: keep-gcpmonitoring-webhook-integration - Enable **Use HTTP Basic Auth** and input the following credentials: - **Auth Username**: `api_key` - **Auth Password**: `%YOURAPIKEY%` ### Step 4: Save the Webhook Configuration - Click **Save** to store the webhook configuration. ### Step 5: Associate the Webhook with an Alert Policy Navigate to the alert policy you wish to send notifications from to Keep. - Click **Edit**. - Under "Notifications and name," find the **Notification Channels** section and select the `keep-gcpmonitoring-webhook-integration` channel you created. - Save the changes by clicking on **SAVE POLICY**. ### Step 6: Review the Alert in Keep Once the setup is complete, alerts from GCP Monitoring will start appearing in Keep. ## Log Query Integration The GCP Monitoring Provider also supports querying logs from GCP Logging, allowing you to fetch log entries based on specific filters. This is helpful for enriching alert data with related logs or for monitoring specific events in Keep. ### Authentication Requirements To enable log querying, you need to provide a service account JSON file with the `logs.viewer` role. This service account should be configured in the `authentication` section of your GCP Monitoring Provider configuration. ### Querying Logs The provider’s `query` function supports filtering logs based on criteria such as resource type, severity, or specific keywords. You can specify a time range for querying logs using `timedelta_in_days`, and control the number of entries with `page_size`. #### Example Usage Here’s an example of how you might use the provider to query log entries: ```python query(filter='resource.type="cloud_run_revision" AND severity="ERROR"', timedelta_in_days=1) ``` This will return logs of severity “ERROR” related to Cloud Run revisions from the past day. #### Post Installation Validation To validate both alerts and logs, follow these steps: 1. Alert Validation: Test the webhook by triggering an alert in GCP Monitoring and confirm it appears in Keep. 2. Log Query Validation: Execute a simple log query and verify that log entries are returned as expected. ### Useful Links - [GCP Monitoring Notification Channels](https://cloud.google.com/monitoring/support/notification-options) - [GCP Monitoring Alerting](https://cloud.google.com/monitoring/alerts) ================================================ FILE: docs/providers/documentation/gemini-provider.mdx ================================================ --- title: "Gemini Provider" description: "The Gemini Provider allows for integrating Google's Gemini language models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/gemini-snippet-autogenerated.mdx'; The Gemini Provider supports querying Gemini language models for prompt-based interactions. ## Connecting with the Provider To connect to Gemini, you'll need to obtain an API Key: 1. Go to [Google AI Studio](https://makersuite.google.com/app/apikey). 2. Click on **Create API Key** or use an existing one. 3. Copy your API key for Keep. Use the generated API key in the `authentication` section of your Gemini Provider configuration. ================================================ FILE: docs/providers/documentation/github-provider.mdx ================================================ --- title: "GitHub" sidebarTitle: "GitHub Provider" description: "GitHub provider allows integration with GitHub for managing repositories, issues, pull requests, and more." --- import AutoGeneratedSnippet from '/snippets/providers/github-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Go to your GitHub account and navigate to **Settings > Developer Settings > Personal Access Tokens**. 2. Generate a token with the required permissions (e.g., `repo`, `workflow`, etc.). 3. Copy the token and provide it as `github_token` in the provider configuration. ## Useful Links - [GitHub REST API Documentation](https://docs.github.com/en/rest?apiVersion=2022-11-28) ================================================ FILE: docs/providers/documentation/github_workflows_provider.mdx ================================================ --- title: "Github Workflows" sidebarTitle: "Github Workflows Provider" description: "GithubWorkflowProvider is a provider that interacts with Github Workflows API." --- import AutoGeneratedSnippet from '/snippets/providers/github_workflows-snippet-autogenerated.mdx'; ## Connecting with the Provider Create your personal access token (classic) in github - In the upper-right corner of any page, click your profile photo, then click **Settings**. - In the left sidebar, click **Developer settings**. - In the left sidebar, under Personal access tokens, click **Tokens (classic)**. - Select Generate new token, then click Generate new **token (classic)**. - In the "Note" field, give your token a descriptive name. - To give your token an expiration, select **Expiration**, then choose a default option or click **Custom** to enter a date. - Select the scopes you'd like to grant this token. - Click **Generate token**. - Optionally, to copy the new token to your clipboard, click copy button. See bellow for more info. ## Useful Links - [Workflows](https://docs.github.com/en/rest/actions/workflows) - [Workflows runs](https://docs.github.com/en/rest/actions/workflow-runs) - [Workflows jobs](https://docs.github.com/en/rest/actions/workflow-jobs) - [Managing your personal access tokens](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) ================================================ FILE: docs/providers/documentation/gitlab-provider.mdx ================================================ --- title: "GitLab Provider" sidebarTitle: "GitLab Provider" description: "GitLab provider is a provider used for creating issues in GitLab" --- import AutoGeneratedSnippet from '/snippets/providers/gitlab-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Go to [Personal Access Token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#create-a-personal-access-token) to see how to create a personal_access_token. 2. Get `host`, eg: if you're using Cloud GitLab, use: `https://gitlab.com` or use your `host` if you're using onPrem. ## Useful Links - [GitLab PAT](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#create-a-personal-access-token) - [GitLab Create New Issue](https://docs.gitlab.com/ee/api/issues.html#new-issue) - [GitLab Scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes) ================================================ FILE: docs/providers/documentation/gitlabpipelines-provider.mdx ================================================ --- title: "GitLab Pipelines" sidebarTitle: "GitLab Pipelines Provider" description: "GitLab Pipelines Provider is a provider that interacts with GitLab Pipelines API." --- import AutoGeneratedSnippet from '/snippets/providers/gitlabpipelines-snippet-autogenerated.mdx'; ## Connecting with the Provider Create your personal access token in GitLab - On the left sidebar, select your avatar. - Select **Edit profile**. - On the left sidebar, select **Access Tokens**. - Select Add **new token**. - Enter a **name** and **expiry date** for the token. - Select the desired scopes. - Select Create **personal access token**. ## Useful Links - [GitLab PAT](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#create-a-personal-access-token) - [GitLab Pipelines API](https://docs.gitlab.com/ee/api/pipelines.html) ================================================ FILE: docs/providers/documentation/gke-provider.mdx ================================================ --- title: "Google Kubernetes Engine" sidebarTitle: "Google Kubernetes Engine Provider" description: "Google Kubernetes Engine provider allows managing Google Kubernetes Engine clusters and related resources." --- import AutoGeneratedSnippet from '/snippets/providers/gke-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Obtain Google Cloud credentials by following the steps in [Google Cloud's service account guide](https://cloud.google.com/iam/docs/creating-managing-service-account-keys). 2. Ensure your service account has the necessary permissions to manage GKE clusters (`roles/container.admin`). 3. Provide the `gcp_credentials`, `project_id`, and `zone` in your provider configuration. ## Usefull Links -[Google Kubernetes Engine Documentation](https://cloud.google.com/kubernetes-engine/docs) ================================================ FILE: docs/providers/documentation/google_chat-provider.mdx ================================================ --- title: "Google Chat" sidebarTitle: "Google Chat Provider" description: "Google Chat provider is a provider that allows to send messages to Google Chat" --- import AutoGeneratedSnippet from '/snippets/providers/google_chat-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Open Google Chat 2. Open the space to which you want to add a webhook 3. Next to the space title, click the expand more arrow, and then click "Apps & Integrations" 4. Click "+ Add webhooks" 5. In the Name field, enter "Quickstart Webhook" 6. In the Avatar URL field, enter https://developers.google.com/chat/images/chat-product-icon.png 7. Click Save 8. To copy the webhook URL, click "More", and then click "Copy link". ## Useful Links - https://developers.google.com/chat/how-tos/webhooks ================================================ FILE: docs/providers/documentation/grafana-provider.mdx ================================================ --- title: "Grafana Provider" description: "Grafana Provider allows either pull/push alerts and pull Topology Map from Grafana to Keep." --- import AutoGeneratedSnippet from '/snippets/providers/grafana-snippet-autogenerated.mdx'; Grafana currently supports pulling/pushing alerts & Topology Map. We will add querying and notifying soon. ## Legacy vs Unified Alerting Keep supports both Grafana's legacy alerting system and the newer Unified Alerting system. Here are the key differences: ### Legacy Alerting - Uses notification channels for alert delivery - Configured at the dashboard level - Uses a different API endpoint (`/api/alerts` and `/api/alert-notifications`) - Simpler setup but fewer features - Alerts are tightly coupled with dashboard panels ### Unified Alerting (Default from Grafana 9.0) - Uses alert rules and contact points - Configured centrally in the Alerting section - Uses the newer `/api/v1/alerts` endpoint - More powerful features including label-based routing - Supports multiple data sources in a single alert rule If you're using Grafana 8.x or earlier, or have explicitly enabled legacy alerting in newer versions, make sure to configure Keep accordingly using the legacy alerting configuration. ## Connecting with the Provider To connect to Grafana, you need to create an API Token: 1. Log in to your Grafana account. 2. Go to the **Service Accounts** page (cmd+k -> service). 3. Click the **Add service account** button and provide a name for your service account. 4. Grant "alerting" permissions: 5. Now generate Service Account Token: 6. Use the token value in the `authentication` section in the Grafana Provider configuration. ## Post Installation Validation You can check that the Grafana Provider works by testing Keep's contact point (which was installed via the webhook integration). 1. Go to **Contact Points** (cmd k -> contact). 2. Find the **keep-grafana-webhook-integration**: 3. Click on the **View contact point**: 4. Click on **Test**: 5. Go to Keep – you should see an alert from Grafana! **Alternative Validation Methods (When Keep is Not Accessible Externally):** If Keep is not accessible externally and the webhook cannot be created, you can manually validate the Grafana provider setup using the following methods: 1. **Manual Test Alerts in Grafana:** - Create a manual test alert in Grafana. - Set up a contact point within Grafana that would normally send alerts to Keep. - Trigger the alert and check Grafana's logs for errors or confirmation that the alert was sent. 2. **Check Logs in Grafana:** - Access Grafana’s log files or use the **Explore** feature to query logs related to the alerting mechanism. - Ensure there are no errors related to the webhook integration and that alerts are processed correctly. 3. **Verify Integration Status:** - Navigate to the **Alerting** section in Grafana. - Confirm that the integration status shows as active or functioning. - Monitor any outbound HTTP requests to verify that Grafana is attempting to communicate with Keep. 4. **Network and Connectivity Check:** - Use network monitoring tools to ensure Grafana can reach Keep or any alternative endpoint configured for alerts. **Topology Map** is generated from the traces collect by Tempo. To get the Datasource UID, go to: 1. Connections > Data Sources. 2. Click the Prometheus instance which is scraping data from Tempo > Your URL is in the format `https://host/connections/datasources/edit/` 3. Copy that DATASOURCE_UID and use it while installing the provider. ## Webhook Integration Modifications The webhook integration adds Keep as a contact point in the Grafana instance. This integration can be located under the "Contact Points" section. Keep also gains access to the following scopes: - `alert.provisioning:read` - `alert.provisioning:write` ================================================ FILE: docs/providers/documentation/grafana_incident-provider.mdx ================================================ --- title: 'Grafana Incident Provider' sidebarTitle: 'Grafana Incident Provider' description: 'Grafana Incident Provider alows you to query all incidents from Grafana Incident.' --- import AutoGeneratedSnippet from '/snippets/providers/grafana_incident-snippet-autogenerated.mdx'; ## Getting started 1. In your Grafana Cloud stack, click Alerts & IRM in the left-side menu. 2. Click the Incident tile to enable the app for your Grafana Cloud instance. 3. Once Grafana Incident is enabled it is accessible to users in your organization. ## Connecting with the Provider 1. After enabling the Grafana Incident app, navigate Adminstration > Users and access > Service Accounts. 2. Create a new service account by clicking the Add Service Account button. 3. Give the service account a name and assign role as Viewer. 4. Click on Add service account token and click on Generate token. 5. Copy the generated token. 6. This will be used as the `service_account_token` parameter in the provider configuration. ## Creating and updating Grafana Incidents Grafana Incident provider supports creating and updating incidents in Grafana. - `operationType` - The operation type can be `create` or `update`. - `updateType` - The update type is used to update the various fields of the incident. ### Create Incident - `operationType` - `create` - `title` (str) - The title of the incident. - `severity` (str) - The severity of the incident. - `labels` (list) - The labels of the incident. - `roomPrefix` (str) - The room prefix of the incident. - `isDrill` (bool) - The drill status of the incident. - `status` (str) - The status of the incident. - `attachCaption` (str) - The attachment caption of the incident. - `attachURL` (str) - The attachment URL of the incident. ### Update Incident - `operationType` - `update` - `updateType` - The updatable fields are `removeLabel`, `unassignLabel`, `unassignLabelByUUID`, `unassignRole`, `updateIncidentEventTime`, `updateIncidentIsDrill`, `updateIncidentSeverity`, `updateIncidentStatus`, `updateIncidentTitle`. #### Remove Label - `incident_id` (str) - The incident ID. - `label` (str) - The label to remove. #### Unassign Label - `incident_id` (str) - The incident ID. - `label` (str) - The label to unassign. - `key` (str) - The key of the label to unassign. #### Unassign Label By UUID - `incident_id` (str) - The incident ID. - `key_uuid` (str) - The key UUID of the label to unassign. - `value_uuid` (str) - The value UUID of the label to unassign. #### Unassign Role - `incident_id` (str) - The incident ID. - `role` (str) - The role to unassign. - `user_id` (str) - The user ID to unassign. #### Update Incident Event Time - `incident_id` (str) - The incident ID. - `event_time` (str) - The event time to update. - `event_name` (str) - The event name to update. #### Update Incident Is Drill - `incident_id` (str) - The incident ID. - `isDrill` (bool) - The drill status to update. #### Update Incident Severity - `incident_id` (str) - The incident ID. - `severity` (str) - The severity to update. #### Update Incident Status - `incident_id` (str) - The incident ID. - `status` (str) - The status to update. #### Update Incident Title - `incident_id` (str) - The incident ID. - `title` (str) - The title to update. ## Usefull Links - [Grafana Incident](https://grafana.com/docs/grafana-cloud/alerting-and-irm/incident/) ================================================ FILE: docs/providers/documentation/grafana_loki-provider.mdx ================================================ --- title: 'Grafana Loki' sidebarTitle: 'Grafana Loki Provider' description: 'Grafana Loki provider allows you to query logs from Grafana Loki.' --- import AutoGeneratedSnippet from '/snippets/providers/grafana_loki-snippet-autogenerated.mdx'; ## Overview Grafana Loki is a log aggregation system designed to store and query logs from all your applications and infrastructure. The easiest way to get started is with Grafana Cloud, our fully composable observability stack. ## Connecting with the Grafana Loki provider 1. Obtain the required authentication parameters. 2. Add Grafana Loki provider to your keep account and configure with the above authentication parameters. ## Querying Grafana Loki The Grafana Loki provider allows you to query logs from Grafana Loki through the `query` and `query_range` types. The following are the parameters available for querying: 1. `query` type: - `query`: The [LogQL](https://grafana.com/docs/loki/latest/query/) query to perform. Requests that do not use valid LogQL syntax will return errors. - `limit`: The max number of entries to return. It defaults to `100`. Only applies to query types which produce a stream (log lines) response. - `time`: The evaluation time for the query as a nanosecond Unix epoch or another [supported format](https://grafana.com/docs/loki/latest/reference/loki-http-api/#timestamps). Defaults to now. - `direction`: Determines the sort order of logs. Supported values are `forward` or `backward`. Defaults to `backward`. 2. `query_range` type: - `query`: The [LogQL](https://grafana.com/docs/loki/latest/query/) query to perform. - `limit`: The max number of entries to return. It defaults to `100`. Only applies to query types which produce a stream (log lines) response. - `start`: The start time for the query as a nanosecond Unix epoch or another [supported format](https://grafana.com/docs/loki/latest/reference/loki-http-api/#timestamps). Defaults to one hour ago. Loki returns results with timestamp greater or equal to this value. - `end`: The end time for the query as a nanosecond Unix epoch or another [supported format](https://grafana.com/docs/loki/latest/reference/loki-http-api/#timestamps). Defaults to now. Loki returns results with timestamp lower than this value. - `since`: A `duration` used to calculate `start` relative to `end`. If `end` is in the future, `start` is calculated as this duration before now. Any value specified for `start` supersedes this parameter. - `step`: Query resolution step width in `duration` format or float number of seconds. `duration` refers to Prometheus duration strings of the form `[0-9]+[smhdwy]`. For example, 5m refers to a duration of 5 minutes. Defaults to a dynamic value based on `start` and `end`. Only applies to query types which produce a matrix response. - `interval`: Only return entries at (or greater than) the specified interval, can be a `duration` format or float number of seconds. Only applies to queries which produce a stream response. Not to be confused with step, see the explanation under [Step versus interval](https://grafana.com/docs/loki/latest/reference/loki-http-api/#step-versus-interval). - `direction`: Determines the sort order of logs. Supported values are `forward` or `backward`. Defaults to `backward`. ## Useful Links - [Grafana Loki](https://grafana.com/oss/loki/) - [Grafana Loki Authentication](https://grafana.com/docs/loki/latest/operations/authentication/) ================================================ FILE: docs/providers/documentation/grafana_oncall-provider.mdx ================================================ --- title: "Grafana OnCall Provider" description: "Grafana Oncall Provider is a class that allows to ingest data to the Grafana OnCall." --- import AutoGeneratedSnippet from '/snippets/providers/grafana_oncall-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Grafana OnCall, you need to create an API Token: 1. Log in to your Grafana account. 2. Go To "Alerts & IRM" -> OnCall. 3. Go to the **Settings** page. 4. Click the **Create** button and provide a name for your token. 5. Copy the token value and keep it secure. 6. Add the token value to the `authentication` section in the Grafana Oncall Provider configuration. ## Notes - This provider allows you to interact with Grafana OnCall to create alerts. - Keep will create "Webhook" type integration called "Keep Integration" inside Grafana OnCall. Payload example: ```json { "alert_uid": "08d6891a-835c-e661-39fa-96b6a9e26552", "title": "The whole system is down", "image_url": "https://upload.wikimedia.org/wikipedia/commons/e/ee/Grumpy_Cat_by_Gage_Skidmore.jpg", "state": "alerting", "link_to_upstream_details": "https://en.wikipedia.org/wiki/Downtime", "message": "Smth happened. Oh no!" } ``` ## Useful Links - [Grafana OnCall Inbound Webhook Integration](https://grafana.com/docs/oncall/latest/configure/integrations/references/webhook/) ================================================ FILE: docs/providers/documentation/graylog-provider.mdx ================================================ --- title: "Graylog Provider" sidebarTitle: "Graylog Provider" description: "The Graylog provider enables webhook installations for receiving alerts in Keep" --- import AutoGeneratedSnippet from '/snippets/providers/graylog-snippet-autogenerated.mdx'; ## Overview The **Graylog Provider** facilitates receiving alerts from Graylog by setting up Webhook connections. It allows seamless integration with Graylog to receive notifications about events and alerts through Keep. ## Connecting with the Provider 1. Obtain the **username** and **access token** from your Graylog instance by following [Graylog's API Access Documentation](https://go2docs.graylog.org/current/setting_up_graylog/rest_api_access_tokens.htm?tocpath=Set%20up%20Graylog%7CGet%20Started%20with%20Graylog%7CREST%C2%A0API%7C_____3#CreateanAccessToken). 2. Set the **deployment URL** to your Graylog instance's base URL (e.g., `http://127.0.0.1:9000`). 3. Ensure the user has the **Admin** role in Graylog. ## Features The **Graylog Provider** supports the following key features: - **Webhook Setup**: Configures webhooks to send alerts to Keep. - **Alerts Retrieval**: Fetches and formats alerts from Graylog based on specified search parameters (only a maximum of 10000 most recent alerts) Ensure that the product of `page` and `per_page` does not exceed 10,000. The notification URL for Graylog v4.x has the api_key as a query param, this is the default behaviour. ## Useful Links - [Graylog API Documentation](https://go2docs.graylog.org/current/what_is_graylog/what_is_graylog.htm?tocpath=What%20Is%20Graylog%253F%7C_____0) - [Graylog Access Token](https://go2docs.graylog.org/current/setting_up_graylog/rest_api_access_tokens.htm?tocpath=Set%20up%20Graylog%7CGet%20Started%20with%20Graylog%7CREST%C2%A0API%7C_____3#CreateanAccessToken) - [Quick Setup for Graylog & Integration with Keep](https://github.com/keephq/keep/keep/providers/graylog_provider/README.md) ================================================ FILE: docs/providers/documentation/grok-provider.mdx ================================================ --- title: "Grok Provider" description: "The Grok Provider allows for integrating X.AI's Grok language models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/grok-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Grok, you'll need to obtain an API Key: 1. Subscribe to Grok on X.AI platform. 2. Navigate to the API section in your X.AI account settings. 3. Generate a new API key for Keep. Use the generated API key in the `authentication` section of your Grok Provider configuration. ================================================ FILE: docs/providers/documentation/http-provider.mdx ================================================ --- title: "HTTP Provider" description: "HTTP Provider is a provider used to query/notify using HTTP requests" --- import AutoGeneratedSnippet from '/snippets/providers/http-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to the provider, you can instantiate an instance of the `HttpProvider` class, providing a `provider_id` and a `ProviderConfig` object. Then you can call the `query` method to query the HTTP endpoint. ## Notes The code logs some debug information about the requests being sent, including the request headers, body, and query parameters. This information should not contain sensitive information, but it's important to make sure of that before using this provider in production. ## Useful Links - [requests library documentation](https://docs.python-requests.org/en/latest/) ================================================ FILE: docs/providers/documentation/icinga2-provider.mdx ================================================ --- title: "Icinga2 Provider" sidebarTitle: "Icinga2" description: "Icinga2 Provider Allows Reception of Push Alerts from Icinga2 to Keep." --- import AutoGeneratedSnippet from '/snippets/providers/icinga2-snippet-autogenerated.mdx'; import ProviderLogo from '@components/ProviderLogo'; # Icinga2 Provider The Icinga2 provider allows you to receive alerts from Icinga2 monitoring system within Keep. Icinga2 provider supports 2 methods for recieving alerts; Webhooks & API Polling. The recommended and primary method for receiving alerts is via Webhooks. ## Setup ### Prerequisites 1. Access to an Icinga2 instance 2. API user with relevant permissions 3. Keep instance with webhook capability ### Configuration The provider requires the following configuration: ```yaml authentication: host_url: "https://icinga2.example.com" # Your Icinga2 instance URL api_user: "your-api-user" # Icinga2 API username api_password: "your-api-password" # Icinga2 API password ``` ### Webhook Configuration To configure Icinga2 to send alerts to Keep via webhooks: 1. Navigate to your Icinga2 configuration directory 2. Create or edit the ```eventcommands.conf``` file 3. Add the following event command configuration: ```plaintext object EventCommand "keep-notification" { command = [ "curl" ] arguments = { "-X" = "POST" "-H" = "Content-Type: application/json" "-H" = "X-API-KEY: ${keep_api_key}" "--data" = "{ \"host\": { \"name\": \"$host.name$\", \"display_name\": \"$host.display_name$\", \"check_command\": \"$host.check_command$\", \"acknowledgement\": \"$host.acknowledgement$\", \"downtime_depth\": \"$host.downtime_depth$\", \"flapping\": \"$host.flapping$\" }, \"service\": { \"name\": \"$service.name$\", \"display_name\": \"$service.display_name$\", \"check_command\": \"$service.check_command$\", \"acknowledgement\": \"$service.acknowledgement$\", \"downtime_depth\": \"$service.downtime_depth$\", \"flapping\": \"$service.flapping$\" }, \"check_result\": { \"exit_status\": \"$service.state$\", \"state\": \"$service.state_text$\", \"output\": \"$service.output$\", \"execution_start\": \"$service.last_check$\", \"execution_end\": \"$service.last_check$\", \"state_type\": \"$service.state_type$\", \"attempt\": \"$service.check_attempt$\", \"execution_time\": \"$service.execution_time$\", \"latency\": \"$service.latency$\" } }" "${keep_webhook_url}" = { required = true } } } ``` 4. Define variables in your Icinga2 Configuration: - ```keep_api_key```: Your Keep API key with webhook role - ```keep_webhook_url```: Your Keep Webhook URL 5. Create a notification rule that uses this event command 6. Restart Icinga2 to apply changes ### State Mapping By Default, Icinga2 states are automatically mapped to Keep alert severities & statuses as follows: #### Status Mapping | Icinga2 State | Keep Status | |:--------------|:------------| | OK | RESOLVED | | WARNING | FIRING | | CRITICAL | FIRING | | UNKNOWN | FIRING | | UP | RESOLVED | | DOWN | FIRING | #### Severity Mapping | Icinga2 State | Keep Severity | |:--------------|:--------------| | OK | INFO | | WARNING | WARNING | | CRITICAL | CRITICAL | | UNKNOWN | INFO | | UP | INFO | | DOWN | CRITICAL | ================================================ FILE: docs/providers/documentation/ilert-provider.mdx ================================================ --- title: "ilert Provider" sidebarTitle: "ilert Provider" description: "The ilert provider facilitates interaction with ilert’s API, allowing for the management of incidents. This includes the ability to create, update, and resolve alerts, as well as send custom event notifications. This provider integrates Keep's system with ilert's AI-first platform for operations teams seeking seamless integration of alerting, on-call management, AI SRE and status pages for faster incident response." --- import AutoGeneratedSnippet from '/snippets/providers/ilert-snippet-autogenerated.mdx'; ## Overview The ilert provider facilitates interaction with ilert’s API, allowing for the management of incidents and events. This includes the ability to create, update, and resolve incidents, as well as send custom event notifications. This provider integrates Keep's system with ilert's robust alerting and incident management platform. ## Connecting with the Provider To integrate Keep with ilert, follow these steps: 1. Log in to your ilert account. 2. Navigate to "Alert Sources" under your account settings. 3. Create a new alert source specifically for Keep. 4. Note the `ALERT-SOURCE-API-KEY` provided for this alert source. The endpoint to make requests for Keep integration will be: (https://api.ilert.com/api/v1/events/keep/{ALERT-SOURCE-API-KEY}) ## Useful Links - [ilert API Documentation](https://api.ilert.com/api-docs/?utm_campaign=Keep&utm_source=integration&utm_medium=organic) - [ilert Alerting](https://www.ilert.com/product/reliable-actionable-alerting?utm_campaign=Keep&utm_source=integration&utm_medium=organic) ================================================ FILE: docs/providers/documentation/incidentio-provider.mdx ================================================ --- title: "Incident.io Provider" sidebarTitle: "Incident.io Provider" description: "The Incident.io provider enables the querying of incidents on Incident.io, leveraging incident management capabilities for effective response." --- import AutoGeneratedSnippet from '/snippets/providers/incidentio-snippet-autogenerated.mdx'; ## Overview The Incident.io provider facilitates interaction with Incident.io's API, allowing for the management of incidents. This includes the ability to query specific incidents, retrieve all incidents, and manage incident details. This provider integrates Keep's system with Incident.io's robust incident management platform. ## Connecting with the Provider ### API Key To use the Incident.io API: 1. Log in to your Incident.io account. 2. Navigate to the "API Keys" section under your account settings. 3. Generate a new API key or use an existing one. 4. Ensure it has `read` permissions enabled for reading and managing incidents. ### Incident Endpoint The Incident.io incident endpoint allows querying and managing incidents. Operations include retrieving specific incident details or fetching a list of all incidents. This is crucial for monitoring and responding to incidents efficiently. For more details, refer to the [Incident.io API Documentation](https://api-docs.incident.io/). ## Useful Links - [Incident.io API Documentation](https://api-docs.incident.io/) - [Incident.io Incidents](https://api-docs.incident.io/tag/Incidents-V2) - [Incident.io Api_Keys and Permissions](https://help.incident.io/en/articles/6149651-our-api) ================================================ FILE: docs/providers/documentation/incidentmanager-provider.mdx ================================================ --- title: "Incident Manager Provider" sidebarTitle: "Incident Manager Provider" --- import AutoGeneratedSnippet from '/snippets/providers/incidentmanager-snippet-autogenerated.mdx'; The Incident Manager Provider allows you to push incidents from AWS IncidentManager to Keep. ## Status Map The Incident Manager Provider maps the following statuses: - "OPEN" to AlertStatus.FIRING - "RESOLVED" to AlertStatus.RESOLVED ## Severities Map The Incident Manager Provider maps the following severities: - 1 to AlertSeverity.CRITICAL - 2 to AlertSeverity.HIGH - 3 to AlertSeverity.LOW - 4 to AlertSeverity.WARNING - 5 to AlertSeverity.INFO ## Notes 1. Incident Manager only throws notification when there is chatChannel attached to response plan. Make sure to add chatChannel to response plan before adding webhook ================================================ FILE: docs/providers/documentation/jira-on-prem-provider.mdx ================================================ --- title: "Jira On-Prem Provider" sidebarTitle: "Jira On-Prem Provider" description: "Jira On-Prem Provider is a provider used to query data and creating issues in Jira" --- import AutoGeneratedSnippet from '/snippets/providers/jiraonprem-snippet-autogenerated.mdx'; This is on-prem Jira provider documentation, for regular please check [Jira Provider](./jira-provider.md). ================================================ FILE: docs/providers/documentation/jira-provider.mdx ================================================ --- title: "Jira Cloud Provider" sidebarTitle: "Jira Cloud Provider" description: "Jira Cloud provider is a provider used to query data and creating issues in Jira" --- import AutoGeneratedSnippet from '/snippets/providers/jira-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Go to https://id.atlassian.com/manage-profile/security/api-tokens to Create API token and generated token should be passed to jira authentication. 2. Get `host` and `board_id` from your respective board from its URL. 3. Get `project_key` from your project > settings > details. 4. `email` would be same as of your account email. ## Auto-Transition Workflows The Jira provider supports automatically transitioning tickets when alerts change status. This is useful for keeping your Jira board synchronized with alert states - for example, automatically closing tickets when alerts are resolved. ### Prerequisites 1. Configure a Jira Cloud provider in Keep 2. Ensure your Jira user has the `TRANSITION_ISSUES` permission 3. Know your Jira board name and desired transition status names ### Workflow 1: Create Jira Ticket on Alert This workflow creates a Jira ticket when an alert fires, but only if no ticket has been created yet. ```yaml workflow: id: jira-create-ticket-on-alert name: Create Jira Ticket on Alert description: Create Jira ticket when alert fires disabled: false triggers: - type: alert cel: status == "firing" actions: - name: jira-action if: "not '{{ alert.ticket_id }}'" provider: type: jira config: "{{ providers.JiraCloud }}" with: board_name: YOUR_BOARD_NAME # Change this to your board name issue_type: Task # Or Bug, Story, etc. summary: "{{ alert.name }} - {{ alert.description }}" description: | "This ticket was created automatically by Keep. Alert Details: {code:json} {{ alert }} {code}" enrich_alert: - key: ticket_type value: jira - key: ticket_id value: results.issue.key - key: ticket_url value: results.ticket_url ``` **Key Points:** - `if: "not '{{ alert.ticket_id }}'"` - Only creates a ticket if one doesn't exist yet - `enrich_alert` - Stores the ticket ID, type, and URL in the alert for later use - The ticket is created in the default status (usually "To Do" or "Open") ### Workflow 2: Transition Ticket to Done on Alert Resolved This workflow updates the existing Jira ticket and transitions it to "Done" when the alert is resolved. ```yaml workflow: id: jira-transition-on-resolved name: Transition Jira Ticket to Done description: Close Jira ticket when alert is resolved disabled: false triggers: - type: alert cel: status == "resolved" actions: - name: jira-action provider: type: jira config: "{{ providers.JiraCloud }}" with: issue_id: "{{ alert.ticket_id }}" summary: "{{ alert.name }} - {{ alert.description }} (resolved)" description: | "Alert has been resolved automatically by Keep. Resolved at: {{ alert.lastReceived }} Original Alert Details: {code:json} {{ alert }} {code}" transition_to: Done # Change to your workflow's status name ``` **Key Points:** - Uses `issue_id: "{{ alert.ticket_id }}"` from the enriched alert data - `transition_to: Done` - Transitions the ticket to the specified status - No `if` condition needed - if the alert has no `ticket_id`, the action will simply fail gracefully ### Available Transition Names Common Jira transition names (varies by workflow): - `Done` - `Resolved` - `Closed` - `In Progress` - `To Do` - `Canceled` **How to find your transition names:** 1. Go to your Jira project settings 2. Navigate to Workflows 3. Check the available statuses in your workflow 4. Use the exact status name in the `transition_to` parameter (case-insensitive) ### Error Handling If you specify an invalid transition name, the Jira provider will return a helpful error message listing all available transitions for that ticket: ``` Transition 'Invalid' not found. Available transitions: To Do, In Progress, Done, Closed ``` ### Example: Three-State Workflow You can also create intermediate transitions: ```yaml # Workflow 3: Move to In Progress when acknowledged workflow: id: jira-transition-in-progress name: Transition to In Progress description: Move ticket to In Progress when alert is acknowledged disabled: false triggers: - type: alert cel: status == "acknowledged" actions: - name: jira-action provider: type: jira config: "{{ providers.JiraCloud }}" with: issue_id: "{{ alert.ticket_id }}" summary: "{{ alert.name }} - In Progress" description: "Alert acknowledged and being worked on." transition_to: In Progress ``` ### Testing 1. **Create an alert** that triggers the first workflow - Verify a Jira ticket is created - Check that the alert has `ticket_id`, `ticket_type`, and `ticket_url` fields 2. **Resolve the alert** to trigger the second workflow - Verify the existing ticket is updated (no new ticket created) - Check that the ticket status changed to "Done" 3. **Check the logs** in Keep UI for any errors or debugging info ### Troubleshooting #### Issue: Workflow creates a new ticket instead of updating **Cause:** The `issue_id` parameter is missing or the alert doesn't have a `ticket_id`. **Solution:** Ensure the first workflow enriches the alert with `ticket_id` and the second workflow uses it via `issue_id: "{{ alert.ticket_id }}"`. #### Issue: Transition fails with "Transition 'X' not found" **Cause:** The transition name doesn't match your Jira workflow. **Solution:** Check the error message for available transitions and update the `transition_to` parameter accordingly. #### Issue: Permission denied when transitioning **Cause:** Your Jira user doesn't have the `TRANSITION_ISSUES` permission. **Solution:** Grant the necessary permissions in Jira project settings. ### Advanced Features #### Configuration Variables You can use Keep's configuration variables to make the workflows more flexible: ```yaml consts: JIRA_BOARD: "ALERTS" JIRA_DONE_STATUS: "Done" JIRA_ISSUE_TYPE: "Task" # Then use in workflows: board_name: "{{ consts.JIRA_BOARD }}" transition_to: "{{ consts.JIRA_DONE_STATUS }}" issue_type: "{{ consts.JIRA_ISSUE_TYPE }}" ``` #### Custom Fields You can also set custom fields when creating or updating tickets: ```yaml with: issue_id: "{{ alert.ticket_id }}" summary: "Alert resolved" custom_fields: customfield_10001: "High" customfield_10002: "Production" transition_to: Done ``` #### Labels and Components ```yaml with: board_name: YOUR_BOARD_NAME summary: "{{ alert.name }}" description: "{{ alert.description }}" labels: - alert - automated - critical components: - Monitoring - Infrastructure ``` ## Notes ## Useful Links - https://id.atlassian.com/manage-profile/security/api-tokens - https://developer.atlassian.com/cloud/jira/software/rest/api-group-board/#api-rest-agile-1-0-board-boardid-issue-get - https://developer.atlassian.com/cloud/jira/platform/rest/v2/api-group-issues/#api-rest-api-2-issue-post - https://developer.atlassian.com/cloud/jira/platform/rest/v2/api-group-issues/#api-rest-api-2-issue-issueidorkey-transitions-get (Transitions API) ================================================ FILE: docs/providers/documentation/kafka-provider.mdx ================================================ --- title: "Kafka" sidebarTitle: "Kafka Provider" description: "Kafka provider allows integration with Apache Kafka for producing and consuming messages." --- import AutoGeneratedSnippet from '/snippets/providers/kafka-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Set up a Kafka broker (or use an existing one) and make sure it is accessible. 2. Get the broker URL (e.g., `localhost:9092` or a remote Kafka service URL). 3. (Optional) If using secure communication, provide the security protocol, SASL mechanism, username, and password. 4. Configure the provider with these parameters. ## Usefull Links -[Kafka Clients Documentation](https://kafka.apache.org/documentation/) ================================================ FILE: docs/providers/documentation/keep-provider.mdx ================================================ --- title: "Keep" sidebarTitle: "Keep Provider" description: "Keep provider allows you to query and manage alerts in Keep." --- import AutoGeneratedSnippet from '/snippets/providers/keep-snippet-autogenerated.mdx'; ## Authentication Parameters To use the Keep provider, you must authenticate with an API token associated with your Keep account. This token can be generated from your Keep dashboard. ## Connecting with the Provider 1. Log in to your Keep account. 2. Navigate to the API section of your account dashboard and generate an API token. 3. Use this token to authenticate when querying alerts via the Keep provider. ================================================ FILE: docs/providers/documentation/kibana-provider.mdx ================================================ --- title: "Kibana" sidebarTitle: "Kibana Provider" description: "Kibana provider allows you get alerts from Kibana Alerting via webhooks." --- import AutoGeneratedSnippet from '/snippets/providers/kibana-snippet-autogenerated.mdx'; Please note that when installing Kibana with Webhook auto instrumentation, Keep installs itself as a Connector, adds itself as an Action to all available Kibana Alert Rules (For each alert, On status changes, when: Alert/No Data/Recovered) and to all available Kibana Watcher rules as a Webhook action. For more information, feel free to reach out on our Slack Community. ## Connecting with the Provider ### Kibana Host Simply copy the hostname from the URL bar in your browser: Kibana Host ### API Key To obtain a Kibana API key, follow these steps: 1. Log in to your Kibana account. 2. Click Stack Management 3. Click on Security 4. Click on API Keys Kibana API Keys 1. Click on the top right `Create API key` button 2. Give the API key and indicative name (e.g. keep-api-key) 3. Make sure the `Restrict Permissions` toggle is not toggeled 4. On the bottom right corner, click on `Create API key` Create Kibana API Key 6. Copy the newly created encoded API key and you're set! Copy Kibana API Key ## Fingerprinting Fingerprints in Kibana are simply the alert instance ID. ## Useful Links - [Kibana Alerting](https://www.elastic.co/guide/en/kibana/current/alerting-getting-started.html) - [Kibana Connectors](https://www.elastic.co/guide/en/kibana/current/action-types.html) ================================================ FILE: docs/providers/documentation/kubernetes-provider.mdx ================================================ --- title: "Kubernetes" description: "Kubernetes provider to perform rollout restart or list pods action." --- import AutoGeneratedSnippet from '/snippets/providers/kubernetes-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Kubernetes, follow below steps: 1. Create a service account on Kubernetes. 2. Create role/clusterrole and bind to service account using rolebinding/clusterrolebinding. 3. Get the token of service account. ## Notes - This provider allows you to interact with Kubernetes to perform rollout restart or pods listing actions. ## Useful Links - [Access Kubernetes Cluster](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/) ================================================ FILE: docs/providers/documentation/libre_nms-provider.mdx ================================================ --- title: 'LibreNMS' sidebarTitle: 'LibreNMS Provider' description: 'LibreNMS allows you to receive alerts from LibreNMS using API endpoints as well as webhooks' --- import AutoGeneratedSnippet from '/snippets/providers/libre_nms-snippet-autogenerated.mdx'; ## Connecting LibreNMS to Keep 1. Open LibreNMS dashboard and click on settings in the top right corner. 2. Click on `Create API access token` to generate a new API key. 3. Give a description to the API key and click on `Create API Token`. ## Webhooks Integration 1. Open LibreNMS dashboard and open `Alerts` tab in the navigation bar and click on `Alert Transports`. 2. Click on `Create add transport` and select `Transport type` as `API`. Select the `API Method` as `POST`. 3. Fill the `API URL` with [https://api.keephq.dev/alerts/event/libre_nms](https://api.keephq.dev/alerts/event/libre_nms). 4. Copy the below JSON and paste it in `body` field. ```json { "title": "{{ $title }}", "hostname": "{{ $hostname }}", "device_id": "{{ $device_id }}", "sysDescr": "{{ $sysDescr }}", "sysName": "{{ $sysName }}", "sysContact": "{{ $sysContact }}", "os": "{{ $os }}", "type": "{{ $type }}", "ip": "{{ $ip }}", "display": "{{ $display }}", "version": "{{ $version }}", "hardware": "{{ $hardware }}", "features": "{{ $features }}", "serial": "{{ $serial }}", "status": "{{ $status }}", "status_reason": "{{ $status_reason }}", "location": "{{ $location }}", "description": "{{ $description }}", "notes": "{{ $notes }}", "uptime": "{{ $uptime }}", "uptime_short": "{{ $uptime_short }}", "uptime_long": "{{ $uptime_long }}", "elapsed": "{{ $elapsed }}", "alerted": "{{ $alerted }}", "alert_id": "{{ $alert_id }}", "alert_notes": "{{ $alert_notes }}", "proc": "{{ $proc }}", "rule_id": "{{ $rule_id }}", "id": "{{ $id }}", "faults": "{{ $faults }}", "uid": "{{ $uid }}", "severity": "{{ $severity }}", "rule": "{{ $rule }}", "name": "{{ $name }}", "string": "{{ $string }}", "timestamp": "{{ $timestamp }}", "contacts": "{{ $contacts }}", "state": "{{ $state }}", "msg": "{{ $msg }}", "builder": "{{ $builder }}" } ``` 5. Follow the below steps to create a new API key in Keep. 6. Go to Keep dashboard and click on the profile icon in the botton left corner and click `Settings`. 7. Select `Users and Access` tab and then select `API Keys` tab and create a new API key. 8. Give name and select the role as `webhook` and click on `Create API Key`. 9. Copy the API key. 10. Add a new header with key as `X-API-KEY` and create a new API key in Keep and paste it as the value and save the webhook. 11. Save the webhook. 12. You can add devices from the Devices tab in the LibreNMS dashboard and select the alert transport that you have created. 13. Now, you will receive the alerts in Keep. ## Useful Links - [LibreNMS](https://www.librenms.org/) ================================================ FILE: docs/providers/documentation/linear_provider.mdx ================================================ --- title: "Linear Provider" sidebarTitle: "Linear Provider" description: "Linear Provider is a provider for fetching data and creating issues in Linear app." --- import AutoGeneratedSnippet from '/snippets/providers/linear-snippet-autogenerated.mdx'; ## How to set up The Linear Provider uses `api_token` for request authorization. You need to provider the following: - **api_token** (requires): The personal api key for your linear app. - How to obtain: 1. Visit the Linear app or website. 2. Log in to your Linear account. 3. Navigate to your account settings -. 4. Navigate to the API page. 5. Under Personal API keys section generate the key. 6. Copy the generated API token. ## Notes - This provider allows you to query projects for the given Linear team. - This provider allows you to notify (create issue) inside Linear app for given project and team. ## Useful Links - [Linear](https://linear.app) - [Linear Docs](https://developers.linear.app/docs/graphql/working-with-the-graphql-api) ================================================ FILE: docs/providers/documentation/linearb-provider.mdx ================================================ --- title: "LinearB" sidebarTitle: "LinearB Provider" description: "The LinearB provider enables integration with LinearB's API to manage and notify incidents directly through webhooks." --- import AutoGeneratedSnippet from '/snippets/providers/linearb-snippet-autogenerated.mdx'; The LinearB provider facilitates the automatic creation, update, and deletion of incidents in LinearB through its public API. It supports dynamic incident management based on operational events, allowing teams to synchronize their development metrics and alerts with LinearB's project management capabilities. For any support or questions, join our community on Slack or GitHub. ## Connecting with the Provider ### Obtaining an API Token To use the LinearB provider, you must obtain an API token from LinearB: 1. Sign in to your LinearB account. 2. Navigate to the API settings section. 3. Generate a new API token with the appropriate permissions. 4. Securely store the API token as it is needed to configure the LinearB provider in Keep. ### Useful Links - [LinearB API Reference](https://docs.linearb.io/api-overview/) ================================================ FILE: docs/providers/documentation/litellm-provider.mdx ================================================ --- title: "LiteLLM Provider" description: "The LiteLLM Provider enables integration with LiteLLM proxy into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/litellm-snippet-autogenerated.mdx'; ================================================ FILE: docs/providers/documentation/llamacpp-provider.mdx ================================================ --- title: "Llama.cpp Provider" description: "The Llama.cpp Provider allows for integrating locally running Llama.cpp models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/llamacpp-snippet-autogenerated.mdx'; The Llama.cpp Provider supports querying local Llama.cpp models for prompt-based interactions. Make sure you have Llama.cpp server running locally with your desired model. ### **Cloud Limitation** This provider is disabled for cloud environments and can only be used in local or self-hosted environments. ## Connecting with the Provider To use the Llama.cpp Provider: 1. Install Llama.cpp on your system 2. Download or convert your model to GGUF format 3. Start the Llama.cpp server with HTTP interface: ```bash ./server --model /path/to/your/model.gguf --host 0.0.0.0 --port 8080 ``` 4. Configure the host URL and model path in your Keep configuration ## Prerequisites - Llama.cpp must be installed and compiled with server support - A GGUF format model file must be available on your system - The Llama.cpp server must be running and accessible - The server must have sufficient resources to load and run your model ## Model Compatibility The provider works with any GGUF format model compatible with Llama.cpp, including: - LLaMA and LLaMA-2 models - Mistral models - OpenLLaMA models - Vicuna models - And other compatible model architectures Make sure your model is in GGUF format before using it with the provider. ================================================ FILE: docs/providers/documentation/mailgun-provider.mdx ================================================ --- title: "Mailgun Provider" description: "Mailgun Provider allows sending alerts to Keep via email." --- import AutoGeneratedSnippet from '/snippets/providers/mailgun-snippet-autogenerated.mdx'; Mailgun currently supports receiving alerts via email. We will add querying and notifying soon. ## Connecting with the Provider To connect to Mailgun, you do not need to perform any actions on the Mailgun side. We use our own Mailgun account and handle everything for you. ## Post Installation Validation You can check that the Mailgun Provider works by sending a test email to the configured email address. 1. Send a test email to the email address provided in the `authentication` section. 2. Check Keep's platform to see if the alert is received. ## Default Alert Values When no extraction rules are set, the default values for every alert are as follows: - **name**: The subject of the email. - **source**: The sender of the email. - **message**: The stripped text content of the email. - **timestamp**: The timestamp of the email, converted to ISO format. - **severity**: "info" - **status**: "firing" ## How Extraction Works Extraction rules allow you to extract specific information from the email content using regular expressions. This can be useful for parsing and structuring the alert data. ### Example Extraction Rule An extraction rule is defined as a dictionary with the following keys: - **key**: The key in the email event to apply the extraction rule to. - **value**: The regular expression to use for extraction. #### Example Extract the severity from the subject of the email. ``` Key: subject Value: (?P\w+): ``` ================================================ FILE: docs/providers/documentation/mattermost-provider.mdx ================================================ --- title: "Mattermost Provider" sidebarTitle: "Mattermost Provider" description: "Mattermost provider is used to send messages to Mattermost." --- import AutoGeneratedSnippet from '/snippets/providers/mattermost-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. **Obtain a Mattermost Webhook URL:** - Go to the Mattermost Incoming Webhook API documentation: [Mattermost Incoming Webhooks](https://docs.mattermost.com/developer/webhooks-incoming.html). - Follow the instructions to create a new incoming webhook. - Copy the generated webhook URL, which should be passed as the `webhook_url` for authentication. ## Useful Links - [Mattermost Incoming Webhooks](https://developers.mattermost.com/integrate/webhooks/incoming/) ================================================ FILE: docs/providers/documentation/mock-provider.mdx ================================================ --- title: "Mock" sidebarTitle: "Mock Provider" description: "Template Provider is a template for newly added provider's documentation" --- import AutoGeneratedSnippet from '/snippets/providers/mock-snippet-autogenerated.mdx'; ================================================ FILE: docs/providers/documentation/monday-provider.mdx ================================================ --- title: 'Monday' sidebar_label: 'Monday Provider' description: 'Monday Provider allows you to add new pulses to your boards' --- import AutoGeneratedSnippet from '/snippets/providers/monday-snippet-autogenerated.mdx'; ## Overview Monday Provider enables seamless integration with Monday.com, a work operating system that powers teams to run projects and workflows with confidence. With Monday Provider, you can add new pulses to your boards. #### Admin tab If you are an admin user on your monday.com account, follow these steps to access your API token: 1. Log into your monday.com account. 2. Click on your avatar/profile picture in the top right corner. 3. Select Administration > Connections > API. 4. Copy your personal token. Please note that you can always regenerate a new token, but doing so will cause any previous tokens to expire. #### Developer tab If you are a member user or an admin on your monday.com account, follow these steps to access your API token: 1. Log into your monday.com account. 2. Click on your profile picture in the top right corner. 3. Select Developers. This will open the Developer Center in another tab. 4. Click My Access Tokens > Show. 5. Copy your personal token. Please note that you can always regenerate a new token, but doing so will cause any previous tokens to expire. ## Connecting Monday to Keep 1. Obtain the API Token from Monday. 2. Add Monday as a provider in Keep. 3. Give the provider a name and paste the API Token in the `Personal API Token` field and click `Connect`. ## How to use? 1. In order to add a new pulse to your board, you need the following information: - Board ID: The ID of the board where you want to add the pulse. - Group ID: The ID of the group where you want to add the pulse. - Item Name: The name of the pulse you want to add. - Column Values: The values of the columns you want to set for the pulse. 2. Open the board where you want to add the pulse in the monday.com app. 3. Hover over the board name in the side panel and click on the three dots that appear and click on ID to copy the board ID. 4. Hover over the group name in the board and click on the three dots that appear and click on Group ID to copy the group ID. 5. Item Name is the name of the pulse you want to add. 6. Column ID and Column Value are the values of the columns you want to set for the pulse. Hover over the column name in the board and click on the three dots that appear and click on Column ID to copy the column ID. The column value is the value you want to set for the column. ## Useful Links - [Monday.com](https://monday.com/) - [Example workflow for Monday Provider](https://github.com/keephq/keep/blob/main/examples/workflows/monday_create_pulse.yml) ================================================ FILE: docs/providers/documentation/mongodb-provider.mdx ================================================ --- title: "MongoDB" sidebarTitle: "MongoDB Provider" description: "MongoDB Provider is a provider used to query MongoDB databases" --- import AutoGeneratedSnippet from '/snippets/providers/mongodb-snippet-autogenerated.mdx'; ## Connecting with the Provider In order to connect to the MongoDB database, you can use either a connection URI or individual parameters. Here's how you can provide authentication information: 1. If using a connection URI, provide the `host` parameter with the MongoDB connection string. 2. If using individual parameters, provide the following: - `username`: MongoDB username. - `password`: MongoDB password. - `host`: MongoDB hostname. - `database`: MongoDB database name. - `authSource`: MongoDB database name. ## Notes - Ensure that the provided user has the necessary privileges to execute queries on the specified MongoDB database. ## Useful Links - [MongoDB Documentation](https://docs.mongodb.com/) ================================================ FILE: docs/providers/documentation/mysql-provider.mdx ================================================ --- title: "MySQL" sidebarTitle: "MySQL Provider" description: "MySQL Provider is a provider used to query MySQL databases" --- import AutoGeneratedSnippet from '/snippets/providers/mysql-snippet-autogenerated.mdx'; ## Connecting with the Provider In order to connect to the MySQL database, you will need to create a new user with the required permissions. Here's how you can do this: 1. Connect to the MySQL server as a user with sufficient privileges to create a new user. 2. Run the following command to create a new user: `CREATE USER ''@'' IDENTIFIED BY ''`; 3. Grant the necessary permissions to the new user by running the following command: `GRANT ALL PRIVILEGES ON .* TO ''@''`; ## Notes ## Useful Links - [MySQL Documentation](https://dev.mysql.com/doc/refman/8.0/en/) ================================================ FILE: docs/providers/documentation/netbox-provider.mdx ================================================ --- title: 'NetBox' sidebarTitle: 'NetBox Provider' description: 'NetBox provider allows you to get events from NetBox through webhook.' --- import AutoGeneratedSnippet from '/snippets/providers/netbox-snippet-autogenerated.mdx'; ## Overview NetBox is the leading solution for modeling and documenting modern networks. By combining the traditional disciplines of IP address management (IPAM) and datacenter infrastructure management (DCIM) with powerful APIs and extensions, NetBox provides the ideal "source of truth" to power network automation. Read on to discover why thousands of organizations worldwide put NetBox at the heart of their infrastructure. ## Connecting NetBox to Keep To connect NetBox to Keep, you need to create a webhook in NetBox. 1. Go to NetBox dashboard, click on `Webhooks` under `Operations` section in the sidebar. 2. Add a new webhook by clicking on `Add` button. 3. Enter [https://api.keephq.dev/alerts/event/netbox](https://api.keephq.dev/alerts/event/netbox) as the URL and select the request method as `POST`. 4. Follow the below steps to create a new API key in Keep. 5. Go to Keep dashboard and click on the profile icon in the botton left corner and click `Settings`. 6. Select `Users and Access` tab and then select `API Keys` tab and create a new API key. 7. Give name and select the role as `webhook` and click on `Create API Key`. 8. In the `Additional headers` field enter `X-API-KEY` as the key and the API key generated in step 7 as the value. It should look like below. Refer the screenshot from step 3. ``` X-API-KEY: your-api-key ``` 9. Disable the `SSL verification` (Optional) or enable it based on your requirement. 10. Click on `Save` to save the webhook. 11. Go to `Event Rules` under `Operations` section in the sidebar and click on `Add` button to create a new event rule. 12. Fill the required fields based on your requirement. Select the `Object types` and `Event types` for which you want to receive the events. 13. In the `Action type` select `Webhook` and select the webhook created in step 3 and click on `Save`. Now, you have successfully connected NetBox to Keep. You will start receiving the events in Keep based on the event rules you have created. ## Useful Links - [NetBox](https://netboxlabs.com/) ================================================ FILE: docs/providers/documentation/netdata-provider.mdx ================================================ --- title: "Netdata" sidebarTitle: "Netdata Provider" description: "Netdata provider allows you to get alerts from Netdata via webhooks." --- import AutoGeneratedSnippet from '/snippets/providers/netdata-snippet-autogenerated.mdx'; ## Overview The Netdata Provider enables seamless integration between Keep and Netdata, allowing alerts from Netdata to be directly sent to Keep through webhook configurations. This integration ensures that critical alerts are efficiently managed and responded to within Keep's platform. ## Useful Links - [Netdata](https://www.netdata.cloud/) ## Note - Currently, Netdata don't support webhook in on-premises installations. ================================================ FILE: docs/providers/documentation/new-relic-provider.mdx ================================================ --- title: "New Relic" sidebarTitle: "New Relic Provider" description: "New Relic Provider enables querying AI alerts and registering webhooks." --- import AutoGeneratedSnippet from '/snippets/providers/newrelic-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Go to https://one.newrelic.com/admin-portal/api-keys/home to create User Key. 2. Get `api_key` and `account_id` from the key created. 3. Based on region get `api_url` from here https://docs.newrelic.com/docs/apis/rest-api-v2/get-started/introduction-new-relic-rest-api-v2 . ## Webhook Integration Modifications The webhook integration adds Keep as a destination within the "Alerts and AI" API within New Relic. This grants Keep access to the following scopes within New Relic: - `ai.destinations:read` - `ai.destinations:write` - `ai.channels:read` - `ai.channels:write` ## Useful Links - https://docs.newrelic.com/docs/apis/rest-api-v2/get-started/introduction-new-relic-rest-api-v2 ================================================ FILE: docs/providers/documentation/ntfy-provider.mdx ================================================ --- title: "Ntfy.sh" sidebarTitle: "Ntfy.sh Provider" description: "Ntfy.sh allows you to send notifications to your devices" --- import AutoGeneratedSnippet from '/snippets/providers/ntfy-snippet-autogenerated.mdx'; ## Connecting with the Provider Obtain Ntfy Access Token (For Ntfy.sh only) 1. Create an account on [Ntfy.sh](https://ntfy.sh/). 2. After logging in, go to the [Access token](https://ntfy.sh/account) page. 3. Click on the `CREATE ACCESS TOKEN`. Give it a label and select token expiration time and click on the `CREATE TOKEN` button. 4. Copy the generated token. This will be used as the `Ntfy Access Token` in the provider settings. Self-Hosted Ntfy 1. To self-host Ntfy, you can follow the instructions [here](https://docs.ntfy.sh/install/). 2. For self-hosted Ntfy, you will need to provide the `Ntfy Host URL`, `Ntfy Username`, and `Ntfy Password` in the provider settings instead of the `Ntfy Access Token`. 3. Create a new user for the self-hosted Ntfy instance and use the generated username and password in the provider settings. Subscribing to a Topic (For Ntfy.sh and self-hosted Ntfy) 1. Login to your Ntfy.sh account. 2. Click on `Subscribe to a topic` button and generate name for the topic and subscribe to it. 3. Copy the generated topic name. This will be used as the `Ntfy Subcription Topic` in the provider settings. 4. Reserve the topic and confiure access (Requires ntfy Pro) ## Usefull Links - [Ntfy.sh](https://ntfy.sh/) - [To self-host Ntfy](https://docs.ntfy.sh/install/) ================================================ FILE: docs/providers/documentation/ollama-provider.mdx ================================================ --- title: "Ollama Provider" description: "The Ollama Provider allows for integrating locally running Ollama language models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/ollama-snippet-autogenerated.mdx'; The Ollama Provider supports querying local Ollama models for prompt-based interactions. Make sure you have Ollama installed and running locally with your desired models. ### **Cloud Limitation** This provider is disabled for cloud environments and can only be used in local or self-hosted environments. ## Connecting with the Provider To use the Ollama Provider: 1. Install Ollama on your system from [Ollama's website](https://ollama.ai). 2. Start the Ollama service. 3. Pull your desired model(s) using `ollama pull model-name`. 4. Configure the host URL in your Keep configuration. ## Prerequisites - Ollama must be installed and running on your system. - The desired models must be pulled and available in your Ollama installation. - The Ollama API must be accessible from the host where Keep is running. ================================================ FILE: docs/providers/documentation/openai-provider.mdx ================================================ --- title: "OpenAI Provider" description: "The OpenAI Provider allows for integrating OpenAI's language models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/openai-snippet-autogenerated.mdx'; The OpenAI Provider supports querying GPT language models for prompt-based interactions. ## Connecting with the Provider To connect to OpenAI, you'll need to obtain an API Key and (optionally) an Organization ID: 1. Log in to your OpenAI account at [OpenAI Platform](https://platform.openai.com). 2. Go to the **API Keys** section. 3. Click on **Create new secret key** to generate a key for Keep. 4. (Optional) Retrieve your **Organization ID** under **Organization settings** if you’re part of multiple organizations. Use the generated API key in the `authentication` section of your OpenAI Provider configuration. ================================================ FILE: docs/providers/documentation/openobserve-provider.mdx ================================================ --- title: "OpenObserve" sidebarTitle: "OpenObserve Provider" description: "OpenObserve provider allows you to get OpenObserve `alerts/actions` via webhook installation" --- import AutoGeneratedSnippet from '/snippets/providers/openobserve-snippet-autogenerated.mdx'; ## Connecting with the Provider Obtain OpenObserve Username and Password: 1. To see how to install and set Credentials: [here](https://openobserve.ai/docs/quickstart/#self-hosted-installation) 2. Get the Organisation ID of the OpenObserve instance in which you wish to install the webhook. ## Webhook Integration Modifications The webhook integration adds Keep as an alert monitor within the OpenObserve instance. It can be found under the "Alerts & Respond" section. The integration automatically gains access to the following scopes within OpenObserve: - `authenticated` ## Useful Links - [OpenObserve Alert Templates](https://openobserve.ai/docs/user-guide/alerts/templates) - [OpenObserve API Spec](https://openobserve.ai/docs/api_specs/#?route=overview) - [OpenObserve Destinations](https://openobserve.ai/docs/user-guide/alerts/destinations/) - [OpenObserve Installation and Credentials](https://openobserve.ai/docs/quickstart/#self-hosted-installation) ================================================ FILE: docs/providers/documentation/opensearchserverless-provider.mdx ================================================ --- title: "OpenSearch Serverless" sidebarTitle: "OpenSearchServerless Provider" description: "OpenSearch Serverless provider enables seamless integration with AWS OpenSearch Serverless for document-level querying, alerting, and writing, directly into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/opensearchserverless-snippet-autogenerated.mdx'; ## Overview The OpenSearch Provider offers native integration with **Amazon OpenSearch Serverless**, allowing Keep users to query, monitor, and write documents in real-time. This supports observability and event-driven alerting for operational and security use cases. ### Key Features: - **Read & Write Support**: Enables both querying and writing documents to OpenSearch Serverless collections. - **AWS IAM Authentication**: Authenticates using AWS IAM credentials (access key/secret or instance role). ## Connecting with the Provider To connect OpenSearch with Keep, you’ll need: - An AWS account with permissions for OpenSearch Serverless (AOSS). - A configured collection and index in AOSS. - AWS IAM credentials (permanent or temporary). ## Required AWS IAM Permissions (Scopes) To function properly, the OpenSearch provider requires the following IAM scopes: ### Mandatory Scopes - **`iam:SimulatePrincipalPolicy`** - **Description**: Required to check if the IAM identity has access to AOSS API. - **Alias**: Needed to test the access for next 3 scopes. - **Mandatory**: Yes - **`aoss:APIAccessAll`** - **Description**: Required to make API calls to OpenSearch Serverless. - **Alias**: Access to make API calls to serverless - **Mandatory**: Yes - **`aoss:ListAccessPolicies`** - **Description**: Needed to list all Data Access Policies. - **Alias**: Policy List access - **Mandatory**: Yes - **`aoss:GetAccessPolicy`** - **Description**: Required to inspect each policy for read/write scope. - **Alias**: Policy read access - **Mandatory**: Yes - **`aoss:CreateIndex`** - **Description**: Required to create an index. - **Documentation**: [AOSS API Docs](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations) - **Alias**: Create Index - **Mandatory**: Yes - **`aoss:ReadDocument`** - **Description**: Required to read documents from an OpenSearch collection. - **Documentation**: [AOSS API Docs](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations) - **Alias**: Read Documents - **Mandatory**: Yes - **`aoss:WriteDocument`** - **Description**: Required to index or update documents in an OpenSearch collection. - **Documentation**: [AOSS API Docs](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations) - **Alias**: Write Documents - **Mandatory**: Yes `iam:SimulatePrincipalPolicy`, `aoss:APIAccessAll`, `aoss:ListAccessPolicies`, `aoss:GetAccessPolicy`, needs to be added from your IAM console to the IAM identity used by Keep. The other two policies are data access policies which needs to be added from aws serverless dashboard. Go through the readme to get step by step setup: [README](https://github.com/keep/keep/providers/opensearchserverless_provider\README.md) ## Authentication Configuration To authenticate with OpenSearch Serverless, provide the following: - **AWS Access Key** (Mandatory): Your AWS access key. - **AWS Access Key Secret** (Mandatory): Your AWS access key secret. - **Region** (Mandatory): The AWS region hosting your OpenSearch collection. - **Domain Endpoint** (Mandatory): The full domain URL of your AOSS collection endpoint. ## Setting Up the Integration ### Steps: 1. **Assign IAM Permissions**: Grant your IAM user/role `aoss:CreateIndex`, `aoss:ReadDocument` and `aoss:WriteDocument` on the target collection. 2. **Configure Keep Provider**: Provide access key, secret, region, and collection endpoint in the Keep platform. ## Querying OpenSearch Keep supports standard OpenSearch queries using the `_search` endpoint: - **index**: The name of the OpenSearch index to query. - **query**: A valid OpenSearch query DSL object. ### Example ```json { "query": { "match_all": {} }, "size": 1 } ``` ## Writing to OpenSearch You can use the `_notify` functionality to push documents into OpenSearch collections. - **index**: The index name where the document should be written. - **document**: A Python dictionary representing the document body. - **id**: ID for the document ## Useful Links - [AWS OpenSearch Serverless Documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless.html) - [AOSS Data Access Control](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-data-access.html) - [README](https://github.com/keep/keep/providers/opensearchserverless_provider\README.md) ================================================ FILE: docs/providers/documentation/openshift-provider.mdx ================================================ --- title: "Openshift" description: "Openshift provider to perform rollout restart action on specific resources." --- import AutoGeneratedSnippet from '/snippets/providers/openshift-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Openshift, follow below steps: 1. Log in to your Openshift cluster and create a new service account with required roles. 2. Get the token of the service account. 3. Use the token to authenticate with Openshift. ## Notes - This provider allows you to interact with Openshift to perform rollout restart actions. ================================================ FILE: docs/providers/documentation/opsgenie-provider.mdx ================================================ --- title: "Opsgenie Provider" description: "OpsGenie Provider is a provider that allows to create alerts in OpsGenie." --- import AutoGeneratedSnippet from '/snippets/providers/opsgenie-snippet-autogenerated.mdx'; ## Connecting with the Provider To use the Opsgenie Provider, you'll need to provide the API Key and Integration Name from API Integration. You can create an API integration under Settings -> Integrations -> Add integration and search for API Integration. Select API and provide a name for the integration and click on continue. You can create an integration key under Settings -> Integrations -> Add integration If you are in the free tier, the integration key can be created under Teams -> Your team -> Integrations -> Add Integration (API) Visit the [Opsgenie API Integration](https://app.opsgenie.com/settings/integrations/create/api) for creating an API integration quickly. Visit the [Opsgenie API Integration](https://support.atlassian.com/opsgenie/docs/create-a-default-api-integration/) documentation for latest information. ## Useful Links - How to create Opsgenie API Integration - https://support.atlassian.com/opsgenie/docs/create-a-default-api-integration/ ================================================ FILE: docs/providers/documentation/pagerduty-provider.mdx ================================================ --- title: "Pagerduty Provider" description: "Pagerduty Provider allows integration with PagerDuty to create, manage, and synchronize incidents and alerts within Keep." --- import AutoGeneratedSnippet from '/snippets/providers/pagerduty-snippet-autogenerated.mdx'; ## Description The Pagerduty Provider enables integration with PagerDuty to create, manage, and synchronize incidents and alerts within Keep. It supports both direct API key authentication and OAuth2, allowing greater flexibility for secure integration. ## Connecting with the Provider To connect Keep to PagerDuty: - **Routing Key**: Use for event posting via the PagerDuty Events API. In the PagerDuty UI, this is displayed as the integration key. - **API Key**: Use for incident creation and management through the PagerDuty Incidents API. - **Service Id** (Optional): If provided, keep operates within the service's scope. - **OAuth2**: Token management handled automatically by Keep. You can find your routing key in the PagerDuty (integration key in PagerDuty UI) web app under **Services** > **Service Directory** > **Your service** > **Integrations** > **Expand Events API**, and select the integration you want to use. You can find your API key in the PagerDuty web app under **Configuration** > **API Access**. The routing_key is used to post events to PagerDuty using the events API. The api_key is used to create incidents using the incidents API. ### Enabling OAuth in the open-source version If you would like to use OAuth in the open-source, where you self-host Keep, you can do so by following these step: 1. Create a PagerDuty account 2. In the account page, go to **Integrations** > **App Registration** 3. Click on **New App** blue button on the top right 4. Fill in the required fields 5. Select "OAuth 2.0" in the Functionality section and click **Next** 6. In the Redirect URL, you need to add Keep's PagerDuty OAuth2 redirect URL, which is based on your deployments URL. For example, if Keep is deployed at http://localhost:3000, the redirect URL is http://localhost:3000/providers/oauth2/pagerduty 7. In the Authorization section, select **Scoped OAuth** and select the following scopes: - Abilities: Read Access - Incidents: Read/Write Access - Services: Read/Write Access - Webhook Subscriptions: Read/Write Access 8. Click on **Register App** blue button on the bottom right 9. Copy the **Client ID** and **Client Secret** from the OAuth 2.0 Client Information modal and set the `PAGERDUTY_CLIENT_ID` and `PAGERDUTY_CLIENT_SECRET` environment variables in your Keep backend deployment. ## PagerDuty Webhook Integration By default, when Keep installs itself as a webhook integration, it subscribes to all incident events ("Account Scope"). If you wish to limit Keep to some specific services, you can do so by selecting the **Service** scope and selecting the services you want to subscribe to. Find this page under **Integrations** > **Generic Webhooks (v3)** ## Notes The provider uses either the events API or the incidents API to create an alert or an incident. The choice of API to use is determined by the presence of either a routing_key or an api_key. An expired trial while using the free version of PagerDuty may result in the "pagerduty scopes are invalid" error at Keep. ## Webhook Integration Modifications The webhook integration adds Keep as a destination within the "Integrations" API within Pagerduty. This grants Keep access to the following scopes within Pagerduty: - `webhook_subscriptions_read` - `webhook_subscriptions_write` ## Useful Links - Pagerduty Events API documentation: https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2 - Pagerduty Incidents API documentation: https://v2.developer.pagerduty.com/docs/create-an-incident-incidents-api-v2 ================================================ FILE: docs/providers/documentation/pagertree-provider.mdx ================================================ --- title: "Pagertree Provider" description: "The Pagertree Provider facilitates interactions with the Pagertree API, allowing the retrieval and management of alerts." --- import AutoGeneratedSnippet from '/snippets/providers/pagertree-snippet-autogenerated.mdx'; ## Connecting with the Provider - To interact with the Pagertree API, you need to provide an api_token. - You can view and manage your API keys on your [User Settings](https://app.pagertree.com/user/settings) page. ## Notes _This provider uses the Pagertree API to send alerts or mark them as incidents based on the parameters provided. Depending on whether an incident is flagged as true, it either calls `__send_alert` or `__send_incident` method._ ## Useful Links - Pagertree API documentation: [Pagertree API](https://pagertree.com/docs) - Pagertree Authentication: [Authentication](https://pagertree.com/docs/api/authentication) - Pagertree Alerts: [Alerts & Incident](https://pagertree.com/docs/api/alerts) ================================================ FILE: docs/providers/documentation/parseable-provider.mdx ================================================ --- title: "Parseable" sidebarTitle: "Parseable Provider" description: "Parseable provider allows integration with Parseable, a tool for collecting and querying logs." --- import AutoGeneratedSnippet from '/snippets/providers/parseable-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Obtain an API key from your Parseable instance. 2. Configure your provider using the `api_key` and `parseable_url`. ## Usefull Links -[Parseable API Documentation](https://www.parseable.com/docs/api) ================================================ FILE: docs/providers/documentation/pingdom-provider.mdx ================================================ --- title: "Pingdom" sidebarTitle: "Pingdom Provider" description: "Pingdom provider allows you to pull alerts from Pingdom or install Keep as webhook." --- import AutoGeneratedSnippet from '/snippets/providers/pingdom-snippet-autogenerated.mdx'; ## Connecting with the Provider ### API Key To obtain the Pingdom API key, follow these steps: 1. Log in to your Pingdom account. 2. Navigate to the "Settings" section. 3. Click on the "Pingdom API" tab. 4. Generate a new API Key. ## Fingerprinting Fingerprints in Pingdom are calculated based on the `check_id` incoming/pulled event. ## Notes _No information yet, feel free to contribute it using the "Edit this page" link at the bottom of the page_ ## Useful Links - [Pingdom Webhook Documentation](https://www.pingdom.com/resources/webhooks) - [Pingdom Actions API](https://docs.pingdom.com/api/#tag/Actions) ================================================ FILE: docs/providers/documentation/planner-provider.mdx ================================================ --- title: "Microsoft Planner Provider" description: "Microsoft Planner Provider to create task in planner." --- import AutoGeneratedSnippet from '/snippets/providers/planner-snippet-autogenerated.mdx'; ## Connecting with the Provider To connect to Microsoft Planner, follow below steps: 1. Log in to your [Azure](https://azure.microsoft.com/) account. 2. Register an application [here](https://portal.azure.com/#view/Microsoft_AAD_RegisteredApps/CreateApplicationBlade/isMSAApp~/false). 3. After successfully registering the application, go to the **API permissions** page and add the below permissions: - `Tasks.Read.All` - `Tasks.ReadWrite.All` 4. Go to **Overview** page and note the `Application (client) ID` and `Directory (tenant) ID`. 5. Go to **Certificates & secrets** page, create a new client secret and note the client secret value. 6. Add the client id, client secret and tenant id to the `authentication` section in the Microsoft Planner Provider configuration. ## Notes - This provider allows you to interact with Microsoft Planner Provider to create tasks. ## Useful Links - [Microsoft Planner Provider Documentation](https://learn.microsoft.com/en-us/graph/api/planner-post-tasks?view=graph-rest-1.0&tabs=http) - [Create an Azure Active Directory app](https://learn.microsoft.com/en-us/graph/toolkit/get-started/add-aad-app-registration) ================================================ FILE: docs/providers/documentation/postgresql-provider.mdx ================================================ --- title: "PostgreSQL" sidebarTitle: "PostgreSQL Provider" description: "PostgreSQL Provider is a provider used to query POSTGRES databases" --- import AutoGeneratedSnippet from '/snippets/providers/postgres-snippet-autogenerated.mdx'; ## Connecting with the Provider In order to connect to the Postgres database, you will need to create a new user with the required permissions. Here's how you can do this: 1. Connect to the Postgresql server as a user with sufficient privileges to create a new user. 2. Run the following command to create a new user: `CREATE USER '' WITH ENCRYPTED PASSWORD ''`; 3. Run the following command to create a database: `CREATE DATABASE '';`; 4. Grant the necessary permissions to the new user by running the following command: `GRANT ALL PRIVILEGES ON .* TO ''`; ## Notes ## Useful Links - [Postgresql Documentation](https://www.postgresql.org/docs/) - [Creating user,database and adding access on psql](https://medium.com/coding-blocks/creating-user-database-and-adding-access-on-postgresql-8bfcd2f4a91e) ================================================ FILE: docs/providers/documentation/posthog-provider.mdx ================================================ --- title: "PostHog" sidebarTitle: "PostHog Provider" description: "PostHog provider allows you to query session recordings and analytics data from PostHog." --- import AutoGeneratedSnippet from '/snippets/providers/posthog-snippet-autogenerated.mdx'; ## Connecting with the Provider ### API Key To obtain the PostHog API key, follow these steps: 1. Log in to your PostHog account. 2. Navigate to "Project Settings" > "API Keys". 3. Create a new API key or use an existing one. 4. Copy the API key value. ### Project ID To find your PostHog project ID: 1. Log in to your PostHog account. 2. The project ID is visible in your project settings or in the URL when you're viewing your project. ## Available Methods The PostHog provider offers the following methods: ### Get Session Recording Domains Retrieve a list of domains from session recordings within a specified time period. ```yaml - name: get-posthog-domains provider: config: "{{ providers.posthog }}" type: posthog with: query_type: session_recording_domains hours: 24 # Number of hours to look back limit: 500 # Maximum number of recordings to fetch ``` ### Get Session Recordings Retrieve session recordings data within a specified time period. ```yaml - name: get-posthog-recordings provider: config: "{{ providers.posthog }}" type: posthog with: query_type: session_recordings hours: 24 # Number of hours to look back limit: 100 # Maximum number of recordings to fetch ``` ## Example Workflow Here's an example workflow that tracks domains from PostHog session recordings over the last 24 hours and sends a summary to Slack: ```yaml workflow: id: posthog-domain-tracker name: PostHog Domain Tracker description: Tracks domains from PostHog session recordings over the last 24 hours and sends a summary to Slack. triggers: - type: manual - type: interval value: 86400 # Run daily (in seconds) steps: - name: get-posthog-domains provider: config: "{{ providers.posthog }}" type: posthog with: query_type: session_recording_domains hours: 24 limit: 500 actions: - name: send-to-slack provider: config: "{{ providers.slack }}" type: slack with: blocks: - type: header text: type: plain_text text: "PostHog Session Recording Domains (Last 24 Hours)" emoji: true - type: section text: type: mrkdwn text: "Found *{{ steps.get-posthog-domains.results.unique_domains_count }}* unique domains across *{{ steps.get-posthog-domains.results.total_domains_found }}* occurrences" - type: divider - type: section text: type: mrkdwn text: "Domains:*" - type: section text: type: mrkdwn text: "{{#steps.get-posthog-domains.results.unique_domains}} • *{{ . }}* {{/steps.get-posthog-domains.results.unique_domains}}" - type: divider ``` ## Notes The PostHog provider requires the following scopes: - `session_recording:read` - Allows reading session recordings data - `project:read` - Allows reading project data - `session_recording_playlist:read` - Optional access to recording playlists ## Useful Links - [PostHog API Documentation](https://posthog.com/docs/api/overview) - [PostHog Session Recordings API](https://posthog.com/docs/api/session-recordings) - [PostHog Projects API](https://posthog.com/docs/api/projects) ================================================ FILE: docs/providers/documentation/prometheus-provider.mdx ================================================ --- title: "Prometheus" sidebarTitle: "Prometheus Provider" description: "Prometheus provider allows integration with Prometheus for monitoring and alerting purposes." --- import AutoGeneratedSnippet from '/snippets/providers/prometheus-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Set up a Prometheus server and make sure it's running. 2. Get the `prometheus_url` where your Prometheus instance is accessible. 3. (Optional) Obtain the API token from your Prometheus configuration if it's protected. 4. Provide these values in the provider configuration. ## Useful Links -[Prometheus Querying API Documentation](https://prometheus.io/docs/prometheus/latest/querying/api/) -[Prometheus Official Documentation](https://prometheus.io/docs/introduction/overview/) ================================================ FILE: docs/providers/documentation/pushover-provider.mdx ================================================ --- title: "Pushover" sidebarTitle: "Pushover Provider" description: "Pushover docs" --- import AutoGeneratedSnippet from '/snippets/providers/pushover-snippet-autogenerated.mdx'; Token: ![Token](/images/token.jpeg) User key: ![User key](/images/user-key.jpeg) ## Useful Links - https://support.pushover.net/i44-example-code-and-pushover-libraries#python ================================================ FILE: docs/providers/documentation/python-provider.mdx ================================================ --- title: "Python" sidebarTitle: "Python Provider" description: "Python provider allows executing Python code snippets." --- import AutoGeneratedSnippet from '/snippets/providers/python-snippet-autogenerated.mdx'; ## Limitations - The Python provider is currently disabled for cloud execution. This means that Python scripts cannot be executed in a cloud environment. - Users must ensure that the scripts are compatible with the local execution environment. ## Usefull Links -[Python Documentation](https://docs.python.org/3/) ================================================ FILE: docs/providers/documentation/quickchart-provider.mdx ================================================ --- title: "QuickChart Provider" sidebarTitle: "QuickChart Provider" description: "The QuickChart provider enables the generation of chart images through a simple and open API, allowing visualization of alert trends and counts. It supports both anonymous usage and authenticated access with an API key for enhanced functionality." --- import AutoGeneratedSnippet from '/snippets/providers/quickchart-snippet-autogenerated.mdx'; # QuickChart Provider ## Overview The QuickChart provider allows for the generation of two types of charts based on alert data within Keep's platform: 1. A line chart that shows the trend of a specific fingerprint alert over time. 2. A radial gauge chart displaying the total number of alerts Keep received for this fingerprint. These charts can be used in various reports, dashboards, or alert summaries to provide visual insights into alert activity and trends. ## Connecting with the Provider ### Using QuickChart without an API Key The QuickChart provider can generate charts without the need for an API key. However, this usage is limited to basic functionality and lower request limits. ### Using QuickChart with an API Key To unlock more advanced features and higher usage limits, you can use a QuickChart API key. Here's how to obtain one: 1. Visit [QuickChart](https://quickchart.io/). 2. Sign up for a free account to get started. 3. Navigate to your account settings to find your API key. Once you have your API key, add it to the provider configuration in Keep. ## Notes This provider is designed to offer flexible chart generation capabilities within Keep, enhancing how you visualize alert data and trends. It is ideal for users who want to quickly integrate visual representations of alert activity into their workflows. ## Useful Links - [QuickChart API Documentation](https://quickchart.io/documentation/) - [QuickChart Website](https://quickchart.io/) ================================================ FILE: docs/providers/documentation/redmine-provider.mdx ================================================ --- title: "Redmine" sidebarTitle: "Redmine Provider" --- import AutoGeneratedSnippet from '/snippets/providers/redmine-snippet-autogenerated.mdx'; # Redmine Provider `RedmineProvider` is a class that integrates with Redmine to manage issue tracking through Keep. ## Connecting with the Provider To connect with the Redmine provider and manage issues through Keep, follow these steps: 1. Obtain a Redmine Personal Access Token: Visit the [Redmine API documentation](https://www.redmine.org/projects/redmine/wiki/rest_api#Authentication) to see the steps to get an API key. 2. Use the following YAML example to create an issue using the Redmine provider, all these are [valid arguments](https://www.redmine.org/projects/redmine/wiki/Rest_Issues#Creating-an-issue): ```yaml title=examples/issue_creation_example.yml # Create an issue using the Redmine provider. task: id: create-redmine-issue description: Create an issue in Redmine actions: - name: create-issue provider: type: redmine config: "{{ providers.redmine-provider }}" with: project_id: "example_project" subject: "Issue Subject" priority_id: "2" description: "This is the issue description." ``` ## Useful Links - [Redmine REST API](https://www.redmine.org/projects/redmine/wiki/rest_api) - [Authentication Guide](https://www.redmine.org/projects/redmine/wiki/rest_api#Authentication) - [Valid arguments while creating issue](https://www.redmine.org/projects/redmine/wiki/Rest_Issues#Creating-an-issue) ================================================ FILE: docs/providers/documentation/resend-provider.mdx ================================================ --- title: "Resend" sidebarTitle: "Resend Provider" --- import AutoGeneratedSnippet from '/snippets/providers/resend-snippet-autogenerated.mdx'; # Resend Provider ResendProvider is a class that implements the Resend API and allows email sending through Keep. ## Connecting with the Provider To connect with the Resend provider and send emails through Keep, follow these steps: 1. Obtain a Resend API key: Visit [Resend API Keys](https://resend.com/api-keys) to obtain an API key if you don't have one already. 2. Configure the Resend provider in your system with the obtained API key. 3. Use the following YAML example to send an email notification using the Resend provider: ```yaml title=examples/alert_example.yml # Send an email notification using the Resend provider. alert: id: email-notification description: Send an email notification using Resend actions: - name: send-email provider: type: resend config: "{{ providers.resend-provider }}" with: _from: "sender@example.com" to: "recipient@example.com" subject: "Hello from Resend Provider" html: "

This is the email body.

" ``` ## Useful Links - [Resend API Keys](https://resend.com/api-keys) ================================================ FILE: docs/providers/documentation/rollbar-provider.mdx ================================================ --- title: "Rollbar" sidebarTitle: "Rollbar Provider" description: "Rollbar provides real-time error tracking and debugging tools for developers." --- import AutoGeneratedSnippet from '/snippets/providers/rollbar-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Create an account on [Rollbar](https://rollbar.com/). 2. After logging in, navigate to the project you want to connect with and go to the project settings. 3. Under Setup, go to Project Access Tokens and create new token with read and write scopes. 4. Copy the generated token. 5. This will be used as the `rollbarAccessToken` parameter in the provider configuration. ## Webhook Integration Modifications You can manage the permissions granted by the webhook integration by navigating to **Settings > Notifications > Webhook** within the Rollbar project. ## Usefull Links - [Rollbar](https://rollbar.com/) ================================================ FILE: docs/providers/documentation/s3-provider.mdx ================================================ --- title: "AWS S3" sidebarTitle: "AWS S3 Provider" description: "AWS S3 provider to query S3 buckets" --- import AutoGeneratedSnippet from '/snippets/providers/s3-snippet-autogenerated.mdx'; ## Limitations Querying only yaml, yml, json, xml and csv files. ## Scopes Please note that during the installation, the provider is performing `list_buckets` to validate the config. Here is an example IAM policy: ``` { "Version": "2025-01-15", "Statement": [ { "Sid": "VisualEditor0", "Effect": "Allow", "Action": [ "s3:ListBucket", "s3:GetObject", "s3:GetBucketLocation", "s3:ListAllMyBuckets" ], "Resource": "*" } ] } ``` ================================================ FILE: docs/providers/documentation/sendgrid-provider.mdx ================================================ --- title: "SendGrid" sidebarTitle: "SendGrid Provider" --- import AutoGeneratedSnippet from '/snippets/providers/sendgrid-snippet-autogenerated.mdx'; # SendGrid Provider SendGridProvider is a class that implements the SendGrid API and allows email sending through Keep. ## Connecting with the Provider To connect with the SendGrid provider and send emails through Keep, follow these steps: 1. Obtain a SendGrid API key: Visit [SendGrid API Keys](https://www.twilio.com/docs/sendgrid/api-reference/api-keys/) to obtain an API key if you don't have one already. 2. Configure the SendGrid provider in your system with the obtained API key and the `from_email` address. 3. Use the following YAML example to send an email notification using the SendGrid provider: ## Useful Links - [SendGrid API Keys](https://sendgrid.com/docs/ui/account-and-settings/api-keys/) - [SendGrid API Reference](https://www.twilio.com/docs/sendgrid/api-reference) ================================================ FILE: docs/providers/documentation/sentry-provider.mdx ================================================ --- title: "Sentry" sidebarTitle: "Sentry Provider" description: "Sentry provider allows you to query Sentry events and to pull/push alerts from Sentry" --- import AutoGeneratedSnippet from "/snippets/providers/sentry-snippet-autogenerated.mdx"; ## Connecting with the Provider To connect self hosted Sentry, you need to set the `api_url` parameter. Default value is `https://sentry.io/api/0/`. ### API Key To obtain the Sentry API key, follow these steps ([Docs](https://docs.sentry.io/product/integrations/integration-platform/?original_referrer=https%3A%2F%2Fwww.google.com%2F#internal-integrations)): 1. Log in to your Sentry account. 2. Navigate `Settings` -> `Developer Settings` section. 3. Click on `Custom integrations`. 4. Click on `Create New Integration` on the top right side of the screen. 5. Select `Internal Integration` and click `Next` 6. Give the integration an indicative name, e.g. `Keep Integration` 7. From the permission section, select the required scopes: Project: Read & Write Issue & Event: Read Organization: Read Alerts: Read & Write (Not Mandatory) 8. Click `Save Changes` 9. Scroll down to the bottom of the screen to the `TOKENS` section and copy the generated token -- This is the API key you will be using in Keep. ### Organization Slug You can find the Organization Slug in your Sentry URL. For example, this is our playground account: `https://keep-dr.sentry.io/` - The organization slug is `keep-dr`. To obtain the Organization Slug from the settings page: 1. Log in to your Sentry account. 2. Navigate `Settings` -> `General Settings`. 3. Copy the Organization Slug from the Organization Slug input. ## Notes When installing Sentry webhook integration, Keep enables built-in Webhook integration to all accessible projects and adds a new Alert that has an `Action` to send a notification via Webhooks to all accessible projects. You can achieve alerts pushing from Sentry to Keep using an `Internal Integration` which is not automated via the platform. [Contact us](mailto:founder@keephq.dev) to set it up. ## Useful Links - [Sentry Integration Platform](https://docs.sentry.io/product/integrations/integration-platform/) - [Sentry API Reference](https://docs.sentry.io/api/) ================================================ FILE: docs/providers/documentation/service-now-provider.mdx ================================================ --- title: "Service Now" sidebarTitle: "Service Now Provider" description: "Service Now provider allows sending notifications, updates, and retrieving topology information from the ServiceNow CMDB." --- import AutoGeneratedSnippet from '/snippets/providers/servicenow-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Ensure that the ServiceNow instance is accessible via API. 2. Provide the necessary API credentials (`instance_url` and `api_token`) in the provider configuration. ## Additional - `KEEP_SERVICENOW_PROVIDER_SKIP_SCOPE_VALIDATION` envirnomental variable in the backend allows to bypass scope validation. ## Useful Links - [Service Now API documentation](https://docs.servicenow.com/bundle/xanadu-api-reference/page/build/applications/concept/api-rest.html) ================================================ FILE: docs/providers/documentation/signalfx-provider.mdx ================================================ --- title: "SignalFX" sidebarTitle: "SignalFX Provider" description: "SignalFX provider allows you get alerts from SignalFX Alerting via webhooks." --- import AutoGeneratedSnippet from '/snippets/providers/signalfx-snippet-autogenerated.mdx'; ## Overview SignalFX Provider enriches your monitoring and alerting capabilities by seamlessly integrating with SignalFX Alerting via webhooks. This integration allows you to receive alerts directly from SignalFX, ensuring you're promptly informed about significant events and metrics within your infrastructure. Key Features: - Webhook Auto-Instrumentation: Automatically configures Keep as a Webhook Integration within SignalFX, subscribing to all available SignalFX Detectors and Rules for comprehensive monitoring. - Manual and Automated Subscription Management: Provides flexibility in adding Keep as a subscriber to new Detectors either manually or by re-running the "setup webhook" feature from the UI for effortless maintenance. For further information or assistance, feel free to reach out on our Slack Community. ## Connecting with the Provider There are three approaches to connect with SignalFX: - Push (Manually) - Install Keep as a Webhook Integration. - Push (Auto Instrumentation) - Let Keep instrument itself as a webhook integration and subscribe to your SignalFx detectors. - Pull - Keep will pull alerts from SignalFx. The recommended way to install SignalFx is through Push (Auto Instrumentation). With this approach, you benefit from the advantages of the Push approach, which include more context (since SignalFx sends more context on Webhooks) and more real-time alerts, combined with the convenience of Pull integration (just supply credentials, and Keep will do the rest). In the following sections, we will elaborate on each approach. ### Push (Manually) For more information about how SignalFx integrates with Webhooks, you can read https://docs.splunk.com/observability/en/admin/notif-services/webhook.html#webhook2 1. From your SignalFx console, click on "Data Management": 2. Click on "+ Add Integration" 3. Change the "By Use Case" select to "All" and filter "webhook": 4. Click on the Webhook tile and fill the following details: 5. Now, go to Detectors & SLOs page: 6. For every Detector and Rule, add Keep as Alert recipient: ### Push (Auto Instrumentation) With this approach: 1. Keep installs itself as Webhook Integration. 2. Keep iterates all Detectors and Rules, and will add itself as a subscriber The downside of this approach is that you'll need email/password of a user with admin role. This is due to SignalFx limitation on installing integrations: You can read more here - https://dev.splunk.com/observability/reference/api/integrations/latest#endpoint-create-integration To install Keep with Push (auto instrumentation): 1. SF token with read permissions - go to Settings -> Access Tokens -> New Token 2. email/password for a user with admin role - this will be used only for creating the Webhook Integration 3. orgid - this will be used only for creating the Webhook Integration After we have all what we need, go to Keep and install the SignalFx provider: ### Pull With this approach, Keep will pull alerts from SignalFx every time you refresh the console page. 1. SF token with read permissions - go to Settings -> Access Tokens -> New Token 2. In Keep's UI, install SignalFx Provider: ## Fingerprinting Fingerprints in SignalFx calculated based on (incidentId, detectorId). ## Webhook Integration Modifications The automatic webhook integration gains access to the `API` authScope, which gives Keep the ability to read and write to the SignalFx API. ## Useful Links - [SignalFx Webhook](https://docs.splunk.com/observability/en/admin/notif-services/webhook.html#webhook2) ================================================ FILE: docs/providers/documentation/signl4-provider.mdx ================================================ --- title: "SIGNL4 Provider" description: "SIGNL4 offers critical alerting, incident response and service dispatching for operating critical infrastructure. It alerts you persistently via app push, SMS text and voice calls including tracking, escalation, collaboration and duty planning. Find out more at [signl4.com](https://www.signl4.com/)" --- import AutoGeneratedSnippet from '/snippets/providers/signl4-snippet-autogenerated.mdx'; ## Connecting with the Provider To use the Signl4Provider, you'll need to provide your signl4_integration_secret. You can find your integration or team secret in the SIGNL4 web portal under **Teams** or **Integrations** -> **Distribution Rules**. The signl4_integration_secret is used to post events to SIGNL4 using the webhook API. ## Notes The provider uses either the events API or the incidents API to create an alert or an incident. The choice of API to use is determined by the presence of either a routing_key or an api_key. ## Useful Links - SIGNL4: https://signl4.com/ - SIGNL4 knowledge base: https://support.signl4.com/ - SIGNL4 getting-started videos: https://www.youtube.com/watch?v=bwYSYOjMJZ8&list=PL9FRxukdQyk9QRZPOEH3jhRX9WQCovCc6 - SIGNL4 videos: https://vimeo.com/showcase/signl4 ================================================ FILE: docs/providers/documentation/site24x7-provider.mdx ================================================ --- title: "Site24x7 Provider" description: "The Site24x7 Provider allows you to install webhooks and receive alerts in Site24x7. It manages authentication, setup of webhooks, and retrieval of alert logs from Site24x7." --- import AutoGeneratedSnippet from '/snippets/providers/site24x7-snippet-autogenerated.mdx'; ### Main Class Methods - **`setup_webhook(tenant_id, keep_api_url, api_key, setup_alerts)`** - `tenant_id (str)`: Tenant identifier. - `keep_api_url (str)`: URL to send alert data. - `api_key (str)`: API key for authentication. - `setup_alerts (bool)`: Whether to setup alerting capabilities (default is True). - **`_get_alerts()`** - Returns a list of `AlertDto` objects representing the alerts. ## Connecting with the Provider To use the Site24x7 Provider, initialize it with the necessary authentication credentials and provider configuration. Ensure that your Zoho account credentials (Client ID, Client Secret, and Refresh Token) are correctly set up in the `Site24x7ProviderAuthConfig`. ## Steps to Obtain a Refresh Token 1. **Registration and Client Credentials:** - Navigate to [Zoho API Console](https://api-console.zoho.com/). - Sign in or sign up using the email associated with your Site24x7 account. - Register your application using the "Self Client" option to get your Client ID and Client Secret. 2. **Generating Grant Token:** - Go to the Zoho Developer Console and access your registered Self Client. - In the "Generate Code" tab, input the required scopes (`Site24x7.Admin.Read, Site24x7.Admin.Create, Site24x7.Operations.Read`), description, and time duration. - Click "Generate" and copy the provided code. 3. **Generating Access and Refresh Tokens:** - Use the grant token to make a POST request to `https://accounts.zoho.com/oauth/v2/token` to obtain the access and refresh tokens. ```bash curl -X POST 'https://accounts.zoho.com/oauth/v2/token' \ -d 'client_id=your_client_id' \ -d 'client_secret=your_client_secret' \ -d 'code=your_grant_token' \ -d 'grant_type=authorization_code' ``` OR ```python import requests response = requests.post( 'https://accounts.zoho.com/oauth/v2/token', data={ 'client_id': 'your_client_id', 'client_secret': 'your_client_secret', 'code': 'your_grant_token', 'grant_type': 'authorization_code' } ) refresh_token = response.json().get('refresh_token') ``` --- ## Notes - You must use your domain-specific Zoho Accounts URL to generate refresh tokens, otherwise you will receive an `invalid_client` error. See [Data center for Zoho Account](https://help.zoho.com/portal/en/kb/accounts/manage-your-zoho-account/articles/data-center-for-zoho-account). - Ensure that the necessary scopes **Site24x7.Admin.Read, Site24x7.Admin.Create, Site24x7.Operations.Read** are included when generating the grant token, as they dictate the API functionalities accessible via the provider. - Zoho API Console [Link](https://api-console.zoho.com) ## Webhook Integration Modifications The webhook integration grants Keep access to the following scopes within Site24x7: - `authenticated` - `valid_tld` The webhook can be accessed via the "Alarms" section in the Site24x7 console. --- ## Useful Links - [Site24x7 API Documentation](https://www.site24x7.com/help/api/) - [Zoho OAuth Documentation](https://www.zoho.com/accounts/protocol/oauth/web-apps.html) - [Site 24x7 Authentication Guide](https://www.site24x7.com/help/api/#authentication) - [Third Party and Webhook Integrations](https://www.site24x7.com/help/api/#third-party-integrations) - [List of Zoho Account datacenters](https://help.zoho.com/portal/en/kb/accounts/manage-your-zoho-account/articles/data-center-for-zoho-account) ================================================ FILE: docs/providers/documentation/slack-provider.mdx ================================================ --- title: "Keep's integration for Slack" sidebarTitle: "Integration for Slack" description: "Enhance your Keep workflows with direct Slack notifications. Simplify communication with timely updates and alerts directly within Slack." --- import AutoGeneratedSnippet from '/snippets/providers/slack-snippet-autogenerated.mdx'; ## Overview Keep's integration for Slack enables seamless communication by allowing you to send notifications to Slack. This integration is designed to streamline your processes, ensuring your team remains informed with real-time updates. ### Key Features - **Direct Notifications**: Utilize Keep to send messages directly to your Slack channels. - **Flexible Configuration**: Easily configure alerts based on specific triggers within your Keep workflows. - **Interactive Messages**: Enhance your Slack messages with interactive components like buttons and inputs. - **Editable Messages**: Update existing Slack messages dynamically based on changes in alert status or other workflow outcomes, ensuring that your notifications reflect the most current information. ## Getting Started ## Authentication Methods Keep's integration for Slack supports two primary authentication methods: - **Webhook URL**: For simple notifications, use the webhook URL associated with your Slack channel. - **OAuth 2.0**: For a more integrated experience, authorize Keep using Slack's OAuth 2.0 flow. This method is particularly useful for applications requiring access to more Slack features. ### Installation 1. **Add to Slack**: Begin by clicking the "Add to Slack" button on this page. You'll be guided through the OAuth authorization process to connect Keep with your Slack workspace. Add to Slack 2. **Installation Confirmation**: After adding Keep to Slack, you'll be redirected to a confirmation page. This page will confirm the successful installation and provide the next steps to fully leverage Slack notifications within your Keep workflows. ### OAuth Flow The OAuth flow simplifies the connection between Keep and Slack, providing a secure method to authenticate and authorize. 1. **Initiate OAuth**: Click the "Slack" Provider in the [Platform](https://platform.keephq.dev). ![OAuth Authorization](/images/slack/slack-oauth.png) 2. **Authorize Keep**: Follow the prompts to authorize Keep to access your Slack workspace. ### Setup 1. **Create a Slack App**: If you haven't already, create a Slack app in the [Slack API Dashboard](https://api.slack.com/apps). 2. **Enable Incoming Webhooks**: In your Slack app settings, enable Incoming Webhooks and create a webhook for the channel you wish to post messages to. 3. **Use Your Webhook URL**: Within Keep, use the webhook URL to send notifications to your chosen Slack channel. ## Using Keep's integration for Slack With Keep's integration for Slack installed, you're ready to enhance your workflows with Slack notifications. Here's how to get started: 1. **Workflow Integration**: In Keep, select the workflow you wish to add Slack notifications to. Add a Slack notification block and configure it with your message or alert criteria. ![Workflow Configuration](/images/slack/slack-workflow.png) 2. **Send a Test Notification**: Ensure your setup is correct by sending a test notification through your configured workflow, use the "Run Manually" link for that.. ## Useful Links - [Slack API Documentation](https://api.slack.com/messaging/webhooks) - [Keep Privacy Policy](https://www.keephq.dev/privacy-policy) - [Keep Pricing Information](https://www.keephq.dev/pricing) For support and further assistance, shoot us a message over [Slack](https://slack.keephq.dev) (pun intended ;)) ================================================ FILE: docs/providers/documentation/smtp-provider.mdx ================================================ --- title: 'SMTP' sidebarTitle: 'SMTP Provider' description: 'SMTP Provider allows you to send emails.' --- import AutoGeneratedSnippet from '/snippets/providers/smtp-snippet-autogenerated.mdx'; ## Overview SMTP Provider allows you to send emails from Keep. Most of the email services like Gmail, Yahoo, Mailgun, etc. provide SMTP servers to send emails. You can use these SMTP servers to send emails from Keep. The SMTP provider supports both plain text and HTML-formatted emails, allowing you to create rich, styled email notifications. ## Connecting with SMTP Provider 1. Obtain the SMTP credentials from your email service provider. Example: Gmail, Yahoo, Mailgun, etc. 2. Add SMTP Provider in Keep with the obtained credentials. 3. Connect the SMTP Provider with Keep. ## Email Format Support The SMTP provider supports two email formats: ### Plain Text Emails Use the `body` parameter to send plain text emails: ```yaml with: from_email: "sender@example.com" from_name: "Keep Alerts" to_email: "recipient@example.com" subject: "Alert Notification" body: "This is a plain text email notification." ``` ### HTML Emails Use the `html` parameter to send HTML-formatted emails: ```yaml with: from_email: "sender@example.com" from_name: "Keep Alerts" to_email: "recipient@example.com" subject: "Alert Notification" html: "

Alert

This is an HTML email notification.

" ``` When both `body` and `html` are provided, the HTML content takes precedence. ## Multiple Recipients You can send emails to multiple recipients by providing a list of email addresses: ```yaml with: to_email: - "recipient1@example.com" - "recipient2@example.com" - "recipient3@example.com" ``` ================================================ FILE: docs/providers/documentation/snowflake-provider.mdx ================================================ --- title: "Snowflake" sidebarTitle: "Snowflake Provider" description: "Template Provider is a template for newly added provider's documentation" --- import AutoGeneratedSnippet from '/snippets/providers/snowflake-snippet-autogenerated.mdx'; ================================================ FILE: docs/providers/documentation/splunk-provider.mdx ================================================ --- title: "Splunk" sidebarTitle: "Splunk Provider" description: "Splunk provider allows you to get Splunk `saved searches` via webhook installation" --- import AutoGeneratedSnippet from '/snippets/providers/splunk-snippet-autogenerated.mdx'; ## Connecting with the Provider Obtain Splunk API Token: 1. Ensure you have a Splunk account with the necessary [permissions](https://docs.splunk.com/Documentation/Splunk/9.2.0/Security/Rolesandcapabilities). The basic permissions required are `list_all_objects` & `edit_own_objects`. 2. Get an API token for authenticating API requests. [Read More](https://docs.splunk.com/Documentation/Splunk/9.2.0/Security/Setupauthenticationwithtokens) on how to set up and get API Keys. Identify Your Splunk Instance Details: 1. Determine the Host (IP address or hostname) and Port (default is 8089 for Splunk's management API) of the Splunk instance you wish to connect to. --- **NOTE** Make sure to follow this [Guide](https://docs.splunk.com/Documentation/Splunk/9.2.0/Alert/ConfigureWebhookAllowList) to configure your webhook allow list to allow your `keep` deployment. --- ## Useful Links - [Splunk Python SDK](https://dev.splunk.com/view/python-sdk/SP-CAAAEBB) - [Splunk Webhook](https://docs.splunk.com/Documentation/Splunk/9.2.0/Alert/Webhooks) - [Splunk Webhook Allow List](https://docs.splunk.com/Documentation/Splunk/9.2.0/Alert/ConfigureWebhookAllowList) - [Splunk Permissions and Roles](https://docs.splunk.com/Documentation/Splunk/9.2.0/Security/Rolesandcapabilities) - [Splunk API tokens](https://docs.splunk.com/Documentation/Splunk/9.2.0/Security/Setupauthenticationwithtokens) ================================================ FILE: docs/providers/documentation/squadcast-provider.mdx ================================================ --- title: "Squadcast Provider" sidebarTitle: "Squadcast Provider" description: "Squadcast provider is a provider used for creating issues in Squadcast" --- import AutoGeneratedSnippet from '/snippets/providers/squadcast-snippet-autogenerated.mdx'; ## Inputs The `notify` function take following parameters as inputs: - `notify_type` (required): Takes either of `incident` or `notes` depending on weather you want to create an incident or a note. 1. ##### parameters for `incident` - `message` (required): This will be the incident message. - `description` (required): This will be the incident description. - `tags` (optional): Tags for the incident. It should be a dict format. - `priority` (optional): Priority of the incident. - `status` (optional): Status of the event. - `event_id` (optional): event_id is used to resolve an incident - `additional_json` (optional): Additional JSON data to be sent with the incident. 2. ##### parameters for `notes` - `message` (required): The message of the note. - `incident_id` (required): Id of the incident where the Note has to be created. - `attachments` (optional): List of attachments for the notes. See [documentation](https://support.squadcast.com/integrations/incident-webhook-incident-webhook-api) for more ## Connecting with the Provider 1. Go to [Refresh Tokens](https://support.squadcast.com/terraform-and-api-documentation/public-api-refresh-token#from-your-profile-page) to see how to create a `refresh_token`. 2. Visit [Documentations](https://support.squadcast.com/integrations/incident-webhook-incident-webhook-api) to learn how to setup `incident_webhooks` & get the `webhook_url` ## Useful Links - [Squadcast Incident API](https://support.squadcast.com/integrations/incident-webhook-incident-webhook-api) - [Squadcast Refresh Tokens](https://support.squadcast.com/terraform-and-api-documentation/public-api-refresh-token#from-your-profile-page) - [Incident Notes](https://support.squadcast.com/incidents-page/incident-notes) ================================================ FILE: docs/providers/documentation/ssh-provider.mdx ================================================ --- title: "SSH" sidebarTitle: "SSH Provider" description: "The `SSH Provider` is a provider that provides a way to execute SSH commands and get their output." --- import AutoGeneratedSnippet from '/snippets/providers/ssh-snippet-autogenerated.mdx'; ## Connecting with the Provider The `SshProvider` class provides a way to execute SSH commands and get their output. The class uses the `paramiko` library to establish an SSH connection to a server and execute commands. ## Notes _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Useful Links - https://www.ssh.com/academy/ssh/keygen ================================================ FILE: docs/providers/documentation/statuscake-provider.mdx ================================================ --- title: "StatusCake" sidebarTitle: "StatusCake Provider" description: "StatusCake allows you to monitor your website and APIs. Keep allows to read alerts and install webhook in StatusCake" --- import AutoGeneratedSnippet from '/snippets/providers/statuscake-snippet-autogenerated.mdx'; ## Connecting with the Provider Obtain StatusCake API Key 1. Create an account on [StatusCake](https://www.statuscake.com/). 2. After logging in, go to the My Account under [Account Settings](https://app.statuscake.com/User.php) 3. Under Manage API Keys, generate a new API key or use the default key. 4. Copy the API Key. This will be used as the `Statuscake API Key` in the provider settings. ## Usefull Links - [StatusCake](https://www.statuscake.com/) ================================================ FILE: docs/providers/documentation/sumologic-provider.mdx ================================================ --- title: "SumoLogic Provider" sidebarTitle: "SumoLogic Provider" description: "The SumoLogic provider enables webhook installations for receiving alerts in keep" --- import AutoGeneratedSnippet from '/snippets/providers/sumologic-snippet-autogenerated.mdx'; ## Overview The SumoLogic provider facilitates receiving alerts from Monitors in SumoLogic using a Webhook Connection. ## Connecting with the Provider 1. Follow the instructions [here](https://help.sumologic.com/docs/manage/security/access-keys/) to get your Access Key & Access ID 2. Make sure the user has roles with the following capabilities: - `manageScheduledViews` - `manageConnections` - `manageUsersAndRoles` 3. Find your `deployment` from [here](https://api.sumologic.com/docs/#section/Getting-Started/API-Endpoints), keep will automatically figure out your endpoint. ## Useful Links - [SumoLogic API Documentation](https://api.sumologic.com/docs/#section/Getting-Started) - [SumoLogic Access_Keys](https://help.sumologic.com/docs/manage/security/access-keys/) - [SumoLogic Roles Management](https://help.sumologic.com/docs/manage/users-roles/roles/create-manage-roles/) - [SumoLogic Deployments](https://api.sumologic.com/docs/#section/Getting-Started/API-Endpoints) ================================================ FILE: docs/providers/documentation/teams-provider.mdx ================================================ --- title: "Microsoft Teams Provider" sidebarTitle: "Microsoft Teams Provider" description: "Microsoft Teams Provider is a provider that allows to notify alerts to Microsoft Teams chats." --- import AutoGeneratedSnippet from '/snippets/providers/teams-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. In the New Teams client, select Teams and navigate to the channel where you want to add an Incoming Webhook. 2. Select More options ••• on the right side of the channel name. 3. Select Manage Channel For members who aren't admins of the channel, the Manage channel option is available under the Open channel details option in the upper-right corner of a channel. 4. Select Edit 5. Search for Incoming Webhook and select Add. 6. Select Add 7. Provide a name for the webhook and upload an image if necessary. 8. Select Create. 9. Copy and save the unique webhook URL present in the dialog. The URL maps to the channel and you can use it to send information to Teams. 10. Select Done. The webhook is now available in the Teams channel. 1. In the Classic Teams client, select Teams and navigate to the channel where you want to add an Incoming Webhook. 2. Select More options ••• from the upper-right corner. 3. Select Connectors from the dropdown menu. 4. Search for Incoming Webhook and select Add. 5. Select Add. 6. Provide a name for the webhook and upload an image if necessary. 7. Select Create. 8. Copy and save the unique webhook URL present in the dialog. The URL maps to the channel and you can use it to send information to Teams. 9. Select Done. ## Notes When using Adaptive Cards (`typeCard="message"`): - The `sections` parameter should follow the [Adaptive Cards schema](https://adaptivecards.io/explorer/) - `themeColor` is ignored for Adaptive Cards - If no sections are provided, the message will be displayed as a simple text block - Both `sections` and `attachments` can be provided as JSON strings or arrays - You can mention users in your Adaptive Cards using the `mentions` parameter ### Workflow Example You can also find this example in our [examples](https://github.com/keephq/keep/tree/main/examples/workflows/keep-teams-adaptive-cards.yaml) folder in the Keep GitHub repository. ```yaml id: 6bc7c72e-ab3d-4913-84dd-08b9323195ae description: Teams Adaptive Cards Example disabled: false triggers: - type: manual - filters: - key: source value: r".*" type: alert consts: {} name: Keep Teams Adaptive Cards owners: [] services: [] steps: [] actions: - name: teams-action provider: config: "{{ providers.teams }}" type: teams with: message: "" sections: '[{"type": "TextBlock", "text": "{{alert.name}}"}, {"type": "TextBlock", "text": "Tal from Keep"}]' typeCard: message # Optional: Add mentions to notify specific users # mentions: '[{"id": "user@example.com", "name": "User Name"}]' ``` You can also find an example with user mentions in our [examples](https://github.com/keephq/keep/tree/main/examples/workflows/keep-teams-adaptive-cards-with-mentions.yaml) folder. The sections parameter is a JSON string that follows the Adaptive Cards schema, but can also be an object. If it's a string, it will be parsed as a JSON string. ### Using Sections ```python provider.notify( message="Fallback text", typeCard="message", sections=[ { "type": "TextBlock", "text": "Hello from Adaptive Card!" }, { "type": "Image", "url": "https://example.com/image.jpg" } ] ) ``` ### Using Custom Attachments ```python provider.notify( typeCard="message", attachments=[{ "contentType": "application/vnd.microsoft.card.adaptive", "content": { "type": "AdaptiveCard", "version": "1.2", "body": [ { "type": "TextBlock", "text": "Custom Attachment Example" } ] } }] ) ``` ### Using User Mentions in Adaptive Cards You can mention users in your Adaptive Cards using the `mentions` parameter. The text in your card should include the mention in the format `User Name`, and you need to provide the user's ID and name in the `mentions` parameter. Teams supports three types of user IDs for mentions: - Teams User ID (format: `29:1234...`) - Microsoft Entra Object ID (format: `49c4641c-ab91-4248-aebb-6a7de286397b`) - User Principal Name (UPN) (format: `user@example.com`) ```python provider.notify( typeCard="message", sections=[ { "type": "TextBlock", "text": "Hello John Doe, please review this alert!" } ], mentions=[ { "id": "john.doe@example.com", # Can be UPN, Microsoft Entra Object ID, or Teams User ID "name": "John Doe" } ] ) ``` You can also mention multiple users in a single card: ```python provider.notify( typeCard="message", sections=[ { "type": "TextBlock", "text": "Hello John Doe and Jane Smith, please review this alert!" } ], mentions=[ { "id": "john.doe@example.com", "name": "John Doe" }, { "id": "49c4641c-ab91-4248-aebb-6a7de286397b", # Microsoft Entra Object ID "name": "Jane Smith" } ] ) ``` In YAML workflows, you can provide the mentions as a JSON string: ```yaml actions: - name: teams-action provider: config: "{{ providers.teams }}" type: teams with: typeCard: message sections: '[{"type": "TextBlock", "text": "Hello John Doe, please review this alert!"}]' mentions: '[{"id": "john.doe@example.com", "name": "John Doe"}]' ``` ## Useful Links - https://learn.microsoft.com/pt-br/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook - https://learn.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/connectors-using - https://adaptivecards.io/explorer/ - https://adaptivecards.io/schemas/adaptive-card.json ================================================ FILE: docs/providers/documentation/telegram-provider.mdx ================================================ --- title: "Telegram Provider" description: "Telegram Provider is a provider that allows to notify alerts to telegram chats." --- import AutoGeneratedSnippet from '/snippets/providers/telegram-snippet-autogenerated.mdx'; Telegram only supports limited formatting options. Refer to the [Telegram Bot API documentation](https://core.telegram.org/bots/api#formatting-options) for more information. ## Authentication Parameters The TelegramProviderAuthConfig class takes the following parameters: - bot_token (str): The bot of the token. \*Required\*\* ## Connecting with the Provider To use the Telegram Provider you'll need a bot token. How to create telegram bot - https://core.telegram.org/bots#how-do-i-create-a-bot ## Useful Links - Telegram Bot docs - https://core.telegram.org/bots - Telegram how to get chat id - https://stackoverflow.com/questions/32423837/telegram-bot-how-to-get-a-group-chat-id ## Example See `examples/alerts/db_disk_space_telegram.yml` for a full working example. ================================================ FILE: docs/providers/documentation/template.mdx ================================================ --- title: "Template" description: "Template Provider is a template for newly added provider's documentation" --- {/* import AutoGeneratedSnippet from '/snippets/providers/template-snippet-autogenerated.mdx'; */} {/* */} ## Inputs _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Outputs _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Authentication Parameters _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Connecting with the Provider _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Notes _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ## Useful Links _No information yet, feel free to contribute it using the "Edit this page" link the buttom of the page_ ================================================ FILE: docs/providers/documentation/thousandeyes-provider.mdx ================================================ --- title: 'ThousandEyes' sidebarTitle: 'ThousandEyes Provider' description: 'ThousandEyes allows you to receive alerts from ThousandEyes using API endpoints as well as webhooks' --- import AutoGeneratedSnippet from '/snippets/providers/thousandeyes-snippet-autogenerated.mdx'; ## Connecting ThousandEyes to Keep 1. Go to [ThousandEyes Dashboard](https://app.thousandeyes.com/dashboard) 2. Click on `Manage` in the left sidebar and select `Account Settings`. 3. Select `Users and Roles` in the Account Settings 4. Under `User API Tokens`, you can create OAuth Bearer Token 5. Copy the generated token. This will be used as the `OAuth2 Bearer Token` in the provider settings. ## Webhooks Integration 1. Open [ThousandEyes Dashboard](https://app.thousandeyes.com/dashboard) and click on `Network & App Synthetics` in the left sidebar and select `Agent Settings`. 2. Go to `Notifications` under `Enterprise Agents` and click on `Notifications`. 3. Go to `Notifications` and create new webhook notification. 4. Give it a name and set the url as [https://api.keephq.dev/alerts/event/thousandeyes?api_key=your-api-key](https://api.keephq.dev/alerts/event/thousandeyes?api_key=your-api-key) 5. Select `Auth Type` as None and `Add New Webhook`. 6. Go to Keep dashboard and click on the profile icon in the botton left corner and click `Settings`. 7. Select `Users and Access` tab and then select `API Keys` tab and create a new API key. 8. Give name and select the role as `webhook` and click on `Create API Key`. 9. Copy the API key and paste it in the webhook URL. ## Useful Links - [ThousandEyes](https://www.thousandeyes.com/) ================================================ FILE: docs/providers/documentation/trello-provider.mdx ================================================ --- title: "Trello" sidebarTitle: "Trello Provider" description: "Trello provider is a provider used to query data from Trello" --- import AutoGeneratedSnippet from '/snippets/providers/trello-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Go to https://trello.com/power-ups/admin to create custom power-up. 2. Create new power-up and add basic details like name, email address, etc. 3. Once it is created, navigate inside power-up and go to API Key section. 4. There click on `Generate a new API key` and it will generate API Key, that will be used as `api_key`. 5. For generating `api_token`, there is option to generate Token manually, click on that and authorize the application. ## Notes ## Useful Links - https://developer.atlassian.com/cloud/trello/guides/power-ups/your-first-power-up/ - https://trello.com/power-ups/admin ================================================ FILE: docs/providers/documentation/twilio-provider.mdx ================================================ --- title: "Twilio Provider" description: "Twilio Provider is a provider that allows to notify alerts via SMS using Twilio." --- import AutoGeneratedSnippet from '/snippets/providers/twilio-snippet-autogenerated.mdx'; ## Connecting with the Provider To use the Twilio Provider you'll need API token. How to create Twilio API token - https://support.twilio.com/hc/en-us/articles/223136027-Auth-Tokens-and-How-to-Change-Them ## Useful Links - Twilio API token - https://support.twilio.com/hc/en-us/articles/223136027-Auth-Tokens-and-How-to-Change-Them - Twilio phone number - https://www.twilio.com/en-us/guidelines/regulatory ================================================ FILE: docs/providers/documentation/uptimekuma-provider.mdx ================================================ --- title: "UptimeKuma" sidebarTitle: "UptimeKuma Provider" description: "UptimeKuma allows you to monitor your website and APIs and send alert to keep" --- import AutoGeneratedSnippet from '/snippets/providers/uptimekuma-snippet-autogenerated.mdx'; ## Connecting with the Provider Obtain UptimeKuma Host URL, Username and Password 1. UptimeKuma can only be self-hosted. You need to have an instance of UptimeKuma running. 2. After setting up UptimeKuma, you can obtain the Host URL, Username and Password. 3. Use the obtained Host URL, Username and Password in the provider settings. ## Webhooks Integration 1. Connect to UptimeKuma provider with the required parameters. 2. Use the Keep Backend API URL as the Host URL in UptimeKuma. [https://api.keephq.dev](https://api.keephq.dev) (Default) 3. Navigate to Account Settings in Keep, proceed to API Keys, and generate a API Key for Webhook. ## Usefull Links - [UptimeKuma](https://uptime.kuma.pet/) ================================================ FILE: docs/providers/documentation/victorialogs-provider.mdx ================================================ --- title: 'VictoriaLogs' sidebarTitle: 'VictoriaLogs Provider' description: 'VictoriaLogs provider allows you to query logs from VictoriaLogs.' --- import AutoGeneratedSnippet from '/snippets/providers/victorialogs-snippet-autogenerated.mdx'; ## Overview VictoriaLogs is open source user-friendly database for logs from VictoriaMetrics. It is optimized for high performance and low memory usage. It can handle high cardinality and high volume of logs. Note: To add authentication VMAuth should be configured. For more information, refer to the [VMauth documentation](https://docs.victoriametrics.com/vmauth/). ### NoAuth - No additional parameters are required, only the `Grafana Loki Host URL` is required. ### HTTP basic authentication - `HTTP basic authentication - Username`: The username to use for HTTP basic authentication. - `HTTP basic authentication - Password`: The password to use for HTTP basic authentication. ### Bearer - `Bearer Token` : The bearer token to use for authentication. - `X-Scope-OrgID Header`: The organization ID to use for VictoriaLogs Multi-tenancy support. (Optional) ## Querying VictoriaLogs The VictoriaLogs provider allows you to query logs from VictoriaLogs through the `query`, `hits`, `stats_query` and `stats_query_range` types. The following are the parameters available for querying: 1. `query` type: - `query`: This is the query to perform. - `limit`: The max number of matching entries to return. - `timeout`: The query timeout in seconds. - `AccountID`: The account ID to use for VictoriaLogs. - `ProjectID`: The project ID to use for VictoriaLogs. 2. `hits` type: - `query`: This is the query to perform. - `start`: The start time for the query. - `end`: The end time for the query. - `step`: The step for the query. - `AccountID`: The account ID to use for VictoriaLogs. - `ProjectID`: The project ID to use for VictoriaLogs. 3. `stats_query` type: - `query`: This is the query to perform. - `time`: The evaluation time for the query. 4. `stats_query_range` type: - `query`: This is the query to perform. - `start`: The start time for the query. - `end`: The end time for the query. - `step`: The step for the query. ## Useful Links - [VictoriaLogs](https://docs.victoriametrics.com/victorialogs/) - [VMauth documentation](https://docs.victoriametrics.com/vmauth/) ================================================ FILE: docs/providers/documentation/victoriametrics-provider.mdx ================================================ --- title: "Victoriametrics Provider" sidebarTitle: "Victoriametrics Provider" description: "The VictoriametricsProvider allows you to fetch alerts in Victoriametrics." --- import AutoGeneratedSnippet from '/snippets/providers/victoriametrics-snippet-autogenerated.mdx'; ## Connecting with the Provider 1. Ensure you have a running instance of VMAlert accessible by the host and port specified. 2. Include the host and port information in your Victoriametrics provider configuration when initializing the provider. ## Querying Victoriametrics The Victoriametrics provider allows you to query from Victoriametrics through `query` and `query_range` types. The following are the parameters available for querying: 1. `query` type: - `query`: The query to execute on Victoriametrics. Example: `sum(rate(http_requests_total{job="api-server"}[5m]))`. - `start`: The time to query the data for. Example: `2024-01-01T00:00:00Z` 2. `query_range` type: - `query`: The query to execute on Victoriametrics. Example: `sum(rate(http_requests_total{job="api-server"}[5m]))`. - `start`: The start time to query the data for. Example: `2024-01-01T00:00:00Z` - `end`: The end time to query the data for. Example: `2024-01-01T00:00:00Z` - `step`: The step size to use for the query. Example: `15s` ## Useful Links - [Victoriametrics](https://victoriametrics.com/docs/) - [VMAlert](https://victoriametrics.github.io/vmalert.html) ================================================ FILE: docs/providers/documentation/vllm-provider.mdx ================================================ --- title: "vLLM Provider" description: "The vLLM Provider enables integration with vLLM-deployed language models into Keep." --- import AutoGeneratedSnippet from '/snippets/providers/vllm-snippet-autogenerated.mdx'; The vLLM Provider supports querying language models deployed with vLLM for prompt-based interactions. ## Connecting with the Provider To connect to a vLLM deployment: 1. Deploy your vLLM instance or obtain the API endpoint of an existing deployment 2. Configure the API URL in your provider configuration 3. If your deployment requires authentication, configure the API key ================================================ FILE: docs/providers/documentation/wazuh-provider.mdx ================================================ --- title: 'Wazuh' sidebarTitle: 'Wazuh Provider' description: 'Wazuh provider allows you to get alerts from Wazuh via custom integration.' --- import AutoGeneratedSnippet from '/snippets/providers/wazuh-snippet-autogenerated.mdx'; ## Overview The Wazuh provider enables seamless integration between Keep and Wazuh. It allows you to get alerts from Wazuh to Keep via custom integration making it easier to track security-related activities in one place. Please refer to the [Wazuh Docs](https://documentation.wazuh.com/current/user-manual/manager/integration-with-external-apis.html#custom-integration) if you want to learn more about Wazuh Custom Integrations. ## Connecting Wazuh to Keep To connect Wazuh to Keep, you need to configure it as a custom integration in Wazuh. Follow the steps below to set up the integration: 1. Keep webhook scripts need to installed on the Wazuh server. 2. You can download the Keep webhook scripts using the following command: ```bash wget -O custom-keep.py https://github.com/keephq/keep/blob/main/keep/providers/wazuh_provider/custom-keep.py?raw=true wget -O custom-keep https://github.com/keephq/keep/blob/main/keep/providers/wazuh_provider/custom-keep?raw=true ``` 3. Copy the downloaded script to the following path on the Wazuh server: `/var/ossec/integrations/` and set correct permissions ```bash cp custom-keep.py /var/ossec/integrations/custom-keep.py cp custom-keep /var/ossec/integrations/custom-keep chown root:wazuh custom-keep* chmod 750 /var/ossec/integrations/custom-keep* ``` 4. Get the Webhook URL of Keep which is `https://api.keephq.dev/alerts/event/wazuh`. 5. Get the API Key of Keep which you can generate in the [Keep settings](https://platform.keephq.dev/settings?selectedTab=users&userSubTab=api-keys). 6. In the config `/var/ossec/etc/ossec.conf` set new integration block ```xml custom-keep 10 PLACE_YOUR_KEEP_WEBHOOK_URL_HERE PLACE_HERE_YOUR_API_KEY json ``` Please refer to the [Wazuh Documentation](https://documentation.wazuh.com/current/user-manual/manager/integration-with-external-apis.html#custom-integration) for more information and set the `level` you are interested in. 7. Restart the `wazuh-manager` ```bash $ systemctl restart wazuh-manager ``` ## Useful Links - [Wazuh](https://documentation.wazuh.com/) ================================================ FILE: docs/providers/documentation/webhook-provider.mdx ================================================ --- title: 'Webhook' sidebarTitle: 'Webhook Provider' description: 'A webhook is a method used to send real-time data from one application to another whenever a specific event occurs' --- import AutoGeneratedSnippet from '/snippets/providers/webhook-snippet-autogenerated.mdx'; ================================================ FILE: docs/providers/documentation/websocket-provider.mdx ================================================ --- title: "Websocket" --- import AutoGeneratedSnippet from '/snippets/providers/websocket-snippet-autogenerated.mdx'; ## Outputs The `query` function of `WebsocketProvider` outputs the following format: ```json { "connection": true, "data": "Received data from the websocket" } ``` The `connection` field indicates whether the websocket connection was successful (`true`) or not (`false`). The `data` field contains the received data from the websocket. If the `connection` field indicates unsuccessful connection (`false`) then the object will also include an `error` field with details about the failed connection. ## Authentication Parameters The Websocket provider does not require any specific authentication parameters. ## Connecting with the Provider To connect with the Websocket provider and perform queries, follow these steps: Initialize the provider and provider configuration in your system. Use the query function of the WebsocketProvider to interact with the websocket. See [documentation](https://websocket-client.readthedocs.io/en/latest/api.html#websocket.WebSocket.send) for more information. ================================================ FILE: docs/providers/documentation/youtrack-provider.mdx ================================================ --- title: 'YouTrack' sidebarTitle: 'YouTrack Provider' description: 'YouTrack provider allows you to create new issues in YouTrack.' --- import AutoGeneratedSnippet from '/snippets/providers/youtrack-snippet-autogenerated.mdx'; ## Overview YouTrack is a project management tool packed with features that streamline your work and increase productivity on any team project. From software development and DevOps to HR and marketing, all kinds of teams can use YouTrack's functionality to easily track and collaborate on projects of any size. ### How to get Project ID and Permanent Token? 1. **Project ID**: The project ID can be found in the URL of the project. For example, in the URL `https:///projects/`, the project ID is ``. 2. **Permanent Token**: Checkout the [YouTrack - Generate Permanent Token](https://www.jetbrains.com/help/youtrack/server/manage-permanent-token.html) documentation to generate a permanent token. ## Useful Links - [YouTrack](https://www.jetbrains.com/youtrack/) - [YouTrack - Generate Permanent Token](https://www.jetbrains.com/help/youtrack/server/manage-permanent-token.html) ================================================ FILE: docs/providers/documentation/zabbix-provider.mdx ================================================ --- title: "Zabbix" sidebarTitle: "Zabbix Provider" description: "Zabbix provider allows you to pull/push alerts from Zabbix" --- import AutoGeneratedSnippet from '/snippets/providers/zabbix-snippet-autogenerated.mdx'; Please note that we currently only support Zabbix of version 6 and above (6.0^) ## Connecting with the Provider ### API Key To obtain Zabbix authentication token, follow the following steps, divided in to 3 categories ([Docs](https://www.zabbix.com/documentation/current/en/manual/web_interface/frontend_sections/users/api_tokens)): First, login in to your Zabbix account (the provided `zabbix_frontend_url`) with a privileged user. #### Create a User Role 1. Navigate to `Users` -> `User Roles` section. 2. In the top right corner of the screen, click `Create user role` 3. Give the role an indicative name (e.g. Keep Role) 4. In the `User type` selectbox, select `Super Admin` - This is because some of the scopes we need are available to `Super Admin` user type only. [See here](https://www.zabbix.com/documentation/current/en/manual/api/reference/mediatype/create) 5. Remove all the checkboxes from everything, except 1 random `Access to UI elements` which is required for any role. 6. In the `API methods` section, select `Allow list` and fill with these scopes: - `action.create` - `action.get` - `event.acknowledge` - `mediatype.create` - `mediatype.get` - `mediatype.update` - `problem.get` - `script.create` - `script.get` - `script.update` - `user.get` - `user.update` #### Create a user 1. Navigate to `Users` -> `Users` section. 2. Follow the instructions to add a new user. Give it an indicative username (e.g. KeepUser) 3. In the `Permissions` tab, select the Role you have just created. 4. Click `Add` #### Create API token 1. Navigate to `Users` -> `API tokens` section. 2. In the top right corner of the screen, click `Create API token` 3. Give the API token an indicative name (e.g. Keep Token) 4. Select the user you have just created 5. Unselect the `Set expiration date and time` checkbox and click `Add` 6. Copy the generated API token and keep it for further use in Keep. ## Notes When installing Zabbix webhook, Keep automatically adds a new media type of type Keep to your media types. After the new media type is added, Keep automatically adds this mediatype as a media to all existing users, in order to get all alerts incoming from Zabbix. ## Webhook Integration Modifications The automatic webhook integration grants Keep access to the following scopes within the Zabbix instance: - `mediatype.get` - `mediatype.update` - `mediatype.create` - `user.get` - `user.update` You can view the webhook settings under **Alerts > Media Types** ## Useful Links - [Zabbix API](https://www.zabbix.com/documentation/current/en/manual/api) ================================================ FILE: docs/providers/documentation/zenduty-provider.mdx ================================================ --- title: "Zenduty" sidebarTitle: "Zenduty Provider" description: "Zenduty docs" --- import AutoGeneratedSnippet from '/snippets/providers/zenduty-snippet-autogenerated.mdx'; ![User key](/images/zenduty.jpeg) ## Authentication configuration example: ``` zenduty: authentication: api_key: XXXXXXXXXXXXXXXX ``` ## Useful Links - https://docs.zenduty.com/docs/api ================================================ FILE: docs/providers/documentation/zoom-provider.mdx ================================================ --- title: "Zoom" sidebarTitle: "Zoom Provider" description: "Zoom provider allows you to create meetings with Zoom." --- import AutoGeneratedSnippet from '/snippets/providers/zoom-snippet-autogenerated.mdx'; For this integration, you'll need to create a Zoom Application - for more details read https://developers.zoom.us/docs/internal-apps The `record_meeting` parameter won't work with Zoom's basic plan. With basic plan, you'll be able to connect to the meeting and enable the "recording" manually. ## Connecting with the Provider ### Create an Application Keep the credentials: ### Grant Scopes ### Activate the app ### (Optional) Make sure cloud recording is set on your account ================================================ FILE: docs/providers/documentation/zoom_chat-provider.mdx ================================================ --- title: "Zoom Chat" sidebarTitle: "Zoom Chat Provider" description: "Zoom Chat provider allows you to send Zoom Chats using the Incoming Webhook Zoom application." --- import AutoGeneratedSnippet from '/snippets/providers/zoom_chat-snippet-autogenerated.mdx'; For this integration, you will need to add and configure the Incoming Webhook application from the Zoom App Marketplace: https://marketplace.zoom.us/apps/eH_dLuquRd-VYcOsNGy-hQ ## Connecting with the Provider ### Enable the Incoming Webhook Application The Incoming Webhook application is available in the Zoom App Marketplace. ### Create Team Chat Channel: This channel will be the recipient of the Keep notifications. ### Enable the Incoming Webhook Application Send `/inc connect ` to the channel to enable a webhook with authorization code. The app will respond with the webhook url and authorization code. You should use the "Full Format" Incoming Webhook Url, which ends in `?format=full`. ## (Optional) Enabling User JID Lookup Messages can optionally include Zoom user JIDs, which are used to tag a particular Zoom user in a message. This is useful, for example, if a team subscribes to a chat channel but members only wish to be notified when they are explicitly tagged. ### Create a Zoom Application User lookup requires authorization. Create an internal only, Zoom Server to Server OAuth application. ### Assign Required Scopes ================================================ FILE: docs/providers/linked-providers.mdx ================================================ --- title: "Linked providers" description: "Understanding linked vs connected providers in Keep" --- # Linked providers In Keep, providers can be either "connected" or "linked." Understanding the difference is important for proper alert routing and management. ## Connected vs linked providers - **Connected Providers**: These are providers that have been explicitly configured in Keep through the UI or API. They have full provider configuration and authentication details. - **Linked Providers**: These are providers that send alerts to Keep without being explicitly connected. They appear automatically when Keep receives alerts from them through webhooks or push mechanisms. ## How linking works When Keep receives alerts from an unconnected provider (like Prometheus pushing alerts), it automatically creates a "linked" provider entry. This allows you to: - Track which systems are sending alerts - See when Keep last received an alert - Apply deduplication rules specific to that provider ## Attaching alerts to connected providers If you have a connected provider and want to associate incoming alerts with it instead of creating a linked provider, add the `provider_id` query parameter to the webhook URL. For example, with Prometheus AlertManager: ```yaml alertmanager: config: receivers: - name: "keep" webhook_configs: - url: "https://api.keephq.dev/alerts/event/prometheus?provider_id=your_provider_id" ``` Or with other webhook-based integrations: ```bash # Grafana webhook https://api.keephq.dev/alerts/event/grafana?provider_id=grafana-prod # Datadog webhook https://api.keephq.dev/alerts/event/datadog?provider_id=datadog-main # Generic webhook https://api.keephq.dev/alerts/event/webhook?provider_id=custom-webhook ``` ## Best practices 1. **For Production Systems**: It's recommended to use connected providers when possible, as they provide: - Better authentication and security - Access to provider-specific features - Clearer audit trail 2. **For Testing/Development**: Linked providers can be useful for: - Quick prototyping - Testing alert flows - Temporary integrations 3. **Converting Linked to Connected**: If you regularly receive alerts from a linked provider, consider: - Setting up a proper provider connection - Using the `provider_id` parameter to attach alerts to the connected provider ## Limitations Linked providers: - Can't be used to pull alerts or data - Don't have authentication details - Can't be used for provider-specific actions - May have limited deduplication capabilities For full capabilities, consider converting linked providers to connected providers when they become part of your permanent alerting infrastructure. ================================================ FILE: docs/providers/overview.md ================================================ # Providers Overview Providers are core components of Keep that allows Keep to either query data, send notifications, get alerts from or manage third-party tools. These third-party tools include, among others, Datadog, Cloudwatch, and Sentry for data querying and/or alert management, and Slack, Resend, Twilio, and PagerDuty for notifications/incidents. By leveraging Keep Providers, users are able to deeply integrate Keep with the tools they use and trust, providing them with a flexible and powerful way to manage these tools with ease and from a single pane. ## Available Providers - [Airflow](/providers/documentation/airflow-provider) - [Azure AKS](/providers/documentation/aks-provider) - [AmazonSQS](/providers/documentation/amazonsqs-provider) - [Anthropic](/providers/documentation/anthropic-provider) - [AppDynamics](/providers/documentation/appdynamics-provider) - [ArgoCD](/providers/documentation/argocd-provider) - [Flux CD](/providers/documentation/fluxcd-provider) - [Asana](/providers/documentation/asana-provider) - [Auth0](/providers/documentation/auth0-provider) - [Axiom](/providers/documentation/axiom-provider) - [Azure Monitor](/providers/documentation/azuremonitoring-provider) - [Bash](/providers/documentation/bash-provider) - [BigQuery](/providers/documentation/bigquery-provider) - [Centreon](/providers/documentation/centreon-provider) - [Checkmk](/providers/documentation/checkmk-provider) - [Checkly](/providers/documentation/checkly-provider) - [Cilium](/providers/documentation/cilium-provider) - [ClickHouse](/providers/documentation/clickhouse-provider) - [CloudWatch](/providers/documentation/cloudwatch-provider) - [Console](/providers/documentation/console-provider) - [Coralogix](/providers/documentation/coralogix-provider) - [Dash0](/providers/documentation/dash0-provider) - [Datadog](/providers/documentation/datadog-provider) - [Databend](/providers/documentation/databend-provider) - [DeepSeek](/providers/documentation/deepseek-provider) - [Discord](/providers/documentation/discord-provider) - [Dynatrace](/providers/documentation/dynatrace-provider) - [EKS](/providers/documentation/eks-provider) - [Elastic](/providers/documentation/elastic-provider) - [Flashduty](/providers/documentation/flashduty-provider) - [GCP Monitoring](/providers/documentation/gcpmonitoring-provider) - [Gemini](/providers/documentation/gemini-provider) - [GitHub](/providers/documentation/github-provider) - [Github Workflows](/providers/documentation/github_workflows_provider) - [GitLab](/providers/documentation/gitlab-provider) - [GitLab Pipelines](/providers/documentation/gitlabpipelines-provider) - [Google Kubernetes Engine](/providers/documentation/gke-provider) - [Google Chat](/providers/documentation/google_chat-provider) - [Grafana](/providers/documentation/grafana-provider) - [Grafana Incident](/providers/documentation/grafana_incident-provider) - [Grafana Loki](/providers/documentation/grafana_loki-provider) - [Grafana OnCall](/providers/documentation/grafana_oncall-provider) - [Graylog](/providers/documentation/graylog-provider) - [Grok](/providers/documentation/grok-provider) - [HTTP](/providers/documentation/http-provider) - [Icinga2](/providers/documentation/icinga2-provider) - [ilert](/providers/documentation/ilert-provider) - [Incident.io](/providers/documentation/incidentio-provider) - [Incident Manager](/providers/documentation/incidentmanager-provider) - [Jira On-Prem](/providers/documentation/jira-on-prem-provider) - [Jira Cloud](/providers/documentation/jira-provider) - [Kafka](/providers/documentation/kafka-provider) - [Keep](/providers/documentation/keep-provider) - [Kibana](/providers/documentation/kibana-provider) - [Kubernetes](/providers/documentation/kubernetes-provider) - [LibreNMS](/providers/documentation/libre_nms-provider) - [Linear](/providers/documentation/linear_provider) - [LinearB](/providers/documentation/linearb-provider) - [LiteLLM](/providers/documentation/litellm-provider) - [Llama.cpp](/providers/documentation/llamacpp-provider) - [Mailgun](/providers/documentation/mailgun-provider) - [Mattermost](/providers/documentation/mattermost-provider) - [Microsoft Planner](/providers/documentation/planner-provider) - [Monday](/providers/documentation/monday-provider) - [MongoDB](/providers/documentation/mongodb-provider) - [MySQL](/providers/documentation/mysql-provider) - [NetBox](/providers/documentation/netbox-provider) - [Netdata](/providers/documentation/netdata-provider) - [New Relic](/providers/documentation/new-relic-provider) - [Ntfy.sh](/providers/documentation/ntfy-provider) - [Ollama](/providers/documentation/ollama-provider) - [OpenAI](/providers/documentation/openai-provider) - [OpenObserve](/providers/documentation/openobserve-provider) - [OpenSearch Serverless](/providers/documentation/opensearchserverless-provider) - [Openshift](/providers/documentation/openshift-provider) - [Opsgenie](/providers/documentation/opsgenie-provider) - [Pagerduty](/providers/documentation/pagerduty-provider) - [Pagertree](/providers/documentation/pagertree-provider) - [Parseable](/providers/documentation/parseable-provider) - [Pingdom](/providers/documentation/pingdom-provider) - [PostgreSQL](/providers/documentation/postgresql-provider) - [PostHog](/providers/documentation/posthog-provider) - [Prometheus](/providers/documentation/prometheus-provider) - [Pushover](/providers/documentation/pushover-provider) - [Python](/providers/documentation/python-provider) - [QuickChart](/providers/documentation/quickchart-provider) - [Redmine](/providers/documentation/redmine-provider) - [Resend](/providers/documentation/resend-provider) - [Rollbar](/providers/documentation/rollbar-provider) - [AWS S3](/providers/documentation/s3-provider) - [SendGrid](/providers/documentation/sendgrid-provider) - [Sentry](/providers/documentation/sentry-provider) - [Service Now](/providers/documentation/service-now-provider) - [SignalFX](/providers/documentation/signalfx-provider) - [SIGNL4](/providers/documentation/signl4-provider) - [Site24x7](/providers/documentation/site24x7-provider) - [Slack](/providers/documentation/slack-provider) - [SMTP](/providers/documentation/smtp-provider) - [Snowflake](/providers/documentation/snowflake-provider) - [Splunk](/providers/documentation/splunk-provider) - [Squadcast](/providers/documentation/squadcast-provider) - [SSH](/providers/documentation/ssh-provider) - [StatusCake](/providers/documentation/statuscake-provider) - [SumoLogic](/providers/documentation/sumologic-provider) - [Microsoft Teams](/providers/documentation/teams-provider) - [Telegram](/providers/documentation/telegram-provider) - [Template](/providers/documentation/template) - [ThousandEyes](/providers/documentation/thousandeyes-provider) - [Trello](/providers/documentation/trello-provider) - [Twilio](/providers/documentation/twilio-provider) - [UptimeKuma](/providers/documentation/uptimekuma-provider) - [VictoriaLogs](/providers/documentation/victorialogs-provider) - [Victoriametrics](/providers/documentation/victoriametrics-provider) - [vLLM](/providers/documentation/vllm-provider) - [Wazuh](/providers/documentation/wazuh-provider) - [Webhook](/providers/documentation/webhook-provider) - [Websocket](/providers/documentation/websocket-provider) - [YouTrack](/providers/documentation/youtrack-provider) - [Zabbix](/providers/documentation/zabbix-provider) - [Zenduty](/providers/documentation/zenduty-provider) - [Zoom](/providers/documentation/zoom-provider) - [Zoom Chat](/providers/documentation/zoom_chat-provider) ================================================ FILE: docs/providers/overview.mdx ================================================ --- title: "Overview" sidebarTitle: "Overview" description: "A Provider is a component of Keep that enables it to interact with third-party products. It is implemented as extensible Python code, making it easy to enhance and customize." --- Providers are core components of Keep that allows Keep to either query data, send notifications, get alerts from or manage third-party tools. These third-party tools include, among others, Datadog, Cloudwatch, and Sentry for data querying and/or alert management, and Slack, Resend, Twilio, and PagerDuty for notifications/incidents. By leveraging Keep Providers, users are able to deeply integrate Keep with the tools they use and trust, providing them with a flexible and powerful way to manage these tools with ease and from a single pane. } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > } > ================================================ FILE: docs/providers/provider-methods.mdx ================================================ --- title: "Provider methods" sidebarTitle: "Provider Methods" --- Provider methods are additional capabilities that providers expose beyond the basic `query` and `notify` capabilities ([read more here](/providers/adding-a-new-provider#basics)). These methods allow you to interact with the provider's API in more specific ways, enabling richer integrations and automation capabilities. ## What are provider methods? Developers define provider methods using the `PROVIDER_METHODS` list in each provider class. They represent specific actions or queries that you can perform through the provider's API. These methods extend the basic capabilities of providers beyond simple notifications and queries. For example, a monitoring service provider might expose methods to: - Mute/unmute alerts - Get detailed traces - Search for specific metrics - Modify monitoring configurations ## Using provider methods You can access provider methods through: - Keep's platform interface via the alert action menu - Keep's smart AI assistant (for example, "get traces for this alert") - Keep's API - Keep's workflows ### Via UI Methods appear in the alert action menu when available for the alert's source provider: The form is automatically populated with the parameters required by the method, if they're available in the alert. ### Via AI assistant Keep's AI assistant can automatically discover and invoke provider methods based on natural language requests by understanding multiple contexts: 1. **Alert Context**: The AI understands: - The alert's source provider - Alert metadata and attributes - Related services and applications - Current alert status and severity 2. **Provider Context**: The AI knows: - Which providers you have connected to your account - Available methods for each provider - Required parameters and their types - Method descriptions and capabilities 3. **Historical Context**: The AI learns from: - Similar past incidents - Previously successful method invocations - Common patterns in alert resolution For example: ```text User: Can you get the traces for this alert? Assistant: I see this alert came from Datadog. I'll use the Datadog provider's get_traces method to fetch the traces. I'll use the trace_id from the alert's metadata: abc-123... User: This alert seems related to high latency. Can you help investigate? Assistant: I'll help investigate the latency issue. Since this is a Datadog alert, I can: 1. Get recent traces using search_traces() to look for slow requests 2. Fetch metrics using get_metrics() to check system performance 3. Look for related logs using search_logs() Would you like me to start with any of these? ``` The AI assistant automatically: 1. Identifies relevant provider methods 2. Extracts required parameters from context 3. Suggests appropriate actions based on the alert type 4. Chains multiple methods for comprehensive investigation ### Via API ```python # Example using a Datadog provider method to mute a monitor response = await api.post( f"/providers/{provider_id}/invoke/mute_monitor", {"monitor_id": "abc123", "duration": 3600} ) ``` ## Adding new provider methods To add a new method to your provider: 1. Define the method in your provider class (must be an instance method): ```python def get_traces(self, trace_id: str) -> dict: """Get trace details from the provider. Args: trace_id (str): The ID of the trace to retrieve Returns: dict: The trace details """ # Implementation pass ``` 2. Add method metadata to `PROVIDER_METHODS`: ```python from keep.providers.models.provider_method import ProviderMethod PROVIDER_METHODS = [ ProviderMethod( name="Get Traces", description="Retrieve trace details", func_name="get_traces", type="view", # 'view' or 'action' scopes=["traces:read"], # Required provider scopes category="Observability", # Optional category for grouping methods ) ] ``` Note: The `func_params` field is automatically populated by Keep through reflection of the method signature, so you don't need to define it manually. Provider methods must be instance methods (not static or class methods) of the provider class. The method signature is automatically inspected to generate UI forms and parameter validation. ### Complete example Here's a complete example of a provider with custom methods: ```python class MonitoringProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Monitoring Service" PROVIDER_METHODS = [ ProviderMethod( name="Mute Alert", description="Mute an alert for a specified duration", func_name="mute_alert", type="action", scopes=["alerts:write"], category="Alert Management", ), ProviderMethod( name="Get Metrics", description="Retrieve metrics for a service", func_name="get_metrics", type="view", scopes=["metrics:read"], category="Observability", ), ] def mute_alert(self, alert_id: str, duration_minutes: int = 60) -> dict: """ Mute an alert for the specified duration. Args: alert_id: The ID of the alert to mute duration_minutes: Duration to mute in minutes (default: 60) Returns: dict: Confirmation of the mute action """ # Implementation here response = self._api_call(f"/alerts/{alert_id}/mute", {"duration": duration_minutes}) return {"success": True, "muted_until": response["muted_until"]} def get_metrics(self, service_name: str, metric_type: str, time_range: str = "1h") -> list: """ Get metrics for a specific service. Args: service_name: Name of the service metric_type: Type of metric (cpu, memory, latency, etc.) time_range: Time range for metrics (default: "1h") Returns: list: List of metric data points """ # Implementation here return self._query(f"metrics.{metric_type}", service=service_name, range=time_range) ``` ### Method types - **view**: Returns data for display (for example, getting traces, metrics) - **action**: Performs an action (for example, muting an alert, creating a ticket) ### Parameter types Supported parameter types for provider methods: - `str`: String input field - `int`: Numeric input field - `float`: Decimal number input field - `bool`: Boolean checkbox - `datetime`: Date/time picker - `dict`: JSON object input - `list`: Array/list input - `Literal`: Dropdown with predefined values - `Optional[type]`: Optional parameter of the specified type Example with different parameter types: ```python from typing import Optional, Literal from datetime import datetime def advanced_query( self, metric_name: str, # Required string time_range: Literal["1h", "6h", "24h", "7d"] = "1h", # Dropdown with options include_metadata: bool = False, # Boolean checkbox limit: Optional[int] = None, # Optional integer start_time: Optional[datetime] = None, # Optional datetime picker ) -> dict: """Query metrics with advanced filtering options.""" # Implementation pass ``` ### Auto-discovery Keep automatically inspects provider classes to: 1. Discover available methods 2. Extract parameter information 3. Generate UI components 4. Enable AI understanding of method capabilities ## Best practices 1. **Clear Documentation**: Provide detailed docstrings for methods 2. **Type Hints**: Use Python type hints for parameters 3. **Error Handling**: Return clear error messages 4. **Scopes**: Define minimum required scopes 5. **Validation**: Validate parameters before execution ## Limitations - Currently supports only synchronous methods - The supported parameter types are limited to basic types - Methods must be instance methods of the provider class - Methods are automatically discovered through reflection - Keep validates parameter types based on type hints ================================================ FILE: docs/snippets/providers/airflow-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/aks-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **subscription_id**: The azure subscription id (required: True, sensitive: True) - **client_id**: The azure client id (required: True, sensitive: True) - **client_secret**: The azure client secret (required: True, sensitive: True) - **tenant_id**: The azure tenant id (required: True, sensitive: True) - **resource_group_name**: The azure aks resource group name (required: True, sensitive: True) - **resource_name**: The azure aks cluster name (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query aks provider: aks config: "{{ provider.my_provider_name }}" with: command_type: {value} # The command type to operate on the k8s cluster (`get_pods`, `get_pvc`, `get_node_pressure`). ``` Check the following workflow example: - [aks_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/aks_basic.yml) ================================================ FILE: docs/snippets/providers/amazonsqs-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **region_name**: Region name (required: True, sensitive: False) - **sqs_queue_url**: SQS Queue URL (required: True, sensitive: False) - **access_key_id**: Access Key Id (Leave empty if using IAM role at EC2) (required: False, sensitive: False) - **secret_access_key**: Secret access key (Leave empty if using IAM role at EC2) (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: Key-Id pair is valid and working (mandatory) - **sqs::read**: Required privileges to receive alert from SQS. If you only want to give read scope to your key-secret pair the permission policy: AmazonSQSReadOnlyAccess. (mandatory) - **sqs::write**: Required privileges to push messages to SQS. If you only want to give read & write scope to your key-secret pair the permission policy: AmazonSQSFullAccess. ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query amazonsqs provider: amazonsqs config: "{{ provider.my_provider_name }}" with: message: {value} group_id: {value} dedup_id: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/anthropic-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Anthropic API Key (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query anthropic provider: anthropic config: "{{ provider.my_provider_name }}" with: prompt: {value} # The prompt to query the model with. model: {value} # The model to query. max_tokens: {value} # The maximum number of tokens to generate. structured_output_format: {value} # The structured output format to use. ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/appdynamics-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **appDynamicsAccountName**: AppDynamics Account Name (required: True, sensitive: False) - **appId**: AppDynamics appId (required: True, sensitive: False) - **host**: AppDynamics host (required: True, sensitive: False) - **appDynamicsAccessToken**: AppDynamics Access Token (required: False, sensitive: False) - **appDynamicsUsername**: Username (required: False, sensitive: False) - **appDynamicsPassword**: Password (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authorized (mandatory) - **administrator**: Administrator privileges (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/argocd-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **argocd_access_token**: Argocd Access Token (required: True, sensitive: True) - **deployment_url**: Deployment Url (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authorized (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ================================================ FILE: docs/snippets/providers/asana-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **pat_token**: Personal Access Token for Asana. (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is authenticated to Asana. (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query asana provider: asana config: "{{ provider.my_provider_name }}" with: task_id: {value} # Task ID. # Apart from the above parameters, you can also provide few other parameters. Refer to the [Asana API documentation](https://developers.asana.com/docs/update-a-task) for more details. ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query asana provider: asana config: "{{ provider.my_provider_name }}" with: name: {value} # Task Name. projects: {value} # List of Project IDs. # Apart from the above parameters, you can also provide few other parameters. Refer to the [Asana API documentation](https://developers.asana.com/docs/update-a-task) for more details. ``` Check the following workflow examples: - [create-task-in-asana.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/create-task-in-asana.yaml) - [update-task-in-asana.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/update-task-in-asana.yaml) ================================================ FILE: docs/snippets/providers/auth0-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **domain**: Auth0 Domain (required: True, sensitive: False) - **token**: Auth0 API Token (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query auth0 provider: auth0 config: "{{ provider.my_provider_name }}" with: log_type: {value} previous_users: {value} ``` Check the following workflow example: - [new-auth0-users-monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/new-auth0-users-monitor.yml) ================================================ FILE: docs/snippets/providers/axiom-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_token**: Axiom API Token (required: True, sensitive: True) - **organization_id**: Axiom Organization ID (required: False, sensitive: False) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query axiom provider: axiom config: "{{ provider.my_provider_name }}" with: dataset: {value} datasets_api_url: {value} organization_id: {value} startTime: {value} endTime: {value} query: {value} # command to execute ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/azuremonitoring-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Connecting via Webhook (omnidirectional) This provider supports webhooks. To send alerts from Azure Monitor to Keep, Use the following webhook url to configure Azure Monitor send alerts to Keep: 1. In Azure Monitor, create a new Action Group. 2. In the Action Group, add a new action of type "Webhook". 3. In the Webhook action, configure the webhook with the following settings. - **Name**: keep-azuremonitoring-webhook-integration - **URL**: Your Keep Backend URL 4. Save the Action Group. 5. In the Alert Rule, configure the Action Group to use the Action Group created in step 1. 6. Save the Alert Rule. 7. Test the Alert Rule to ensure that the alerts are being sent to Keep. ================================================ FILE: docs/snippets/providers/base-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query base provider: base config: "{{ provider.my_provider_name }}" with: kwargs: {value} # The provider context (with statement) ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query base provider: base config: "{{ provider.my_provider_name }}" with: # The provider context (with statement) ``` Check the following workflow examples: - [change.yml](https://github.com/keephq/keep/blob/main/examples/workflows/change.yml) - [conditionally_run_if_ai_says_so.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/conditionally_run_if_ai_says_so.yaml) - [consts_and_vars.yml](https://github.com/keephq/keep/blob/main/examples/workflows/consts_and_vars.yml) - [create_alert_from_vm_metric.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alert_from_vm_metric.yml) - [create_alerts_from_mysql.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alerts_from_mysql.yml) - [create_multi_alert_from_vm_metric.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_multi_alert_from_vm_metric.yml) - [db_disk_space_monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/db_disk_space_monitor.yml) - [disk_grown_defects_rule.yml](https://github.com/keephq/keep/blob/main/examples/workflows/disk_grown_defects_rule.yml) - [elastic_enrich_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/elastic_enrich_example.yml) - [ifelse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/ifelse.yml) - [incident-tier-escalation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/incident-tier-escalation.yml) - [openshift_pod_restart.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_pod_restart.yml) - [query_victoriametrics.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_victoriametrics.yml) - [raw_sql_query_datetime.yml](https://github.com/keephq/keep/blob/main/examples/workflows/raw_sql_query_datetime.yml) - [webhook_example_foreach.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example_foreach.yml) - [workflow_start_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/workflow_start_example.yml) ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ================================================ FILE: docs/snippets/providers/bash-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query bash provider: bash config: "{{ provider.my_provider_name }}" with: timeout: {value} command: {value} shell: {value} ``` Check the following workflow example: - [bash_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/bash_example.yml) ================================================ FILE: docs/snippets/providers/bigquery-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **service_account_json**: The service account JSON with container.viewer role (required: True, sensitive: True) - **project_id**: Google Cloud project ID. If not provided, it will try to fetch it from the environment variable 'GOOGLE_CLOUD_PROJECT' (required: False, sensitive: False) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query bigquery provider: bigquery config: "{{ provider.my_provider_name }}" with: query: {value} ``` Check the following workflow examples: - [bigquery.yml](https://github.com/keephq/keep/blob/main/examples/workflows/bigquery.yml) - [failed-to-login-workflow.yml](https://github.com/keephq/keep/blob/main/examples/workflows/failed-to-login-workflow.yml) ================================================ FILE: docs/snippets/providers/centreon-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: Centreon Host URL (required: True, sensitive: False) - **api_token**: Centreon API Token (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is authenticated ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/checkly-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **checklyApiKey**: Checkly API Key (required: True, sensitive: True) - **accountId**: Checkly Account ID (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **read_alerts**: Read alerts from Checkly ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/checkmk-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/cilium-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **cilium_base_endpoint**: The base endpoint of the cilium hubble relay (required: True, sensitive: False) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ================================================ FILE: docs/snippets/providers/clickhouse-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **username**: Clickhouse username (required: True, sensitive: False) - **password**: Clickhouse password (required: True, sensitive: True) - **host**: Clickhouse hostname (required: True, sensitive: False) - **port**: Clickhouse port (required: True, sensitive: False) - **database**: Clickhouse database name (required: False, sensitive: False) - **protocol**: Protocol ('clickhouses' for SSL, 'clickhouse' for no SSL, 'http' or 'https') (required: True, sensitive: False) - **verify**: Enable SSL verification (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_server**: The user can connect to the server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query clickhouse provider: clickhouse config: "{{ provider.my_provider_name }}" with: query: {value} single_row: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query clickhouse provider: clickhouse config: "{{ provider.my_provider_name }}" with: query: {value} single_row: {value} ``` Check the following workflow examples: - [clickhouse_multiquery.yml](https://github.com/keephq/keep/blob/main/examples/workflows/clickhouse_multiquery.yml) - [query_clickhouse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_clickhouse.yml) ================================================ FILE: docs/snippets/providers/cloudwatch-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **region**: AWS region (required: True, sensitive: False) - **access_key**: AWS access key (Leave empty if using IAM role at EC2) (required: False, sensitive: True) - **access_key_secret**: AWS access key secret (Leave empty if using IAM role at EC2) (required: False, sensitive: True) - **session_token**: AWS Session Token (required: False, sensitive: True) - **cloudwatch_sns_topic**: AWS Cloudwatch SNS Topic [ARN or name] (required: False, sensitive: False) - **protocol**: Protocol to use for the webhook (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **cloudwatch:DescribeAlarms**: Required to retrieve information about alarms. (mandatory) ([Documentation](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_DescribeAlarms.html)) - **cloudwatch:PutMetricAlarm**: Required to update information about alarms. This mainly use to add Keep as an SNS action to the alarm. ([Documentation](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutMetricAlarm.html)) - **sns:ListSubscriptionsByTopic**: Required to list all subscriptions of a topic, so Keep will be able to add itself as a subscription. ([Documentation](https://docs.aws.amazon.com/sns/latest/dg/sns-access-policy-language-api-permissions-reference.html)) - **logs:GetQueryResults**: Part of CloudWatchLogsReadOnlyAccess role. Required to retrieve the results of CloudWatch Logs Insights queries. ([Documentation](https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_GetQueryResults.html)) - **logs:DescribeQueries**: Part of CloudWatchLogsReadOnlyAccess role. Required to describe the results of CloudWatch Logs Insights queries. ([Documentation](https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_DescribeQueries.html)) - **logs:StartQuery**: Part of CloudWatchLogsReadOnlyAccess role. Required to start CloudWatch Logs Insights queries. ([Documentation](https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_StartQuery.html)) - **iam:SimulatePrincipalPolicy**: Allow Keep to test the scopes of the current user/role without modifying any resource. ([Documentation](https://docs.aws.amazon.com/IAM/latest/APIReference/API_SimulatePrincipalPolicy.html)) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query cloudwatch provider: cloudwatch config: "{{ provider.my_provider_name }}" with: log_group: {value} log_groups: {value} remove_ptr_from_results: {value} query: {value} hours: {value} ``` Check the following workflow examples: - [retrieve_cloudwatch_logs.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/retrieve_cloudwatch_logs.yaml) - [slack_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack_basic.yml) - [slack_basic_cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack_basic_cel.yml) ================================================ FILE: docs/snippets/providers/console-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query console provider: console config: "{{ provider.my_provider_name }}" with: message: {value} logger: {value} severity: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query console provider: console config: "{{ provider.my_provider_name }}" with: message: {value} # The message to be printed in to the console logger: {value} # Whether to use the logger or not severity: {value} # The severity of the message if logger is True ``` Check the following workflow examples: - [aks_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/aks_basic.yml) - [change.yml](https://github.com/keephq/keep/blob/main/examples/workflows/change.yml) - [complex-conditions-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/complex-conditions-cel.yml) - [console_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/console_example.yml) - [consts_and_dict.yml](https://github.com/keephq/keep/blob/main/examples/workflows/consts_and_dict.yml) - [eks_advanced.yml](https://github.com/keephq/keep/blob/main/examples/workflows/eks_advanced.yml) - [eks_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/eks_basic.yml) - [fluxcd_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/fluxcd_example.yml) - [gke.yml](https://github.com/keephq/keep/blob/main/examples/workflows/gke.yml) - [ifelse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/ifelse.yml) - [incident-enrich.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/incident-enrich.yaml) - [incident_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/incident_example.yml) - [inputs_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/inputs_example.yml) - [multi-condition-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/multi-condition-cel.yml) - [mustache-paths-example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/mustache-paths-example.yml) - [openshift_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_basic.yml) - [openshift_monitoring_and_remediation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_monitoring_and_remediation.yml) - [openshift_pod_restart.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_pod_restart.yml) - [pattern-matching-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/pattern-matching-cel.yml) - [severity_changed.yml](https://github.com/keephq/keep/blob/main/examples/workflows/severity_changed.yml) - [webhook_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example.yml) - [webhook_example_foreach.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example_foreach.yml) ================================================ FILE: docs/snippets/providers/coralogix-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/dash0-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/databend-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: Databend host_url (required: True, sensitive: False) - **username**: Databend username (required: True, sensitive: False) - **password**: Databend password (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_server**: The user can connect to the server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query databend provider: databend config: "{{ provider.my_provider_name }}" with: query: {value} ``` Check the following workflow example: - [query-databend.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query-databend.yml) ================================================ FILE: docs/snippets/providers/datadog-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Datadog Api Key (required: True, sensitive: True) - **app_key**: Datadog App Key (required: True, sensitive: True) - **domain**: Datadog API domain (required: False, sensitive: False) - **environment**: Topology environment name (required: False, sensitive: False) - **oauth_token**: For OAuth flow (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **events_read**: Read events data. (mandatory) - **monitors_read**: Read monitors (mandatory) ([Documentation](https://docs.datadoghq.com/account_management/rbac/permissions/#monitors)) - **monitors_write**: Write monitors ([Documentation](https://docs.datadoghq.com/account_management/rbac/permissions/#monitors)) - **create_webhooks**: Create webhooks integrations - **metrics_read**: View custom metrics. - **logs_read**: Read log data. - **apm_read**: Read APM data for Topology creation. - **apm_service_catalog_read**: Read APM service catalog for Topology creation. ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query datadog provider: datadog config: "{{ provider.my_provider_name }}" with: query: {value} timeframe: {value} query_type: {value} ``` Check the following workflow examples: - [complex-conditions-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/complex-conditions-cel.yml) - [datadog-log-monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/datadog-log-monitor.yml) - [db_disk_space_monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/db_disk_space_monitor.yml) - [service-error-rate-monitor-datadog.yml](https://github.com/keephq/keep/blob/main/examples/workflows/service-error-rate-monitor-datadog.yml) ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **mute_monitor** Mute a monitor (action, scopes: monitors_write) - **unmute_monitor** Unmute a monitor (action, scopes: monitors_write) - **get_monitor_events** Get all events related to this monitor (view, scopes: events_read) - **get_trace** Get trace by ID (view, scopes: apm_read) - **create_incident** Create an incident (action, scopes: incidents_write) - **resolve_incident** Resolve an active incident (action, scopes: incidents_write) - **add_incident_timeline_note** Add a note to an incident timeline (action, scopes: incidents_write) ================================================ FILE: docs/snippets/providers/deepseek-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: DeepSeek API Key (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query deepseek provider: deepseek config: "{{ provider.my_provider_name }}" with: prompt: {value} # The user query. model: {value} # The model to use for the query. max_tokens: {value} # The maximum number of tokens to generate. system_prompt: {value} # The system prompt to use. structured_output_format: {value} # The structured output format. ``` Check the following workflow example: - [enrich_using_structured_output_from_deepseek.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_deepseek.yaml) ================================================ FILE: docs/snippets/providers/discord-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **webhook_url**: Discord Webhook Url (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query discord provider: discord config: "{{ provider.my_provider_name }}" with: content: {value} # The content of the message. components: {value} # The components of the message. ``` Check the following workflow example: - [discord_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/discord_basic.yml) ================================================ FILE: docs/snippets/providers/dynatrace-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **environment_id**: Dynatrace's environment ID (required: True, sensitive: False) - **api_token**: Dynatrace's API token (required: True, sensitive: True) - **alerting_profile**: Dynatrace's alerting profile for the webhook integration. Defaults to 'Default' (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **problems.read**: Read access to Dynatrace problems (mandatory) - **settings.read**: Read access to Dynatrace settings [for webhook installation] - **settings.write**: Write access to Dynatrace settings [for webhook installation] ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/eks-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **region**: AWS region where the EKS cluster is located (required: True, sensitive: False) - **cluster_name**: Name of the EKS cluster (required: True, sensitive: False) - **access_key**: AWS access key (Leave empty if using IAM role at EC2) (required: False, sensitive: True) - **secret_access_key**: AWS secret access key (Leave empty if using IAM role at EC2) (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **eks:DescribeCluster**: Required to get cluster information (mandatory) ([Documentation](https://docs.aws.amazon.com/eks/latest/APIReference/API_DescribeCluster.html)) - **eks:ListClusters**: Required to list available clusters (mandatory) ([Documentation](https://docs.aws.amazon.com/eks/latest/APIReference/API_ListClusters.html)) - **pods:delete**: Required to delete/restart pods ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **deployments:scale**: Required to scale deployments ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **pods:list**: Required to list pods ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **pods:get**: Required to get pod details ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **pods:logs**: Required to get pod logs ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query eks provider: eks config: "{{ provider.my_provider_name }}" with: command_type: {value} # Type of query to execute # Additional arguments for the query ``` Check the following workflow examples: - [eks_advanced.yml](https://github.com/keephq/keep/blob/main/examples/workflows/eks_advanced.yml) - [eks_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/eks_basic.yml) ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **get_pods** List all pods in a namespace or across all namespaces (view, scopes: pods:list, pods:get) - `namespace`: The namespace to list pods from. If None, lists pods from all namespaces. - **get_pvc** List all PVCs in a namespace or across all namespaces (view, scopes: pods:list) - `namespace`: The namespace to list pods from. If None, lists pods from all namespaces. - **get_node_pressure** Get pressure metrics for all nodes (view, scopes: pods:list) - **exec_command** Execute a command in a pod (action, scopes: pods:exec) - `namespace`: Namespace of the pod - `pod_name`: Name of the pod - `command`: Command to execute (string or array) - `container`: Name of the container (optional, defaults to first container) - **restart_pod** Restart a pod by deleting it (action, scopes: pods:delete) - `namespace`: Namespace of the pod - `pod_name`: Name of the pod - **get_deployment** Get deployment information (view, scopes: pods:list) - `deployment_name`: Name of the deployment to get - `namespace`: Target namespace (defaults to “default”) - **scale_deployment** Scale a deployment to specified replicas (action, scopes: deployments:scale) - `deployment_name`: Name of the deployment to get - `namespace`: Target namespace (defaults to “default”) - `replicas`: Number of replicas to scale to - **get_pod_logs** Get logs from a pod (view, scopes: pods:logs) - `namespace`: Namespace of the pod - `pod_name`: Name of the pod - `container`: Name of the container (optional) - `tail_lines`: Number of lines to fetch from the end of logs (default: 100) ================================================ FILE: docs/snippets/providers/elastic-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: Elasticsearch host (required: False, sensitive: False) - **cloud_id**: Elasticsearch cloud id (required: False, sensitive: False) - **verify**: Enable SSL verification (required: False, sensitive: False) - **api_key**: Elasticsearch API Key (required: False, sensitive: True) - **username**: Elasticsearch username (required: False, sensitive: False) - **password**: Elasticsearch password (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_server**: The user can connect to the server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query elastic provider: elastic config: "{{ provider.my_provider_name }}" with: query: {value} # The body of the query index: {value} # The index to search in ``` Check the following workflow examples: - [create_alerts_from_elastic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alerts_from_elastic.yml) - [elastic_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/elastic_basic.yml) - [elastic_enrich_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/elastic_enrich_example.yml) ================================================ FILE: docs/snippets/providers/flashduty-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **integration_key**: Flashduty integration key (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query flashduty provider: flashduty config: "{{ provider.my_provider_name }}" with: title: {value} # The title of the incident event_status: {value} # The status of the incident, one of: Info, Warning, Critical, Ok description: {value} # The description of the incident alert_key: {value} # Alert identifier, used to update or automatically recover existing alerts. If you're reporting a recovery event, this value must exist. labels: {value} # The labels of the incident ``` Check the following workflow example: - [flashduty_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/flashduty_example.yml) ================================================ FILE: docs/snippets/providers/fluxcd-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **kubeconfig**: Kubeconfig file content (required: False, sensitive: True) - **context**: Kubernetes context to use (required: False, sensitive: False) - **namespace**: Namespace where Flux CD is installed (required: False, sensitive: False) - **api_server**: Kubernetes API server URL (required: False, sensitive: False) - **token**: Kubernetes API token (required: False, sensitive: True) - **insecure**: Skip TLS verification (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authorized (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query fluxcd provider: fluxcd config: "{{ provider.my_provider_name }}" with: **_: {value} # Additional arguments (ignored) ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query fluxcd provider: fluxcd config: "{{ provider.my_provider_name }}" with: action: {value} # The action to perform. Supported actions are: - reconcile: Trigger a reconciliation for a FluxCD resource. # Additional arguments for the action. ``` Check the following workflow example: - [fluxcd_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/fluxcd_example.yml) ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **get_fluxcd_resources** Get resources from Flux CD (, scopes: no additional scopes) ================================================ FILE: docs/snippets/providers/gcpmonitoring-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **service_account_json**: A service account JSON with logging viewer role (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **roles/logs.viewer**: Read access to GCP logging (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query gcpmonitoring provider: gcpmonitoring config: "{{ provider.my_provider_name }}" with: filter: {value} timedelta_in_days: {value} page_size: {value} raw: {value} project: {value} ``` Check the following workflow examples: - [gcp_logging_open_ai.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/gcp_logging_open_ai.yaml) - [slack-message-reaction.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack-message-reaction.yml) ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **execute_query** Query the GCP logs (view, scopes: no additional scopes) ================================================ FILE: docs/snippets/providers/gemini-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Google AI API Key (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query gemini provider: gemini config: "{{ provider.my_provider_name }}" with: prompt: {value} model: {value} max_tokens: {value} structured_output_format: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/github-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **access_token**: GitHub Access Token (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query github provider: github config: "{{ provider.my_provider_name }}" with: repository: {value} previous_stars_count: {value} last_stargazer: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query github provider: github config: "{{ provider.my_provider_name }}" with: run_action: {value} # The action to run. workflow: {value} # The workflow to run. repo_name: {value} # The repository name. repo_owner: {value} # The repository owner. ref: {value} # The ref to use. inputs: {value} # The inputs to use. ``` Check the following workflow examples: - [datadog-log-monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/datadog-log-monitor.yml) - [db_disk_space_monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/db_disk_space_monitor.yml) - [new_github_stars.yml](https://github.com/keephq/keep/blob/main/examples/workflows/new_github_stars.yml) - [run-github-workflow.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/run-github-workflow.yaml) - [service-error-rate-monitor-datadog.yml](https://github.com/keephq/keep/blob/main/examples/workflows/service-error-rate-monitor-datadog.yml) - [update_workflows_from_http.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_workflows_from_http.yml) - [zoom_chat_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/zoom_chat_example.yml) ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **get_last_commits** Get the N last commits from a GitHub repository (view, scopes: no additional scopes) - `repository`: The GitHub repository to get the commits from. - `n`: The number of commits to get. - **get_last_releases** Get the N last releases and their changelog from a GitHub repository (view, scopes: no additional scopes) - `repository`: The GitHub repository to get the releases from. - `n`: The number of releases to get. ================================================ FILE: docs/snippets/providers/github_workflows-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **personal_access_token**: Github Personal Access Token (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query github_workflows provider: github_workflows config: "{{ provider.my_provider_name }}" with: url: {value} method: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query github_workflows provider: github_workflows config: "{{ provider.my_provider_name }}" with: github_url: {value} github_method: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/gitlab-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: GitLab Host (required: True, sensitive: False) - **personal_access_token**: GitLab Personal Access Token (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **api**: Authenticated with api scope (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query gitlab provider: gitlab config: "{{ provider.my_provider_name }}" with: id: {value} title: {value} description: {value} labels: {value} issue_type: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/gitlabpipelines-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **access_token**: GitLab Access Token (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query gitlabpipelines provider: gitlabpipelines config: "{{ provider.my_provider_name }}" with: url: {value} method: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query gitlabpipelines provider: gitlabpipelines config: "{{ provider.my_provider_name }}" with: gitlab_url: {value} gitlab_method: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/gke-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **service_account_json**: The service account JSON with container.viewer role (required: True, sensitive: True) - **cluster_name**: The name of the cluster (required: True, sensitive: False) - **region**: The GKE cluster region (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **roles/container.viewer**: Read access to GKE resources (mandatory) - **pods:delete**: Required to delete/restart pods ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **deployments:scale**: Required to scale deployments ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **pods:list**: Required to list pods ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **pods:get**: Required to get pod details ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) - **pods:logs**: Required to get pod logs ([Documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query gke provider: gke config: "{{ provider.my_provider_name }}" with: command_type: {value} # Type of query to execute # Additional arguments will be passed to the query method ``` Check the following workflow example: - [gke.yml](https://github.com/keephq/keep/blob/main/examples/workflows/gke.yml) ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **get_pods** List all pods in a namespace or across all namespaces (view, scopes: pods:list, pods:get) - **get_pvc** List all PVCs in a namespace or across all namespaces (view, scopes: pods:list) - **get_node_pressure** Get pressure metrics for all nodes (view, scopes: pods:list) - **exec_command** Execute a command in a pod (action, scopes: pods:exec) - **restart_pod** Restart a pod by deleting it (action, scopes: pods:delete) - **get_deployment** Get deployment information (view, scopes: pods:list) - **scale_deployment** Scale a deployment to specified replicas (action, scopes: deployments:scale) - **get_pod_logs** Get logs from a pod (view, scopes: pods:logs) ================================================ FILE: docs/snippets/providers/google_chat-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **webhook_url**: Google Chat Webhook Url (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query google_chat provider: google_chat config: "{{ provider.my_provider_name }}" with: message: {value} # The text message to send. ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/grafana-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **token**: Token (required: True, sensitive: True) - **host**: Grafana host (required: True, sensitive: False) - **datasource_uid**: Datasource UID (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **alert.rules:read**: Read Grafana alert rules in a folder and its subfolders. (mandatory) ([Documentation](https://grafana.com/docs/grafana/latest/administration/roles-and-permissions/access-control/custom-role-actions-scopes/)) - **alert.provisioning:read**: Read all Grafana alert rules, notification policies, etc via provisioning API. ([Documentation](https://grafana.com/docs/grafana/latest/administration/roles-and-permissions/access-control/custom-role-actions-scopes/)) - **alert.provisioning:write**: Update all Grafana alert rules, notification policies, etc via provisioning API. ([Documentation](https://grafana.com/docs/grafana/latest/administration/roles-and-permissions/access-control/custom-role-actions-scopes/)) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ## Connecting via Webhook (omnidirectional) This provider supports webhooks. If your Grafana is unreachable from Keep, you can use the following webhook url to configure Grafana to send alerts to Keep: 1. In Grafana, go to the Alerting tab in the Grafana dashboard. 2. Click on Contact points in the left sidebar and create a new one. 3. Give it a name and select Webhook as kind of contact point with webhook url as KEEP_BACKEND_URL/alerts/event/grafana. 4. Add 'X-API-KEY' as the request header {api_key}. 5. Save the webhook. 6. Click on Notification policies in the left sidebar 7. Click on "New child policy" under the "Default policy" 8. Remove all matchers until you see the following: "If no matchers are specified, this notification policy will handle all alert instances." 9. Chose the webhook contact point you have just created under Contact point and click "Save Policy" ================================================ FILE: docs/snippets/providers/grafana_incident-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: Grafana Host URL (required: True, sensitive: False) - **service_account_token**: Service Account Token (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authenticated ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query grafana_incident provider: grafana_incident config: "{{ provider.my_provider_name }}" with: operationType: {value} updateType: {value} ``` Check the following workflow examples: - [create-new-incident-grafana-incident.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/create-new-incident-grafana-incident.yaml) - [update-incident-grafana-incident.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/update-incident-grafana-incident.yaml) ================================================ FILE: docs/snippets/providers/grafana_loki-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: Grafana Loki Host URL (required: True, sensitive: False) - **verify**: Enable SSL verification (required: False, sensitive: False) - **authentication_type**: Authentication Type (required: True, sensitive: False) - **username**: HTTP basic authentication - Username (required: False, sensitive: False) - **password**: HTTP basic authentication - Password (required: False, sensitive: True) - **x_scope_orgid**: X-Scope-OrgID Header Authentication (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: Instance is valid and user is authenticated ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query grafana_loki provider: grafana_loki config: "{{ provider.my_provider_name }}" with: query: {value} limit: {value} time: {value} direction: {value} start: {value} end: {value} since: {value} step: {value} interval: {value} queryType: {value} ``` Check the following workflow example: - [query_grafana_loki.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/query_grafana_loki.yaml) ================================================ FILE: docs/snippets/providers/grafana_oncall-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **token**: Token (required: True, sensitive: False) - **host**: Grafana OnCall Host (required: True, sensitive: False) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query grafana_oncall provider: grafana_oncall config: "{{ provider.my_provider_name }}" with: title: {value} alert_uid: {value} message: {value} image_url: {value} state: {value} link_to_upstream_details: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/graylog-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **graylog_user_name**: Username (required: True, sensitive: False) - **graylog_access_token**: Graylog Access Token (required: True, sensitive: True) - **deployment_url**: Deployment Url (required: True, sensitive: False) - **verify**: Verify SSL certificates (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: Mandatory for all operations, ensures the user is authenticated. (mandatory) - **authorized**: Mandatory for querying incidents and managing resources, ensures the user has `Admin` privileges. (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query graylog provider: graylog config: "{{ provider.my_provider_name }}" with: events_search_parameters: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **search** Search using elastic query language in Graylog (action, scopes: authorized) - `query`: The query string to search for. - `query_type`: The type of query to use. Default is "elastic". - `timerange_seconds`: The time range in seconds. Default is 300 seconds. - `timerange_type`: The type of time range. Default is "relative". - `page`: Page number, starting from 0. - `per_page`: Number of results per page. ## Connecting via Webhook (omnidirectional) This provider supports webhooks. To send alerts from Graylog to Keep, Use the following webhook url to configure Graylog send alerts to Keep: 1. In Graylog, from the Topbar, go to `Alerts` > `Notifications`. 2. Click "Create Notification". 3. In the New Notification form, configure: **Note**: For Graylog v4.x please set the **URL** to `KEEP_BACKEND_URL/alerts/event/graylog?api_key={api_key}`. - **Display Name**: keep-graylog-webhook-integration - **Title**: keep-graylog-webhook-integration - **Notification Type**: Custom HTTP Notification - **URL**: KEEP_BACKEND_URL/alerts/event/graylog # Whitelist this URL - **Headers**: X-API-KEY:{api_key} 4. Erase the Body Template. 5. Click on "Create Notification". 6. Go the the `Event Definitions` tab, and select the Event Definition that will trigger the alert you want to send to Keep and click on More > Edit. 7. Go to "Notifications" tab. 8. Click on "Add Notification" and select the "keep-graylog-webhook-integration" that you created in step 3. 9. Click on "Add Notification". 10. Click `Next` > `Update` event definition ================================================ FILE: docs/snippets/providers/grok-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: X.AI Grok API Key (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query grok provider: grok config: "{{ provider.my_provider_name }}" with: prompt: {value} model: {value} max_tokens: {value} structured_output_format: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/http-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query http provider: http config: "{{ provider.my_provider_name }}" with: url: {value} method: {value} headers: {value} body: {value} params: {value} proxies: {value} fail_on_error: {value} verify: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query http provider: http config: "{{ provider.my_provider_name }}" with: url: {value} method: {value} headers: {value} body: {value} params: {value} proxies: {value} verify: {value} ``` Check the following workflow examples: - [create-new-incident-grafana-incident.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/create-new-incident-grafana-incident.yaml) - [db_disk_space_monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/db_disk_space_monitor.yml) - [http_enrich.yml](https://github.com/keephq/keep/blob/main/examples/workflows/http_enrich.yml) - [ifelse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/ifelse.yml) - [incident-enrich.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/incident-enrich.yaml) - [pagerduty.yml](https://github.com/keephq/keep/blob/main/examples/workflows/pagerduty.yml) - [permissions_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/permissions_example.yml) - [send-message-telegram-with-htmlmd.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/send-message-telegram-with-htmlmd.yaml) - [simple_http_request_ntfy.yml](https://github.com/keephq/keep/blob/main/examples/workflows/simple_http_request_ntfy.yml) - [slack-workflow-trigger.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack-workflow-trigger.yml) - [telegram_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/telegram_basic.yml) - [update-incident-grafana-incident.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/update-incident-grafana-incident.yaml) - [update_workflows_from_http.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_workflows_from_http.yml) - [webhook_example_foreach.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example_foreach.yml) - [zoom_chat_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/zoom_chat_example.yml) ================================================ FILE: docs/snippets/providers/icinga2-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: Icinga2 Host URL (required: True, sensitive: False) - **api_user**: Icinga2 API User (required: True, sensitive: False) - **api_password**: Icinga2 API Password (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **read_alerts**: Read alerts from Icinga2 ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/ilert-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **ilert_token**: ILert API token (required: True, sensitive: True) - **ilert_host**: ILert API host (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **read_permission**: Read permission (mandatory) - **write_permission**: Write permission ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query ilert provider: ilert config: "{{ provider.my_provider_name }}" with: incident_id: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query ilert provider: ilert config: "{{ provider.my_provider_name }}" with: _type: {value} # Type of notification ('incident' or 'event') - determines which endpoint is used summary: {value} # A brief summary of the incident (required for new incidents) status: {value} # Current status of the incident (INVESTIGATING, RESOLVED, MONITORING, IDENTIFIED) message: {value} # Detailed message describing the incident (default: empty string) affectedServices: {value} # JSON string of affected services and their statuses (default: "[]") id: {value} # ID of incident to update (use "0" to create a new incident) event_type: {value} # Type of event to post (ALERT, ACCEPT, RESOLVE) details: {value} # Detailed information about the event alert_key: {value} # Unique key for event deduplication priority: {value} # Priority level of the event (HIGH, LOW) images: {value} # List of image URLs to include with the event links: {value} # List of related links to include with the event custom_details: {value} # Custom key-value pairs for additional context ``` Check the following workflow example: - [ilert-incident-upon-alert.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/ilert-incident-upon-alert.yaml) ================================================ FILE: docs/snippets/providers/incidentio-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **incidentIoApiKey**: IncidentIO's API_KEY (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authenticated (mandatory) - **read_access**: User has read access (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query incidentio provider: incidentio config: "{{ provider.my_provider_name }}" with: incident_id: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/incidentmanager-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **region**: AWS region (required: True, sensitive: False) - **response_plan_arn**: AWS Response Plan's arn (required: True, sensitive: False) - **sns_topic_arn**: AWS SNS Topic arn you want to be used/using in response plan (required: True, sensitive: False) - **access_key**: AWS access key (Leave empty if using IAM role at EC2) (required: False, sensitive: True) - **access_key_secret**: AWS access key secret (Leave empty if using IAM role at EC2) (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **ssm-incidents:ListIncidentRecords**: Required to retrieve incidents. (mandatory) ([Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html)) - **ssm-incidents:GetResponsePlan**: Required to get response plan and register keep as webhook ([Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html)) - **ssm-incidents:UpdateResponsePlan**: Required to update response plan and register keep as webhook ([Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html)) - **iam:SimulatePrincipalPolicy**: Allow Keep to test the scopes of the current user/role without modifying any resource. ([Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html)) - **sns:ListSubscriptionsByTopic**: Required to list all subscriptions of a topic, so Keep will be able to add itself as a subscription. ([Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html)) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query incidentmanager provider: incidentmanager config: "{{ provider.my_provider_name }}" ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/jira-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **email**: Atlassian Jira Email (required: True, sensitive: False) - **api_token**: Atlassian Jira API Token (required: True, sensitive: True) - **host**: Atlassian Jira Host (required: True, sensitive: False) - **ticket_creation_url**: URL for creating new tickets (optional, will use default if not provided) (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **BROWSE_PROJECTS**: Browse Jira Projects (mandatory) - **CREATE_ISSUES**: Create Jira Issues (mandatory) - **CLOSE_ISSUES**: Close Jira Issues - **EDIT_ISSUES**: Edit Jira Issues - **DELETE_ISSUES**: Delete Jira Issues - **MODIFY_REPORTER**: Modify Jira Issue Reporter - **TRANSITION_ISSUES**: Transition Jira Issues ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query jira provider: jira config: "{{ provider.my_provider_name }}" with: ticket_id: {value} # The ticket id of the issue, optional. board_id: {value} # The board id of the issue. ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query jira provider: jira config: "{{ provider.my_provider_name }}" with: summary: {value} # The summary of the issue. description: {value} # The description of the issue. issue_type: {value} # The type of the issue. project_key: {value} # The project key of the issue. board_name: {value} # The board name of the issue. issue_id: {value} # The issue id of the issue. labels: {value} # The labels of the issue. components: {value} # The components of the issue. custom_fields: {value} # The custom fields of the issue. transition_to: {value} # Optional transition name (e.g., "Done", "Resolved") to apply after update/create. ``` Check the following workflow examples: - [create_jira_ticket_upon_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_jira_ticket_upon_alerts.yml) - [incident-enrich.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/incident-enrich.yaml) - [jira-create-ticket-on-alert.yml](https://github.com/keephq/keep/blob/main/examples/workflows/jira-create-ticket-on-alert.yml) - [jira-transition-on-resolved.yml](https://github.com/keephq/keep/blob/main/examples/workflows/jira-transition-on-resolved.yml) - [jira_on_prem.yml](https://github.com/keephq/keep/blob/main/examples/workflows/jira_on_prem.yml) - [test_jira_create_with_custom_fields.yml](https://github.com/keephq/keep/blob/main/examples/workflows/test_jira_create_with_custom_fields.yml) - [test_jira_custom_fields_fix.yml](https://github.com/keephq/keep/blob/main/examples/workflows/test_jira_custom_fields_fix.yml) - [update_jira_ticket.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_jira_ticket.yml) ================================================ FILE: docs/snippets/providers/jiraonprem-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: Jira Host (required: True, sensitive: False) - **personal_access_token**: Jira PAT (required: True, sensitive: True) - **ticket_creation_url**: URL for creating new tickets (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **BROWSE_PROJECTS**: Browse Jira Projects (mandatory) - **CREATE_ISSUES**: Create Jira Issues (mandatory) - **CLOSE_ISSUES**: Close Jira Issues - **EDIT_ISSUES**: Edit Jira Issues - **DELETE_ISSUES**: Delete Jira Issues - **MODIFY_REPORTER**: Modify Jira Issue Reporter ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query jiraonprem provider: jiraonprem config: "{{ provider.my_provider_name }}" with: ticket_id: {value} # The ticket id. board_id: {value} # The board id. ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query jiraonprem provider: jiraonprem config: "{{ provider.my_provider_name }}" with: summary: {value} description: {value} issue_type: {value} project_key: {value} board_name: {value} issue_id: {value} labels: {value} components: {value} custom_fields: {value} priority: {value} ``` Check the following workflow example: - [jira_on_prem.yml](https://github.com/keephq/keep/blob/main/examples/workflows/jira_on_prem.yml) ================================================ FILE: docs/snippets/providers/kafka-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: Kafka host (required: True, sensitive: False) - **topic**: The topic to subscribe to (required: True, sensitive: False) - **username**: Username (required: False, sensitive: True) - **password**: Password (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **topic_read**: The kafka user that have permissions to read the topic. (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/keep-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query keep provider: keep config: "{{ provider.my_provider_name }}" with: filters: {value} # filters to query Keep (only for version 1) version: {value} # version of Keep API distinct: {value} # if True, return only distinct alerts time_delta: {value} # time delta in days to query Keep timerange: {value} # timerange dict to calculate time delta filter: {value} # filter to query Keep (only for version 2) limit: {value} # limit number of results (only for version 2) ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query keep provider: keep config: "{{ provider.my_provider_name }}" with: delete_all_other_workflows: {value} # if True, delete all other workflows workflow_full_sync: {value} # if True, sync all workflows workflow_to_update_yaml: {value} # workflow yaml to update alert: {value} # alert data to create fingerprint_fields: {value} # fields to use for alert fingerprinting override_source_with: {value} # override alert source read_only: {value} # if True, don't modify existing alerts fingerprint: {value} # alert fingerprint if: {value} # condition to evaluate for alert creation for: {value} # duration for state alerts ``` Check the following workflow examples: - [create_alert_from_vm_metric.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alert_from_vm_metric.yml) - [create_alert_in_keep.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alert_in_keep.yml) - [create_alerts_from_elastic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alerts_from_elastic.yml) - [create_alerts_from_mysql.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alerts_from_mysql.yml) - [create_multi_alert_from_vm_metric.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_multi_alert_from_vm_metric.yml) - [fluxcd_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/fluxcd_example.yml) - [resolve_old_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/resolve_old_alerts.yml) - [retrieve_cloudwatch_logs.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/retrieve_cloudwatch_logs.yaml) - [update_service_now_tickets_status.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_service_now_tickets_status.yml) - [update_workflows_from_http.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_workflows_from_http.yml) - [update_workflows_from_s3.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_workflows_from_s3.yml) - [webhook_example_foreach.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example_foreach.yml) ================================================ FILE: docs/snippets/providers/kibana-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Kibana API Key (required: True, sensitive: True) - **kibana_host**: Kibana Host (required: True, sensitive: False) - **kibana_port**: Kibana Port (defaults to 9243) (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **rulesSettings:read**: Read alerts (mandatory) - **rulesSettings:write**: Modify alerts (mandatory) - **actions:read**: Read connectors (mandatory) - **actions:write**: Write connectors (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/kubernetes-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_server**: The kubernetes api server url (required: False, sensitive: False) - **token**: Bearer token to access kubernetes (leave empty for in-cluster auth) (required: False, sensitive: True) - **insecure**: Skip TLS verification (required: False, sensitive: False) - **use_in_cluster_config**: Use in-cluster configuration (ServiceAccount) (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_kubernetes**: Check if the provided token can connect to the kubernetes server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query kubernetes provider: kubernetes config: "{{ provider.my_provider_name }}" with: command_type: {value} # The type of query to perform. Supported queries are: - get_logs: Get logs from a pod - get_deployment_logs: Get logs from all pods in a deployment - get_events: Get events for a namespace or pod - get_nodes: List nodes - get_pods: List pods - get_node_pressure: Get node pressure conditions - get_pvc: List persistent volume claims - get_deployments: List deployments - get_statefulsets: List statefulsets - get_daemonsets: List daemonsets - get_services: List services - get_namespaces: List namespaces - get_ingresses: List ingresses for a namespace or all namespaces - get_jobs: List jobs # Additional arguments for the query. ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query kubernetes provider: kubernetes config: "{{ provider.my_provider_name }}" with: action: {value} # The action to perform. Supported actions are: - rollout_restart: Restart a deployment/statefulset/daemonset - restart_pod: Restart a specific pod - cordon_node: Mark node as unschedulable - uncordon_node: Mark node as schedulable - drain_node: Safely evict pods from node - scale_deployment: Scale deployment up/down - scale_statefulset: Scale statefulset up/down - exec_pod_command: Execute command in pod # Additional arguments for the action. ``` Check the following workflow example: - [gke.yml](https://github.com/keephq/keep/blob/main/examples/workflows/gke.yml) ================================================ FILE: docs/snippets/providers/libre_nms-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: LibreNMS Host URL (required: True, sensitive: False) - **api_key**: LibreNMS API Key (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **read_alerts**: Read alerts from LibreNMS ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/linear-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_token**: Linear API Token (required: True, sensitive: True) - **ticket_creation_url**: URL for creating new tickets (required: False, sensitive: False) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query linear provider: linear config: "{{ provider.my_provider_name }}" with: team_name: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query linear provider: linear config: "{{ provider.my_provider_name }}" with: team_name: {value} project_name: {value} title: {value} description: {value} priority: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/linearb-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_token**: LinearB API Token (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **any**: A way to validate the provider (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query linearb provider: linearb config: "{{ provider.my_provider_name }}" with: incident_id: {value} http_url: {value} title: {value} teams: {value} repository_urls: {value} services: {value} started_at: {value} ended_at: {value} git_ref: {value} should_delete: {value} issued_at: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/litellm-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_url**: LiteLLM API endpoint URL (required: True, sensitive: False) - **api_key**: Optional API key if your LiteLLM deployment requires authentication (required: False, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query litellm provider: litellm config: "{{ provider.my_provider_name }}" with: prompt: {value} temperature: {value} model: {value} max_tokens: {value} structured_output_format: {value} ``` Check the following workflow example: - [enrich_using_structured_output_from_openai.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_openai.yaml) ================================================ FILE: docs/snippets/providers/llamacpp-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: Llama.cpp Server Host URL (required: True, sensitive: False) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query llamacpp provider: llamacpp config: "{{ provider.my_provider_name }}" with: prompt: {value} max_tokens: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/mailgun-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **email**: Email address to send alerts to (required: False, sensitive: False) - **sender**: Sender email address to validate (required: False, sensitive: False) - **email_domain**: Custom email domain for receiving alerts (required: False, sensitive: False) - **extraction**: Extraction Rules (required: False, sensitive: False) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/mattermost-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **webhook_url**: Mattermost Webhook Url (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query mattermost provider: mattermost config: "{{ provider.my_provider_name }}" with: message: {value} # The content of the message. attachments: {value} # The attachments of the message. channel: {value} # The channel to send the message ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/mock-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query mock provider: mock config: "{{ provider.my_provider_name }}" with: # Just will return all parameters passed to it. ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query mock provider: mock config: "{{ provider.my_provider_name }}" with: # Just will return all parameters passed to it. ``` Check the following workflow examples: - [autosupress.yml](https://github.com/keephq/keep/blob/main/examples/workflows/autosupress.yml) - [businesshours.yml](https://github.com/keephq/keep/blob/main/examples/workflows/businesshours.yml) - [datadog-log-monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/datadog-log-monitor.yml) - [db_disk_space_monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/db_disk_space_monitor.yml) - [enrich_using_structured_output_from_deepseek.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_deepseek.yaml) - [enrich_using_structured_output_from_openai.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_openai.yaml) - [enrich_using_structured_output_from_vllm_qwen.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_vllm_qwen.yaml) - [ilert-incident-upon-alert.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/ilert-incident-upon-alert.yaml) - [resolve_old_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/resolve_old_alerts.yml) ================================================ FILE: docs/snippets/providers/monday-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_token**: Personal API Token (required: False, sensitive: True) - **access_token**: For access token installation flow, use Keep UI (required: False, sensitive: True) - **scopes**: Scopes from OAuth logic, comma separated (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **create_pulse**: Create a new pulse ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query monday provider: monday config: "{{ provider.my_provider_name }}" with: board_id: {value} group_id: {value} item_name: {value} column_values: {value} ``` Check the following workflow example: - [monday_create_pulse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/monday_create_pulse.yml) ================================================ FILE: docs/snippets/providers/mongodb-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: Mongo host_uri (required: True, sensitive: False) - **username**: MongoDB username (required: False, sensitive: False) - **password**: MongoDB password (required: False, sensitive: True) - **database**: MongoDB database name (required: False, sensitive: False) - **auth_source**: Mongo authSource database name (required: False, sensitive: False) - **additional_options**: Mongo kwargs, these will be passed to MongoClient (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_server**: The user can connect to the server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query mongodb provider: mongodb config: "{{ provider.my_provider_name }}" with: query: {value} as_dict: {value} single_row: {value} ``` Check the following workflow example: - [query_mongodb.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/query_mongodb.yaml) ================================================ FILE: docs/snippets/providers/mysql-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **username**: MySQL username (required: True, sensitive: False) - **password**: MySQL password (required: True, sensitive: True) - **host**: MySQL hostname (required: True, sensitive: False) - **database**: MySQL database name (required: False, sensitive: False) - **port**: MySQL port (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_server**: The user can connect to the server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query mysql provider: mysql config: "{{ provider.my_provider_name }}" with: query: {value} # Query to execute as_dict: {value} # If True, returns the results as a list of dictionaries single_row: {value} # If True, returns only the first row of the results # Arguments will me passed to the query.format(**kwargs) ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query mysql provider: mysql config: "{{ provider.my_provider_name }}" with: query: {value} # Query to execute as_dict: {value} # If True, returns the results as a list of dictionaries single_row: {value} # If True, returns only the first row of the results # Arguments will me passed to the query.format(**kwargs) ``` Check the following workflow examples: - [blogpost.yml](https://github.com/keephq/keep/blob/main/examples/workflows/blogpost.yml) - [conditionally_run_if_ai_says_so.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/conditionally_run_if_ai_says_so.yaml) - [create_alerts_from_mysql.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alerts_from_mysql.yml) - [raw_sql_query_datetime.yml](https://github.com/keephq/keep/blob/main/examples/workflows/raw_sql_query_datetime.yml) - [simple_http_request_ntfy.yml](https://github.com/keephq/keep/blob/main/examples/workflows/simple_http_request_ntfy.yml) - [slack-message-reaction.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack-message-reaction.yml) ================================================ FILE: docs/snippets/providers/netbox-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/netdata-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Connecting via Webhook (omnidirectional) This provider supports webhooks. To send alerts from Netdata to Keep, Use the following webhook url to configure Netdata send alerts to Keep: 1. In Netdata, go to Space settings. 2. Go to "Alerts & Notifications". 3. Click on "Add configuration". 4. Add "Webhook" as the notification method. 5. Add a name to the configuration. 6. Select Room(s) to apply the configuration. 7. Select Notification(s) to apply the configuration. 8. In the "Webhook URL" field, add KEEP_BACKEND_URL/alerts/event/netdata. 9. Add a request header with the key "x-api-key" and the value as {api_key}. 10. Leave the Authentication as "No Authentication". 11. Add the "Challenge secret" as "keep-netdata-webhook-integration". 12. Save the configuration. ================================================ FILE: docs/snippets/providers/netxms-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: NetXMS API key (required: True, sensitive: True) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/newrelic-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: New Relic User key. To receive webhooks, use `User key` of an admin account (required: True, sensitive: True) - **account_id**: New Relic account ID (required: True, sensitive: False) - **new_relic_api_url**: New Relic API URL (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **ai.issues:read**: Required to read issues and related information (mandatory) ([Documentation](https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/)) - **ai.destinations:read**: Required to read whether keep webhooks are registered ([Documentation](https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/)) - **ai.destinations:write**: Required to register keep webhooks ([Documentation](https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/)) - **ai.channels:read**: Required to know informations about notification channels. ([Documentation](https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/)) - **ai.channels:write**: Required to create notification channel ([Documentation](https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/)) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query newrelic provider: newrelic config: "{{ provider.my_provider_name }}" with: nrql: {value} query: {value} # query to execute ``` Check the following workflow example: - [complex-conditions-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/complex-conditions-cel.yml) ================================================ FILE: docs/snippets/providers/ntfy-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **access_token**: Ntfy Access Token (required: False, sensitive: True) - **host**: Ntfy Host URL (For self-hosted Ntfy only) (required: False, sensitive: False) - **username**: Ntfy Username (For self-hosted Ntfy only) (required: False, sensitive: False) - **password**: Ntfy Password (For self-hosted Ntfy only) (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **send_alert**: (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query ntfy provider: ntfy config: "{{ provider.my_provider_name }}" with: message: {value} topic: {value} ``` Check the following workflow examples: - [ntfy_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/ntfy_basic.yml) - [query_clickhouse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_clickhouse.yml) - [query_victoriametrics.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_victoriametrics.yml) - [simple_http_request_ntfy.yml](https://github.com/keephq/keep/blob/main/examples/workflows/simple_http_request_ntfy.yml) ================================================ FILE: docs/snippets/providers/ollama-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: Ollama API Host URL (required: True, sensitive: False) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query ollama provider: ollama config: "{{ provider.my_provider_name }}" with: prompt: {value} model: {value} max_tokens: {value} structured_output_format: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/openai-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: OpenAI Platform API Key (required: True, sensitive: True) - **organization_id**: OpenAI Platform Organization ID (required: False, sensitive: False) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query openai provider: openai config: "{{ provider.my_provider_name }}" with: prompt: {value} model: {value} max_tokens: {value} structured_output_format: {value} ``` Check the following workflow examples: - [conditionally_run_if_ai_says_so.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/conditionally_run_if_ai_says_so.yaml) - [enrich_using_structured_output_from_openai.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_openai.yaml) - [gcp_logging_open_ai.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/gcp_logging_open_ai.yaml) - [send_slack_message_on_failure.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/send_slack_message_on_failure.yaml) - [update-incident-grafana-incident.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/update-incident-grafana-incident.yaml) ================================================ FILE: docs/snippets/providers/openobserve-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **openObserveUsername**: OpenObserve Username (required: True, sensitive: False) - **openObservePassword**: Password (required: True, sensitive: True) - **openObserveHost**: OpenObserve host url (required: True, sensitive: False) - **openObservePort**: OpenObserve Port (required: True, sensitive: False) - **organisationID**: OpenObserve organisationID (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authorized (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/opensearchserverless-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **domain_endpoint**: Domain endpoint (required: True, sensitive: False) - **region**: AWS region (required: True, sensitive: False) - **access_key**: AWS access key (required: False, sensitive: True) - **access_key_secret**: AWS access key secret (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **iam:SimulatePrincipalPolicy**: Required to check if we have access to AOSS API. (mandatory) - **aoss:APIAccessAll**: Required to make API calls to OpenSearch Serverless. (Add from IAM console) (mandatory) - **aoss:ListAccessPolicies**: Required to access all Data Access Policies. (Add from IAM console) (mandatory) - **aoss:GetAccessPolicy**: Required to check each policy for read and write scope. (Add from IAM console) (mandatory) - **aoss:CreateIndex**: Required to create indexes while saving a doc. (mandatory) ([Documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations)) - **aoss:ReadDocument**: Required to query. (mandatory) ([Documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations)) - **aoss:WriteDocument**: Required to save documents. (mandatory) ([Documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations)) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query opensearchserverless provider: opensearchserverless config: "{{ provider.my_provider_name }}" with: query: {value} index: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query opensearchserverless provider: opensearchserverless config: "{{ provider.my_provider_name }}" with: index: {value} document: {value} doc_id: {value} ``` Check the following workflow example: - [opensearchserverless_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/opensearchserverless_basic.yml) ================================================ FILE: docs/snippets/providers/openshift-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_server**: The openshift api server url (required: True, sensitive: False) - **token**: The openshift token (required: True, sensitive: True) - **insecure**: Skip TLS verification (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_openshift**: Check if the provided token can connect to the openshift server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query openshift provider: openshift config: "{{ provider.my_provider_name }}" with: command_type: {value} # The type of query to perform. Supported queries are: - get_logs: Get logs from a pod - get_events: Get events for a namespace or pod - get_pods: List pods in a namespace or across all namespaces - get_node_pressure: Get node pressure conditions - get_pvc: List persistent volume claims - get_routes: List OpenShift routes - get_deploymentconfigs: List OpenShift deployment configs - get_projects: List OpenShift projects # Additional arguments for the query. ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query openshift provider: openshift config: "{{ provider.my_provider_name }}" with: action: {value} # The action to perform. Supported actions are: - rollout_restart: Restart a deployment, statefulset, or daemonset - restart_pod: Restart a pod by deleting it - scale_deployment: Scale a deployment to specified replicas - scale_deploymentconfig: Scale a deployment config to specified replicas # Additional arguments for the action. ``` Check the following workflow examples: - [openshift_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_basic.yml) - [openshift_monitoring_and_remediation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_monitoring_and_remediation.yml) - [openshift_pod_restart.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_pod_restart.yml) ================================================ FILE: docs/snippets/providers/opsgenie-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: OpsGenie api key (required: True, sensitive: True) - **integration_name**: OpsGenie integration name (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **opsgenie:create**: Create OpsGenie alerts (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query opsgenie provider: opsgenie config: "{{ provider.my_provider_name }}" with: query_type: {value} query: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query opsgenie provider: opsgenie config: "{{ provider.my_provider_name }}" with: user: {value} # Display name of the request owner note: {value} # Additional note that will be added while creating the alert source: {value} # Source field of the alert. Default value is IP address of the incoming request message: {value} # Message of the alert alias: {value} # Client-defined identifier of the alert, that is also the key element of alert deduplication description: {value} # Description field of the alert that is generally used to provide a detailed information responders: {value} # Responders that the alert will be routed to send notifications visible_to: {value} # Teams and users that the alert will become visible to without sending any notification actions: {value} # Custom actions that will be available for the alert tags: {value} # Tags of the alert details: {value} # Map of key-value pairs to use as custom properties of the alert entity: {value} # Entity field of the alert that is generally used to specify which domain alert is related to priority: {value} # Priority level of the alert type: {value} # Type of the request, e.g. create_alert, close_alert # Additional arguments ``` Check the following workflow examples: - [failed-to-login-workflow.yml](https://github.com/keephq/keep/blob/main/examples/workflows/failed-to-login-workflow.yml) - [opsgenie-close-alert.yml](https://github.com/keephq/keep/blob/main/examples/workflows/opsgenie-close-alert.yml) - [opsgenie-create-alert-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/opsgenie-create-alert-cel.yml) - [opsgenie-create-alert.yml](https://github.com/keephq/keep/blob/main/examples/workflows/opsgenie-create-alert.yml) - [opsgenie_open_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/opsgenie_open_alerts.yml) ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **close_alert** Close an alert (action, scopes: opsgenie:create) - **comment_alert** Comment an alert (action, scopes: opsgenie:create) ================================================ FILE: docs/snippets/providers/pagerduty-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **routing_key**: Routing Key (an integration or ruleset key) (required: False, sensitive: False) - **api_key**: Api Key (a user or team API key) (required: False, sensitive: True) - **oauth_data**: For oauth flow (required: False, sensitive: True) - **service_id**: Service Id (if provided, keep will only operate on this service) (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **incidents_read**: Read incidents data. (mandatory) - **incidents_write**: Write incidents. - **webhook_subscriptions_read**: Read webhook data. - **webhook_subscriptions_write**: Write webhooks. ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query pagerduty provider: pagerduty config: "{{ provider.my_provider_name }}" with: incident_id: {value} incident_key: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query pagerduty provider: pagerduty config: "{{ provider.my_provider_name }}" with: title: {value} # Title of the alert or incident dedup: {value} # String used to deduplicate alerts for events API, max 255 chars service_id: {value} # ID of the service for incidents routing_key: {value} # API routing_key (optional), if not specified, fallbacks to the one provided in provider requester: {value} # Email of the user requesting the incident creation incident_id: {value} # Key to identify the incident. UUID generated if not provided event_type: {value} # Event type for events API (trigger/acknowledge/resolve) severity: {value} # Severity for events API (critical/error/warning/info) source: {value} # Source field for events API priority: {value} # Priority reference ID for incidents status: {value} # Status for incident updates (resolved/acknowledged) resolution: {value} # Resolution note for resolved incidents body: {value} # Body of the incident as per https://developer.pagerduty.com/api-reference/a7d81b0e9200f-create-an-incident#request-body kwargs: {value} # Additional event/incident fields ``` Check the following workflow examples: - [ifelse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/ifelse.yml) - [pagerduty.yml](https://github.com/keephq/keep/blob/main/examples/workflows/pagerduty.yml) ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ================================================ FILE: docs/snippets/providers/pagertree-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_token**: Your pagertree APIToken (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: The user can connect to the server and is authenticated using their API_Key (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query pagertree provider: pagertree config: "{{ provider.my_provider_name }}" with: title: {value} # Title of the alert. urgency: {value} # low|medium|high|critical incident: {value} # True if the alert is an incident severities: {value} # SEV-1|SEV-2|SEV-3|SEV-4|SEV-5|SEV_UNKNOWN incident_message: {value} # Message to be displayed in the incident description: {value} # UTF-8 string of custom message for alert. Shown in incident description status: {value} # alert status to send destination_team_ids: {value} # destination team_ids to send alert to destination_router_ids: {value} # destination router_ids to send alert to destination_account_user_ids: {value} # destination account_users_ids to send alert to # Additional parameters to be passed ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/parseable-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **parseable_server**: Parseable Frontend URL (required: True, sensitive: False) - **username**: Parseable username (required: True, sensitive: False) - **password**: Parseable password (required: True, sensitive: True) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Connecting via Webhook (omnidirectional) This is an example of how to configure an alert to be sent to Keep using Parseable's webhook feature. Post this to https://YOUR_PARSEABLE_SERVER/api/v1/logstream/YOUR_STREAM_NAME/alert ``` {{ "version": "v1", "alerts": [ {{ "name": "Alert: Server side error", "message": "server reporting status as 500", "rule": {{ "type": "column", "config": {{ "column": "status", "operator": "=", "value": 500, "repeats": 2 }} }}, "targets": [ {{ "type": "webhook", "endpoint": "KEEP_BACKEND_URL/alerts/event/parseable", "skip_tls_check": true, "repeat": {{ "interval": "10s", "times": 5 }}, "headers": {{"X-API-KEY": "{api_key}"}} }} ] }} ] }} ``` ================================================ FILE: docs/snippets/providers/pingdom-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Pingdom API Key (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **read**: Read alerts from Pingdom. (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Connecting via Webhook (omnidirectional) Install Keep as Pingdom webhook 1. Go to Settings > Integrations. 2. Click Add Integration. 3. Enter: Type = Webhook Name = Keep URL = Your Keep Backend URL 4. Click Save Integration. ================================================ FILE: docs/snippets/providers/planner-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **tenant_id**: Planner Tenant ID (required: True, sensitive: True) - **client_id**: Planner Client ID (required: True, sensitive: True) - **client_secret**: Planner Client Secret (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query planner provider: planner config: "{{ provider.my_provider_name }}" with: plan_id: {value} title: {value} bucket_id: {value} ``` Check the following workflow example: - [planner_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/planner_basic.yml) ================================================ FILE: docs/snippets/providers/postgres-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **username**: Postgres username (required: True, sensitive: False) - **password**: Postgres password (required: True, sensitive: True) - **host**: Postgres hostname (required: True, sensitive: False) - **database**: Postgres database name (required: False, sensitive: False) - **port**: Postgres port (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connect_to_server**: The user can connect to the server (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query postgres provider: postgres config: "{{ provider.my_provider_name }}" with: query: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query postgres provider: postgres config: "{{ provider.my_provider_name }}" with: query: {value} ``` Check the following workflow example: - [disk_grown_defects_rule.yml](https://github.com/keephq/keep/blob/main/examples/workflows/disk_grown_defects_rule.yml) ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **execute_query** Query the Postgres database (view, scopes: no additional scopes) ================================================ FILE: docs/snippets/providers/posthog-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: PostHog API key (required: True, sensitive: True) - **project_id**: PostHog project ID (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **session_recording:read**: Read PostHog session recordings (mandatory) - **session_recording_playlist:read**: Read PostHog session recording playlists - **project:read**: Read PostHog project data (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query posthog provider: posthog config: "{{ provider.my_provider_name }}" with: query_type: {value} # Type of query (e.g., "session_recording_domains", "session_recordings") hours: {value} # Number of hours to look back limit: {value} # Maximum number of items to fetch # Additional arguments ``` Check the following workflow example: - [posthog_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/posthog_example.yml) ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **get_session_recording_domains** Get a list of domains from session recordings within a time period (action, scopes: session_recording:read, project:read) - `hours`: Number of hours to look back (default: 24) - `limit`: Maximum number of recordings to fetch (default: 100) - **get_session_recordings** Get session recordings within a time period (action, scopes: session_recording:read, project:read) - `hours`: Number of hours to look back (default: 24) - `limit`: Maximum number of recordings to fetch (default: 100) ================================================ FILE: docs/snippets/providers/prometheus-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **url**: Prometheus server URL (required: True, sensitive: False) - **username**: Prometheus username (required: False, sensitive: False) - **password**: Prometheus password (required: False, sensitive: True) - **verify**: Verify SSL certificates (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connectivity**: Connectivity Test (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query prometheus provider: prometheus config: "{{ provider.my_provider_name }}" with: query: {value} ``` Check the following workflow examples: - [create_service_now_ticket_upon_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_service_now_ticket_upon_alerts.yml) - [enrich_using_structured_output_from_deepseek.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_deepseek.yaml) - [enrich_using_structured_output_from_openai.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_openai.yaml) - [enrich_using_structured_output_from_vllm_qwen.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_vllm_qwen.yaml) - [http_enrich.yml](https://github.com/keephq/keep/blob/main/examples/workflows/http_enrich.yml) - [multi-condition-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/multi-condition-cel.yml) ## Connecting via Webhook (omnidirectional) This provider takes advantage of configurable webhooks available with Prometheus Alertmanager. Use the following template to configure AlertManager: ``` route: receiver: "keep" group_by: ['alertname'] group_wait: 15s group_interval: 15s repeat_interval: 1m continue: true receivers: - name: "keep" webhook_configs: - url: 'KEEP_BACKEND_URL/alerts/event/prometheus' send_resolved: true http_config: basic_auth: username: api_key password: {api_key} ``` ================================================ FILE: docs/snippets/providers/pushover-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **token**: Pushover app token (required: True, sensitive: True) - **user_key**: Pushover user key (required: True, sensitive: False) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query pushover provider: pushover config: "{{ provider.my_provider_name }}" with: message: {value} # The content of the message. ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/python-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query python provider: python config: "{{ provider.my_provider_name }}" with: code: {value} imports: {value} ``` Check the following workflow examples: - [bash_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/bash_example.yml) - [mustache-paths-example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/mustache-paths-example.yml) ================================================ FILE: docs/snippets/providers/quickchart-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Quickchart API Key (required: False, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query quickchart provider: quickchart config: "{{ provider.my_provider_name }}" with: fingerprint: {value} status: {value} chartConfig: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/redmine-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: Redmine Host (required: True, sensitive: False) - **api_access_key**: Redmine API Access key (required: True, sensitive: True) - **ticket_creation_url**: URL for creating new tickets (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: Authenticated with Redmine API (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query redmine provider: redmine config: "{{ provider.my_provider_name }}" with: project_id: {value} subject: {value} priority_id: {value} description: {value} ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/resend-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Resend API key (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query resend provider: resend config: "{{ provider.my_provider_name }}" with: _from: {value} # From email address to: {value} # To email address subject: {value} # Email subject html: {value} # Email body ``` Check the following workflow example: - [bash_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/bash_example.yml) ================================================ FILE: docs/snippets/providers/rollbar-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **rollbarAccessToken**: Project Access Token (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authenticated ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/s3-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **access_key**: S3 Access Token (Leave empty if using IAM role at EC2) (required: False, sensitive: True) - **secret_access_key**: S3 Secret Access Token (Leave empty if using IAM role at EC2) (required: False, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query s3 provider: s3 config: "{{ provider.my_provider_name }}" with: bucket: {value} ``` Check the following workflow examples: - [consts_and_dict.yml](https://github.com/keephq/keep/blob/main/examples/workflows/consts_and_dict.yml) - [update_workflows_from_s3.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_workflows_from_s3.yml) ================================================ FILE: docs/snippets/providers/salesforce-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Salesforce API key (required: True, sensitive: True) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/sendgrid-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: SendGrid API key (required: True, sensitive: True) - **from_email**: From email address (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **email.send**: Send emails using SendGrid (mandatory) ([Documentation](https://sendgrid.com/docs/API_Reference/api_v3.html)) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query sendgrid provider: sendgrid config: "{{ provider.my_provider_name }}" with: to: {value} # To email address or list of email addresses subject: {value} # Email subject html: {value} # Email body ``` Check the following workflow examples: - [consts_and_vars.yml](https://github.com/keephq/keep/blob/main/examples/workflows/consts_and_vars.yml) - [sendgrid_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/sendgrid_basic.yml) ================================================ FILE: docs/snippets/providers/sentry-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Sentry Api Key (required: True, sensitive: True) - **organization_slug**: Sentry organization slug (required: True, sensitive: False) - **api_url**: Sentry API URL (required: False, sensitive: False) - **project_slug**: Sentry project slug within the organization (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - ****: Write permission for projects in organization ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query sentry provider: sentry config: "{{ provider.my_provider_name }}" with: project: {value} # project name time: {value} # time range, for example: 14d ``` Check the following workflow example: - [create_jira_ticket_upon_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_jira_ticket_upon_alerts.yml) ================================================ FILE: docs/snippets/providers/servicenow-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **service_now_base_url**: The base URL of the ServiceNow instance (required: True, sensitive: False) - **username**: The username of the ServiceNow user (required: True, sensitive: False) - **password**: The password of the ServiceNow user (required: True, sensitive: True) - **client_id**: The client ID to use OAuth 2.0 based authentication (required: False, sensitive: False) - **client_secret**: The client secret to use OAuth 2.0 based authentication (required: False, sensitive: True) - **ticket_creation_url**: URL for creating new tickets (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **itil**: The user can read/write tickets from the table (mandatory) ([Documentation](https://docs.servicenow.com/bundle/sandiego-platform-administration/page/administer/roles/reference/r_BaseSystemRoles.html)) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query servicenow provider: servicenow config: "{{ provider.my_provider_name }}" with: table_name: {value} # The name of the table to query. incident_id: {value} # The incident ID to query. sysparm_limit: {value} # The maximum number of records to return. sysparm_offset: {value} # The offset to start from. ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query servicenow provider: servicenow config: "{{ provider.my_provider_name }}" with: table_name: {value} # The name of the table to create the ticket in. payload: {value} # The ticket payload. ticket_id: {value} # The ticket ID (optional to update a ticket). fingerprint: {value} # The fingerprint of the ticket (optional to update a ticket). ``` Check the following workflow examples: - [blogpost.yml](https://github.com/keephq/keep/blob/main/examples/workflows/blogpost.yml) - [clickhouse_multiquery.yml](https://github.com/keephq/keep/blob/main/examples/workflows/clickhouse_multiquery.yml) - [create_service_now_ticket_upon_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_service_now_ticket_upon_alerts.yml) - [update_service_now_tickets_status.yml](https://github.com/keephq/keep/blob/main/examples/workflows/update_service_now_tickets_status.yml) ## Topology This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **get_incidents** Fetch all incidents from ServiceNow (view, scopes: itil) - **get_incident_activities** Get work notes and comments from a ServiceNow incident (view, scopes: itil) - `incident_id`: The incident number (e.g. INC0010001) or sys_id. - `limit`: Maximum number of activity records to return. - **add_incident_activity** Add a work note or comment to a ServiceNow incident (action, scopes: itil) - `incident_id`: The incident number (e.g. INC0010001) or sys_id. - `content`: The text content to add. - `activity_type`: Either 'work_notes' or 'comments'. Defaults to 'work_notes'. ================================================ FILE: docs/snippets/providers/signalfx-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **sf_token**: SignalFX token (required: True, sensitive: True) - **realm**: SignalFX Realm (required: False, sensitive: False) - **email**: SignalFX email. Required for setup webhook. (required: False, sensitive: True) - **password**: SignalFX password. Required for setup webhook. (required: False, sensitive: True) - **org_id**: SignalFX organization ID. Required for setup webhook. (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **API**: API authScope - read permission for SignalFx API (mandatory) ([Documentation](https://dev.splunk.com/observability/reference/api/org_tokens/latest#endpoint-create-single-token)) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/signl4-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **signl4_integration_secret**: SIGNL4 integration or team secret (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **signl4:create**: Create SIGNL4 alerts (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query signl4 provider: signl4 config: "{{ provider.my_provider_name }}" with: title: {value} # Alert title. message: {value} # Alert message. user: {value} # User name. s4_external_id: {value} # External ID. s4_status: {value} # Alert status. s4_service: {value} # Service name. s4_location: {value} # Location. s4_alerting_scenario: {value} # Alerting scenario. s4_filtering: {value} # Filtering. # Additional alert data. ``` Check the following workflow example: - [signl4-alerting-workflow.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/signl4-alerting-workflow.yaml) ================================================ FILE: docs/snippets/providers/site24x7-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **zohoRefreshToken**: Zoho Refresh Token (required: True, sensitive: True) - **zohoClientId**: Zoho Client Id (required: True, sensitive: True) - **zohoClientSecret**: Zoho Client Secret (required: True, sensitive: True) - **zohoAccountTLD**: Zoho Account's TLD (.com | .eu | .com.cn | .in | .au | .jp) (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authenticated (mandatory) - **valid_tld**: TLD is amongst the list [.com | .eu | .com.cn | .in | .com.au | .jp] (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/slack-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **webhook_url**: Slack Webhook Url (required: True, sensitive: True) - **access_token**: For access token installation flow, use Keep UI (required: False, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query slack provider: slack config: "{{ provider.my_provider_name }}" with: message: {value} # The content of the message. blocks: {value} # The blocks of the message. channel: {value} # The channel to send the message slack_timestamp: {value} # The timestamp of the message to update thread_timestamp: {value} # The timestamp of the thread to send the message attachments: {value} # The attachments of the message. username: {value} # The username of the message. notification_type: {value} # The type of notification. ``` Check the following workflow examples: - [consts_and_vars.yml](https://github.com/keephq/keep/blob/main/examples/workflows/consts_and_vars.yml) - [create_jira_ticket_upon_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_jira_ticket_upon_alerts.yml) - [datadog-log-monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/datadog-log-monitor.yml) - [db_disk_space_monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/db_disk_space_monitor.yml) - [elastic_enrich_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/elastic_enrich_example.yml) - [failed-to-login-workflow.yml](https://github.com/keephq/keep/blob/main/examples/workflows/failed-to-login-workflow.yml) - [gcp_logging_open_ai.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/gcp_logging_open_ai.yaml) - [ifelse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/ifelse.yml) - [incident-tier-escalation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/incident-tier-escalation.yml) - [new-auth0-users-monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/new-auth0-users-monitor.yml) - [new_github_stars.yml](https://github.com/keephq/keep/blob/main/examples/workflows/new_github_stars.yml) - [notify-new-trello-card.yml](https://github.com/keephq/keep/blob/main/examples/workflows/notify-new-trello-card.yml) - [openshift_monitoring_and_remediation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_monitoring_and_remediation.yml) - [opsgenie_open_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/opsgenie_open_alerts.yml) - [permissions_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/permissions_example.yml) - [posthog_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/posthog_example.yml) - [query_clickhouse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_clickhouse.yml) - [query_victoriametrics.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_victoriametrics.yml) - [raw_sql_query_datetime.yml](https://github.com/keephq/keep/blob/main/examples/workflows/raw_sql_query_datetime.yml) - [send_slack_message_on_failure.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/send_slack_message_on_failure.yaml) - [service-error-rate-monitor-datadog.yml](https://github.com/keephq/keep/blob/main/examples/workflows/service-error-rate-monitor-datadog.yml) - [slack-message-reaction.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack-message-reaction.yml) - [slack-workflow-trigger.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack-workflow-trigger.yml) - [slack_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack_basic.yml) - [slack_basic_cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack_basic_cel.yml) - [slack_basic_interval.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack_basic_interval.yml) - [slack_message_update.yml](https://github.com/keephq/keep/blob/main/examples/workflows/slack_message_update.yml) - [workflow_only_first_time_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/workflow_only_first_time_example.yml) - [workflow_start_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/workflow_start_example.yml) - [zoom_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/zoom_example.yml) ================================================ FILE: docs/snippets/providers/smtp-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **smtp_server**: SMTP Server Address (required: True, sensitive: False) - **smtp_port**: SMTP port (required: True, sensitive: False) - **encryption**: SMTP encryption (required: True, sensitive: False) - **smtp_username**: SMTP username (required: False, sensitive: False) - **smtp_password**: SMTP password (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **send_email**: Send email using SMTP protocol (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query smtp provider: smtp config: "{{ provider.my_provider_name }}" with: from_email: {value} from_name: {value} to_email: {value} subject: {value} body: {value} html: {value} ``` Check the following workflow examples: - [send_smtp_email.yml](https://github.com/keephq/keep/blob/main/examples/workflows/send_smtp_email.yml) - [send_smtp_html_email.yml](https://github.com/keephq/keep/blob/main/examples/workflows/send_smtp_html_email.yml) ================================================ FILE: docs/snippets/providers/snowflake-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **user**: Snowflake user (required: True, sensitive: False) - **account**: Snowflake account (required: True, sensitive: False) - **pkey**: Snowflake private key (required: True, sensitive: True) - **pkey_passphrase**: Snowflake password (required: False, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query snowflake provider: snowflake config: "{{ provider.my_provider_name }}" with: query: {value} # query to execute ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/splunk-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Splunk API Key (required: True, sensitive: True) - **host**: Splunk Host (default is localhost) (required: False, sensitive: False) - **port**: Splunk Port (default is 8089) (required: False, sensitive: False) - **verify**: Enable SSL verification (required: False, sensitive: False) - **username**: The username connected with the API key/token provided. (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **list_all_objects**: The user can get all the alerts (mandatory) - **edit_own_objects**: The user can edit and add webhook to saved_searches (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/squadcast-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **service_region**: Service region: EU/US (required: True, sensitive: False) - **refresh_token**: Squadcast Refresh Token (required: False, sensitive: True) - **webhook_url**: Incident webhook url (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: The user can connect to the client ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query squadcast provider: squadcast config: "{{ provider.my_provider_name }}" with: notify_type: {value} message: {value} description: {value} incident_id: {value} priority: {value} tags: {value} status: {value} event_id: {value} attachments: {value} additional_json: {value} ``` Check the following workflow example: - [squadcast_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/squadcast_example.yml) ================================================ FILE: docs/snippets/providers/ssh-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host**: SSH hostname (required: True, sensitive: False) - **user**: SSH user (required: True, sensitive: False) - **port**: SSH port (required: False, sensitive: False) - **pkey**: SSH private key (required: False, sensitive: True) - **password**: SSH password (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **ssh_access**: The provided credentials grant access to the SSH server ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query ssh provider: ssh config: "{{ provider.my_provider_name }}" with: command: {value} query: {value} # command to execute ``` Check the following workflow example: - [businesshours.yml](https://github.com/keephq/keep/blob/main/examples/workflows/businesshours.yml) ================================================ FILE: docs/snippets/providers/statuscake-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Statuscake API Key (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **alerts**: Read alerts from Statuscake ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/sumologic-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **sumoAccessId**: SumoLogic Access ID (required: True, sensitive: False) - **sumoAccessKey**: SumoLogic Access Key (required: True, sensitive: True) - **deployment**: Deployment Region (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authorized (mandatory) - **authorized**: Required privileges (mandatory) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/teams-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **webhook_url**: Teams Webhook Url (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query teams provider: teams config: "{{ provider.my_provider_name }}" with: message: {value} # The message to send typeCard: {value} # The card type. Can be "MessageCard" (legacy) or "message" (for Adaptive Cards). Default is "message" themeColor: {value} # Hexadecimal color (only used with MessageCard type) sections: {value} # For MessageCard: Array of custom information sections. For Adaptive Cards: Array of card elements following the Adaptive Card schema. Can be provided as a JSON string or array. schema: {value} # Schema URL for Adaptive Cards. Default is "http://adaptivecards.io/schemas/adaptive-card.json" attachments: {value} # Custom attachments array for Adaptive Cards (overrides default attachment structure). Can be provided as a JSON string or array. mentions: {value} # List of user mentions to include in the Adaptive Card. Each mention should be a dict with 'id' (user ID, Microsoft Entra Object ID, or UPN) and 'name' (display name) keys. Example: [{"id": "user-id-123", "name": "John Doe"}, {"id": "john.doe@example.com", "name": "John Doe"}] ``` Check the following workflow examples: - [create_jira_ticket_upon_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_jira_ticket_upon_alerts.yml) - [teams-adaptive-card-notifier.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/teams-adaptive-card-notifier.yaml) - [teams-adaptive-cards-with-mentions.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/teams-adaptive-cards-with-mentions.yaml) ================================================ FILE: docs/snippets/providers/telegram-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **bot_token**: Telegram Bot Token (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query telegram provider: telegram config: "{{ provider.my_provider_name }}" with: chat_id: {value} # Unique identifier for the target chat or username of the target channel topic_id: {value} # Unique identifier for the target message thread (topic) message: {value} # Message to be sent reply_markup: {value} # Inline keyboard markup to be attached to the message reply_markup_layout: {value} # Direction of the reply markup, could be "horizontal" or "vertical" parse_mode: {value} # Mode for parsing entities in the message text, could be "markdown" or "html" image_url: {value} # URL of the image to be attached to the message caption_on_image: {value} # Whether to use the message as a caption for the image ``` Check the following workflow examples: - [send-message-telegram-with-htmlmd.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/send-message-telegram-with-htmlmd.yaml) - [telegram_advanced.yml](https://github.com/keephq/keep/blob/main/examples/workflows/telegram_advanced.yml) - [telegram_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/telegram_basic.yml) ================================================ FILE: docs/snippets/providers/test_fluxcd-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/thousandeyes-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **oauth2_token**: OAuth2 Bearer Token (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: User is Authenticated ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/trello-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Trello API Key (required: True, sensitive: True) - **api_token**: Trello API Token (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query trello provider: trello config: "{{ provider.my_provider_name }}" with: board_id: {value} # Trello board ID filter: {value} # Trello action filter ``` Check the following workflow example: - [notify-new-trello-card.yml](https://github.com/keephq/keep/blob/main/examples/workflows/notify-new-trello-card.yml) ================================================ FILE: docs/snippets/providers/twilio-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **account_sid**: Twilio Account SID (required: True, sensitive: False) - **api_token**: Twilio API Token (required: True, sensitive: True) - **from_phone_number**: Twilio Phone Number (required: True, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **send_sms**: The API token has permission to send the SMS (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query twilio provider: twilio config: "{{ provider.my_provider_name }}" with: message_body: {value} # The content of the SMS message to be sent. Defaults to "". to_phone_number: {value} # The recipient's phone number. Defaults to "". ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/uptimekuma-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: UptimeKuma Host URL (required: True, sensitive: False) - **username**: UptimeKuma Username (required: True, sensitive: False) - **password**: UptimeKuma Password (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **alerts**: Read alerts from UptimeKuma ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/vectordev-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: API key (required: True, sensitive: True) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/victorialogs-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: VictoriaLogs Host URL (required: True, sensitive: False) - **authentication_type**: Authentication Type (required: True, sensitive: False) - **username**: HTTP basic authentication - Username (required: False, sensitive: False) - **password**: HTTP basic authentication - Password (required: False, sensitive: True) - **bearer_token**: Bearer Token (required: False, sensitive: True) - **x_scope_orgid**: X-Scope-OrgID Header (required: False, sensitive: False) - **insecure**: Skip TLS verification (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **authenticated**: The instance is valid and the user is authenticated ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query victorialogs provider: victorialogs config: "{{ provider.my_provider_name }}" with: queryType: {value} query: {value} time: {value} start: {value} end: {value} step: {value} account_id: {value} project_id: {value} limit: {value} timeout: {value} ``` Check the following workflow example: - [query_victorialogs.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/query_victorialogs.yaml) ================================================ FILE: docs/snippets/providers/victoriametrics-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **VMAlertHost**: The hostname or IP address where VMAlert is running (required: False, sensitive: False) - **VMAlertPort**: The port number on which VMAlert is listening (required: False, sensitive: False) - **VMAlertURL**: The full URL to the VMAlert instance. Alternative to Host/Port (required: False, sensitive: False) - **VMBackendHost**: The hostname or IP address where VictoriaMetrics backend is running (required: False, sensitive: False) - **VMBackendPort**: The port number on which VictoriaMetrics backend is listening (required: False, sensitive: False) - **VMBackendURL**: The full URL to the VictoriaMetrics backend. Alternative to Host/Port (required: False, sensitive: False) - **BasicAuthUsername**: Username for basic authentication (required: False, sensitive: False) - **BasicAuthPassword**: Password for basic authentication (required: False, sensitive: True) - **SkipValidation**: Enter 'true' to skip validation of authentication (required: False, sensitive: False) - **insecure**: Skip TLS verification (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **connected**: The user can connect to the client (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query victoriametrics provider: victoriametrics config: "{{ provider.my_provider_name }}" with: query: {value} start: {value} end: {value} step: {value} queryType: {value} ``` Check the following workflow examples: - [create_alert_from_vm_metric.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_alert_from_vm_metric.yml) - [create_multi_alert_from_vm_metric.yml](https://github.com/keephq/keep/blob/main/examples/workflows/create_multi_alert_from_vm_metric.yml) - [query_victoriametrics.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_victoriametrics.yml) ## Connecting via Webhook (omnidirectional) This provider takes advantage of configurable webhooks available with Prometheus Alertmanager. Use the following template to configure AlertManager: ``` route: receiver: "keep" group_by: ['alertname'] group_wait: 15s group_interval: 15s repeat_interval: 1m continue: true receivers: - name: "keep" webhook_configs: - url: 'KEEP_BACKEND_URL/alerts/event/victoriametrics' send_resolved: true http_config: basic_auth: username: api_key password: {api_key} ``` ================================================ FILE: docs/snippets/providers/vllm-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_url**: vLLM API endpoint URL (required: True, sensitive: False) - **api_key**: Optional API key if your vLLM deployment requires authentication (required: False, sensitive: True) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query vllm provider: vllm config: "{{ provider.my_provider_name }}" with: prompt: {value} temperature: {value} model: {value} max_tokens: {value} structured_output_format: {value} ``` Check the following workflow example: - [enrich_using_structured_output_from_vllm_qwen.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/enrich_using_structured_output_from_vllm_qwen.yaml) ================================================ FILE: docs/snippets/providers/wazuh-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/webhook-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **url**: Webhook URL (required: True, sensitive: False) - **verify**: Enable SSL verification (required: False, sensitive: False) - **method**: HTTP method (required: True, sensitive: False) - **http_basic_authentication_username**: HTTP basic authentication - Username (required: False, sensitive: False) - **http_basic_authentication_password**: HTTP basic authentication - Password (required: False, sensitive: True) - **api_key**: API key (required: False, sensitive: True) - **headers**: Headers (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **send_webhook**: (mandatory) ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query webhook provider: webhook config: "{{ provider.my_provider_name }}" with: url: {value} method: {value} http_basic_authentication_username: {value} http_basic_authentication_password: {value} api_key: {value} headers: {value} body: {value} params: {value} fail_on_error: {value} ``` As "action" to make changes or update data, example: ```yaml actions: - name: Query webhook provider: webhook config: "{{ provider.my_provider_name }}" with: body: {value} params: {value} ``` Check the following workflow examples: - [webhook_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example.yml) - [webhook_example_foreach.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example_foreach.yml) - [zoom_chat_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/zoom_chat_example.yml) ================================================ FILE: docs/snippets/providers/websocket-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## In workflows This provider can be used in workflows. As "step" to query data, example: ```yaml steps: - name: Query websocket provider: websocket config: "{{ provider.my_provider_name }}" with: socket_url: {value} # The websocket URL to query. timeout: {value} # Connection Timeout. Defaults to None. data: {value} # Data to send through the websocket. Defaults to None. ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/youtrack-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **host_url**: YouTrack Host URL (required: True, sensitive: False) - **project_id**: YouTrack Project ID (required: True, sensitive: False) - **permanent_token**: YouTrack Permanent Token (required: True, sensitive: True) - **ticket_creation_url**: URL for creating new tickets (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **create_issue**: (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query youtrack provider: youtrack config: "{{ provider.my_provider_name }}" with: summary: {value} description: {value} ``` Check the following workflow example: - [create-issue-youtrack.yaml](https://github.com/keephq/keep/blob/main/examples/workflows/create-issue-youtrack.yaml) ================================================ FILE: docs/snippets/providers/zabbix-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **zabbix_frontend_url**: Zabbix Frontend URL (required: True, sensitive: False) - **auth_token**: Zabbix Auth Token (required: True, sensitive: True) - **verify**: Verify SSL certificates (required: False, sensitive: False) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **action.create**: This method allows to create new actions. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/action/create)) - **action.get**: This method allows to retrieve actions. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/action/get)) - **event.acknowledge**: This method allows to update events. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/event/acknowledge)) - **mediatype.create**: This method allows to create new media types. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/mediatype/create)) - **mediatype.get**: This method allows to retrieve media types. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/mediatype/get)) - **mediatype.update**: This method allows to update media types. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/mediatype/update)) - **problem.get**: The method allows to retrieve problems. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/problem/get)) - **script.create**: This method allows to create new scripts. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/script/create)) - **script.get**: The method allows to retrieve scripts. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/script/get)) - **script.update**: This method allows to update scripts. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/script/update)) - **user.get**: This method allows to retrieve users. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/user/get)) - **user.update**: This method allows to update users. (mandatory) ([Documentation](https://www.zabbix.com/documentation/current/en/manual/api/reference/user/update)) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ## Provider Methods The provider exposes the following [Provider Methods](/providers/provider-methods#via-ai-assistant). They are available in the [AI Assistant](/overview/ai-incident-assistant). - **close_problem** No description. (action, scopes: event.acknowledge) - `id`: The problem id. - **change_severity** No description. (action, scopes: event.acknowledge) - `id`: The problem id. - `new_severity`: The new severity. Can be an integer string (0-5) or severity name: - "0" or "Not classified" - "1" or "Information" - "2" or "Warning" - "3" or "Average" - "4" or "High" - "5" or "Disaster" - **surrpress_problem** No description. (action, scopes: event.acknowledge) - `id`: The problem id. - `suppress_until`: The datetime to suppress the problem until. - **unsurrpress_problem** No description. (action, scopes: event.acknowledge) - `id`: The problem id. - **acknowledge_problem** No description. (action, scopes: event.acknowledge) - `id`: The problem id. - **unacknowledge_problem** No description. (action, scopes: event.acknowledge) - `id`: The problem id. - **add_message_to_problem** No description. (action, scopes: event.acknowledge) - `id`: The problem id. - `message_text`: The message text. - **get_problem_messages** No description. (view, scopes: problem.get) - `id`: The problem id. ================================================ FILE: docs/snippets/providers/zendesk-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Zendesk API key (required: True, sensitive: True) - **zendesk_domain**: Zendesk domain (required: True, sensitive: False) - **ticket_creation_url**: URL for creating new tickets (required: False, sensitive: False) ## In workflows This provider can't be used as a "step" or "action" in workflows. If you want to use it, please let us know by creating an issue in the [GitHub repository](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/zenduty-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **api_key**: Zenduty api key (required: True, sensitive: True) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query zenduty provider: zenduty config: "{{ provider.my_provider_name }}" with: title: {value} # Title of the incident summary: {value} # Summary of the incident service: {value} # Service ID in Zenduty user: {value} # User ID in Zenduty policy: {value} # Policy ID in Zenduty ``` If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). ================================================ FILE: docs/snippets/providers/zoom-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **account_id**: Zoom Account ID (required: True, sensitive: True) - **client_id**: Zoom Client ID (required: True, sensitive: True) - **client_secret**: Zoom Client Secret (required: True, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **create_meeting**: Create a new Zoom meeting (mandatory) ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query zoom provider: zoom config: "{{ provider.my_provider_name }}" with: topic: {value} start_time: {value} duration: {value} timezone: {value} record_meeting: {value} host_email: {value} ``` Check the following workflow examples: - [zoom_chat_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/zoom_chat_example.yml) - [zoom_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/zoom_example.yml) ================================================ FILE: docs/snippets/providers/zoom_chat-snippet-autogenerated.mdx ================================================ {/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication This provider requires authentication. - **webhook_url**: Zoom Incoming Webhook Full Format Url (required: True, sensitive: True) - **authorization_token**: Incoming Webhook Authorization Token (required: True, sensitive: True) - **account_id**: Zoom Account ID (required: False, sensitive: True) - **client_id**: Zoom Client ID (required: False, sensitive: True) - **client_secret**: Zoom Client Secret (required: False, sensitive: True) Certain scopes may be required to perform specific actions or queries via the provider. Below is a summary of relevant scopes and their use cases: - **user:read:user:admin**: View a Zoom user's details - **user:read:list_users:admin**: List Zoom users ## In workflows This provider can be used in workflows. As "action" to make changes or update data, example: ```yaml actions: - name: Query zoom_chat provider: zoom_chat config: "{{ provider.my_provider_name }}" with: severity: {value} # The severity of the alert. title: {value} # The title to use for the message. (optional) message: {value} # The text message to send. Supports Markdown formatting. tagged_users: {value} # A list of Zoom user email addresses to tag. (optional) details_url: {value} # A URL linking to more information. (optional) ``` Check the following workflow example: - [zoom_chat_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/zoom_chat_example.yml) ================================================ FILE: docs/workflows/examples/autosupress.mdx ================================================ --- title: "Suppressing Alerts Automatically" --- Link to the [workflow](https://github.com/keephq/keep/blob/main/examples/workflows/autosupress.yml). This workflow demonstrates how to suppress alerts by marking them as dismissed. Explanation: - Trigger: Activated by any alert. - Action: Enrich the alert by adding a `dismissed` field with the value `true`. ```yaml workflow: id: autosupress description: demonstrates how to automatically suppress alerts triggers: - type: alert actions: - name: dismiss-alert provider: type: mock with: enrich_alert: - key: dismissed value: "true" ``` ================================================ FILE: docs/workflows/examples/buisnesshours.mdx ================================================ --- title: "Executing Actions During Business Hours" --- Link to the [workflow](https://github.com/keephq/keep/blob/main/examples/workflows/businesshours.yml). This workflow demonstrates how to take actions only during specified business hours. Explanation: - Trigger: Activated by an alert or manually. - Action: Check if the current time falls within business hours in the `America/New_York` timezone. If yes, enrich the alert with a `businesshours` field set to `true`. ```yaml workflow: id: businesshours description: demonstrate how to do smth only when it's business hours triggers: - type: alert - type: manual actions: - name: dismiss-alert if: "keep.is_business_hours(timezone='America/New_York')" provider: type: mock with: enrich_alert: - key: businesshours value: "true" ``` ================================================ FILE: docs/workflows/examples/create-servicenow-tickets.mdx ================================================ --- title: "Creating ServiceNow Tickets for Alerts" --- Link to the [workflow](https://github.com/keephq/keep/blob/main/examples/workflows/create_service_now_ticket_upon_alerts.yml). This workflow creates a ServiceNow ticket whenever an alert from Grafana or Prometheus is triggered. Explanation: - Trigger: Activated by alerts from Grafana or Prometheus. - Action: If the alert does not already have a ticket ID, create a ServiceNow ticket and enrich the alert with details like ticket ID, URL, and status. ```yaml workflow: id: servicenow description: create a ticket in servicenow when an alert is triggered triggers: - type: alert cel: source.contains("grafana") || source.contains("prometheus") actions: - name: create-service-now-ticket if: "not '{{ alert.ticket_id }}' and {{ alert.annotations.ticket_type }}" provider: type: servicenow config: "{{ providers.servicenow }}" with: table_name: "{{ alert.annotations.ticket_type }}" payload: short_description: "{{ alert.name }} - {{ alert.description }} [created by Keep][fingerprint: {{alert.fingerprint}}]" description: "{{ alert.description }}" enrich_alert: - key: ticket_type value: servicenow - key: ticket_id value: results.sys_id - key: ticket_url value: results.link - key: ticket_status value: results.stage ``` ================================================ FILE: docs/workflows/examples/highsev.mdx ================================================ --- title: "Handling High-Severity Sentry Alerts" --- Link to the [workflow](https://github.com/keephq/keep/blob/main/examples/workflows/create_jira_ticket_upon_alerts.yml). This workflow handles critical alerts from Sentry based on the service they are associated with. Explanation: - Trigger: Activated by critical alerts from Sentry. - Actions: - - Send a Slack message to the payments team for alerts related to the `payments` service. - - Create a Jira ticket for alerts related to the `ftp` service if a ticket ID is not already present. ```yaml workflow: id: sentry-alerts description: handle alerts triggers: - type: alert cel: source.contains("sentry") && severity == "critical" && (service == "payments" || service == "ftp") actions: - name: send-slack-message-team-payments if: "'{{ alert.service }}' == 'payments'" provider: type: slack config: "{{ providers.team-payments-slack }}" with: message: | "A new alert from Sentry: Alert: {{ alert.name }} - {{ alert.description }} {{ alert }}" - name: create-jira-ticket-oncall-board if: "'{{ alert.service }}' == 'ftp' and not '{{ alert.ticket_id }}'" provider: type: jira config: "{{ providers.jira }}" with: board_name: "Oncall Board" custom_fields: customfield_10201: "Critical" issuetype: "Task" summary: "{{ alert.name }} - {{ alert.description }} (created by Keep)" description: | "This ticket was created by Keep. Please check the alert details below: {code:json} {{ alert }} {code}" enrich_alert: - key: ticket_type value: jira - key: ticket_id value: results.issue.key - key: ticket_url value: results.ticket_url ``` ================================================ FILE: docs/workflows/examples/update-servicenow-tickets.mdx ================================================ --- title: "Update ServiceNow Tickets" --- Link to the [workflow](https://github.com/keephq/keep/blob/main/examples/workflows/update_service_now_tickets_status.yml). This example demonstrates how to periodically update the status of ServiceNow tickets associated with alerts. Explanation: - Trigger: The workflow can be triggered manually, simulating the scheduled execution. - Step 1: Fetch all alerts with a `ticket_type` of `servicenow` using the Keep provider. - Action: Iterate over the fetched alerts and update their associated ServiceNow tickets with the latest status. ```yaml workflow: id: servicenow description: update the ticket status every minute triggers: - type: manual steps: - name: get-alerts provider: type: keep with: cel: ticket_type == "servicenow" actions: - name: update-ticket foreach: "{{ steps.get-alerts.results }}" provider: type: servicenow config: "{{ providers.servicenow }}" with: ticket_id: "{{ foreach.value.alert_enrichment.enrichments.ticket_id }}" table_name: "{{ foreach.value.alert_enrichment.enrichments.table_name }}" fingerprint: "{{ foreach.value.alert_fingerprint }}" enrich_alert: - key: ticket_status value: results.state ``` ================================================ FILE: docs/workflows/overview.mdx ================================================ --- title: "Overview" --- You can see plenty of fully working examples at our [GitHub repo](https://github.com/keephq/keep/blob/main/examples/workflows/). Keep Workflow Engine designed to streamline and automate operational tasks by integrating triggers, steps, actions, and conditions. This documentation provides an overview of the core concepts used to define and execute workflows effectively. ### General Structure Each workflow compose of: 1. **metadata** - id, description 2. **triggers** - when this workflow runs? 3. **steps/actions** - what this workflow should do? The general structure of a workflow is: ```yaml workflow: id: aks-example description: aks-example triggers: # list of triggers - type: manual steps: # list of steps - name: some-step provider: type: some-provider-type config: "{{ providers.provider_id }}" with: # provider configuration - ... actions: - name: some-action provider: type: some-provider-type with: # provider configuration - ... ``` Let's dive into building workflows: - [Triggers](#triggers) - [Steps And Actions](#steps-and-actions) - [Conditions](#conditions) - [Functions](#functions) - [Context](#context) - [Providers](#providers) - [Variables](#variables) - [Foreach Loops](#foreach-loops) - [Alert Enrichment](#alert-enrichment) ### Triggers Define how a workflow starts, such as manually, on a schedule, or in response to alerts with optional filters for specific conditions. [See syntax](/workflows/syntax/triggers) ### Steps And Actions Represent sequential operations, like querying data or running scripts, using configurable providers. [See syntax](/workflows/syntax/steps-and-actions) ### Conditions Allow decision-making in actions based on thresholds, assertions, or previous step results. [See syntax](/workflows/syntax/conditions) ### Functions Built-in helpers like datetime_compare or is_business_hours simplify complex operations. [See syntax](/workflows/syntax/functions) ### Context Enables access to and reuse of outputs from earlier steps within actions or conditions. [See syntax](/workflows/syntax/context) ### Providers External systems or services (e.g., Slack, Datadog, ServiceNow) integrated into workflows through a standard configuration interface. [See syntax](/workflows/syntax/providers) ### Foreach Loops Iterate over a list of results from a step to perform repeated actions for each item. [See syntax](/workflows/syntax/foreach) ### Alert Enrichment Add context to alerts, like customer details or ticket metadata, using enrichment mechanisms in steps or actions. [See syntax](/workflows/syntax/enrichment) ================================================ FILE: docs/workflows/syntax/conditions.mdx ================================================ --- title: "Conditions" --- # Conditions Attach a condition to any step or action to decide at runtime whether it should run. A condition is a mustache expression that can reference outputs from earlier steps, workflow variables, or any other data in the execution context. Using conditions, you can introduce decision-making into workflows by asserting values, thresholds, or specific states. ### Simple `if` condition ```yaml actions: - name: notify-slack if: "{{ alert.cpu_load }} == '70'" provider: type: slack config: "{{ providers.slack }}" with: message: "The CPU load exceeded the threshold!" ``` **Values of variables will be quoted when evaluated**. For example, if `alert.cpu_load` is `70`, it will resolve to `'70'` (number quoted with single quotes). ### Using results of other steps in condition ```yaml workflow: id: query-and-alert description: "Query a database and notify only if a threshold is met" steps: - name: get-disk-usage provider: type: mysql config: "{{ providers.mysql-prod }}" with: query: "SELECT disk_usage FROM metrics WHERE server = 'db1'" single_row: true actions: - name: notify-slack if: "{{ steps.get-disk-usage.results.disk_usage }} > 90" provider: type: slack config: "{{ providers.slack }}" with: message: "Disk usage is critical: {{ steps.get-disk-usage.results.disk_usage }}%" ``` ### Complex logic ```yaml actions: - name: create-incident if: "{{ steps.get-alert.results.severity }} == 'critical' and {{ steps.get-alert.results.source }} == 'datadog'" provider: type: servicenow config: "{{ providers.servicenow }}" with: table_name: INCIDENT payload: short_description: "Critical Datadog alert received" ``` ### Condition with foreach ```yaml actions: - name: process-pods foreach: "{{ steps.get-pods.results }}" if: "{{ foreach.value.status.phase }} == 'Failed'" provider: type: slack with: message: "Pod {{ foreach.value.metadata.name }} has failed!" ``` ## Condition with constants ```yaml consts: max_load: 70 actions: - name: process-pods if: "{{ alert.cpu_load }} > {{ consts.max_load }}" provider: type: slack with: message: "Pod {{ foreach.value.metadata.name }} has failed!" ``` --- ## Explicit condition blocks (deprecated) Explicit condition blocks are deprecated and will be discontinued. Use the `if` syntax instead. ### assert (deprecated) Checks whether a specific assertion is true. ```yaml condition: - name: assert-condition type: assert assert: "{{ steps.get-data.results.value }} == 'expected'" ``` ### threshold (deprecated) Compares a value to a threshold using operators like `>` (gt) and `<` (lt), defaults to `>` (gt). ```yaml condition: - name: threshold-condition type: threshold value: "{{ steps.get-data.results.value }}" compare_to: 100 compare_type: gt ``` ================================================ FILE: docs/workflows/syntax/context.mdx ================================================ --- title: "Context" --- The **Context** in Keep workflows allows you to reference and utilize data dynamically across different parts of your workflow. Context variables give you access to runtime data such as alert details, results from previous steps or actions, and constants defined in your workflow. This capability makes workflows flexible, reusable, and able to handle complex scenarios dynamically. --- ## Accessing Context Context variables can be accessed using curly braces (`{{ }}`). You can use these variables directly in triggers, steps, and actions. The context includes: 1. **Alert Data**: Access data from the alert triggering the workflow. 2. **Incident Data**: If the workflow is incident-based, you can access the incident's attributes. 3. **Steps and Actions Results**: Retrieve data produced by previous steps or actions using their unique IDs. ### Alert Data You can access attributes of the alert anywhere in the workflow: ```yaml message: "Alert triggered: {{ alert.name }} - Severity: {{ alert.severity }}" ``` ### Incident Data For incident workflows, access incident-related context: ```yaml if: "{{ incident.current_tier == 1 }}" ``` ### Steps Results Access results from previous steps: ```yaml message: "Query results: {{ steps.get-max-datetime.results }}" ``` ### Action Results Retrieve data from completed actions: ```yaml if: "{{ actions.trigger-email.results.success }}" ``` ### Constants Define reusable values in the workflow and access them: ```yaml consts: alert_message: "Critical system alert!" escalation_policy: "tier-1" slack_channels: sre_team: CH00001 payments_team: CH00002 actions: - name: notify-slack if: "{{alert.source}} == 'datadog'" provider: type: slack config: "{{ providers.slack }}" with: channel: "{{ consts.slack_channels.sre_team }}" message: "{{ consts.alert_message }}" ``` ## Using Context in Loops When iterating over data in a `foreach` loop, the context provides `foreach.value` for the current iteration. For example: ```yaml steps: - name: get-alerts provider: type: keep with: query: "status == 'firing'" actions: - name: notify-on-alerts foreach: "{{ steps.get-alerts.results }}" provider: type: slack with: message: "Alert: {{ foreach.value.name }} is firing!" ``` --- ## Examples of Context Usage ### Dynamic Action Execution Using context to trigger actions conditionally: ```yaml actions: - name: escalate-alert if: "{{ alert.severity == 'critical' }}" provider: type: slack with: message: "Critical alert: {{ alert.name }}" ``` ### Enriching Alerts You can use results from a step to enrich an alert ```yaml steps: - name: fetch-customer-details provider: type: mysql with: query: "SELECT * FROM customers WHERE id = '{{ alert.customer_id }}'" single_row: true actions: - name: enrich-alert provider: type: mock with: enrich_alert: - key: customer_name value: "{{ steps.fetch-customer-details.results.name }}" ``` ### Conditional Logic Based on Step Results ```yaml actions: - name: trigger-slack if: "{{ steps.get-pods.results.0.status.phase == 'Running' }}" provider: type: slack with: message: "Pod is running: {{ steps.get-pods.results.0.metadata.name }}" ``` ================================================ FILE: docs/workflows/syntax/enrichment.mdx ================================================ --- title: "Enrichment" --- Keep workflows support **enrichment**, a powerful feature that allows you to enhance alerts with additional data, making them more actionable and meaningful. Enrichments add custom fields or modify existing ones in an alert directly from your workflow. --- ## Why Enrich Alerts? - **Provide Context:** Add critical information, such as related customer data or ticket IDs. - **Enable Automation:** Use enriched fields in subsequent actions for dynamic processing. - **Improve Visibility:** Surface essential metadata for better decision-making. --- ## How to Enrich Alerts ### Using the `enrich_alert` Directive The `enrich_alert` directive is used in actions to add or update fields in the alert. You specify a list of key-value pairs where: - `key` is the field name to add or update. - `value` is the data to assign to the field. It can be a static value or dynamically derived from steps or other parts of the workflow. - `disposable` is an optional attribute that determines whether the enrichment is temporary and should be discarded when a new alert is received. If disposable is set to True, the enrichment is added to disposable_enrichments and marked with dispose_on_new_alert=True. ### Example Workflow with Enrichment ```yaml workflow: id: enrich-alert-example description: Demonstrates enriching alerts triggers: - type: alert steps: - name: get-customer-details provider: type: mysql config: "{{ providers.mysql-prod }}" with: query: "SELECT * FROM customers WHERE customer_id = '{{ alert.customer_id }}'" single_row: true actions: - name: enrich-alert-with-customer-data provider: type: mock with: enrich_alert: - key: customer_name value: "{{ steps.get-customer-details.results.name }}" - key: customer_tier value: "{{ steps.get-customer-details.results.tier }}" ``` In this example: - The `get-customer-details` step fetches customer data based on the alert. - The `enrich_alert` directive adds `customer_name` and `customer_tier` to the alert. --- ## Enrichment Syntax ### Key-Value Pairs Each enrichment is defined as a key-value pair: ```yaml enrich_alert: - key: field_name value: field_value disposable: true ``` - **Static Values:** Use static strings or numbers for straightforward enrichments: ```yaml - key: alert_source value: "Monitoring System" ``` -- **Dynamic Values:** Use values derived from steps, actions, or the alert itself: ```yaml - key: severity_level value: "{{ alert.severity }}" ``` ### Conditional Enrichment You can combine enrichment with conditions to enrich alerts dynamically: ```yaml actions: - name: enrich-critical-alert if: "{{ alert.severity == 'critical' }}" provider: type: mock with: enrich_alert: - key: priority value: high ``` ## Advanced Use Cases ### Enrich Alerts with Results from Actions Enrichments can use results from actions, allowing dynamic updates based on previous steps: ```yaml enrich_alert: - key: ticket_id value: "{{ actions.create-ticket.results.ticket_id }}" - key: ticket_url value: "{{ actions.create-ticket.results.ticket_url }}" ``` ## Enrichment Workflow Example This example demonstrates how to enrich an alert with ticket details from ServiceNow: ```yaml workflow: id: servicenow-ticket-enrichment triggers: - type: alert steps: - name: fetch-alert-details provider: type: keep with: filter: "alert_id == '{{ alert.id }}'" actions: - name: create-servicenow-ticket provider: type: servicenow config: "{{ providers.servicenow }}" with: table_name: INCIDENT payload: short_description: "Alert: {{ alert.name }}" description: "{{ alert.description }}" enrich_alert: - key: ticket_id value: "{{ results.sys_id }}" - key: ticket_url value: "{{ results.link }}" ``` ## Troubleshooting Enrichment ### Enrichment without an Alert/Incident If there is no alert/incident present in the trigger (for example interval trigger or manual call in workflow page), the enrichment rule would not have an alert/incident to apply to. The enrichment process typically requires an alert/incident to be present to apply the specified enrichments. Without an alert/incident, the enrichment rule would not execute as intended. A workaround is to use a foreach directive and pass it an object containing the "fingerprint" variable. ================================================ FILE: docs/workflows/syntax/foreach.mdx ================================================ --- title: "Foreach" --- The `foreach` directive in Keep workflows allows you to iterate over a list of items and perform actions for each item. This is particularly useful for processing multiple results returned by a step or performing actions on a collection of entities. ## Key Features - **Dynamic Iteration:** Iterate over any list or array returned by a step or defined in the workflow. - **Scoped Variables:** Each iteration exposes the current item under the `foreach` variable, allowing you to access its properties directly. - **Action Chaining:** Multiple actions can use `foreach` to work sequentially on the same list of items. --- ## Defining a `foreach` To use `foreach`, include it as part of an action. The value of `foreach` should be a reference to the list you want to iterate over. ### Example Workflow with `foreach` ```yaml workflow: id: foreach-example description: Demonstrates the use of foreach triggers: - type: manual steps: - name: get-pods provider: type: gke config: "{{ providers.gke }}" with: command_type: get_pods actions: - name: echo-pod-status foreach: "{{ steps.get-pods.results }}" provider: type: console with: message: "Pod name: {{ foreach.value.metadata.name }} || Namespace: {{ foreach.value.metadata.namespace }} || Status: {{ foreach.value.status.phase }}" ``` In this example: - The `get-pods` step retrieves a list of Kubernetes pods. - The `foreach` iterates over the `results` returned by the `get-pods` step. - For each pod, it prints its `name`, `namespace`, and `status.` --- ## Using `foreach` Variables The `foreach` variable provides scoped access to the current item in the iteration. ### Example of Scoped Variables ```yaml actions: - name: notify-pod-status foreach: "{{ steps.get-pods.results }}" provider: type: slack with: message: | Pod Name: {{ foreach.value.metadata.name }} Namespace: {{ foreach.value.metadata.namespace }} Status: {{ foreach.value.status.phase }} ``` In this case: - `{{ foreach.value }}` refers to the current item in the list. - Access properties like `metadata.name`, `metadata.namespace`, and `s`tatus.phase` dynamically. ### Using Conditions with `foreach` You can combine `foreach` with `if` conditions to filter or act selectively. ```yaml actions: - name: alert-critical-pods foreach: "{{ steps.get-pods.results }}" if: "{{ foreach.value.status.phase == 'Failed' }}" provider: type: slack with: message: "Critical pod failure detected: {{ foreach.value.metadata.name }}" ``` ================================================ FILE: docs/workflows/syntax/functions.mdx ================================================ --- title: "Functions" --- The **Functions** in Keep Workflow Engine are utilities that can be used to manipulate data, check conditions, or perform transformations within workflows. This document provides a brief overview and usage examples for each available function. --- ## Mathematical Functions ### `add` **Description:** Adds all provided numbers together. All arguments are converted to integers. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.add(1, 2, 3) # Output: 6 message2: keep.add(10, 20, 30) # Output: 60 ``` --- ### `sub` **Description:** Subtracts all subsequent numbers from the first number. All arguments are converted to integers. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.sub(10, 2, 3) # Output: 5 message2: keep.sub(100, 20, 30) # Output: 50 ``` --- ### `mul` **Description:** Multiplies all provided numbers together. All arguments are converted to integers. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.mul(2, 3, 4) # Output: 24 message2: keep.mul(5, 6, 7) # Output: 210 ``` --- ### `div` **Description:** Divides the first number by all subsequent numbers. All arguments are converted to integers. Returns an integer if the division result is whole, otherwise returns a floating-point number. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.div(10, 2) # Output: 5 message2: keep.div(10, 3) # Output: 3.3333333333333335 message3: keep.div(100, 2, 5) # Output: 10 ``` --- ### `mod` **Description:** Calculates the remainder of dividing the first number by all subsequent numbers sequentially. All arguments are converted to integers. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.mod(10, 3) # Output: 1 message2: keep.mod(100, 30, 7) # Output: 2 ``` --- ### `exp` **Description:** Raises the first number to the power equal to the product of all subsequent numbers. All arguments are converted to integers. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.exp(2, 3) # Output: 8 message2: keep.exp(2, 3, 2) # Output: 64 ``` --- ### `fdiv` **Description:** Performs integer division of the first number by all subsequent numbers sequentially. All arguments are converted to integers. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.fdiv(10, 3) # Output: 3 message2: keep.fdiv(100, 3, 2) # Output: 16 ``` --- ### `eq` **Description:** Checks if two values are equal. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.eq(5, 5) # Output: true message2: keep.eq("hello", "world") # Output: false message3: keep.eq([1, 2, 3], [1, 2, 3]) # Output: true ``` --- ## String Functions ### `uppercase` **Description:** Converts a string to uppercase. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: "keep.uppercase('hello world')" # Output: "HELLO WORLD" ``` --- ### `lowercase` **Description:** Converts a string to lowercase. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: "keep.lowercase('HELLO WORLD')" # Output: "hello world" ``` --- ### `capitalize` **Description:** Capitalizes the first character of a string. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.capitalize("hello world") # Output: "Hello world" ``` --- ### `title` **Description:** Converts a string to title case (capitalizes each word). **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.title("hello world") # Output: "Hello World" ``` --- ### `split` **Description:** Splits a string into a list using a delimiter. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: "keep.split('a,b,c', ',')" # Output: ["a", "b", "c"] ``` --- ### `strip` **Description:** Removes leading and trailing whitespace from a string. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.strip(" hello world ") # Output: "hello world" ``` --- ### `replace` **Description:** Replaces occurrences of a substring with another string. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.replace("hello world", "world", "Keep") # Output: "hello Keep" ``` --- ### `remove_newlines` **Description:** Removes all newline and tab characters from a string. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.remove_newlines("hello\nworld\t!") # Output: "helloworld!" ``` --- ### `encode` **Description:** URL-encodes a string. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.encode("hello world") # Output: "hello%20world" ``` --- ### `slice` **Description:** Extracts a portion of a string based on start and end indices. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.slice("hello world", 0, 5) # Output: "hello" ``` --- ## List and Dictionary Functions ### `first` **Description:** Retrieves the first element from a list. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.first([1, 2, 3]) # Output: 1 ``` --- ### `last` **Description:** Retrieves the last element from a list. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.last([1, 2, 3]) # Output: 3 ``` --- ### `index` **Description:** Retrieves an element at a specific index from a list. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.index(["a", "b", "c"], 1) # Output: "b" ``` --- ### `join` **Description:** Joins a list of elements into a string using a delimiter. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.join(["a", "b", "c"], ",") # Output: "a,b,c" ``` --- ### `len` **Description:** Returns the length of a list. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.len([1, 2, 3]) # Output: 3 ``` --- ### `dict_to_key_value_list` **Description:** Converts a dictionary into a list of key-value pairs. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.dict_to_key_value_list({"a": 1, "b": 2}) # Output: ["a:1", "b:2"] ``` --- ### `dict_pop` **Description:** Removes specified keys from a dictionary. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.dict_pop({"a": 1, "b": 2, "c": 3}, "a", "b") # Output: {"c": 3} ``` --- ### `dict_pop_prefix` **Description:** Removes all keys that start with a specified prefix from a dictionary. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.dict_pop_prefix({"a_1": 1, "a_2": 2, "b_1": 3}, "a_") # Output: {"b_1": 3} ``` --- ### `dict_filter_by_prefix` **Description:** Returns only the dictionary entries whose keys start with a specified prefix. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.dict_filter_by_prefix({"a_1": 1, "a_2": 2, "b_1": 3}, "a_") # Output: {"a_1": 1, "a_2": 2} ``` --- ### `dictget` **Description:** Gets a value from a dictionary with a default fallback. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.dictget({"a": 1, "b": 2}, "c", "default") # Output: "default" ``` --- ## Date and Time Functions ### `from_timestamp` **Description:** Converts unix timestamp int, float or string to datetime object, with optional timezone option. **Example:** ```yaml steps: - name: example-step provider: type: console with: message: keep.from_timestamp(1717244449.0) # will print "2024-06-01 12:20:49+00:00" # or with timezone # message: keep.from_timestamp(1717244449.0, "Europe/Berlin") # will print "2024-06-01 14:20:49+02:00" ``` ### `utcnow` **Description:** Returns the current UTC datetime. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.utcnow() ``` --- ### `utcnowtimestamp` **Description:** Returns the current UTC datetime as a Unix timestamp (seconds since epoch). **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.utcnowtimestamp() # Output: 1704067200 ``` --- ### `utcnowiso` **Description:** Returns the current UTC datetime in ISO format. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.utcnowiso() ``` --- ### `to_utc` **Description:** Converts a datetime string or object to UTC. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.to_utc("2024-01-01T00:00:00") ``` --- ### `to_timestamp` **Description:** Converts a datetime object or string into a Unix timestamp. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.to_timestamp("2024-01-01T00:00:00") ``` --- ### `datetime_compare` **Description:** Compares two datetime objects and returns the difference in hours. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.datetime_compare("2024-01-01T10:00:00", "2024-01-01T00:00:00") # Output: 10.0 ``` --- ### `is_business_hours` **Description:** Checks whether a given time falls within business hours. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.is_business_hours( time_to_check="2024-01-01T14:00:00Z", start_hour=8, end_hour=20, business_days=[0,1,2,3,4], timezone="America/New_York" ) ``` --- ## JSON Functions ### `json_dumps` **Description:** Converts a dictionary or string into a formatted JSON string. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.json_dumps({"key": "value"}) ``` --- ### `json_loads` **Description:** Parses a JSON string into a dictionary. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.json_loads('{"key": "value"}') ``` --- ## Utility Functions ### `get_firing_time` **Description:** Calculates the firing duration of an alert in specified time units. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.get_firing_time(alert, "m", tenant_id="tenant-id") # Output: "15.0" ``` --- ### `add_time_to_date` **Description:** Adds time to a date string based on specified time units. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.add_time_to_date("2024-01-01", "%Y-%m-%d", "1w 2d") # Output: "2024-01-10" ``` --- ### `timestamp_delta` **Description:** Adds or subtracts a time delta to/from a datetime. Use negative values to subtract time. **Example:** ```yaml steps: - name: example-step provider: type: mock with: # Add 2 hours to the current time add_hours: keep.timestamp_delta(keep.utcnow(), 2, "hours") # Subtract 30 minutes from a specific datetime subtract_minutes: keep.timestamp_delta("2024-01-01T12:00:00Z", -30, "minutes") # Output: 2024-01-01T11:30:00Z # Add 1 week to a datetime add_week: keep.timestamp_delta("2024-01-01T00:00:00Z", 1, "weeks") # Output: 2024-01-08T00:00:00Z ``` --- ### `is_first_time` **Description:** Checks if an alert with a given fingerprint is firing for the first time or first time within a specified period. **Example:** ```yaml steps: - name: example-step provider: type: mock with: # Check if this is the first time the alert is firing first_time: keep.is_first_time(alert.fingerprint, tenant_id="tenant-id") # Check if this is the first time the alert is firing in the last 24 hours first_time_24h: keep.is_first_time(alert.fingerprint, "24h", tenant_id="tenant-id") ``` --- ### `all` **Description:** Checks if all elements in an iterable are identical. **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.all([1, 1, 1]) # Output: true ``` --- ### `diff` **Description:** Checks if any elements in an iterable are different (opposite of `all`). **Example:** ```yaml steps: - name: example-step provider: type: mock with: message: keep.diff([1, 2, 1]) # Output: true ``` --- ================================================ FILE: docs/workflows/syntax/permissions.mdx ================================================ --- title: "Permissions" --- # Permissions Permissions in Keep Workflow Engine define **who can execute a workflow manually**. They allow you to restrict access to workflows based on user roles or specific email addresses, ensuring that only authorized users can trigger sensitive workflows. Currently, permissions can only be edited directly in the workflow YAML file. The workflow builder UI does not support editing permissions at this time. --- ## General Structure Permissions are defined at the top level of a workflow YAML file using the `permissions` field, which accepts a list of roles and/or email addresses. ```yaml workflow: id: sensitive-workflow name: Sensitive Workflow description: "A workflow with restricted access" permissions: - admin - john.doe@example.com steps: # workflow steps ``` ## How Permissions Work When a workflow has permissions defined: 1. **Admin users** can always run the workflow regardless of the permissions list 2. **Non-admin users** can only run the workflow if: - Their role is explicitly listed in the permissions - OR their email address is explicitly listed in the permissions 3. If the `permissions` field is empty or not defined, any user with the `write:workflows` permission can run the workflow ## Supported Role Types Keep supports the following role types that can be used in the permissions list: - `admin`: Administrator users with full system access - `noc`: Network Operations Center users with read-only access - `webhook`: API access for webhook integrations - `workflowrunner`: Special role for running workflows via API ## Examples ### Restricting to Admin Users Only ```yaml workflow: id: critical-infrastructure-workflow name: Critical Infrastructure Workflow permissions: - admin steps: # workflow steps ``` ### Allowing Specific Users ```yaml workflow: id: department-specific-workflow name: Department Specific Workflow permissions: - sarah.smith@example.com - team.lead@example.com steps: # workflow steps ``` ### Combining Roles and Individual Users ```yaml workflow: id: mixed-permissions-workflow name: Mixed Permissions Workflow permissions: - admin - noc - devops.specialist@example.com steps: # workflow steps ``` ## Best Practices - Use permissions for workflows that have significant impact on systems or trigger sensitive operations - Consider using role-based permissions (like `admin` or `noc`) for groups of users with similar responsibilities - List individual email addresses only for exceptions or when very specific access control is needed - Review workflow permissions regularly as part of security audits - Document which workflows have restricted permissions in your internal documentation ================================================ FILE: docs/workflows/syntax/providers.mdx ================================================ --- title: "Providers" --- Providers are a fundamental part of workflows in Keep. They enable workflows to interact with external systems, fetch data, and perform actions. Each provider is designed to handle specific integrations such as Datadog, Slack, ServiceNow, or custom-built APIs. ## Key Features of Providers - **Extensibility:** Providers can be easily extended to support new systems or custom use cases. You can explore and contribute to the existing providers or create your own in the [Keep Providers Code Directory on GitHub](https://github.com/keephq/keep/providers). - **Parameterization:** Parameters under the `with` section are passed directly to the provider. This allows you to configure provider-specific settings for each step or action. - **Provisioning:** Providers can be provisioned via CI/CD pipelines or through the Keep UI, providing flexibility for both automated and manual setups. --- ## Defining a Provider To define a provider, include its configuration under the `providers` section of your workflow file. Here's an example: ```yaml providers: slack: description: "Slack provider for sending messages" authentication: webhook_url: "{{ env.SLACK_WEBHOOK_URL }}" ``` ## Using a Provider in a Workflow Once a provider is defined, it can be used in workflow steps or actions by specifying its type and configuration. For example: ```yaml actions: - name: trigger-slack provider: type: slack config: "{{ providers.slack }}" with: channel: "#alerts" message: "Alert triggered: {{ alert.name }}" ``` - The `config` field links the action to the provider. - The `with` section includes parameters that are passed to the provider. ## Examples ### Fetching Data with a Provider ```yaml steps: - name: get-alerts provider: type: datadog config: "{{ providers.datadog }}" with: query: "avg:cpu.usage{*}" timeframe: "1h" ``` ### Sending Notifications with a Provider ```yaml actions: - name: notify-slack provider: type: slack config: "{{ providers.slack }}" with: channel: "#alerts" message: "Critical alert: {{ alert.name }}" ``` ================================================ FILE: docs/workflows/syntax/steps-and-actions.mdx ================================================ --- title: "Steps and Actions" --- Steps and actions are the building blocks of workflows in Keep Workflow Engine. While they share a similar structure and syntax, the **difference between steps and actions is mostly semantic**: - **Steps**: Focused on querying data or triggering fetch-like operations from providers (e.g., querying databases, fetching logs, or retrieving information). - **Actions**: Geared toward notifying or triggering outcomes, such as sending notifications, updating tickets, or invoking external services. Together, steps and actions allow workflows to both gather the necessary data and act upon it. --- ## General Structure Both steps and actions are defined using a similar schema: ### Steps Used for querying or fetching data. Step uses the `_query` method of each provider. ```yaml steps: - name: provider: type: config: with: ``` ### Actions Used for notifications or triggering effects. Action uses the `_notify` method of each provider. ```yaml actions: - name: provider: type: config: with: ``` ## Examples ### Fetch data from a MySQL database ```yaml steps: - name: get-user-data provider: type: mysql config: "{{ providers.mysql-prod }}" with: query: "SELECT * FROM users WHERE id = 1" single_row: true ``` ### Retrieve logs from Datadog ```yaml steps: - name: get-service-logs provider: type: datadog config: "{{ providers.datadog }}" with: query: "service:keep and @error" timeframe: "1h" ``` ### Query Kubernetes for running pods ```yaml steps: - name: get-pods provider: type: k8s config: "{{ providers.k8s-cluster }}" with: command_type: get_pods ``` ### Send an email ```yaml actions: - name: send-email provider: type: email config: "{{ providers.email }}" with: to: "user@example.com" subject: "Account Updated" body: "Your account details have been updated." ``` ### Send a Slack Message ```yaml actions: - name: notify-slack provider: type: slack config: "{{ providers.slack-demo }}" with: message: "Critical alert received!" ``` ### Create a ticket in ServiceNow ```yaml actions: - name: create-servicenow-ticket provider: type: servicenow config: "{{ providers.servicenow }}" with: table_name: INCIDENT payload: short_description: "New incident created by Keep" description: "Please investigate the issue." ``` ## Combining Steps and Actions A workflow typically combines steps (for querying data) with actions (for notifications or outcomes). Here's few examples: ### Query and Notify ```yaml workflow: id: query-and-notify description: "Query a database and notify via Slack" steps: - name: get-user-data provider: type: mysql config: "{{ providers.mysql-prod }}" with: query: "SELECT email FROM users WHERE id = 1" single_row: true actions: - name: send-notification provider: type: slack config: "{{ providers.slack-demo }}" with: message: "User email: {{ steps.get-user-data.results.email }}" ``` ### Alert and Incident Management ```yaml workflow: id: alert-management description: "Handle alerts and create incidents" steps: - name: get-alert-details provider: type: datadog config: "{{ providers.datadog }}" with: query: "service:keep and @alert" timeframe: "1h" actions: - name: create-incident provider: type: servicenow config: "{{ providers.servicenow }}" with: table_name: INCIDENT payload: short_description: "Alert from Datadog: {{ steps.get-alert-details.results.alert_name }}" description: "Details: {{ steps.get-alert-details.results.alert_description }}" ``` ## Error Handling and Retries Both steps and actions support error handling to ensure workflows can recover from failures. ```yaml steps: - name: fetch-data provider: type: http with: url: "https://api.example.com/data" on-failure: retry: count: 3 # Retry every 5 seconds interval: 5 ``` ================================================ FILE: docs/workflows/syntax/triggers.mdx ================================================ --- title: "Triggers" --- ## Overview Triggers in Keep Workflow Engine define **when a workflow is executed**. Triggers are the starting point for workflows and can be configured to respond to a variety of events, conditions, or schedules. A workflow can have one or multiple triggers, and these triggers determine the specific circumstances under which the workflow is initiated. Examples include manual invocation, time-based schedules, or event-driven actions like alerts or incident updates. Triggers are defined under the `triggers` section of a workflow YAML file. Each trigger has a `type` and optional additional configurations or filters. ## Supported Trigger Types ### Manual Trigger Used to execute workflows on demand. ```yaml triggers: - type: manual ``` ### Interval Trigger Runs workflows at a regular time. ```yaml triggers: - type: interval # Run every 5 seconds value: 5 ``` ### Alert Trigger Executes a workflow when an alert is received. ```yaml triggers: - type: alert ``` If no filters or CEL expressions are specified, the workflow will be executed for every alert that comes in. ### Filtering Alerts There are two ways to filter alerts in Keep: #### 1. CEL-based Filtering (Recommended) Keep uses [Common Expression Language (CEL)](https://github.com/google/cel-spec/blob/master/doc/langdef.md) for filtering alerts. CEL provides a powerful and flexible way to express conditions using a simple expression language. ```yaml triggers: - type: alert cel: source.contains("datadog") && severity == "critical" ``` Common CEL patterns: - String matching: `source.contains("prometheus")` - Exact matching: `severity == "critical"` - Multiple conditions: `source.contains("datadog") && severity == "critical"` - Pattern matching: `name.contains("error") || name.contains("failure")` - Complex conditions: `(source.contains("datadog") && severity == "critical") || (source.contains("newrelic") && severity == "error")` You can test and experiment with CEL expressions using the [CEL Playground](https://playcel.undistro.io/). #### 2. Legacy Filtering (Deprecated) The old filtering mechanism is deprecated but still supported for backward compatibility. It uses a list of key-value pairs with optional regex patterns. ```yaml triggers: - type: alert filters: - key: severity value: critical - key: source value: datadog - key: service value: r"(payments|ftp)" ``` ### Incident Trigger Runs workflows when an incident is created, updated, or resolved. ```yaml triggers: - type: incident on: - create - update ``` ### Field Change Trigger Executes a workflow when specific fields in an alert change, such as status or severity. ```yaml triggers: - type: alert only_on_change: - status ``` ## Summary Triggers are a powerful way to control the execution of workflows, ensuring that they respond appropriately to manual actions, schedules, or events. By leveraging CEL expressions or filters, workflows can be fine-tuned to execute only under specific conditions. For more information about CEL expressions, refer to the [CEL Language Definition](https://github.com/google/cel-spec/blob/master/doc/langdef.md) and experiment with expressions in the [CEL Playground](https://playcel.undistro.io/). ================================================ FILE: ee/LICENSE ================================================ The Keep Enterprise Edition (EE) license (the Enterprise License) Copyright (c) 2024-present Keep Alerting LTD With regard to the Keep Software: This software and associated documentation files (the "Software") may only be used in production, if you (and any entity that you represent) have agreed to, and are in compliance with, the Keep Subscription Terms of Service, available (if not available, it's impossible to comply) at https://www.keephq.dev/terms-of-service (the "The Enterprise Terms”), or other agreement governing the use of the Software, as agreed by you and Keep, and otherwise have a valid Keep Enterprise Edition subscription for the correct number of user seats. Subject to the foregoing sentence, you are free to modify this Software and publish patches to the Software. You agree that Keep and/or its licensors (as applicable) retain all right, title and interest in and to all such modifications and/or patches, and all such modifications and/or patches may only be used, copied, modified, displayed, distributed, or otherwise exploited with a valid Keep Enterprise Edition subscription for the correct number of user seats. You agree that Keep and/or its licensors (as applicable) retain all right, title and interest in and to all such modifications. You are not granted any other rights beyond what is expressly stated herein. Subject to the foregoing, it is forbidden to copy, merge, publish, distribute, sublicense, and/or sell the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For all third party components incorporated into the Keep Software, those components are licensed under the original license provided by the owner of the applicable component. ================================================ FILE: ee/identitymanager/__init__.py ================================================ ================================================ FILE: ee/identitymanager/identity_managers/__init__.py ================================================ ================================================ FILE: ee/identitymanager/identity_managers/auth0/__init__.py ================================================ ================================================ FILE: ee/identitymanager/identity_managers/auth0/auth0_authverifier.py ================================================ import logging import os import jwt import requests from fastapi import HTTPException from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.rbac import Admin as AdminRole logger = logging.getLogger(__name__) def _discover_jwks_uri(auth_domain: str) -> str: """Discover the JWKS URI via the OpenID Connect Discovery endpoint. Per the OpenID Connect Discovery 1.0 specification (https://openid.net/specs/openid-connect-discovery-1_0.html#rfc.section.3), the ``jwks_uri`` should be obtained from the provider's discovery document at ``{issuer}/.well-known/openid-configuration``. Falls back to the Auth0-style ``/.well-known/jwks.json`` path when the discovery document is unavailable or does not contain ``jwks_uri``. """ discovery_url = f"https://{auth_domain}/.well-known/openid-configuration" try: resp = requests.get(discovery_url, timeout=10) resp.raise_for_status() discovered_uri = resp.json().get("jwks_uri") if discovered_uri: return discovered_uri logger.warning( "OpenID discovery document at %s did not contain jwks_uri, " "falling back to /.well-known/jwks.json", discovery_url, ) except Exception: logger.warning( "Failed to fetch OpenID discovery document from %s, " "falling back to /.well-known/jwks.json", discovery_url, exc_info=True, ) # Fallback: Auth0's conventional JWKS endpoint return f"https://{auth_domain}/.well-known/jwks.json" # Note: cache_keys is set to True to avoid fetching the jwks keys on every request auth_domain = os.environ.get("AUTH0_DOMAIN") if auth_domain: jwks_uri = _discover_jwks_uri(auth_domain) jwks_client = jwt.PyJWKClient( jwks_uri, cache_keys=True, headers={"User-Agent": "keep-api"} ) else: jwks_client = None class Auth0AuthVerifier(AuthVerifierBase): """Handles authentication and authorization for multi tenant mode""" def __init__(self, scopes: list[str] = []) -> None: # TODO: this verifier should be instantiated once and not for every endpoint/route # to better cache the jwks keys super().__init__(scopes) # init once so the cache will actually work self.auth_domain = os.environ.get("AUTH0_DOMAIN") if not self.auth_domain: raise Exception("Missing AUTH0_DOMAIN environment variable") self.jwks_uri = _discover_jwks_uri(self.auth_domain) # Note: cache_keys is set to True to avoid fetching the jwks keys on every request # but it currently caches only per-route. After moving this auth verifier to be a singleton, we can cache it globally self.issuer = f"https://{self.auth_domain}/" self.auth_audience = os.environ.get("AUTH0_AUDIENCE") def _verify_bearer_token(self, token) -> AuthenticatedEntity: from opentelemetry import trace tracer = trace.get_tracer(__name__) with tracer.start_as_current_span("verify_bearer_token"): if not token: raise HTTPException(status_code=401, detail="No token provided 👈") # more than one tenant support if token.startswith("keepActiveTenant"): active_tenant, token = token.split("&") active_tenant = active_tenant.split("=")[1] else: active_tenant = None try: jwt_signing_key = jwks_client.get_signing_key_from_jwt(token).key payload = jwt.decode( token, jwt_signing_key, algorithms="RS256", audience=self.auth_audience, issuer=self.issuer, leeway=60, ) # if active_tenant is set, we must verify its in the token if active_tenant: active_tenant_found = False for tenant in payload.get("keep_tenant_ids", []): if tenant.get("tenant_id") == active_tenant: active_tenant_found = True break if not active_tenant_found: self.logger.warning( "Someone tries to use a token with a tenant that is not in the token" ) raise HTTPException( status_code=401, detail="Token does not contain the active tenant", ) tenant_id = active_tenant else: tenant_id = payload.get("keep_tenant_id") role_name = payload.get( "keep_role", AdminRole.get_name() ) # default to admin for backwards compatibility email = payload.get("email") return AuthenticatedEntity(tenant_id, email, role=role_name) except jwt.exceptions.DecodeError: self.logger.exception("Failed to decode token") raise HTTPException(status_code=401, detail="Token is not a valid JWT") except Exception as e: self.logger.exception("Failed to validate token") raise HTTPException(status_code=401, detail=str(e)) ================================================ FILE: ee/identitymanager/identity_managers/auth0/auth0_identitymanager.py ================================================ import os import secrets import jwt from fastapi import HTTPException from ee.identitymanager.identity_managers.auth0.auth0_authverifier import ( Auth0AuthVerifier, ) from ee.identitymanager.identity_managers.auth0.auth0_utils import getAuth0Client from keep.api.models.user import User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.identitymanager import BaseIdentityManager from keep.identitymanager.rbac import Admin as AdminRole class Auth0IdentityManager(BaseIdentityManager): def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.logger.info("Auth0IdentityManager initialized") self.domain = os.environ.get("AUTH0_DOMAIN") self.client_id = os.environ.get("AUTH0_CLIENT_ID") self.client_secret = os.environ.get("AUTH0_CLIENT_SECRET") self.audience = f"https://{self.domain}/api/v2/" self.jwks_client = jwt.PyJWKClient( f"https://{self.domain}/.well-known/jwks.json", cache_keys=True, headers={"User-Agent": "keep-api"}, ) def get_users(self) -> list[User]: return self._get_users_auth0(self.tenant_id) def _get_users_auth0(self, tenant_id: str) -> list[User]: auth0 = getAuth0Client() users = auth0.users.list(q=f'app_metadata.keep_tenant_id:"{tenant_id}"') users = [ User( email=user["email"], name=user["name"], # for backwards compatibility we return admin if no role is set role=user.get("app_metadata", {}).get( "keep_role", AdminRole.get_name() ), last_login=user.get("last_login", None), created_at=user["created_at"], picture=user["picture"], ) for user in users.get("users", []) ] return users def create_user(self, user_email: str, role: str, **kwargs) -> dict: return self._create_user_auth0(user_email, self.tenant_id, role) def delete_user(self, user_email: str) -> dict: auth0 = getAuth0Client() users = auth0.users.list(q=f'app_metadata.keep_tenant_id:"{self.tenant_id}"') for user in users.get("users", []): if user["email"] == user_email: auth0.users.delete(user["user_id"]) return {"status": "OK"} raise HTTPException(status_code=404, detail="User not found") def get_auth_verifier(self, scopes) -> Auth0AuthVerifier: return Auth0AuthVerifier(scopes) def _create_user_auth0(self, user_email: str, tenant_id: str, role: str) -> dict: auth0 = getAuth0Client() # User email can exist in 1 tenant only for now. users = auth0.users.list(q=f'email:"{user_email}"') if users.get("users", []): raise HTTPException(status_code=409, detail="User already exists") user = auth0.users.create( { "email": user_email, "password": secrets.token_urlsafe(13), "email_verified": True, "app_metadata": {"keep_tenant_id": tenant_id, "keep_role": role}, "connection": os.environ.get("AUTH0_DB_NAME", "keep-users"), } ) user_dto = User( email=user["email"], name=user["name"], # for backwards compatibility we return admin if no role is set role=user.get("app_metadata", {}).get("keep_role", AdminRole.get_name()), last_login=user.get("last_login", None), created_at=user["created_at"], picture=user["picture"], ) return user_dto def update_user(self, user_email: str, update_data: dict) -> User: auth0 = getAuth0Client() users = auth0.users.list( q=f'email:"{user_email}" AND app_metadata.keep_tenant_id:"{self.tenant_id}"' ) if not users.get("users", []): raise HTTPException(status_code=404, detail="User not found") user = users["users"][0] user_id = user["user_id"] update_body = {} if "email" in update_data and update_data["email"]: update_body["email"] = update_data["email"] if "password" in update_data and update_data["password"]: update_body["password"] = update_data["password"] if "role" in update_data and update_data["role"]: update_body["app_metadata"] = user.get("app_metadata", {}) update_body["app_metadata"]["keep_role"] = update_data["role"] if "groups" in update_data and update_data["groups"]: # Assuming groups are stored in app_metadata if "app_metadata" not in update_body: update_body["app_metadata"] = user.get("app_metadata", {}) update_body["app_metadata"]["groups"] = update_data["groups"] try: updated_user = auth0.users.update(user_id, update_body) return User( email=updated_user["email"], name=updated_user["name"], role=updated_user.get("app_metadata", {}).get( "keep_role", AdminRole.get_name() ), last_login=updated_user.get("last_login", None), created_at=updated_user["created_at"], picture=updated_user["picture"], ) except Exception as e: self.logger.error(f"Error updating user: {str(e)}") raise HTTPException(status_code=500, detail="Failed to update user") ================================================ FILE: ee/identitymanager/identity_managers/auth0/auth0_utils.py ================================================ from auth0.authentication import GetToken from auth0.management import Auth0 from keep.api.core.config import config def getAuth0Client() -> Auth0: AUTH0_DOMAIN = config("AUTH0_MANAGEMENT_DOMAIN") AUTH0_CLIENT_ID = config("AUTH0_CLIENT_ID") AUTH0_CLIENT_SECRET = config("AUTH0_CLIENT_SECRET") get_token = GetToken(AUTH0_DOMAIN, AUTH0_CLIENT_ID, AUTH0_CLIENT_SECRET) token = get_token.client_credentials("https://{}/api/v2/".format(AUTH0_DOMAIN)) mgmt_api_token = token["access_token"] auth0 = Auth0(AUTH0_DOMAIN, mgmt_api_token) return auth0 ================================================ FILE: ee/identitymanager/identity_managers/azuread/__init__.py ================================================ ================================================ FILE: ee/identitymanager/identity_managers/azuread/azuread_authverifier.py ================================================ import hashlib import logging import os from datetime import datetime, timedelta from typing import Any, Dict, List, Optional import jwt import requests from fastapi import Depends, HTTPException from jwt import PyJWK from jwt.exceptions import ( ExpiredSignatureError, InvalidIssuedAtError, InvalidIssuerError, InvalidTokenError, MissingRequiredClaimError, ) from keep.api.core.db import create_user, update_user_last_sign_in, user_exists from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase, oauth2_scheme from keep.identitymanager.rbac import Admin as AdminRole from keep.identitymanager.rbac import Noc as NOCRole from keep.identitymanager.rbac import get_role_by_role_name logger = logging.getLogger(__name__) class AzureADGroupMapper: """Maps Azure AD groups to Keep roles""" def __init__(self): # Get group IDs from environment variables self.admin_group_id = os.environ.get("KEEP_AZUREAD_ADMIN_GROUP_ID") self.noc_group_id = os.environ.get("KEEP_AZUREAD_NOC_GROUP_ID") if not all([self.admin_group_id, self.noc_group_id]): raise Exception( "Missing KEEP_AZUREAD_ADMIN_GROUP_ID or KEEP_AZUREAD_NOC_GROUP_ID environment variables" ) # Define group to role mapping self.group_role_mapping = { self.admin_group_id: AdminRole.get_name(), self.noc_group_id: NOCRole.get_name(), } def get_role_from_groups(self, groups: List[str]) -> Optional[str]: """ Determine Keep role based on Azure AD group membership Returns highest privilege role if user is in multiple groups """ user_roles = set() for group_id in groups: if role := self.group_role_mapping.get(group_id): user_roles.add(role) # If user is in admin group, return admin role if AdminRole.get_name() in user_roles: return AdminRole.get_name() # If user is in NOC group, return NOC role elif NOCRole.get_name() in user_roles: return NOCRole.get_name() # No matching groups return None class AzureADKeysManager: """Singleton class to manage Azure AD signing keys""" _instance = None _signing_keys: Dict[str, Any] = {} _last_updated: Optional[datetime] = None _cache_duration = timedelta(hours=24) def __new__(cls): if cls._instance is None: cls._instance = super(AzureADKeysManager, cls).__new__(cls) return cls._instance def __init__(self): if self._last_updated is None: self.tenant_id = os.environ.get("KEEP_AZUREAD_TENANT_ID") if not self.tenant_id: raise Exception("Missing KEEP_AZUREAD_TENANT_ID environment variable") self.jwks_uri = f"https://login.microsoftonline.com/{self.tenant_id}/discovery/v2.0/keys" self._refresh_keys() def _refresh_keys(self) -> None: """Fetch signing keys from Azure AD's JWKS endpoint""" try: response = requests.get(self.jwks_uri) response.raise_for_status() jwks = response.json() new_keys = {} for key in jwks.get("keys", []): if key.get("use") == "sig": # Only use signing keys logger.debug("Loading public key from certificate: %s", key) cert_obj = PyJWK(key, "RS256") if kid := key.get("kid"): new_keys[kid] = cert_obj.key if new_keys: # Only update if we got valid keys self._signing_keys = new_keys self._last_updated = datetime.utcnow() logger.info("Successfully refreshed Azure AD signing keys") else: logger.error("No valid signing keys found in JWKS response") except requests.RequestException as e: logger.error(f"Failed to fetch signing keys: {str(e)}") if not self._signing_keys: raise HTTPException( status_code=500, detail="Unable to verify tokens at this time" ) def get_signing_key(self, kid: str) -> Optional[Any]: """Get a signing key by its ID, refreshing if necessary""" now = datetime.utcnow() # Refresh keys if they're expired or if we can't find the requested key if ( self._last_updated is None or now - self._last_updated > self._cache_duration or (kid not in self._signing_keys) ): self._refresh_keys() return self._signing_keys.get(kid) # Initialize the keys manager globally azure_keys_manager = AzureADKeysManager() class AzureadAuthVerifier(AuthVerifierBase): """Handles authentication and authorization for Azure AD""" def __init__(self, scopes: list[str] = []) -> None: super().__init__(scopes) # Azure AD configurations self.tenant_id = os.environ.get("KEEP_AZUREAD_TENANT_ID") self.client_id = os.environ.get("KEEP_AZUREAD_CLIENT_ID") if not all([self.tenant_id, self.client_id]): raise Exception( "Missing KEEP_AZUREAD_TENANT_ID or KEEP_AZUREAD_CLIENT_ID environment variable" ) self.group_mapper = AzureADGroupMapper() # Keep track of hashed tokens so we won't update the user on the same token self.saw_tokens = set() def _verify_bearer_token( self, token: str = Depends(oauth2_scheme) ) -> AuthenticatedEntity: """Verify the Azure AD JWT token and extract claims""" try: # First decode without verification to get the key id (kid) unverified_headers = jwt.get_unverified_header(token) kid = unverified_headers.get("kid") if not kid: raise HTTPException(status_code=401, detail="No key ID in token header") # Get the signing key from the global manager signing_key = azure_keys_manager.get_signing_key(kid) if not signing_key: raise HTTPException(status_code=401, detail="Invalid token signing key") # For v2.0 tokens, 'appid' doesn't exist — 'azp' is used instead. # Remove "appid" from the 'require' list so v2 tokens won't fail. options = { "verify_signature": True, "verify_aud": False, # We'll validate manually below "verify_iat": True, "verify_exp": True, "verify_nbf": True, # we will validate manually since we need to support both # v1 (sts.windows.net) and v2 (https://login.microsoftonline.com) "verify_iss": False, # "require" the standard claims but NOT "appid" (search for 'azp' in this code to see the comment) "require": ["exp", "iat", "nbf", "iss", "sub"], } try: payload = jwt.decode( token, key=signing_key, algorithms=["RS256"], options=options, ) # ---- MANUAL ISSUER CHECK ---- # Allowed issuers for v1 vs. v2 in the same tenant: allowed_issuers = [ f"https://sts.windows.net/{self.tenant_id}/", # v1 tokens f"https://login.microsoftonline.com/{self.tenant_id}/v2.0", # v2 tokens ] issuer_in_token = payload.get("iss") if issuer_in_token not in allowed_issuers: raise HTTPException(status_code=401, detail="Invalid token issuer") # Check client ID: v1 -> 'appid', v2 -> 'azp' client_id_in_token = payload.get("appid") or payload.get("azp") if not client_id_in_token: raise HTTPException( status_code=401, detail="No client ID (appid/azp) in token" ) if client_id_in_token != self.client_id: raise HTTPException( status_code=401, detail="Invalid token application ID (appid/azp)", ) # Validate the audience allowed_aud = [ f"api://{self.client_id}", # v1 tokens f"{self.client_id}", # v2 tokens ] if payload.get("aud") not in allowed_aud: self.logger.error( f"Invalid token audience: {payload.get('aud')}", extra={ "tenant_id": self.tenant_id, "audience": payload.get("aud"), "allowed_aud": allowed_aud, }, ) raise HTTPException( status_code=401, detail="Invalid token audience" ) except ExpiredSignatureError: raise HTTPException(status_code=401, detail="Token has expired") except InvalidIssuerError: raise HTTPException(status_code=401, detail="Invalid token issuer") except (InvalidIssuedAtError, MissingRequiredClaimError): raise HTTPException( status_code=401, detail="Token is missing required claims" ) except InvalidTokenError as e: logger.error(f"Token validation failed: {str(e)}") raise HTTPException(status_code=401, detail="Invalid token") # Extract relevant claims tenant_id = payload.get("tid") email = ( payload.get("email") or payload.get("preferred_username") or payload.get("unique_name") ) if not all([tenant_id, email]): raise HTTPException(status_code=401, detail="Missing required claims") # Clean up email if it's in the live.com#email@domain.com format if "#" in email: email = email.split("#")[1] # Get groups from token groups = payload.get("groups", []) # Map groups to role role_name = self.group_mapper.get_role_from_groups(groups) if not role_name: self.logger.warning( f"User {email} is not a member of any authorized groups for Keep", extra={ "tenant_id": tenant_id, "groups": groups, }, ) raise HTTPException( status_code=403, detail="User not a member of any authorized groups for Keep", ) # Validate role scopes role = get_role_by_role_name(role_name) if not role.has_scopes(self.scopes): self.logger.warning( f"Role {role_name} does not have required permissions", extra={ "tenant_id": tenant_id, "role": role_name, }, ) raise HTTPException( status_code=403, detail=f"Role {role_name} does not have required permissions", ) # Auto-provisioning logic hashed_token = hashlib.sha256(token.encode()).hexdigest() if hashed_token not in self.saw_tokens and not user_exists( tenant_id, email ): create_user( tenant_id=tenant_id, username=email, role=role_name, password="" ) if hashed_token not in self.saw_tokens: update_user_last_sign_in(tenant_id, email) self.saw_tokens.add(hashed_token) return AuthenticatedEntity(tenant_id, email, None, role_name) except HTTPException: # Re-raise known HTTP errors self.logger.exception("Token validation failed (HTTPException)") raise except Exception: self.logger.exception("Token validation failed") raise HTTPException(status_code=401, detail="Invalid token") def _authorize(self, authenticated_entity: AuthenticatedEntity) -> None: """ Authorize the authenticated entity against required scopes """ if not authenticated_entity.role: raise HTTPException(status_code=403, detail="No role assigned") role = get_role_by_role_name(authenticated_entity.role) if not role.has_scopes(self.scopes): raise HTTPException( status_code=403, detail="You don't have the required permissions to access this resource", ) ================================================ FILE: ee/identitymanager/identity_managers/azuread/azuread_identitymanager.py ================================================ from ee.identitymanager.identity_managers.azuread.azuread_authverifier import ( AzureadAuthVerifier, ) from keep.api.models.user import User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.identity_managers.db.db_identitymanager import ( DbIdentityManager, ) from keep.identitymanager.identitymanager import BaseIdentityManager class AzureadIdentityManager(BaseIdentityManager): def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.db_identity_manager = DbIdentityManager( tenant_id, context_manager, **kwargs ) def get_users(self) -> list[User]: # we keep the azuread users in the db return self.db_identity_manager.get_users(self.tenant_id) def create_user(self, user_email: str, role: str, **kwargs) -> dict: return None def delete_user(self, user_email: str) -> dict: raise NotImplementedError("AzureadIdentityManager.delete_user") def get_auth_verifier(self, scopes) -> AzureadAuthVerifier: return AzureadAuthVerifier(scopes) def update_user(self, user_email: str, update_data: dict) -> User: raise NotImplementedError("AzureadIdentityManager.update_user") ================================================ FILE: ee/identitymanager/identity_managers/keycloak/__init__.py ================================================ ================================================ FILE: ee/identitymanager/identity_managers/keycloak/keycloak_authverifier.py ================================================ import logging import os from fastapi import Depends, HTTPException from keep.api.core.config import config from keep.api.core.db import create_tenant, get_tenants from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase, oauth2_scheme from keep.identitymanager.rbac import Roles from keycloak import KeycloakOpenID, KeycloakOpenIDConnection from keycloak.connection import ConnectionManager from keycloak.keycloak_uma import KeycloakUMA from keycloak.uma_permissions import UMAPermission logger = logging.getLogger(__name__) # PATCH TO MONKEYPATCH KEYCLOAK VERIFY BUG # https://github.com/marcospereirampj/python-keycloak/issues/645 original_init = ConnectionManager.__init__ def patched_init( self, base_url: str, headers: dict = None, timeout: int = 60, verify: bool = None, proxies: dict = None, ): if verify is None: verify = os.environ.get("KEYCLOAK_VERIFY_CERT", "true").lower() == "true" logger.warning( "Using KEYCLOAK_VERIFY_CERT environment variable to set verify. ", extra={"KEYCLOAK_VERIFY_CERT": verify}, ) if headers is None: headers = {} original_init(self, base_url, headers, timeout, verify, proxies) ConnectionManager.__init__ = patched_init class KeycloakAuthVerifier(AuthVerifierBase): """Handles authentication and authorization for Keycloak""" def __init__(self, scopes: list[str] = []) -> None: super().__init__(scopes) self.keycloak_url = os.environ.get("KEYCLOAK_URL") self.keycloak_realm = os.environ.get("KEYCLOAK_REALM") self.keycloak_client_id = os.environ.get("KEYCLOAK_CLIENT_ID") self.keycloak_audience = os.environ.get("KEYCLOAK_AUDIENCE") self.keycloak_verify_cert = ( os.environ.get("KEYCLOAK_VERIFY_CERT", "true").lower() == "true" ) if ( not self.keycloak_url or not self.keycloak_realm or not self.keycloak_client_id ): raise Exception( "Missing KEYCLOAK_URL, KEYCLOAK_REALM or KEYCLOAK_CLIENT_ID environment variable" ) self.keycloak_client = KeycloakOpenID( server_url=self.keycloak_url, realm_name=self.keycloak_realm, client_id=self.keycloak_client_id, client_secret_key=os.environ.get("KEYCLOAK_CLIENT_SECRET"), verify=self.keycloak_verify_cert, ) self.keycloak_openid_connection = KeycloakOpenIDConnection( server_url=self.keycloak_url, realm_name=self.keycloak_realm, client_id=self.keycloak_client_id, client_secret_key=os.environ.get("KEYCLOAK_CLIENT_SECRET"), verify=self.keycloak_verify_cert, ) self.keycloak_uma = KeycloakUMA(connection=self.keycloak_openid_connection) # will be populated in on_start of the identity manager self.protected_resource = None self.roles_from_groups = config( "KEYCLOAK_ROLES_FROM_GROUPS", default=False, cast=bool ) self.groups_claims = config("KEYCLOAK_GROUPS_CLAIM", default="groups") self.groups_claims_admin = config( "KEYCLOAK_GROUPS_CLAIM_ADMIN", default="admin" ) self.groups_claims_noc = config("KEYCLOAK_GROUPS_CLAIM_NOC", default="noc") self.groups_claims_webhook = config( "KEYCLOAK_GROUPS_CLAIM_WEBHOOK", default="webhook" ) self.groups_org_prefix = config( "KEYCLOAK_GROUPS_ORG_PREFIX", default="keep" ).lower() self.keycloak_roles = { self.groups_claims_admin: Roles.ADMIN, self.groups_claims_noc: Roles.NOC, self.groups_claims_webhook: Roles.WEBHOOK, } if self.roles_from_groups: self.keycloak_multi_org = True else: self.keycloak_multi_org = False self.groups_separator = os.environ.get("KEYCLOAK_GROUPS_SEPERATOR", "-").lower() self._tenants = [] @property def tenants(self): if not self._tenants: tenants = get_tenants() self._tenants = { tenant.name: { "tenant_id": tenant.id, "tenant_logo_url": ( tenant.configuration.get("logo_url") if tenant.configuration else None ), } for tenant in tenants } return self._tenants def _reload_tenants(self): self._tenants = [] # access the property to reload the tenants tenants = self.tenants # log self.logger.info("Reloaded tenants", extra={"tenants": tenants}) def get_org_name_by_tenant_id(self, tenant_id): for org_name, org_tenant_id in self.tenants.items(): if org_tenant_id.get("tenant_id") == tenant_id: return org_name self.logger.error("Tenant id not found", extra={"tenant_id": tenant_id}) raise Exception("Org not found") def _check_if_group_represents_org(self, group_name: str): # if must start with the group prefix if not group_name.startswith( self.groups_org_prefix ) and not group_name.startswith("/" + self.groups_org_prefix): return False # TODO: dynamic roles + orgs # admin if group_name.endswith(self.groups_claims_admin): return True # noc if group_name.endswith(self.groups_claims_noc): return True # webhook if group_name.endswith(self.groups_claims_webhook): return True # if not, its not a group that represents an org return False def _get_org_name(self, group_name): # first, keycloak groups starts with "/" if group_name.startswith("/"): group_name = group_name[1:] # second, trim the role org_name = self.groups_separator.join( group_name.split(self.groups_separator)[0:-1] ) return org_name def _get_role_in_org(self, user_groups, org_name): # for the org_name (e.g. keep-org-a) iterate over the groups and find the role # e.g. /org-a-admin, /org-a-noc, /org-a-webhook # we want to iterate from the "strongest" to the "weakest" role for role, keep_role in self.keycloak_roles.items(): for group in user_groups: group_lower = group.lower() if org_name in group_lower and role in group_lower: return keep_role.value return None def _verify_bearer_token( self, token: str = Depends(oauth2_scheme) ) -> AuthenticatedEntity: # verify keycloak token try: # more than one tenant support if token.startswith("keepActiveTenant"): active_tenant, token = token.split("&") active_tenant = active_tenant.split("=")[1] else: active_tenant = None payload = self.keycloak_client.decode_token(token, validate=True) except Exception as e: if "Expired" in str(e): raise HTTPException(status_code=401, detail="Expired Keycloak token") raise HTTPException(status_code=401, detail="Invalid Keycloak token") tenant_id = payload.get("keep_tenant_id") email = payload.get("preferred_username") org_id = payload.get("active_organization", {}).get("id") org_realm = payload.get("active_organization", {}).get("name") if org_id is None or org_realm is None: logger.warning( "Invalid Keycloak configuration - no org information for user. Check organization mapper: https://github.com/keephq/keep/blob/main/keycloak/keep-realm.json#L93" ) # this allows more than one tenant to be configured in the same keycloak realm # todo: support dynamic roles user_orgs = {} if self.roles_from_groups: self.logger.info("Using roles from groups") # get roles from groups # e.g. # "group-keeps": [ # "/ORG-A-USERS", # "/ORG-B-USERS", # "/org-users" # ], groups = payload.get(self.groups_claims, []) groups_that_represent_orgs = [] # first, create tenants if they are not exists (should be happen once, new group) for group in groups: # first, check if its an org group (e.g. keep-org-a) group_lower = group.lower() if self._check_if_group_represents_org(group_name=group_lower): # check if its the configuration org_name = self._get_org_name(group_lower) groups_that_represent_orgs.append(group_lower) if org_name not in self.tenants: self.logger.info("Creating tenant") org_tenant_id = create_tenant(tenant_name=org_name) # so it won't be self.tenants[org_name] = { "tenant_id": org_tenant_id, "tenant_logo_url": None, } self.logger.info("Tenant created") # this will be returned to the UI user_orgs[org_name] = self.tenants.get(org_name) # TODO: fix if active_tenant: # get the active_tenant grou org_name = self.get_org_name_by_tenant_id(active_tenant) tenant_id = active_tenant if not tenant_id: self.logger.warning( "Tenant id not found, reloading tenants from db" ) self._reload_tenants() tenant_id = self.get_org_name_by_tenant_id(active_tenant) # if still if not tenant_id: self.logger.error( "Tenant id not found, raising exception", extra={"org_name": org_name}, ) raise HTTPException( status_code=401, detail="Invalid Keycloak token - could not find any group that represents the org and the role", ) role = self._get_role_in_org(groups, org_name) if not role: raise HTTPException( status_code=401, detail="Invalid Keycloak token - could not find any group that represents the org and the role", ) # if no active tenant, we take the first else: current_tenant_group = groups_that_represent_orgs[0] org_name = self._get_org_name(current_tenant_group) tenant_id = self.tenants.get(org_name).get("tenant_id") if not tenant_id: self.logger.warning( "Tenant id not found, reloading tenants from db" ) self._reload_tenants() tenant_id = self.tenants.get(org_name).get("tenant_id") # if still if not tenant_id: self.logger.error( "Tenant id not found, raising exception", extra={"org_name": org_name}, ) raise HTTPException( status_code=401, detail="Invalid Keycloak token - could not find any group that represents the org and the role", ) if self.groups_claims_admin in current_tenant_group: role = "admin" elif self.groups_claims_noc in current_tenant_group: role = "noc" elif self.groups_claims_webhook in current_tenant_group: role = "webhook" else: raise HTTPException( status_code=401, detail="Invalid Keycloak token - no role in groups", ) # Keycloak single tenant else: role = ( payload.get("resource_access", {}) .get(self.keycloak_client_id, {}) .get("roles", []) ) # filter out uma_protection role = [r for r in role if not r.startswith("uma_protection")] if not role: raise HTTPException( status_code=401, detail="Invalid Keycloak token - no role" ) role = role[0] # finally, check if the role is in the allowed roles authenticated_entity = AuthenticatedEntity( tenant_id, email, None, role, org_id=org_id, org_realm=org_realm, token=token, ) if user_orgs: authenticated_entity.user_orgs = user_orgs return authenticated_entity def _authorize(self, authenticated_entity: AuthenticatedEntity) -> None: # multi org does not support UMA for now: if self.keycloak_multi_org: return super()._authorize(authenticated_entity) # API key auth does not carry a Keycloak token; fall back to RBAC if not getattr(authenticated_entity, "token", None): return super()._authorize(authenticated_entity) # for single tenant Keycloaks, use Keycloak's UMA to authorize try: permission = UMAPermission( resource=self.protected_resource, scope=self.scopes[0], # todo: handle multiple scopes per resource ) self.logger.info(f"Checking permission {permission}") allowed = self.keycloak_uma.permissions_check( token=authenticated_entity.token, permissions=[permission] ) self.logger.info(f"Permission check result: {allowed}") if not allowed: raise HTTPException(status_code=403, detail="Permission check failed") # secure fallback except Exception as e: raise HTTPException( status_code=403, detail="Permission check failed - " + str(e) ) return allowed def authorize_resource( self, resource_type, resource_id, authenticated_entity: AuthenticatedEntity ) -> None: # API key auth does not carry a Keycloak token; skip per-resource UMA check if not getattr(authenticated_entity, "token", None): return # use Keycloak's UMA to authorize try: permission = UMAPermission( resource=resource_id, ) allowed = self.keycloak_uma.permissions_check( token=authenticated_entity.token, permissions=[permission] ) if not allowed: raise HTTPException(status_code=401, detail="Permission check failed") # secure fallback except Exception: raise HTTPException(status_code=401, detail="Permission check failed") return allowed ================================================ FILE: ee/identitymanager/identity_managers/keycloak/keycloak_identitymanager.py ================================================ import json import os import requests from fastapi import HTTPException from fastapi.routing import APIRoute from starlette.routing import Route from ee.identitymanager.identity_managers.keycloak.keycloak_authverifier import ( KeycloakAuthVerifier, ) from keep.api.core.config import config from keep.api.core.db import get_resource_ids_by_resource_type from keep.api.models.user import Group, PermissionEntity, ResourcePermission, Role, User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase, get_all_scopes from keep.identitymanager.identitymanager import PREDEFINED_ROLES, BaseIdentityManager from keycloak import KeycloakAdmin from keycloak.exceptions import KeycloakDeleteError, KeycloakGetError, KeycloakPostError from keycloak.openid_connection import KeycloakOpenIDConnection # Some good sources on this topic: # 1. https://stackoverflow.com/questions/42186537/resources-scopes-permissions-and-policies-in-keycloak # 2. MUST READ - https://www.keycloak.org/docs/24.0.4/authorization_services/ # 3. ADMIN REST API - https://www.keycloak.org/docs-api/22.0.1/rest-api/index.html # 4. (TODO) PROTECTION API - https://www.keycloak.org/docs/latest/authorization_services/index.html#_service_protection_api class KeycloakIdentityManager(BaseIdentityManager): """ RESOURCES = { "preset": { "table": "preset", "uid": "id", }, "incident": { "table": "incident", "uid": "id", }, } """ RESOURCES = {} def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.server_url = os.environ.get("KEYCLOAK_URL") self.keycloak_verify_cert = ( os.environ.get("KEYCLOAK_VERIFY_CERT", "true").lower() == "true" ) try: self.keycloak_admin = KeycloakAdmin( server_url=os.environ["KEYCLOAK_URL"] + "/admin", username=os.environ.get("KEYCLOAK_ADMIN_USER"), password=os.environ.get("KEYCLOAK_ADMIN_PASSWORD"), realm_name=os.environ["KEYCLOAK_REALM"], verify=self.keycloak_verify_cert, ) self.client_id = self.keycloak_admin.get_client_id( os.environ["KEYCLOAK_CLIENT_ID"] ) self.keycloak_id_connection = KeycloakOpenIDConnection( server_url=os.environ["KEYCLOAK_URL"], client_id=os.environ["KEYCLOAK_CLIENT_ID"], realm_name=os.environ["KEYCLOAK_REALM"], client_secret_key=os.environ["KEYCLOAK_CLIENT_SECRET"], verify=self.keycloak_verify_cert, ) self.admin_url = f'{os.environ["KEYCLOAK_URL"]}/admin/realms/{os.environ["KEYCLOAK_REALM"]}/clients/{self.client_id}' self.admin_url_without_client = f'{os.environ["KEYCLOAK_URL"]}/admin/realms/{os.environ["KEYCLOAK_REALM"]}' self.realm = os.environ["KEYCLOAK_REALM"] # if Keep controls the Keycloak server so it have event listener # for future use self.keep_controlled_keycloak = ( os.environ.get("KEYCLOAK_KEEP_CONTROLLED", "false") == "true" ) # Does ABAC is enabled self.abac_enabled = ( os.environ.get("KEYCLOAK_ABAC_ENABLED", "true") == "true" ) self.keycloak_multi_org = config( "KEYCLOAK_ROLES_FROM_GROUPS", default=False, cast=bool ) except Exception as e: self.logger.error( "Failed to initialize Keycloak Identity Manager: %s", str(e) ) raise self.logger.info("Keycloak Identity Manager initialized") def on_start(self, app) -> None: # if the on start process is disabled: if os.environ.get("SKIP_KEYCLOAK_ONSTART", "false") == "true": self.logger.info("Skipping keycloak on start") return # first, create all the scopes for scope in get_all_scopes(): self.logger.info("Creating scope: %s", scope) self.create_scope(scope) self.logger.info("Scope created: %s", scope) # create resource for each route for route in app.routes: self.logger.info("Creating resource for route %s", route.path) # fetch the scopes for this route from the auth dependency if isinstance(route, Route) and not isinstance(route, APIRoute): self.logger.info("Skipping route: %s", route.path) continue if not route.dependant.dependencies: self.logger.warning("Skipping unprotected route: %s", route.path) continue scopes = [] for dep in route.dependant.dependencies: # for routes that have other dependencies if not isinstance(dep.cache_key[0], KeycloakAuthVerifier): continue scopes = dep.cache_key[0].scopes # this is the KeycloakAuthVerifier dependency :) methods = list(route.methods) if len(methods) > 1: self.logger.warning( "Keep does not support multiple methods for a single route", ) continue protected_resource = methods[0] + " " + route.path dep.cache_key[0].protected_resource = protected_resource break # protected route but without scopes if not scopes: self.logger.warning("Route without scopes: %s", route.path) self.create_resource( protected_resource, scopes=scopes, resource_type="keep_route" ) self.logger.info("Resource created for route: %s", route.path) # another thing we need to do is to add a /auth/user/orgs endpoint that will # return the orgs of the user for TenantSwitcher in the UI if self.keycloak_multi_org: self.logger.info("Creating /auth/user/orgs endpoint") from fastapi import Depends from keep.identitymanager.identitymanagerfactory import ( IdentityManagerFactory, ) # we want to add it only once to skip endless loop current_routes = [route.path for route in app.routes] if "/auth/user/orgs" not in current_routes: self.logger.info("Adding /auth/user/orgs endpoint") # add the endpoint @app.get("/auth/user/orgs") def tenant( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier([]) ), ): tenants = authenticated_entity.user_orgs return tenants # create resource for each object if self.abac_enabled: for resource_type, resource_type_data in self.RESOURCES.items(): self.logger.info("Creating resource for object %s", resource_type) resources = get_resource_ids_by_resource_type( tenant_id=self.tenant_id, table_name=resource_type_data["table"], uid=resource_type_data["uid"], ) for resource_id in resources: resource_name = f"{resource_type}_{resource_id}" resource_type_name = f"keep_{resource_type}" self.create_resource( resource_name=resource_name, scopes=[], resource_type=resource_type_name, ) self.logger.info("Resource created for object: %s", resource_type) for role in PREDEFINED_ROLES: self.logger.info("Creating role: %s", role) self.create_role(role, predefined=True) self.logger.info("Role created: %s", role) def _scope_name_to_id(self, all_scopes, scope_name: str) -> str: # if its ":*": if scope_name.split(":")[1] == "*": scope_verb = scope_name.split(":")[0] scope_ids = [ scope["id"] for scope in all_scopes if scope["name"].startswith(scope_verb) ] return scope_ids else: scope = next( (scope for scope in all_scopes if scope["name"] == scope_name), None, ) if not scope: self.logger.error( "Scope %s not found in Keycloak", scope_name, extra={"scopes": all_scopes}, ) return [] return [scope["id"]] def get_permission_by_name(self, permission_name): permissions = self.keycloak_admin.get_client_authz_permissions(self.client_id) permission = next( ( permission for permission in permissions if permission["name"] == permission_name ), None, ) return permission def create_scope_based_permission(self, role: Role, policy_id: str) -> None: try: scopes = role.scopes all_scopes = self.keycloak_admin.get_client_authz_scopes(self.client_id) scopes_ids = set() for scope in scopes: scope_ids = self._scope_name_to_id(all_scopes, scope) scopes_ids.update(scope_ids) resp = self.keycloak_admin.create_client_authz_scope_permission( client_id=self.client_id, payload={ "name": f"Permission for {role.name}", "scopes": list(scopes_ids), "policies": [policy_id], "resources": [], "decisionStrategy": "Affirmative".upper(), "type": "scope", "logic": "POSITIVE", }, ) return resp except KeycloakPostError as e: # if the permissions already exists, just update it if "already exists" in str(e): self.logger.info("Scope based permission already exists in Keycloak") # let's try to update try: permission = self.get_permission_by_name( f"Permission for {role.name}" ) permission_id = permission.get("id") resp = self.keycloak_admin.connection.raw_put( path=f"{self.admin_url}/authz/resource-server/permission/scope/{permission_id}", client_id=self.client_id, data=json.dumps( { "name": f"Permission for {role.name}", "scopes": list(scopes_ids), "policies": [policy_id], "resources": [], "decisionStrategy": "Affirmative".upper(), "type": "scope", "logic": "POSITIVE", } ), ) except Exception: pass else: self.logger.error( "Failed to create scope based permission in Keycloak: %s", str(e) ) raise HTTPException( status_code=500, detail="Failed to create scope based permission" ) def create_scope(self, scope: str) -> None: try: self.keycloak_admin.create_client_authz_scopes( self.client_id, { "name": scope, "displayName": f"Scope for {scope}", }, ) except KeycloakPostError as e: self.logger.error("Failed to create scopes in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create scopes") def create_role(self, role: Role, predefined=False) -> str: try: role_name = self.keycloak_admin.create_client_role( self.client_id, { "name": role.name, "description": f"Role for {role.name}", # we will use this to identify the role as predefined "attributes": { "predefined": [str(predefined).lower()], }, }, skip_exists=True, ) role_id = self.keycloak_admin.get_client_role_id(self.client_id, role_name) # create the role policy policy_id = self.create_role_policy(role_id, role.name, role.description) # create the scope based permission self.create_scope_based_permission(role, policy_id) return role_id except KeycloakPostError as e: if "already exists" in str(e): self.logger.info("Role already exists in Keycloak") # its ok! pass else: self.logger.error("Failed to create roles in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create roles") def update_role(self, role_id: str, role: Role) -> str: # just update the policy role_id = self.keycloak_admin.get_client_role_id(self.client_id, role.name) scopes = role.scopes all_scopes = self.keycloak_admin.get_client_authz_scopes(self.client_id) scopes_ids = set() for scope in scopes: scope_ids = self._scope_name_to_id(all_scopes, scope) scopes_ids.update(scope_ids) # get the scope-based permission permissions = self.keycloak_admin.get_client_authz_permissions(self.client_id) permission = next( ( permission for permission in permissions if permission["name"] == f"Permission for {role.name}" ), None, ) if not permission: raise HTTPException(status_code=404, detail="Permission not found") permission_id = permission["id"] permission["scopes"] = list(scopes_ids) resp = self.keycloak_admin.connection.raw_put( f"{self.admin_url}/authz/resource-server/permission/scope/{permission_id}", data=json.dumps(permission), ) resp.raise_for_status() return role_id def create_role_policy(self, role_id: str, role_name: str, role_description) -> str: try: resp = self.keycloak_admin.connection.raw_post( f"{self.admin_url}/authz/resource-server/policy/role", data=json.dumps( { "name": f"Allow {role_name} to {role_description}", "description": f"Allow {role_name} to {role_description}", # future use "roles": [{"id": role_id, "required": False}], "logic": "POSITIVE", "fetchRoles": False, } ), ) resp.raise_for_status() resp = resp.json() return resp.get("id") except requests.exceptions.HTTPError as e: if "Conflict" in str(e): self.logger.info("Policy already exists in Keycloak") # get its id policies = self.get_policies() # find by name policy = next( ( policy for policy in policies if policy["name"] == f"Allow {role_name} to {role_description}" ), None, ) return policy["id"] else: self.logger.error("Failed to create policies in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create policies") except Exception as e: self.logger.error("Failed to create policies in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create policies") @property def support_sso(self) -> bool: return True def get_sso_providers(self) -> list[str]: return [] def get_sso_wizard_url(self, authenticated_entity: AuthenticatedEntity) -> str: tenant_realm = authenticated_entity.org_realm org_id = authenticated_entity.org_id return f"{self.server_url}realms/{tenant_realm}/wizard/?org_id={org_id}/#iss={self.server_url}/realms/{tenant_realm}" def get_users(self) -> list[User]: try: # TODO: query only users that Keep created (so not show all LDAP users) users = self.keycloak_admin.get_users({}) users = [user for user in users if "firstName" in user] users_dto = [] for user in users: # todo: should be more efficient groups = self.keycloak_admin.get_user_groups(user["id"]) groups = [ { "id": group["id"], "name": group["name"], } for group in groups ] role = self.get_user_current_role(user_id=user.get("id")) user_dto = User( email=user.get("email", ""), name=user.get("firstName", ""), role=role, created_at=user.get("createdTimestamp", ""), ldap=( True if user.get("attributes", {}).get("LDAP_ID", False) else False ), last_login=user.get("attributes", {}).get("last-login", [""])[0], groups=groups, ) users_dto.append(user_dto) return users_dto except KeycloakGetError as e: self.logger.error("Failed to fetch users from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to fetch users") def create_user( self, user_email: str, user_name: str, password: str, role: list[str], groups: list[str], ) -> dict: try: user_data = { "username": user_email, "email": user_email, "enabled": True, "firstName": user_name, "lastName": user_name, "emailVerified": True, } if password: user_data["credentials"] = [ {"type": "password", "value": password, "temporary": False} ] user_id = self.keycloak_admin.create_user(user_data) if role: role_id = self.keycloak_admin.get_client_role_id(self.client_id, role) self.keycloak_admin.assign_client_role( client_id=self.client_id, user_id=user_id, roles=[{"id": role_id, "name": role}], ) for group in groups: self.add_user_to_group(user_id=user_id, group=group) return { "status": "success", "message": "User created successfully", "user_id": user_id, } except KeycloakPostError as e: if "User exists" in str(e): self.logger.error( "Failed to create user - user %s already exists", user_email ) raise HTTPException( status_code=409, detail=f"Failed to create user - user {user_email} already exists", ) self.logger.error("Failed to create user in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create user") def get_user_id_by_email(self, user_email: str) -> str: user_id = self.keycloak_admin.get_users(query={"email": user_email}) if not user_id: self.logger.error("User does not exists") raise HTTPException(status_code=404, detail="User does not exists") elif len(user_id) > 1: self.logger.error("Multiple users found") raise HTTPException( status_code=500, detail="Multiple users found, please contact admin" ) user_id = user_id[0]["id"] return user_id def get_user_current_role(self, user_id: str) -> str: current_role = ( self.keycloak_admin.connection.raw_get( self.admin_url_without_client + f"/users/{user_id}/role-mappings" ) .json() .get("clientMappings", {}) .get(self.realm, {}) .get("mappings") ) if current_role: # remove uma protection current_role = [ role for role in current_role if role["name"] != "uma_protection" ] # if uma_protection is the only role, then the user has no role if current_role: return current_role[0]["name"] else: return None else: return None def add_user_to_group(self, user_id: str, group: str): resp = self.keycloak_admin.connection.raw_put( f"{self.admin_url_without_client}/users/{user_id}/groups/{group}", data=json.dumps({}), ) resp.raise_for_status() def update_user(self, user_email: str, update_data: dict) -> dict: try: user_id = self.get_user_id_by_email(user_email) if "role" in update_data and update_data["role"]: role = update_data["role"] # get current role and understand if needs to be updated: current_role = self.get_user_current_role(user_id) # update the role only if its different than current # TODO: more than one role if current_role != role: role_id = self.keycloak_admin.get_client_role_id( self.client_id, role ) if not role_id: self.logger.error("Role does not exists") raise HTTPException( status_code=404, detail="Role does not exists" ) self.keycloak_admin.assign_client_role( client_id=self.client_id, user_id=user_id, roles=[{"id": role_id, "name": role}], ) if "groups" in update_data and update_data["groups"]: # get the current groups groups = self.keycloak_admin.get_user_groups(user_id) groups_ids = [g.get("id") for g in groups] # calc with groups needs to be removed and which to be added groups_to_remove = [ group_id for group_id in groups_ids if group_id not in update_data["groups"] ] groups_to_add = [ group for group in update_data["groups"] if group not in groups_ids ] # remove for group in groups_to_remove: self.logger.info("Leaving group") resp = self.keycloak_admin.connection.raw_delete( f"{self.admin_url_without_client}/users/{user_id}/groups/{group}" ) resp.raise_for_status() self.logger.info("Left group") # add for group in groups_to_add: self.logger.info("Joining group") self.add_user_to_group(user_id=user_id, group=group) self.logger.info("Joined group") return {"status": "success", "message": "User updated successfully"} except KeycloakPostError as e: self.logger.error("Failed to update user in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to update user") def delete_user(self, user_email: str) -> dict: try: user_id = self.get_user_id_by_email(user_email) self.keycloak_admin.delete_user(user_id) # delete the policy for the user (if not implicitly deleted?) return {"status": "success", "message": "User deleted successfully"} except KeycloakDeleteError as e: self.logger.error("Failed to delete user from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to delete user") def get_auth_verifier(self, scopes: list) -> AuthVerifierBase: return KeycloakAuthVerifier(scopes) def create_resource( self, resource_name: str, scopes: list[str] = [], resource_type="keep_generic", attributes={}, ) -> None: resource = { "name": resource_name, "displayName": f"Resource for {resource_name}", "type": "urn:keep:resources:" + resource_type, "scopes": [{"name": scope} for scope in scopes], "attributes": attributes, } try: self.keycloak_admin.create_client_authz_resource(self.client_id, resource) except KeycloakPostError as e: if "already exists" in str(e): self.logger.info("Resource already exists in Keycloak") pass else: self.logger.error("Failed to create resource in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create resource") def delete_resource(self, resource_id: str) -> None: try: resources = self.keycloak_admin.get_client_authz_resources( os.environ["KEYCLOAK_CLIENT_ID"] ) for resource in resources: if resource["uris"] == ["/resource/" + resource_id]: self.keycloak_admin.delete_client_authz_resource( os.environ["KEYCLOAK_CLIENT_ID"], resource["id"] ) except KeycloakDeleteError as e: self.logger.error("Failed to delete resource from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to delete resource") def get_groups(self) -> list[dict]: try: groups = self.keycloak_admin.get_groups( query={"briefRepresentation": False} ) result = [] for group in groups: group_id = group["id"] group_name = group["name"] roles = group.get("clientRoles", {}).get("keep", []) # Fetch members for each group members = self.keycloak_admin.get_group_members(group_id) member_names = [member.get("email", "") for member in members] member_count = len(members) result.append( Group( id=group_id, name=group_name, roles=roles, memberCount=member_count, members=member_names, ) ) return result except KeycloakGetError as e: self.logger.error("Failed to fetch groups from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to fetch groups") def create_user_policy(self, perm, permission: ResourcePermission) -> None: # we need the user id from email: # TODO: this is not efficient, we should cache this users = self.keycloak_admin.get_users({}) user = next( (user for user in users if user.get("email") == perm.id), None, ) if not user: raise HTTPException(status_code=400, detail="User not found") resp = self.keycloak_admin.connection.raw_post( f"{self.admin_url}/authz/resource-server/policy/user", data=json.dumps( { "name": f"Allow user {user.get('id')} to access resource type {permission.resource_type} with name {permission.resource_name}", "description": json.dumps( { "user_id": user.get("id"), "user_email": user.get("email"), "resource_id": permission.resource_id, } ), "logic": "POSITIVE", "users": [user.get("id")], } ), ) try: resp.raise_for_status() # 409 is ok, it means the policy already exists except Exception as e: if resp.status_code != 409: raise e # just continue to next policy else: return None policy_id = resp.json().get("id") return policy_id def create_group_policy(self, perm, permission: ResourcePermission) -> None: group_name = perm.id group = self.keycloak_admin.get_groups(query={"search": perm.id}) if not group or len(group) > 1: self.logger.error("Problem with group - should be 1 but got %s", len(group)) raise HTTPException(status_code=400, detail="Problem with group") group = group[0] group_id = group["id"] resp = self.keycloak_admin.connection.raw_post( f"{self.admin_url}/authz/resource-server/policy/group", data=json.dumps( { "name": f"Allow group {perm.id} to access resource type {permission.resource_type} with name {permission.resource_name}", "description": json.dumps( { "group_name": group_name, "group_id": group_id, "resource_id": permission.resource_id, } ), "logic": "POSITIVE", "groups": [{"id": group_id, "extendChildren": False}], "groupsClaim": "", } ), ) try: resp.raise_for_status() # 409 is ok, it means the policy already exists except Exception as e: if resp.status_code != 409: raise e # just continue to next policy else: return None policy_id = resp.json().get("id") return policy_id def create_permissions(self, permissions: list[ResourcePermission]) -> None: # create or update try: existing_permissions = self.keycloak_admin.get_client_authz_permissions( self.client_id, ) existing_permission_names_to_permissions = { permission["name"]: permission for permission in existing_permissions } for permission in permissions: # 1. first, create the resource if its not already created resp = self.keycloak_admin.create_client_authz_resource( self.client_id, { "name": permission.resource_id, "displayName": permission.resource_name, "type": "urn:keep:resources:keep_" + permission.resource_type, "scopes": [], }, skip_exists=True, ) # 2. create the policy if it doesn't exist: policies = [] for perm in permission.permissions: try: if perm.type == "user": policy_id = self.create_user_policy(perm, permission) if policy_id: policies.append(policy_id) else: self.logger.info("Policy already exists in Keycloak") else: policy_id = self.create_group_policy(perm, permission) if policy_id: policies.append(policy_id) else: self.logger.info("Policy already exists in Keycloak") except KeycloakPostError as e: if "already exists" in str(e): self.logger.info("Policy already exists in Keycloak") # its ok! pass else: self.logger.error( "Failed to create policy in Keycloak: %s", str(e) ) raise HTTPException( status_code=500, detail="Failed to create policy" ) except Exception as e: self.logger.error( "Failed to create policy in Keycloak: %s", str(e) ) raise HTTPException( status_code=500, detail="Failed to create policy" ) # 3. Finally, create the resource # 3.0 try to get the resource based permission permission_name = f"Permission on resource type {permission.resource_type} with name {permission.resource_name}" if existing_permission_names_to_permissions.get(permission_name): # update the permission existing_permissions = existing_permission_names_to_permissions[ permission_name ] existing_permission_id = existing_permissions["id"] # if no new policies, continue if not policies: existing_permissions["policies"] = [] else: # add the new policies associated_policies = self.keycloak_admin.get_client_authz_permission_associated_policies( self.client_id, existing_permission_id ) existing_permissions["policies"] = [ policy["id"] for policy in associated_policies ] existing_permissions["policies"].extend(policies) # update the policy to include the new policy resp = self.keycloak_admin.connection.raw_put( f"{self.admin_url}/authz/resource-server/permission/resource/{existing_permission_id}", data=json.dumps(existing_permissions), ) resp.raise_for_status() else: # 3.2 else, create it self.keycloak_admin.create_client_authz_resource_based_permission( self.client_id, { "type": "resource", "name": f"Permission on resource type {permission.resource_type} with name {permission.resource_name}", "scopes": [], "policies": policies, "resources": [ permission.resource_id, ], "decisionStrategy": "Affirmative".upper(), }, ) except KeycloakPostError as e: if "already exists" in str(e): self.logger.info("Permission already exists in Keycloak") raise HTTPException(status_code=409, detail="Permission already exists") else: self.logger.error( "Failed to create permissions in Keycloak: %s", str(e) ) raise HTTPException( status_code=500, detail="Failed to create permissions" ) except Exception as e: self.logger.error("Failed to create permissions in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create permissions") def get_permissions(self) -> list[ResourcePermission]: try: resources = self.keycloak_admin.get_client_authz_resources(self.client_id) resources_to_policies = {} permissions = self.keycloak_admin.get_client_authz_permissions( self.client_id ) for permission in permissions: # if its a scope permission, skip it if permission["type"] == "scope": continue permission_id = permission["id"] associated_policies = ( self.keycloak_admin.get_client_authz_permission_associated_policies( self.client_id, permission_id ) ) for policy in associated_policies: try: details = json.loads(policy["description"]) # with Keep convention, the description should be a json except json.JSONDecodeError: self.logger.warning( "Failed to parse policy description: %s", policy["description"], ) continue resource_id = details["resource_id"] if resource_id not in resources_to_policies: resources_to_policies[resource_id] = [] if policy.get("type") == "user": user_email = details.get("user_email") resources_to_policies[resource_id].append( {"id": user_email, "type": "user"} ) else: group_name = details.get("group_name") resources_to_policies[resource_id].append( {"id": group_name, "type": "group"} ) permissions_dto = [] for resource in resources: resource_id = resource["name"] resource_name = resource["displayName"] resource_type = resource["type"] permissions_dto.append( ResourcePermission( resource_id=resource_id, resource_name=resource_name, resource_type=resource_type, permissions=[ PermissionEntity( id=policy["id"], name=policy.get("name", ""), type=policy["type"], ) for policy in resources_to_policies.get(resource_id, []) ], ) ) return permissions_dto except KeycloakGetError as e: self.logger.error("Failed to fetch permissions from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to fetch permissions") except Exception as e: self.logger.error("Failed to fetch permissions from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to fetch permissions") # TODO: this should use UMA and not evaluation since evaluation needs admin access def get_user_permission_on_resource_type( self, resource_type: str, authenticated_entity: AuthenticatedEntity ) -> list[ResourcePermission]: """ Get permissions for a specific user on a specific resource type. Args: resource_type (str): The type of resource for which to retrieve permissions. user_id (str): The ID of the user for which to retrieve permissions. Returns: list: A list of permission objects. """ # there is two ways to do this: # 1. admin api # 2. token endpoint directly # we will use the admin api and put (2) on TODO # https://keycloak.discourse.group/t/keyycloak-authz-policy-evaluation-using-rest-api/798/2 # https://keycloak.discourse.group/t/how-can-i-evaluate-user-permission-over-rest-api/10619 # also, we should see how it scale with many resources try: user_id = self.keycloak_admin.get_user_id(authenticated_entity.email) resource_type = f"urn:keep:resources:keep_{resource_type}" resp = self.keycloak_admin.connection.raw_post( f"{self.admin_url}/authz/resource-server/policy/evaluate", data=json.dumps( { "userId": user_id, "resources": [ { "type": resource_type, } ], "context": {"attributes": {}}, "clientId": self.client_id, } ), ) results = resp.json() results = results.get("results", []) allowed_resources_ids = [ result["resource"]["name"] for result in results if result["status"] == "PERMIT" ] # there is some bug/limitation in keycloak where if the resource_type does not exist, it returns # all other objects, so lets handle it by checking if the word "with" is one of the results name if any("with" in result for result in allowed_resources_ids): return [] return allowed_resources_ids except Exception as e: self.logger.error( "Failed to fetch user permissions from Keycloak: %s", str(e) ) raise HTTPException( status_code=500, detail="Failed to fetch user permissions" ) def get_policies(self) -> list[dict]: try: policies = self.keycloak_admin.connection.raw_get( f"{self.admin_url}/authz/resource-server/policy" ).json() return policies except KeycloakGetError as e: self.logger.error("Failed to fetch policies from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to fetch policies") def get_roles(self) -> list[Role]: """ Get roles in the identity manager for authorization purposes. This method is used to retrieve the roles that have been defined in the identity manager. It returns a list of role objects, each containing the resource, scope, and user or group information. # TODO: Still to review if this is the correct way to fetch roles """ try: roles = self.keycloak_admin.get_client_roles( self.client_id, brief_representation=False ) # filter out the uma role roles = [role for role in roles if role["name"] != "uma_protection"] roles_dto = { role.get("id"): Role( id=role.get("id"), name=role["name"], description=role["description"], scopes=set([]), # will populate this later predefined=( True if role.get("attributes", {}).get("predefined", ["false"])[0] == "true" else False ), ) for role in roles } # now for each role we need to get the scopes policies = self.keycloak_admin.get_client_authz_policies(self.client_id) roles_related_policies = [ policy for policy in policies if policy.get("config", {}).get("roles", []) ] for policy in roles_related_policies: role_id = json.loads(policy["config"]["roles"])[0].get("id") policy_id = policy["id"] # get dependent permissions dependentPolicies = self.keycloak_admin.connection.raw_get( f"{self.admin_url}/authz/resource-server/policy/{policy_id}/dependentPolicies", ).json() dependentPoliciesId = dependentPolicies[0].get("id") scopes = self.keycloak_admin.connection.raw_get( f"{self.admin_url}/authz/resource-server/policy/{dependentPoliciesId}/scopes", ).json() scope_names = [scope["name"] for scope in scopes] # happens only when delete role fails from some resaon if role_id not in roles_dto: self.logger.warning("Role not found for policy, skipping") continue roles_dto[role_id].scopes.update(scope_names) return list(roles_dto.values()) except KeycloakGetError as e: self.logger.error("Failed to fetch roles from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to fetch roles") def get_role_by_role_name(self, role_name: str) -> Role: roles = self.get_roles() role = next((role for role in roles if role.name == role_name), None) if not role: self.logger.error("Role not found") raise HTTPException(status_code=404, detail="Role not found") return role def delete_role(self, role_id: str) -> None: try: # delete the role resp = self.keycloak_admin.connection.raw_delete( f"{self.admin_url_without_client}/roles-by-id/{role_id}", ) resp.raise_for_status() # delete the policy policies = self.get_policies() for policy in policies: roles = json.loads(policy.get("config", {}).get("roles", "{}")) if roles and roles[0].get("id") == role_id: policy_id = policy.get("id") break if not policy_id: self.logger.warning("Policy not found for role deletion, skipping") else: self.logger.info("Deleteing policy id") self.keycloak_admin.delete_client_authz_policy( self.client_id, policy_id ) self.logger.info("Policy id deleted") # permissions gets deleted impliclty when we delete the policy except KeycloakDeleteError as e: self.logger.error("Failed to delete role from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to delete role") def create_group( self, group_name: str, members: list[str], roles: list[str] ) -> None: try: # create it group_id = self.keycloak_admin.create_group( { "name": group_name, } ) # add members for member in members: user_id = self.get_user_id_by_email(member) self.keycloak_admin.group_user_add(user_id=user_id, group_id=group_id) # assign roles for role in roles: role_id = self.keycloak_admin.get_client_role_id(self.client_id, role) self.keycloak_admin.assign_group_client_roles( client_id=self.client_id, group_id=group_id, roles=[{"id": role_id, "name": role}], ) except KeycloakPostError as e: if "already exists" in str(e): self.logger.info("Group already exists in Keycloak") pass else: self.logger.error("Failed to create group in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to create group") def update_group( self, group_name: str, members: list[str], roles: list[str] ) -> None: try: # get the group id groups = self.keycloak_admin.get_groups(query={"search": group_name}) if not groups: self.logger.error("Group not found") raise HTTPException(status_code=404, detail="Group not found") group_id = groups[0]["id"] # check what members needs to be added and which to be removed existing_members = self.keycloak_admin.get_group_members(group_id) existing_members = [member.get("email") for member in existing_members] members_to_add = [ member for member in members if member not in existing_members ] members_to_remove = [ member for member in existing_members if member not in members ] # remove members for member in members_to_remove: user_id = self.get_user_id_by_email(member) self.keycloak_admin.group_user_remove( user_id=user_id, group_id=group_id ) # add members for member in members_to_add: user_id = self.get_user_id_by_email(member) self.keycloak_admin.group_user_add(user_id=user_id, group_id=group_id) # check what roles needs to be added and which to be removed existing_roles = self.keycloak_admin.get_group_client_roles( client_id=self.client_id, group_id=group_id ) existing_roles = [role["name"] for role in existing_roles] roles_to_add = [role for role in roles if role not in existing_roles] roles_to_remove = [role for role in existing_roles if role not in roles] # remove roles for role in roles_to_remove: role_id = self.keycloak_admin.get_client_role_id(self.client_id, role) self.keycloak_admin.connection.raw_delete( f"{self.admin_url_without_client}/groups/{group_id}/role-mappings/clients/{self.client_id}", payload={ "client": self.client_id, "group": group_id, "roles": [{"id": role_id, "name": role}], }, ) # assign roles for role in roles_to_add: role_id = self.keycloak_admin.get_client_role_id(self.client_id, role) self.keycloak_admin.assign_group_client_roles( client_id=self.client_id, group_id=group_id, roles=[{"id": role_id, "name": role}], ) except KeycloakPostError as e: self.logger.error("Failed to update group in Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to update group") def delete_group(self, group_name: str) -> None: try: groups = self.keycloak_admin.get_groups(query={"search": group_name}) if not groups: self.logger.error("Group not found") raise HTTPException(status_code=404, detail="Group not found") group_id = groups[0]["id"] self.keycloak_admin.delete_group(group_id) except KeycloakDeleteError as e: self.logger.error("Failed to delete group from Keycloak: %s", str(e)) raise HTTPException(status_code=500, detail="Failed to delete group") ================================================ FILE: elk/README.md ================================================ # ELK-stack integration This directory contains the configuration files and Docker services needed to run Keep with a filebeat container. Useful if you want to test integration of Keep backend logs with Logstash and Kibana. ## Directory Structure ``` proxy/ ├── docker-compose-elk.yml # Docker Compose configuration for elk integtation ├── filebeat.yaml # Filebeat configuration file ├── logstash.conf # Logstash configuration example to save keep-backend logs └── README.md # This files ``` ## Components The setup consists of several services: - **Filebeat**: Filebeat container to push keep-backend logs to logstash - **Keep Frontend**: The Keep UI service configured to use the proxy - **Keep Backend**: The Keep API service - **Keep WebSocket**: The WebSocket server for real-time updates ## Configuration ### Environment Variables ```env LOGSTASH_HOST=logstash-host LOGSTASH_PORT=5044 ``` ### Usage 1. Start the elk environment: ```bash docker compose -f docker-compose-elk.yml up ``` 2. To run in detached mode: ```bash docker compose -f docker-compose-elk.yml up -d ``` 3. To stop all services: ```bash docker compose -f docker-compose-elk.yml down ``` ### Accessing Services - Keep Backend: http://localhost:8080 - Kibana: http://localhost:5601 ### Kibana configuration - Goto http://localhost:5601/app/discover - Click "Create Data view" - Add any name you want - Add index pattern to `keep-backend-logs-*` - Save data view and insect logs ## Custom Configuration ### Modifying Proxy Settings To modify the Filebeat configuration: 1. Edit `filebeat.yml` 2. Restart the filebeat service: ```bash docker compose -f docker-compose-elk.yml restart filebeat ``` ### Modifying Logstash Settings To modify the Logstash configuration: 1. Edit `logstash.conf` 2. Restart the logstash service: ```bash docker compose -f docker-compose-elk.yml restart logstash ``` ## Security Considerations - This setup is intended for development environments only - SSL is disabled for all services for simplification ## Contributing When modifying the elk setup: 1. Document any changes to configuration files 2. Test the setup of elk environments 3. Update this README if adding new features or configurations ================================================ FILE: elk/docker-compose-elk.yml ================================================ services: keep-backend-elk: extends: file: ../docker-compose.common.yml service: keep-backend-common image: us-central1-docker.pkg.dev/keephq/keep/keep-api environment: - AUTH_TYPE=NO_AUTH volumes: - ./state:/state keep-websocket-server: extends: file: ../docker-compose.common.yml service: keep-websocket-server-common elastic: image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0 labels: co.elastic.logs/module: elasticsearch volumes: - elastic_data:/usr/share/elasticsearch/data ports: - "9200:9200" environment: - node.name=elastic - cluster.name=keep-elk - discovery.type=single-node - ELASTIC_PASSWORD=elastic - bootstrap.memory_lock=true - xpack.security.enabled=false - xpack.security.enrollment.enabled=false - xpack.security.transport.ssl.enabled=false - xpack.license.self_generated.type=basic kibana: depends_on: - elastic image: docker.elastic.co/kibana/kibana:8.17.0 labels: co.elastic.logs/module: kibana volumes: - kibana_data:/usr/share/kibana/data ports: - 5601:5601 environment: - SERVERNAME=kibana - ELASTICSEARCH_HOSTS=http://elastic:9200 - ELASTICSEARCH_USERNAME=kibana_system - ELASTICSEARCH_PASSWORD=kibana - XPACK_APM_SERVICEMAPENABLED="true" - XPACK_ENCRYPTEDSAVEDOBJECTS_ENCRYPTIONKEY=${ENCRYPTION_KEY} filebeat: image: docker.elastic.co/beats/filebeat:8.17.0 container_name: filebeat user: root volumes: - /var/lib/docker/containers:/var/lib/docker/containers:ro - /var/run/docker.sock:/var/run/docker.sock:ro - ./filebeat.yml:/usr/share/filebeat/filebeat.yml:ro environment: - LOGSTASH_HOST=logstash01 command: [ "--strict.perms=false" ] # Disable strict permissions to avoid permission errors logstash: depends_on: - elastic - kibana image: docker.elastic.co/logstash/logstash:8.17.0 labels: co.elastic.logs/module: logstash user: root ports: - "5001:5000" - "5044:5044" - "9600:9600" volumes: - logstash_data:/usr/share/logstash/data - "./logstash.conf:/usr/share/logstash/pipeline/logstash.conf:ro" environment: - xpack.monitoring.enabled=false - ELASTIC_USER=elastic - ELASTIC_PASSWORD=elastic - ELASTIC_HOSTS=http://elastic:9200 volumes: elastic_data: kibana_data: logstash_data: ================================================ FILE: elk/filebeat.yml ================================================ filebeat.inputs: - type: container paths: - /var/lib/docker/containers/*/*.log stream: stdout # Only capture stdout json.keys_under_root: true # Parse JSON-formatted logs automatically json.add_error_key: true # Add error field if JSON parsing fails processors: - decode_json_fields: fields: [ "message" ] # Try to decode the `message` field as JSON target: "" # Merge decoded fields at the root level overwrite_keys: true # Overwrite existing keys if present - add_docker_metadata: # Enrich logs with Docker metadata host: "unix:///var/run/docker.sock" - drop_event: when.not.contains.container.labels: com_docker_compose_service: "keep-backend-elk" output.logstash: hosts: ["logstash:5044"] # Replace with your Logstash host and port logging.level: info # Set Filebeat logging level ================================================ FILE: elk/logstash.conf ================================================ input { beats { port => 5044 # Match the port used in Filebeat configuration } } filter { json { source => "message" } } output { stdout { codec => rubydebug } # For debugging elasticsearch { hosts => ["http://elastic:9200"] index => "keep-backend-logs-%{+YYYY.MM.dd}" } } ================================================ FILE: examples/providers/airflow-prod.yaml ================================================ name: airflow-prod type: airflow deduplication_rules: airflow-prod-default: description: "Default deduplication rule for Airflow Production" fingerprint_fields: - fingerprint full_deduplication: true ignore_fields: - name - lastReceived ================================================ FILE: examples/providers/telegram-bot.yaml ================================================ name: telegram-bot type: telegram authentication: # Use environment variables to store sensitive information bot_token: "$(TELEGRAM_BOT_TOKEN)" ================================================ FILE: examples/workflows/aks_basic.yml ================================================ workflow: id: aks-pod-status-monitor name: AKS Pod Status Monitor description: Retrieves and displays status information for all pods in an AKS cluster, including pod names, namespaces, and current phase. triggers: - type: manual steps: # get all pods - name: get-pods provider: type: aks config: "{{ providers.aks }}" with: command_type: get_pods actions: - name: echo-pod-status foreach: "{{ steps.get-pods.results }}" provider: type: console with: message: "Pod name: {{ foreach.value.metadata.name }} || Namespace: {{ foreach.value.metadata.namespace }} || Status: {{ foreach.value.status.phase }}" ================================================ FILE: examples/workflows/autosupress.yml ================================================ workflow: id: automatic-alert-suppression name: Automatic Alert Suppression strategy: parallel description: Automatically suppresses incoming alerts by marking them as dismissed, useful for handling known or expected alert conditions. triggers: - type: alert actions: - name: dismiss-alert provider: type: mock with: enrich_alert: - key: dismissed value: "true" ================================================ FILE: examples/workflows/bash_example.yml ================================================ workflow: id: python-service-monitor name: Python Service Monitor description: Monitors a Python service by executing a test script and sends email notifications via Resend when the service is operational. triggers: - type: manual owners: [] services: [] steps: - name: run-script provider: config: "{{ providers.default-bash }}" type: bash with: command: python3 test.py timeout: 5 actions: - condition: - assert: "{{ steps.run-script.results.return_code }} == 0" name: assert-condition type: assert name: trigger-resend provider: type: resend config: "{{ providers.resend-test }}" with: _from: "onboarding@resend.dev" to: "youremail.dev@gmail.com" subject: "Python test is up!" html:

Python test is up!

================================================ FILE: examples/workflows/bigquery.yml ================================================ workflow: id: bigquery-data-freshness-monitor name: BigQuery Data Freshness Monitor description: Monitors data freshness in BigQuery tables by checking time differences and querying public datasets for validation. triggers: - type: manual steps: - name: get-max-datetime provider: type: bigquery config: "{{ providers.bigquery-prod }}" with: # Get max(datetime) from the random table query: "SELECT MAX(created_date) as date FROM `bigquery-public-data.austin_311.311_service_requests` LIMIT 1" - name: runbook-step1-bigquery-sql provider: type: bigquery config: "{{ providers.bigquery }}" with: # Get max(datetime) from the random table query: "SELECT * FROM `bigquery-public-data.austin_bikeshare.bikeshare_stations` LIMIT 10" ================================================ FILE: examples/workflows/blogpost.yml ================================================ workflow: id: critical-alert-enrichment name: Critical Alert Enrichment description: Enriches critical alerts with customer information from MySQL and creates ServiceNow incident tickets with detailed context. triggers: # filter on critical alerts - type: alert filters: - key: severity value: critical steps: # get the customer details - name: get-more-details provider: type: mysql config: " {{ providers.mysql-prod }} " with: query: "select * from blogpostdb.customer where customer_id = '{{ alert.customer_id }}'" single_row: true as_dict: true enrich_alert: - key: customer_name value: results.name - key: customer_email value: results.email - key: customer_tier value: results.tier actions: # Create service now incident ticket - name: create-service-now-ticket # if the alert already assigned a ticket, skip it if: "not '{{ alert.ticket_id }}'" provider: type: servicenow config: " {{ providers.servicenow-prod }} " with: table_name: INCIDENT payload: short_description: "{{ alert.name }} - {{ alert.description }} [created by Keep][fingerprint: {{alert.fingerprint}}]" description: "{{ alert.description }}" enrich_alert: - key: ticket_type value: servicenow - key: ticket_id value: results.sys_id - key: ticket_url value: results.link - key: ticket_status value: results.stage - key: table_name value: "{{ alert.annotations.ticket_type }}" - key: ticket_number value: results.number ================================================ FILE: examples/workflows/businesshours.yml ================================================ workflow: id: business-hours-alert-handler name: Business Hours Alert Handler description: Processes alerts only during specified business hours in the America/New York timezone, preventing off-hours notifications. triggers: - type: alert - type: manual actions: - name: dismiss-alert if: "keep.is_business_hours(timezone='America/New_York')" provider: type: mock with: enrich_alert: - key: buisnesshours value: "true" ================================================ FILE: examples/workflows/change.yml ================================================ workflow: id: alert-status-change-monitor name: Alert Status Change Monitor description: Triggers workflow actions specifically when an alert's status field changes, useful for status-based notifications. triggers: - type: alert only_on_change: - status actions: - name: echo-test provider: type: console with: message: "Hello world" ================================================ FILE: examples/workflows/clickhouse_multiquery.yml ================================================ workflow: id: clickhouse-multi-query-monitor name: ClickHouse Multi-Query Monitor description: Executes multiple ClickHouse queries to monitor system health and creates ServiceNow tickets when issues are detected. triggers: - type: manual steps: - name: clickhouse-observability-urls provider: config: "{{ providers.clickhouse }}" type: clickhouse with: query: | SELECT Url, Status FROM "observability"."Urls" WHERE ( Url LIKE '%te_tests%' ) AND Timestamp >= toStartOfMinute(date_add(toDateTime(NOW()), INTERVAL -1 MINUTE)) AND Status = 0; - name: clickhouse-observability-events provider: config: "{{ providers.clickhouse }}" type: clickhouse with: query: | SELECT arrayElement(Metrics.testName, 1) AS mytest FROM observability.Events WHERE (Sources = 'ThousandEyes') AND (Timestamp >= toStartOfMinute(toDateTime(NOW()) + toIntervalMinute(-1))) AND (mytest = 'Oceanspot-TE') - name: clickhouse-observability-traces provider: config: "{{ providers.clickhouse }}" type: clickhouse with: query: | SELECT count(*) as c FROM "observability"."Traces" WHERE ( SpanName LIKE '%te_tests%' ) AND Timestamp >= toStartOfMinute(date_add(toDateTime(NOW()), INTERVAL -1 MINUTE)); - name: clickhouse-observability-follow-up-query # if any of the previous queries return results, run this query if: keep.len( {{ steps.clickhouse-observability-urls.results }} ) or keep.len( {{ steps.clickhouse-observability-events.results }} ) or keep.len( {{ steps.clickhouse-observability-traces.results }} ) provider: config: "{{ providers.clickhouse }}" type: clickhouse with: query: | SELECT Url, Status FROM "observability"."Urls" WHERE ( Url LIKE '%te_tests%' ) AND Timestamp >= toStartOfMinute(date_add(toDateTime(NOW()), INTERVAL -1 MINUTE)) AND Status = 0; actions: - name: snow-action # if any of the previous queries return results, run this query if: keep.len( {{ steps.clickhouse-observability-urls.results }} ) or keep.len( {{ steps.clickhouse-observability-events.results }} ) or keep.len( {{ steps.clickhouse-observability-traces.results }} ) provider: type: servicenow config: "{{ providers.servicenow }}" with: table_name: "yourtablename" payload: short_description: "Results returned for clickhouse-observability" description: | Urls: {{ steps.clickhouse-observability-urls.results }} Events: {{ steps.clickhouse-observability-events.results }} Traces: {{ steps.clickhouse-observability-traces.results }} ================================================ FILE: examples/workflows/complex-conditions-cel.yml ================================================ workflow: id: complex-conditions-monitor-cel name: Complex Conditions Monitor (CEL) description: Monitors alerts with complex conditions using CEL filters. triggers: - type: alert cel: (source.contains("datadog") && severity == "critical") || (source.contains("newrelic") && severity == "error") actions: - name: notify provider: type: console with: message: "Critical Datadog or error NewRelic alert: {{ alert.name }}" ================================================ FILE: examples/workflows/conditionally_run_if_ai_says_so.yaml ================================================ workflow: id: ai-guided-mysql-cleanup name: AI-Guided MySQL Cleanup description: Uses OpenAI to intelligently determine whether to run MySQL table cleanup operations based on alert context. triggers: - type: incident events: - updated - created steps: - name: ask-openai-if-this-workflow-is-applicable provider: config: "{{ providers.my_openai }}" type: openai with: prompt: "There is a task cleaning MySQL database. Should we run the task if we received an alert with such a name {{ alert.name }}?" model: "gpt-4o-mini" # This model supports structured output structured_output_format: # We limit what model could return type: json_schema json_schema: name: workflow_applicability schema: type: object properties: should_run: type: boolean description: "Whether the workflow should be executed based on the alert" required: ["should_run"] additionalProperties: false strict: true actions: - name: clean-db-step if: "{{ steps.ask-openai-if-this-workflow-is-applicable.results.response.should_run }}" provider: config: "{{ providers.mysql }}" type: mysql with: query: DELETE FROM bookstore.cache ORDER BY id DESC LIMIT 100; ================================================ FILE: examples/workflows/console_example.yml ================================================ workflow: id: console-logger name: Console Logger description: Simple workflow demonstrating console logging functionality with customizable messages. triggers: - type: manual actions: - name: echo provider: type: console with: logger: true message: "Hey" ================================================ FILE: examples/workflows/consts_and_dict.yml ================================================ workflow: id: consts-severity-queries-mapping name: Severity and Queries Mapping Example description: Demonstrates how to use constant mappings to standardize alert severity levels and queries. triggers: - type: manual consts: ts: 1748465504 queries: get-all-tables: query: "SELECT table_name FROM information_schema.tables;" user-query: query: "select * from user where user.id == %user_id%;" severities: s1: critical s2: error s3: warning s4: info critical: critical error: error steps: - name: print-user-query provider: type: console with: message: keep.replace('{{consts.queries.user-query.query}}', '%user_id%', '999') # will print "select * from user where user.id == 999;" actions: - name: echo provider: type: console with: logger: true message: keep.dictget({{ consts.severities }}, '{{ alert.severity }}', 'info') ================================================ FILE: examples/workflows/consts_and_vars.yml ================================================ workflow: id: tiered-alert-notification-system name: Tiered Alert Notification System description: Implements a sophisticated multi-tier alert notification system with escalating notifications to email and Slack based on alert duration. triggers: - type: alert filters: - key: source value: "openobserve" # consts block for email_template and slack_message consts: email_template: | Hi,
This {{ vars.alert_tier }} is triggered because the pipelines for {{ alert.host }} are down for more than keep.get_firing_time('{{ alert }}', 'minutes') minutes.
Please visit monitoring.keeohq.dev for more!
Regards,
KeepHQ dev Monitoring
slack_message: | {{ vars.alert_tier }} Alert: SA Pipelines are down Hi, This {{ vars.alert_tier }} alert is triggered because the pipelines for {{ alert.host }} are down for more than keep.get_firing_time('{{ alert }}', 'minutes') minutes. Please visit monitoring.keeohq.dev for more! actions: # Sendgrid Tier 0 Alert - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 0 and keep.get_firing_time('{{ alert }}', 'minutes') < 10" name: Sendgrid_Tier_0_alert vars: alert_tier: "Alert 0" provider: config: "{{ providers.Sendgrid }}" type: sendgrid with: to: - "shahar@keephq.dev" subject: '"Tier 0 Alert: SA Pipelines are down"' html: "{{ consts.email_template }}" # Sendgrid Tier 1 Alert - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 10 and keep.get_firing_time('{{ alert }}', 'minutes') < 15" name: Sendgrid_Tier_1_alert vars: alert_tier: "Alert 1" provider: config: "{{ providers.Sendgrid }}" type: sendgrid with: to: - "shahar@keephq.dev" subject: '"Tier 1 Alert: SA Pipelines are down"' html: "{{ consts.email_template }}" # Sendgrid Tier 2 Alert - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 60 and keep.get_firing_time('{{ alert }}', 'minutes') < 70" name: Sendgrid_Tier_2_alert vars: alert_tier: "Alert 2" provider: config: "{{ providers.Sendgrid }}" type: sendgrid with: to: - "shahar@keephq.dev" subject: '"Tier 2 Alert: SA Pipelines are down"' html: "{{ consts.email_template }}" # Sendgrid Tier 3 Alert - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 120 and keep.get_firing_time('{{ alert }}', 'minutes') < 130" name: Sendgrid_Tier_3_alert vars: alert_tier: "Alert 3" provider: config: "{{ providers.Sendgrid }}" type: sendgrid with: to: - "shahar@keephq.dev" subject: '"Tier 3 Alert: SA Pipelines are down"' html: "{{ consts.email_template }}" # Sendgrid Tier 4 Alert - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 1440 and keep.get_firing_time('{{ alert }}', 'minutes') < 1450" name: Sendgrid_Tier_4_alert vars: alert_tier: "Alert 4" provider: config: "{{ providers.Sendgrid }}" type: sendgrid with: to: - "shahar@keephq.dev" subject: '"Tier 4 Alert: SA Pipelines are down"' html: "{{ consts.email_template }}" # Slack Alerts - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 0 and keep.get_firing_time('{{ alert }}', 'minutes') < 10" name: Slack_Tier_0_alert vars: alert_tier: "Alert 0" provider: config: "{{ providers.dev_slack }}" type: slack with: message: "{{ consts.slack_message }}" - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 10 and keep.get_firing_time('{{ alert }}', 'minutes') < 15" name: Slack_Tier_1_alert vars: alert_tier: "Alert 1" provider: config: "{{ providers.dev_slack }}" type: slack with: message: "{{ consts.slack_message }}" - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 60 and keep.get_firing_time('{{ alert }}', 'minutes') < 70" name: Slack_Tier_2_alert vars: alert_tier: "Alert 2" provider: config: "{{ providers.dev_slack }}" type: slack with: message: "{{ consts.slack_message }}" - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 120 and keep.get_firing_time('{{ alert }}', 'minutes') < 130" name: Slack_Tier_3_alert vars: alert_tier: "Alert 3" provider: config: "{{ providers.dev_slack }}" type: slack with: message: "{{ consts.slack_message }}" - if: "keep.get_firing_time('{{ alert }}', 'minutes') >= 1440 and keep.get_firing_time('{{ alert }}', 'minutes') < 1450" name: Slack_Tier_4_alert vars: alert_tier: "Alert 4" provider: config: "{{ providers.dev_slack }}" type: slack with: message: "{{ consts.slack_message }}" ================================================ FILE: examples/workflows/create-issue-youtrack.yaml ================================================ workflow: id: youtrack-issue-creator name: YouTrack Issue Creator description: Creates standardized issues in YouTrack with predefined templates and fields. disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: [] actions: - name: youtrack-action provider: type: youtrack config: "{{ providers.YouTrack }}" with: description: Users face random logout issues when logged in through Google OAuth summary: Login fails with session error ================================================ FILE: examples/workflows/create-new-incident-grafana-incident.yaml ================================================ workflow: id: grafana-incident-creator name: Grafana Incident Creator description: Creates and manages incidents in Grafana Incident with customizable severity and status. disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: [] actions: - name: grafana_incident-action provider: type: grafana_incident config: "{{ providers.incide }}" with: # Checkout https://docs.keephq.dev/providers/documentation/grafana_incident-provider for other available fields operationType: create title: Creating new incident from Keep severity: critical status: active attachURL: https://keephq.dev ================================================ FILE: examples/workflows/create-task-in-asana.yaml ================================================ workflow: id: create-task-in-asana name: Create task in asana description: asana disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: [] actions: - name: asana-action provider: type: asana config: "{{ providers.asana }}" with: name: This is a test task from Keep projects: - "1209746642330536" assignee: "1209746640089515" due_at: "2025-09-15 02:06:58.147000+00:00" ================================================ FILE: examples/workflows/create_alert_from_vm_metric.yml ================================================ # This workflow queries VictoriaMetrics metrics and creates alerts based on CPU usage workflow: # Unique identifier for this workflow id: victoriametrics-cpu-alert # Display name shown in the UI name: VictoriaMetrics CPU Alert # Brief description of what this workflow does description: Monitors CPU usage metrics from VictoriaMetrics and generates alerts based on configurable thresholds. # Define how the workflow is triggered triggers: - type: manual # Can be triggered manually from the UI # Steps to execute in order steps: - name: victoriametrics-step provider: # Use VictoriaMetrics provider config defined in providers.vm config: "{{ providers.vm }}" type: victoriametrics with: # Query average CPU usage rate query: avg(rate(process_cpu_seconds_total)) queryType: query # Actions to take based on the query results actions: - name: create-alert provider: type: keep with: # Create alert if CPU usage exceeds threshold if: "{{ value.1 }} > 0.0040" alert: name: "High CPU Usage" description: "[Single] CPU usage is high on the VM (created from VM metric)" # Set severity based on CPU usage thresholds severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"' # Alert labels for filtering and routing labels: environment: production app: myapp service: api team: devops owner: alice ================================================ FILE: examples/workflows/create_alert_in_keep.yml ================================================ workflow: id: keep-alert-generator name: Keep Alert Generator description: Creates new alerts within the Keep system with customizable parameters and descriptions. triggers: - type: manual actions: - name: create-alert provider: type: keep with: alert: name: "Alert created from the workflow" description: "This alert was created from the create_alert_in_keep.yml example workflow." labels: environment: production ================================================ FILE: examples/workflows/create_alerts_from_elastic.yml ================================================ workflow: id: elastic-basic name: Create alerts from Elasticsearch description: Create alerts from Elastic index (e.g. info alerts) triggers: - type: manual steps: - name: query-ack-index provider: type: elastic config: " {{ providers.elastic }} " with: index: keep-alerts-keep query: | { "query_string": { "query": "firing" } } actions: - name: create-alert provider: type: keep with: override_source_with: "elastic" read_only: true fingerprint_fields: - id alert: name: "{{ _source.name }}" status: "{{ _source.status }}" host: "{{ _source.host }}" service: "{{ _source.service }}" ================================================ FILE: examples/workflows/create_alerts_from_mysql.yml ================================================ workflow: id: mysql-alert-sync name: MySQL Alert Sync description: Synchronizes alerts from a MySQL database into Keep, with configurable intervals and data mapping. triggers: # run manually (debugging) - type: manual # run 5 minutes - type: interval value: 300 steps: # get the customer details - name: get-alerts-from-mysql provider: type: mysql config: " {{ providers.mysql-prod }} " with: # run the query, and limit the results to the last run query: "select * from monitoring_system.alerts where ts > '{{ last_workflow_run_time }}'" as_dict: true # create the alerts using Keep provider actions: # Create an alert in Keep based on the query results - name: create-alert provider: type: keep with: # by default, the alert will be created in the "keep" source, this can be adjusted override_source_with: "mysql" # do not try to resolve alerts or smth like that - just sync from the database read_only: true # adjust if needed fingerprint_fields: - id # build the alert payload from the query results alert: name: "{{ message }}" status: "{{ state }}" host: "{{ host }}" service: "{{ service }}" client: "{{ client }}" ================================================ FILE: examples/workflows/create_jira_ticket_upon_alerts.yml ================================================ workflow: id: sentry-to-jira-bridge name: Sentry-to-Jira Bridge description: Creates Jira tickets for critical Sentry alerts and notifies relevant teams via Slack. triggers: - type: alert # we want to run this workflow only for Sentry alerts with high severity filters: - key: source value: sentry - key: severity value: critical - key: service value: r"(payments|ftp)" actions: - name: send-slack-message-team-payments # if the alert is on the payments service, slack the payments team if: "'{{ alert.service }}' == 'payments'" provider: type: slack config: " {{ providers.team-payments-slack }} " with: message: | "A new alert from Sentry: Alert: {{ alert.name }} - {{ alert.description }} {{ alert}}" - name: create-jira-ticket-oncall-board if: "'{{ alert.service }}' == 'ftp' and not '{{ alert.ticket_id }}'" provider: type: jira config: " {{ providers.jira }} " with: board_name: "Oncall Board" custom_fields: customfield_10201: "Critical" issuetype: "Task" summary: "{{ alert.name }} - {{ alert.description }} (created by Keep)" description: | "This ticket was created by Keep. Please check the alert details below: {code:json} {{ alert }} {code}" # enrich the alerts enrich_alert: - key: ticket_type value: jira - key: ticket_id value: results.issue.key - key: ticket_url value: results.ticket_url ================================================ FILE: examples/workflows/create_multi_alert_from_vm_metric.yml ================================================ workflow: # Unique identifier for this workflow id: multi-service-cpu-monitor # Display name shown in the UI name: Multi-Service CPU Monitor # Brief description of what this workflow does description: Creates separate alerts for different services based on VictoriaMetrics CPU metrics with customizable thresholds. triggers: # This workflow can be triggered manually from the UI - type: manual steps: # Query VictoriaMetrics for CPU metrics - name: victoriametrics-step provider: # Use the VictoriaMetrics provider configuration config: "{{ providers.vm }}" type: victoriametrics with: # Query that returns the sum of CPU usage for each job # Example response: # [ # {'metric': {'job': 'victoriametrics'}, 'value': [1737808021, '0.022633333333333307']}, # {'metric': {'job': 'vmagent'}, 'value': [1737808021, '0.009299999999999998']} # ] query: sum(rate(process_cpu_seconds_total)) by (job) queryType: query actions: # Create an alert in Keep based on the query results - name: create-alert provider: type: keep with: # Only create alert if CPU usage is above threshold if: "{{ value.1 }} > 0.01 " # Alert must persist for 1 minute for: 1m # Use job label to create unique fingerprint for each alert fingerprint_fields: - labels.job alert: # Alert name includes the specific job name: "High CPU Usage on {{ metric.job }}" description: "CPU usage is high on the VM (created from VM metric)" # Set severity based on CPU usage thresholds: # > 0.9 = critical # > 0.7 = warning # else = info severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"' labels: # Job label is required for alert fingerprinting job: "{{ metric.job }}" # Additional context labels environment: production app: myapp service: api team: devops owner: alice ================================================ FILE: examples/workflows/create_service_now_ticket_upon_alerts.yml ================================================ workflow: id: prometheus-grafana-servicenow-integration name: Prometheus/Grafana ServiceNow Integration description: Creates ServiceNow tickets for Prometheus and Grafana alerts with rich context and alert enrichment. triggers: - type: alert # create ticket for grafana/prometheus alerts filters: - key: source value: r"(grafana|prometheus)" actions: - name: create-service-now-ticket # if the ticket id is not present in the alert, create a ticket if: "not '{{ alert.ticket_id }}' and {{ alert.annotations.ticket_type }}" provider: type: servicenow config: " {{ providers.servicenow }} " with: table_name: "{{ alert.annotations.ticket_type }}" payload: short_description: "{{ alert.name }} - {{ alert.description }} [created by Keep][fingerprint: {{alert.fingerprint}}]" description: "{{ alert.description }}" # enrich the alert with the ticket number and other details returned from servicenow enrich_alert: - key: ticket_type value: servicenow - key: ticket_id value: results.sys_id - key: ticket_url value: results.link - key: ticket_status value: results.stage - key: table_name value: "{{ alert.annotations.ticket_type }}" ================================================ FILE: examples/workflows/datadog-log-monitor.yml ================================================ workflow: id: datadog-log-monitor name: Datadog Log Monitor description: Monitors Datadog logs for specific services and sends Slack notifications when error conditions are detected. triggers: - type: manual steps: - name: check-error-rate provider: type: datadog config: "{{ providers.datadog }}" with: query: "service:keep-github-app" timeframe: "3d" query_type: "logs" actions: - name: trigger-slack condition: - name: threshold-condition type: threshold value: "keep.len({{ steps.check-error-rate.results.logs }})" compare_to: 0 compare_type: gt provider: type: slack config: "{{ providers.slack-demo }}" with: channel: db-is-down # Message is always mandatory message: > The db is down. Please investigate. blocks: - type: section text: type: plain_text text: | Query: {{ steps.check-error-rate.provider_parameters.query }} Timeframe: {{ steps.check-error-rate.provider_parameters.timeframe }} Number of logs: keep.len({{ steps.check-error-rate.results.logs }}) From: {{ steps.check-error-rate.provider_parameters.from }} To: {{ steps.check-error-rate.provider_parameters.to }} providers: db-server-mock: description: Paper DB Server authentication: datadog: authentication: api_key: "{{ env.DATADOG_API_KEY }}" app_key: "{{ env.DATADOG_APP_KEY }}" ================================================ FILE: examples/workflows/db_disk_space_monitor.yml ================================================ # Database disk space is low (<10%) workflow: id: database-disk-space-monitor name: Database Disk Space Monitor description: Monitors database disk space usage and sends detailed Slack notifications with interactive components when space is low. owners: - github-shahargl - slack-talboren services: - db - api # Run every 60 seconds triggers: - type: interval value: 60 steps: - name: db-no-space provider: type: mock config: "{{ providers.db-server-mock }}" with: command: df -h | grep /dev/disk3s1s1 | awk '{ print $5}' # Check the disk space command_output: 91% # Mock actions: - name: trigger-slack condition: - name: threshold-condition type: threshold value: "{{ steps.db-no-space.results }}" compare_to: 90% # Trigger if more than 90% full provider: type: slack config: " {{ providers.slack-demo }} " with: # Message is always mandatory message: > The disk space of {{ providers.db-server-mock.description }} is about to finish Disk space left: {{ steps.db-no-space.results }} blocks: - type: header text: type: plain_text text: "Alert! :alarm_clock:" emoji: true - type: section text: type: mrkdwn text: |- Hello, SRE and Assistant to the Regional Manager Dwight! *Michael Scott* wants to know what's going on with the servers in the paper warehouse, there is a critical issue on-going and paper *must be delivered on time*. *This is the alert context:* - type: divider - type: section text: type: mrkdwn text: |- Server *{{ providers.db-server-mock.description }}* :floppy_disk: disk space is at {{ steps.db-no-space.results }} capacity Seems like it prevents further inserts in to the database with some weird exception: 'This is a prank by Jim Halpert' This means that paper production is currently on hold, Dunder Mifflin Paper Company *may lose revenue due to that*. accessory: type: image image_url: https://media.licdn.com/dms/image/C4E03AQGtRDDj3GI4Ig/profile-displayphoto-shrink_800_800/0/1550248958619?e=2147483647&v=beta&t=-AYVwN44CsHUdIcd-7iOHQVVjfhEC0DZydhlmvNvTKo alt_text: jim does dwight - type: divider - type: input element: type: multi_users_select placeholder: type: plain_text text: Select users emoji: true action_id: multi_users_select-action label: type: plain_text text: Select the people for the mission emoji: true - type: divider - type: section text: type: plain_text text: "Some context that can help you:" emoji: true - type: context elements: - type: plain_text text: "DB System Info: Some important context fetched from the DB" emoji: true - type: context elements: - type: image image_url: https://pbs.twimg.com/profile_images/625633822235693056/lNGUneLX_400x400.jpg alt_text: cute cat - type: mrkdwn text: "*Cat* is currently on site, ready to follow your instructions." - type: divider - dispatch_action: true type: input element: type: plain_text_input action_id: plain_text_input-action label: type: plain_text text: Please Acknowledge emoji: true - type: actions elements: - type: button style: primary text: type: plain_text text: ":dog: Datadog" emoji: true value: click_me_123 - type: button style: danger text: type: plain_text text: ":sos: Database" emoji: true value: click_me_123 url: https://google.com - type: button text: type: plain_text text: ":book: Playbook" emoji: true value: click_me_123 url: https://google.com providers: db-server-mock: description: Paper DB Server authentication: ================================================ FILE: examples/workflows/discord_basic.yml ================================================ workflow: id: discord-notification-demo name: Discord Notification Demo description: Demonstrates Discord integration with interactive button components for alert notifications. triggers: - type: manual actions: - name: discord provider: type: discord config: "{{ providers.discordtest }}" with: content: Alerta! components: - type: 1 # Action row components: - type: 2 # Button style: 1 # Primary style label: "Click Me!" custom_id: "button_click" ================================================ FILE: examples/workflows/disk_grown_defects_rule.yml ================================================ # Alert description: this alert will trigger if the disk defects is over 50%, 40% or 30%. # Alert breakdown: # 1. Read the disk status from postgres (select * from disk) # 2. For each disk, check if the disk defects is over 50% (major), 40% (medium) or 30% (minor). # 3. If the disk defects is over the threshold, insert a new row to the alert table with the disk name and the disk defects. workflow: id: disk-defect-tracker name: Disk Defect Tracker description: Monitors disk defects and creates tiered alerts in PostgreSQL based on defect percentage thresholds. triggers: - type: interval value: 60 steps: - name: check-disk-defects provider: type: postgres config: "{{ providers.postgres-server }}" with: query: "select * from disk" actions: - name: push-alert-to-postgres foreach: "{{steps.check-disk-defects.results}}" condition: - name: threshold-condition type: threshold value: " {{ foreach.value[13] }} " # disk defect is the 13th column compare_to: 50, 40, 30 level: major, medium, minor provider: type: postgres config: "{{ providers.postgres-server }}" with: query: >- INSERT INTO alert (alert_level, alert_message) VALUES ('{{ foreach.level }}', 'Disk defects: {{ foreach.value[13] }} | Disk name: {{ foreach.value[1] }}') providers: postgres-server: description: The postgres server (sql) authentication: username: "{{ env.POSTGRES_USER }}" password: "{{ env.POSTGRES_PASSWORD }}" database: "{{ env.POSTGRES_DATABASE }}" host: "{{ env.POSTGRES_HOST }}" ================================================ FILE: examples/workflows/eks_advanced.yml ================================================ workflow: id: eks-deployment-scaling-manager name: EKS Deployment Scaling Manager description: Manages EKS cluster operations including pod monitoring and deployment scaling. Retrieves pod status, scales nginx deployment, and provides detailed status reporting. triggers: - type: manual steps: # get all pods - name: get-pods provider: type: eks config: "{{ providers.eks }}" with: command_type: get_pods # get specific deployment info - name: get-deployment-info provider: type: eks config: "{{ providers.eks }}" with: command_type: get_deployment namespace: default deployment_name: nginx-test # scale up deployment - name: scale-up provider: type: eks config: "{{ providers.eks }}" with: command_type: scale_deployment namespace: default deployment_name: nginx-test replicas: 4 # get pods after scaling - name: get-pods-after-scale provider: type: eks config: "{{ providers.eks }}" with: command_type: get_pods namespace: default actions: - name: echo-all-pods foreach: "{{ steps.get-pods.results }}" provider: type: console with: message: "Pod name: {{ foreach.value.metadata.name }} || Namespace: {{ foreach.value.metadata.namespace }} || Status: {{ foreach.value.status.phase }}" - name: echo-deployment-info provider: type: console with: message: "Deployment {{ steps.get-deployment-info.results.metadata.name }} has {{ steps.get-deployment-info.results.status.replicas }} replicas" - name: echo-scaled-pods foreach: "{{ steps.get-pods-after-scale.results }}" provider: type: console with: message: "After scaling - Pod name: {{ foreach.value.metadata.name }} || Status: {{ foreach.value.status.phase }}" ================================================ FILE: examples/workflows/eks_basic.yml ================================================ workflow: id: eks-pod-status-monitor name: EKS Pod Status Monitor description: Monitors and reports the status of all pods in an EKS cluster, including their names, namespaces, and current phases. triggers: - type: manual steps: # get all pods - name: get-pods provider: type: eks config: "{{ providers.eks }}" with: command_type: get_pods actions: - name: echo-pod-status foreach: "{{ steps.get-pods.results }}" provider: type: console with: message: "Pod name: {{ foreach.value.metadata.name }} || Namespace: {{ foreach.value.metadata.namespace }} || Status: {{ foreach.value.status.phase }}" ================================================ FILE: examples/workflows/elastic_basic.yml ================================================ workflow: id: elastic-basic name: Simple query from Elasticsearch description: Querying alerts from Keep's elastic index (e.g. info alerts) triggers: - type: manual steps: - name: query-ack-index provider: type: elastic config: " {{ providers.elastic }} " with: index: keep-alerts-keep query: | { "query_string": { "query": "info" } } ================================================ FILE: examples/workflows/elastic_enrich_example.yml ================================================ # if no acknowledgement has been recieved (updated in index) for x (from config index) time, i want to escalate it to next level of people workflow: id: alert-acknowledgment-escalator name: Alert Acknowledgment Escalator description: Monitors unacknowledged alerts in Elasticsearch and automatically escalates them based on configured thresholds. Integrates with people and configuration indices for smart escalation routing. triggers: # run every minute - type: interval value: 1m steps: # first, query the ack index to check if there are any alerts that have not been acknowledged - name: query-ack-index provider: type: elastic config: " {{ providers.elastic }} " with: index: your_ack_index query: | { "query": { "bool": { "must": [ { "match": { "acknowledged": false } } ] } } } - name: query-config-index provider: type: elastic config: " {{ providers.elastic }} " with: index: your_config_index query: | { "query": { "bool": { "must": [ { "match": { "config": true } } ] } } } - name: query-people-index provider: type: elastic config: " {{ providers.elastic }} " with: index: your_people_index query: | { "query": { "bool": { "must": [ { "match": { "people": true } } ] } } } # now, we have the results from the ack index, config index, and people index actions: - name: escalate-if-needed # if there are any alerts that have not been acknowledged if: "{{ query-ack-index.hits.total.value }} > 0" provider: type: slack # or email or whatever you want config: " {{ providers.slack }} " with: message: | "A unacknowledged alert has been found: {{ query-ack-index.hits.hits }} {{ query-config-index.hits.hits }} {{ query-people-index.hits.hits }}" ================================================ FILE: examples/workflows/enrich_using_structured_output_from_deepseek.yaml ================================================ workflow: id: deepseek-alert-enrichment name: DeepSeek Alert Enrichment description: Enriches Prometheus alerts using DeepSeek Coder to determine environment and customer impact information through structured JSON output. triggers: - type: alert filters: - key: source value: prometheus steps: - name: get-enrichments provider: config: "{{ providers.my_deepseek }}" type: deepseek with: prompt: | You received such an alert {{alert}}, generate missing fields. Environment could be \"production\", \"staging\", \"development\". EXAMPLE JSON OUTPUT: { \"environment\": \"production\", \"impacted_customer_name\": \"Acme Corporation\" } model: "deepseek-coder-33b-instruct" structured_output_format: # We limit what model could return type: json_object actions: - name: enrich-alert provider: type: mock with: enrich_alert: - key: environment value: "{{ steps.get-enrichments.results.response.environment }}" - key: impacted_customer_name value: "{{ steps.get-enrichments.results.response.impacted_customer_name }}" ================================================ FILE: examples/workflows/enrich_using_structured_output_from_openai.yaml ================================================ workflow: id: openai-alert-enrichment name: OpenAI Alert Enrichment description: Enriches Prometheus alerts using GPT-4 structured output to determine environment and impacted customer information with strict schema validation. triggers: - type: alert filters: - key: source value: prometheus steps: - name: get-enrichments provider: config: "{{ providers.my_openai }}" type: openai # Could be also LiteLLM with: prompt: "You received such an alert {{alert}}, generate missing fields." model: "gpt-4o-mini" # This model supports structured output structured_output_format: # We limit what model could return type: json_schema json_schema: name: missing_fields schema: type: object properties: environment: type: string enum: - "production" - "pre-prod" - "debug" description: "Be pessimistic, return pre-prod or production only if you see evidence in the alert body." impacted_customer_name: type: string description: "Return undefined if you are not sure about the customer." required: ["environment", "impacted_customer_name"] additionalProperties: false strict: true actions: - name: enrich-alert provider: type: mock with: enrich_alert: - key: environment value: "{{ steps.get-enrichments.results.response.environment }}" - key: impacted_customer_name value: "{{ steps.get-enrichments.results.response.impacted_customer_name }}" ================================================ FILE: examples/workflows/enrich_using_structured_output_from_vllm_qwen.yaml ================================================ workflow: id: vllm-qwen-alert-enrichment name: vLLM Qwen Alert Enrichment description: Enriches Prometheus alerts using vLLM-hosted Qwen model to automatically determine environment type and impacted customer details. triggers: - type: alert filters: - key: source value: prometheus steps: - name: get-enrichments provider: config: "{{ providers.my_vllm }}" type: vllm with: prompt: "You received such an alert {{alert}}, generate missing fields." model: "Qwen/Qwen1.5-1.8B-Chat" # This model supports structured output structured_output_format: # We limit what model could return type: object properties: environment: type: string enum: - production - debug - pre-prod impacted_customer_name: type: string required: - environment - impacted_customer_name actions: - name: enrich-alert provider: type: mock with: enrich_alert: - key: environment value: "{{ steps.get-enrichments.results.response.environment }}" - key: impacted_customer_name value: "{{ steps.get-enrichments.results.response.impacted_customer_name }}" ================================================ FILE: examples/workflows/failed-to-login-workflow.yml ================================================ workflow: id: tiered-login-failure-response name: Tiered Login Failure Response description: Handles user login failures by querying customer tier from BigQuery and routes notifications to appropriate channels - OpsGenie for enterprise customers and Slack for all tiers. triggers: - type: alert filters: - key: name value: "User failed to login" steps: - name: get-customer-tier-by-id provider: type: bigquery config: "{{ providers.bigquery-prod }}" with: query: "SELECT customer_name, tier FROM `bigquery-production.prod-db.customers` WHERE customer_id = {{ alert.customer_id }} LIMIT 1" actions: # for enterprise customer, open an incident in opsgenie - name: opsgenie-alert condition: - name: enterprise-tier type: assert assert: "{{ steps.get-customer-tier-by-id.result.tier }} == 'enterprise'" provider: type: opsgenie config: " {{ providers.opsgenie-prod }} " with: message: "User of customer {{ steps.get-customer-tier-by-id.result.customer_name }} failed to login!" # for every customer, send a slack message - name: trigger-slack provider: type: slack config: " {{ providers.slack-prod }} " with: message: "User of customer {{ steps.get-customer-tier-by-id.result.customer_name }} failed to login!" ================================================ FILE: examples/workflows/flashduty_example.yml ================================================ workflow: id: flashduty-incident-notifier name: FlashDuty Incident Notifier description: Manages incident notifications in FlashDuty with customizable event statuses, labels, and environment tracking. disabled: false triggers: - type: incident events: - created - updated - deleted consts: {} owners: [] services: [] steps: [] actions: - name: flashduty-action provider: type: flashduty config: "{{ providers.default-flashduty }}" with: title: test title description: test description event_status: Info alert_key: 611eed6614ec labels: service: flashduty environment: dev ================================================ FILE: examples/workflows/fluxcd_example.yml ================================================ workflow: id: fluxcd-example name: "FluxCD Resource Monitor" description: "Example workflow that retrieves Flux CD resources and creates alerts for failed deployments" triggers: - type: interval value: 1800 # 30 minutes in seconds steps: - name: get-fluxcd-resources provider: type: fluxcd config: "{{ providers.fluxcd }}" with: kubeconfig: "{{ env.KUBECONFIG }}" namespace: "flux-system" vars: fluxcd_resources: "{{ steps.get-fluxcd-resources.results }}" - name: check-for-failed-deployments provider: type: console with: message: | Found {{ vars.fluxcd_resources.kustomizations | length }} Kustomizations and {{ vars.fluxcd_resources.helm_releases | length }} HelmReleases - name: create-alerts-for-failed-kustomizations foreach: "{{ vars.fluxcd_resources.kustomizations }}" if: "{{ item.status.conditions[0].status == 'False' }}" provider: type: keep with: alert_name: "FluxCD Kustomization {{ item.metadata.name }} failed" alert_description: "Kustomization {{ item.metadata.name }} in namespace {{ item.metadata.namespace }} failed with message: {{ item.status.conditions[0].message }}" alert_severity: "critical" alert_fingerprint: "fluxcd-kustomization-{{ item.metadata.name }}-{{ item.metadata.namespace }}" alert_source: "fluxcd" alert_labels: namespace: "{{ item.metadata.namespace }}" name: "{{ item.metadata.name }}" type: "kustomization" - name: create-alerts-for-failed-helmreleases foreach: "{{ vars.fluxcd_resources.helm_releases }}" if: "{{ item.status.conditions[0].status == 'False' }}" provider: type: keep with: alert_name: "FluxCD HelmRelease {{ item.metadata.name }} failed" alert_description: "HelmRelease {{ item.metadata.name }} in namespace {{ item.metadata.namespace }} failed with message: {{ item.status.conditions[0].message }}" alert_severity: "critical" alert_fingerprint: "fluxcd-helmrelease-{{ item.metadata.name }}-{{ item.metadata.namespace }}" alert_source: "fluxcd" alert_labels: namespace: "{{ item.metadata.namespace }}" name: "{{ item.metadata.name }}" type: "helmrelease" ================================================ FILE: examples/workflows/gcp_logging_open_ai.yaml ================================================ workflow: id: gcp-log-analysis-ai name: GCP Log Analysis with AI description: Analyzes Cloud Run errors using OpenAI to provide root cause analysis from GCP logs, including confidence scoring and relevant log entries. disabled: false triggers: - type: manual - filters: - key: source value: gcpmonitoring type: alert consts: {} owners: [] services: [] steps: - name: gcpmonitoring-step provider: config: "{{ providers.gcp }}" type: gcpmonitoring with: as_json: false filter: resource.type = "cloud_run_revision" {{alert.traceId}} page_size: 1000 raw: false timedelta_in_days: 1 - name: openai-step provider: config: "{{ providers.openai }}" type: openai with: prompt: | You are a very talented engineer that receives context from GCP logs about an endpoint that returned 500 status code and reports back the root cause analysis. Here is the context: keep.json_dumps({{steps.gcpmonitoring-step.results}}) (it is a JSON list of log entries from GCP Logging). In your answer, also provide the log entry that made you conclude the root cause and specify what your certainty level is that it is the root cause. (between 1-10, where 1 is low and 10 is high) actions: - name: slack-action provider: config: "{{ providers.slack }}" type: slack with: message: "{{steps.openai-step.results}}" ================================================ FILE: examples/workflows/gke.yml ================================================ workflow: id: gke-pod-status-monitor name: GKE Pod Status Monitor description: Monitors and displays status information for all pods in a Google Kubernetes Engine cluster, including pod names, namespaces, and phases. triggers: - type: manual steps: # get all pods - name: get-pods provider: type: gke config: "{{ providers.GKE }}" with: command_type: get_pods actions: - name: echo-pod-status foreach: "{{ steps.get-pods.results }}" provider: type: console with: message: "Pod name: {{ foreach.value.metadata.name }} || Namespace: {{ foreach.value.metadata.namespace }} || Status: {{ foreach.value.status.phase }}" ================================================ FILE: examples/workflows/http_enrich.yml ================================================ workflow: id: http_enrich name: Enrich alert with HTTP description: Enrich alert with HTTP Action, using a public free API disabled: false triggers: - type: alert filters: - key: source value: prometheus consts: {} owners: [] services: [] steps: [] actions: - name: http-action provider: type: http config: "{{ providers.default-http }}" with: url: https://api.restful-api.dev/objects/7 method: GET enrich_alert: - key: computerName value: results.body.name ================================================ FILE: examples/workflows/ifelse.yml ================================================ workflow: id: alert-routing-policy name: Alert Routing Policy Manager description: Routes alerts to appropriate channels based on multiple criteria including business hours, team ownership, environment, and monitor type with conditional flow control. triggers: - type: alert actions: - name: business-hours-check if: "keep.is_business_hours(timezone='America/New_York')" # stop the workflow if it's business hours continue: false provider: type: console with: message: "Alert during business hours, exiting" - name: infra-prod-slack if: "'{{ alert.team }}' == 'infra' and '{{ alert.env }}' == 'prod'" provider: type: slack config: "{{ providers.slack-prod }}" with: channel: prod-infra-alerts message: | "Infrastructure Production Alert Team: {{ alert.team }} Environment: {{ alert.env }} Description: {{ alert.description }}" - name: http-api-errors-slack if: "'{{ alert.monitor_name }}' == 'Http API Errors'" provider: type: slack config: "{{ providers.slack-prod }}" with: channel: backend-team-alerts message: | "HTTP API Error Alert Monitor: {{ alert.monitor_name }} Description: {{ alert.description }}" # exit after sending http api error alert continue: false - name: backend-staging-pagerduty if: "'{{ alert.team }}'== 'backend' and '{{ alert.env }}' == 'staging'" provider: type: console with: severity: low message: | "Backend Staging Alert Team: {{ alert.team }} Environment: {{ alert.env }} Description: {{ alert.description }}" # Exit after sending staging alert continue: false ================================================ FILE: examples/workflows/ilert-incident-upon-alert.yaml ================================================ workflow: id: ilert-incident-creator name: iLert Incident Creator description: Creates structured incidents in iLert from Keep alerts, including service impact assessment and investigation status tracking. triggers: - filters: - key: source value: keep type: alert owners: [] services: [] steps: [] actions: - name: ilert-action provider: config: "{{ providers.ilert-default }}" type: ilert with: affectedServices: - impact: OPERATIONAL service: id: 339743 message: A mock incident created with Keep! status: INVESTIGATING summary: Keep Incident {{ alert.name }} ================================================ FILE: examples/workflows/incident-enrich.yaml ================================================ workflow: id: incident-metadata-enricher name: Incident Metadata Enricher description: Enriches incidents with additional metadata including environment, incident IDs, URLs, and provider information while logging incident details. disabled: false triggers: - type: manual - events: - created - updated type: incident consts: {} owners: [] services: [] steps: [] actions: - name: console-log provider: type: console with: message: "Incident name: {{ incident.user_generated_name }} | severity: {{ incident.severity }}" enrich_incident: - key: environment value: "prod-de-prod" - key: incident_id value: "1234567890" - key: incident_url value: "https://keephq.dev/incident/1234567890" - key: incident_provider value: "jira" ================================================ FILE: examples/workflows/incident-tier-escalation.yml ================================================ workflow: id: incident-tier-escalation name: Incident Tier Escalation description: Manages incident escalation tiers based on alert conditions, automatically adjusting notification tiers and sending appropriate Slack notifications for each level. triggers: # when an incident is created or updated with a new alert - type: incident events: - created - updated actions: - name: send-slack-message-tier-0 # send tier0 if this is a new incident (no tier set) or if the incident is tier0 but the alert is alert2 if: "{{ !incident.current_tier || incident.current_tier == 0 && alert.name == 'alert2' }}" provider: type: slack config: "{{ providers.slack }}" with: message: | "Incident created: {{ incident.name }} - {{ incident.description }} Tier: 0" Alert: {{ alert.name }} - {{ alert.description }} Alert details: {{ alert }}" # enrich the incident with the current tier enrich_incident: - key: current_tier value: 0 - name: send-slack-message-tier-1 if: "{{ incident.current_tier == 0 && alert.name == 'alert1' }}" provider: type: slack config: "{{ providers.slack }}" with: message: | "Incident updated: {{ incident.name }} - {{ incident.description }} Tier: 1 Alert: {{ alert.name }} - {{ alert.description }} Alert details: {{ alert }}" enrich_incident: - key: current_tier value: 1 ================================================ FILE: examples/workflows/incident_example.yml ================================================ workflow: id: incident-echo-monitor name: Incident Echo Monitor description: Monitors incident updates and creations, providing basic console logging for incident tracking and debugging. triggers: - type: incident events: - updated - created actions: - name: just-echo provider: type: console with: message: "Hey there! I am an incident!" ================================================ FILE: examples/workflows/inputs_example.yml ================================================ workflow: id: input-example name: Input Example description: Simple workflow demonstrating input functionality with customizable messages. triggers: - type: manual inputs: - name: message description: The message to log to the console type: string default: "Hey" - name: nodefault description: A no default examples type: string - name: boolexample description: Whether to log the message type: boolean default: true - name: choiceexample description: The choice to make type: choice default: "option1" options: - option1 - option2 - option3 actions: - name: echo provider: type: console with: message: | "This is my input message: {{ inputs.message }} This is my input boolean: {{ inputs.boolexample }} This is my input choice: {{ inputs.choiceexample }}" ================================================ FILE: examples/workflows/jira-create-ticket-on-alert.yml ================================================ workflow: id: jira-create-ticket-on-alert name: Create Jira Ticket on Alert description: Create Jira ticket when alert fires disabled: false triggers: - type: alert cel: status == "firing" actions: - name: jira-action if: "not '{{ alert.ticket_id }}'" provider: type: jira config: "{{ providers.JiraCloud }}" with: board_name: YOUR_BOARD_NAME # Change this to your board name issue_type: Task # Or Bug, Story, etc. summary: "{{ alert.name }} - {{ alert.description }}" description: | "This ticket was created automatically by Keep. Alert Details: {code:json} {{ alert }} {code}" enrich_alert: - key: ticket_type value: jira - key: ticket_id value: results.issue.key - key: ticket_url value: results.ticket_url ================================================ FILE: examples/workflows/jira-transition-on-resolved.yml ================================================ workflow: id: jira-transition-on-resolved name: Transition Jira Ticket to Done description: Close Jira ticket when alert is resolved disabled: false triggers: - type: alert cel: status == "resolved" actions: - name: jira-action provider: type: jira config: "{{ providers.JiraCloud }}" with: issue_id: "{{ alert.ticket_id }}" summary: "{{ alert.name }} - {{ alert.description }} (resolved)" description: | "Alert has been resolved automatically by Keep. Resolved at: {{ alert.lastReceived }} Original Alert Details: {code:json} {{ alert }} {code}" transition_to: Done # Change to your workflow's status name ================================================ FILE: examples/workflows/jira_on_prem.yml ================================================ workflow: id: jira-onprem-incident-creator name: Jira On-Prem Incident Creator description: Creates standardized incidents in on-premises Jira with customizable fields, labels, and priorities for SRE team tracking. triggers: - type: manual owners: [] services: [] steps: [] actions: - name: jiraonprem-action provider: config: "{{ providers.jira }}" type: jiraonprem with: board_name: SA custom_fields: "" description: test issue_type: Incident labels: - "SRE_Team" priority: Low project_key: SA summary: test ================================================ FILE: examples/workflows/monday_create_pulse.yml ================================================ workflow: id: monday-pulse-creator name: Monday.com Pulse Creator description: Creates new pulses (items) in Monday.com boards with customizable column values and group assignments. triggers: - type: manual actions: - name: monday provider: type: monday config: "{{ providers.monday }}" with: # Open the board in monday.com web app. # Hover over the board name in the side panel, click on the three dots that appear, and click on ID to copy the board ID. board_id: 1956384489 # Hover over the group name in the board, click on the three dots that appear, and click on Group ID to copy the group ID. group_id: "topics" # Item Name is the name of the pulse you want to add. item_name: "Test" column_values: # Specify the column IDs and their corresponding values for the new item/pulse. # Hover over the column name in the board, click on the three dots that appear, and click on Column ID to copy the column ID. # The Key is the column ID and the Value is the value you want to set for the column. - text_mkm77x3p: "helo" # Here text_mkm77x3p is the column ID and helo is the value. - text_1_mkm7x2ep: "10" # Here text_1_mkm7x2ep is the column ID and 10 is the value. ================================================ FILE: examples/workflows/multi-condition-cel.yml ================================================ workflow: id: multi-condition-monitor-cel name: Multi-Condition Monitor (CEL) description: Monitors alerts with multiple conditions using CEL filters. triggers: - type: alert cel: source.contains("prometheus") && severity == "critical" && environment == "production" actions: - name: notify provider: type: console with: message: "Critical production alert from Prometheus: {{ alert.name }}" ================================================ FILE: examples/workflows/mustache-paths-example.yml ================================================ workflow: id: mustache-path-extractor name: Mustache Path Extractor description: Demonstrates extraction of values from nested dictionaries and lists using Mustache templating with Python and console output. disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: - name: step-with-dict provider: config: "{{ providers.default-python }}" type: python with: code: "{'hello': 'world', 'nested': {'bye': 'bye'}, 'nested_list': ['a','b','c', {'in': 'list'}]}" - name: step-with-list provider: config: "{{ providers.default-python }}" type: python with: code: "[{'hello': 'world', 'nested': {'bye': 'bye'}, 'nested_list': ['a','b','c', {'in': 'list'}]}]" - name: console-step-with-dict provider: type: console with: message: "{{ steps.step-with-dict.results.hello }}" - name: console-step-with-list provider: type: console with: message: "{{ steps.step-with-list.results.0.nested.bye }}" actions: [] ================================================ FILE: examples/workflows/new-auth0-users-monitor.yml ================================================ # Alert when there are new Auth0 users workflow: id: new-auth0-users-monitor name: New Auth0 Users Monitor description: Tracks new Auth0 user signups and sends Slack notifications with detailed user information, maintaining state between runs. triggers: - type: interval value: 3600 # every hour steps: - name: get-auth0-users provider: type: auth0.logs config: "{{ providers.auth0 }}" with: log_type: ss previous_users: "{{ state.new-auth0-users.-1.alert_context.alert_steps_context.get-auth0-users.results.users }}" # state.alert-id.-1 for last run actions: - name: trigger-slack condition: - name: assert-condition type: assert assert: "{{ steps.get-auth0-users.results.new_users_count }} == 0" # if there are more than 0 new users, trigger the action provider: type: slack config: " {{ providers.slack-demo }} " with: blocks: - type: section text: type: plain_text text: There are new keep.len({{ steps.get-auth0-users.results.new_users }}) users! emoji: true - type: section text: type: plain_text text: |- {{#steps.get-auth0-users.results.new_users}} - {{user_name}} {{/steps.get-auth0-users.results.new_users}} emoji: true ================================================ FILE: examples/workflows/new_github_stars.yml ================================================ workflow: id: github-star-tracker name: GitHub Star Tracker description: Monitors new GitHub stars for the Keep repository and sends Slack notifications with stargazer details and timestamps. triggers: - type: manual - type: interval value: 300 steps: - name: get-github-stars provider: config: "{{ providers.github }}" type: github.stars with: previous_stars_count: default: 0 key: "{{ last_workflow_results.get-github-stars.0.stars }}" last_stargazer: default: "" key: "{{ last_workflow_results.get-github-stars.0.last_stargazer }}" repository: keephq/keep actions: - condition: - assert: "{{ steps.get-github-stars.results.new_stargazers_count }} > 0" name: assert-condition type: assert name: trigger-slack provider: config: "{{ providers.slack-demo }}" type: slack with: blocks: - text: emoji: true text: There are new keep.len({{ steps.get-github-stars.results.new_stargazers}}) stargazers for keephq/keep type: plain_text type: section - text: emoji: true text: "{{#steps.get-github-stars.results.new_stargazers}} - {{username}} at {{starred_at}} {{/steps.get-github-stars.results.new_stargazers}}" type: plain_text type: section channel: "C06N0KXXXX" ================================================ FILE: examples/workflows/notify-new-trello-card.yml ================================================ # A new trello card was created workflow: id: notify-new-trello-card name: Notify on new Trello card description: Send a slack notification when a new trello card is created triggers: - type: interval value: 60 steps: - name: trello-cards provider: type: trello config: "{{ providers.trello-provider }}" with: board_id: hIjQQX9S filter: "createCard" condition: - name: assert-condition type: assert assert: "{{ state.notify-new-trello-card.-1.alert_context.alert_steps_context.trello-cards.results.number_of_cards }} >= {{steps.trello-cards.results.number_of_cards }}" actions: - name: trigger-slack provider: type: slack config: "{{ providers.slack-demo }}" with: channel: some-channel-that-youll-decide-later # Message is always mandatory message: > A new card was created ================================================ FILE: examples/workflows/ntfy_basic.yml ================================================ workflow: id: ntfy-notification-sender name: Ntfy Notification Sender description: Sends notifications to Ntfy topics with customizable messages for basic alerting and communication. triggers: - type: manual actions: - name: ntfy provider: type: ntfy config: "{{ providers.ntfy }}" with: message: "test-message" topic: "test-topic" ================================================ FILE: examples/workflows/opensearchserverless_basic.yml ================================================ workflow: id: opensearch-serverless-create-query name: OSS Create Query Docs description: Retrieves all the documents from index keep, and uploads a document to opensearch in index keep. disabled: false triggers: - type: manual steps: # This step will fail if there is no index called keep - name: query-index provider: type: opensearchserverless config: "{{ providers.opensearchserverless }}" with: query: query: match_all: {} index: keep actions: - name: create-doc provider: type: opensearchserverless config: "{{ providers.opensearchserverless }}" with: index: keep document: message: Keep test doc doc_id: doc_1 ================================================ FILE: examples/workflows/openshift_basic.yml ================================================ workflow: id: openshift-basic-monitoring name: OpenShift Basic Monitoring description: Simple OpenShift monitoring workflow that gets cluster status and pod information triggers: - type: manual steps: # Get all OpenShift projects - name: get-projects provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_projects # Get all pods - name: get-pods provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_pods # Get OpenShift routes - name: get-routes provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_routes actions: # Display cluster summary - name: display-cluster-summary provider: type: console with: message: | 🔍 OpenShift Cluster Summary: - Projects: {{ steps.get-projects.results | length }} - Total Pods: {{ steps.get-pods.results | length }} - Routes: {{ steps.get-routes.results | length }} # Show pod status for each namespace - name: display-pod-status foreach: "{{ steps.get-pods.results }}" provider: type: console with: message: "Pod: {{ foreach.value.metadata.name }} | Namespace: {{ foreach.value.metadata.namespace }} | Status: {{ foreach.value.status.phase }}" # List all projects - name: list-projects foreach: "{{ steps.get-projects.results }}" provider: type: console with: message: "Project: {{ foreach.value.metadata.name }} | Status: {{ foreach.value.status.phase | default('Active') }}" ================================================ FILE: examples/workflows/openshift_monitoring_and_remediation.yml ================================================ workflow: id: openshift-monitoring-and-remediation name: OpenShift Monitoring and Remediation description: | Comprehensive OpenShift monitoring workflow that demonstrates: - Getting cluster information (projects, pods, routes, deployment configs) - Monitoring pod health and events - Automatic remediation actions (restart pods, scale deployments) - Alert-driven workflows for OpenShift clusters triggers: - type: manual - type: alert filters: - key: source value: openshift - key: severity value: critical steps: # Get all OpenShift projects - name: get-projects provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_projects # Get all pods across namespaces - name: get-all-pods provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_pods # Get deployment configs - name: get-deployment-configs provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_deploymentconfigs # Get routes - name: get-routes provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_routes # Get node pressure conditions - name: get-node-pressure provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_node_pressure # Get events for a specific namespace (if alert provides namespace) - name: get-events if: "{{ alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_events namespace: "{{ alert.namespace }}" # Get pod logs for failing pods (if alert provides pod name) - name: get-pod-logs if: "{{ alert.pod_name and alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_logs namespace: "{{ alert.namespace }}" pod_name: "{{ alert.pod_name }}" tail_lines: 50 actions: # Report cluster overview - name: report-cluster-overview provider: type: console with: message: | 🔍 OpenShift Cluster Overview: - Projects: {{ steps.get-projects.results | length }} - Total Pods: {{ steps.get-all-pods.results | length }} - Deployment Configs: {{ steps.get-deployment-configs.results | length }} - Routes: {{ steps.get-routes.results | length }} - Node Pressure Issues: {{ steps.get-node-pressure.results | selectattr('conditions', 'ne', []) | list | length }} # Alert on failing pods - name: alert-failing-pods foreach: "{{ steps.get-all-pods.results | selectattr('status.phase', 'ne', 'Running') | selectattr('status.phase', 'ne', 'Succeeded') }}" provider: type: console with: message: | ⚠️ Pod Issue Detected: - Pod: {{ foreach.value.metadata.name }} - Namespace: {{ foreach.value.metadata.namespace }} - Status: {{ foreach.value.status.phase }} - Node: {{ foreach.value.spec.nodeName }} # Restart failing pods automatically (CrashLoopBackOff, Failed) - name: restart-failed-pods foreach: "{{ steps.get-all-pods.results | selectattr('status.phase', 'in', ['CrashLoopBackOff', 'Failed']) }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: restart_pod namespace: "{{ foreach.value.metadata.namespace }}" pod_name: "{{ foreach.value.metadata.name }}" message: "Auto-restarting failed pod {{ foreach.value.metadata.name }}" # Scale up deployment if alert indicates high load - name: scale-deployment-on-high-load if: "{{ alert.deployment_name and alert.namespace and alert.scale_up }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: scale_deployment namespace: "{{ alert.namespace }}" deployment_name: "{{ alert.deployment_name }}" replicas: "{{ alert.target_replicas | default(3) }}" # Scale up deployment config if specified - name: scale-deploymentconfig-on-demand if: "{{ alert.deploymentconfig_name and alert.namespace and alert.scale_up }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: scale_deploymentconfig namespace: "{{ alert.namespace }}" deploymentconfig_name: "{{ alert.deploymentconfig_name }}" replicas: "{{ alert.target_replicas | default(2) }}" # Restart deployment on critical alerts - name: restart-deployment-on-critical-alert if: "{{ alert.severity == 'critical' and alert.deployment_name and alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: rollout_restart kind: "deployment" name: "{{ alert.deployment_name }}" namespace: "{{ alert.namespace }}" # Restart deployment config on critical alerts - name: restart-deploymentconfig-on-critical-alert if: "{{ alert.severity == 'critical' and alert.deploymentconfig_name and alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: rollout_restart kind: "deploymentconfig" name: "{{ alert.deploymentconfig_name }}" namespace: "{{ alert.namespace }}" # Send notification with detailed information - name: send-notification if: "{{ alert }}" provider: type: slack config: "{{ providers.slack }}" with: message: | 🚨 OpenShift Alert: {{ alert.name }} 📊 Cluster Status: • Projects: {{ steps.get-projects.results | length }} • Total Pods: {{ steps.get-all-pods.results | length }} • Failing Pods: {{ steps.get-all-pods.results | selectattr('status.phase', 'ne', 'Running') | selectattr('status.phase', 'ne', 'Succeeded') | list | length }} 🔍 Alert Details: • Severity: {{ alert.severity }} • Source: {{ alert.source }} • Namespace: {{ alert.namespace | default('N/A') }} • Pod: {{ alert.pod_name | default('N/A') }} 🛠️ Actions Taken: {% if alert.deployment_name and alert.scale_up %}• Scaled deployment {{ alert.deployment_name }} to {{ alert.target_replicas | default(3) }} replicas{% endif %} {% if alert.deploymentconfig_name and alert.scale_up %}• Scaled DeploymentConfig {{ alert.deploymentconfig_name }} to {{ alert.target_replicas | default(2) }} replicas{% endif %} {% if alert.severity == 'critical' and (alert.deployment_name or alert.deploymentconfig_name) %}• Performed rollout restart{% endif %} # Example alert payloads to test this workflow: # Manual trigger for cluster overview: # No additional data needed # High load scaling scenario: # { # "name": "High CPU Usage", # "severity": "warning", # "source": "openshift", # "namespace": "production", # "deployment_name": "web-app", # "scale_up": true, # "target_replicas": 5 # } # Critical pod failure: # { # "name": "Pod CrashLoopBackOff", # "severity": "critical", # "source": "openshift", # "namespace": "production", # "pod_name": "web-app-123-abc", # "deployment_name": "web-app" # } # DeploymentConfig scaling: # { # "name": "Scale DeploymentConfig", # "severity": "warning", # "source": "openshift", # "namespace": "staging", # "deploymentconfig_name": "api-server", # "scale_up": true, # "target_replicas": 3 # } ================================================ FILE: examples/workflows/openshift_pod_restart.yml ================================================ workflow: id: openshift-pod-restart-remediation name: OpenShift Pod Restart Remediation description: Automatically restart failing pods and scale deployments based on alerts or manual triggers triggers: - type: manual - type: alert filters: - key: source value: openshift - key: pod_status value: CrashLoopBackOff steps: # Get pod details for a specific namespace - name: get-namespace-pods if: "{{ alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_pods namespace: "{{ alert.namespace }}" # Get pod logs if specific pod is mentioned - name: get-failing-pod-logs if: "{{ alert.pod_name and alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_logs namespace: "{{ alert.namespace }}" pod_name: "{{ alert.pod_name }}" tail_lines: 100 # Get events for the namespace to understand issues - name: get-namespace-events if: "{{ alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: command_type: get_events namespace: "{{ alert.namespace }}" actions: # Restart specific pod if mentioned in alert - name: restart-specific-pod if: "{{ alert.pod_name and alert.namespace }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: restart_pod namespace: "{{ alert.namespace }}" pod_name: "{{ alert.pod_name }}" message: "Restarting pod due to {{ alert.pod_status | default('failure') }}" # Scale deployment if replica count is specified - name: scale-deployment if: "{{ alert.deployment_name and alert.namespace and alert.replicas }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: scale_deployment namespace: "{{ alert.namespace }}" deployment_name: "{{ alert.deployment_name }}" replicas: "{{ alert.replicas }}" # Scale deployment config if specified - name: scale-deploymentconfig if: "{{ alert.deploymentconfig_name and alert.namespace and alert.replicas }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: scale_deploymentconfig namespace: "{{ alert.namespace }}" deploymentconfig_name: "{{ alert.deploymentconfig_name }}" replicas: "{{ alert.replicas }}" # Rollout restart deployment - name: rollout-restart-deployment if: "{{ alert.deployment_name and alert.namespace and alert.restart_deployment }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: rollout_restart kind: "deployment" name: "{{ alert.deployment_name }}" namespace: "{{ alert.namespace }}" # Rollout restart deployment config - name: rollout-restart-deploymentconfig if: "{{ alert.deploymentconfig_name and alert.namespace and alert.restart_deployment }}" provider: type: openshift config: "{{ providers.openshift }}" with: action: rollout_restart kind: "deploymentconfig" name: "{{ alert.deploymentconfig_name }}" namespace: "{{ alert.namespace }}" # Report remediation actions taken - name: report-actions provider: type: console with: message: | 🔧 OpenShift Remediation Actions Completed: {% if alert.pod_name %} - Restarted pod: {{ alert.pod_name }} in {{ alert.namespace }} {% endif %} {% if alert.deployment_name and alert.replicas %} - Scaled deployment {{ alert.deployment_name }} to {{ alert.replicas }} replicas {% endif %} {% if alert.deploymentconfig_name and alert.replicas %} - Scaled DeploymentConfig {{ alert.deploymentconfig_name }} to {{ alert.replicas }} replicas {% endif %} {% if alert.restart_deployment %} - Performed rollout restart on {{ alert.deployment_name or alert.deploymentconfig_name }} {% endif %} # Example alert payloads: # Restart specific pod: # { # "source": "openshift", # "namespace": "production", # "pod_name": "web-app-789-xyz", # "pod_status": "CrashLoopBackOff" # } # Scale deployment: # { # "source": "openshift", # "namespace": "production", # "deployment_name": "web-app", # "replicas": 5 # } # Scale deployment config: # { # "source": "openshift", # "namespace": "staging", # "deploymentconfig_name": "api-server", # "replicas": 3 # } # Rollout restart deployment: # { # "source": "openshift", # "namespace": "production", # "deployment_name": "web-app", # "restart_deployment": true # } ================================================ FILE: examples/workflows/opsgenie-close-alert.yml ================================================ workflow: id: opsgenie-alert-closer name: OpsGenie Alert Closer description: Closes OpsGenie alerts for resolved Coralogix alerts. triggers: - type: manual - type: alert filters: - key: source value: coralogix - key: status value: resolved actions: - name: close-alert # run only if we have an opsgenie alert id if: "'{{ alert.opsgenie_alert_id }}'" provider: config: "{{ providers.opsgenie }}" type: opsgenie with: type: close_alert alert_id: "{{ alert.opsgenie_alert_id }}" ================================================ FILE: examples/workflows/opsgenie-create-alert-cel.yml ================================================ workflow: id: opsgenie-critical-alert-creator-cel name: OpsGenie Critical Alert Creator (CEL) description: Creates OpsGenie alerts for critical Coralogix issues with team assignment and alert enrichment tracking using CEL filters. triggers: - type: manual - type: alert cel: source.contains("coralogix") && severity == "critical" actions: - name: create-alert if: "not '{{ alert.opsgenie_alert_id }}'" provider: config: "{{ providers.opsgenie }}" type: opsgenie with: message: "{{ alert.name }}" responders: - name: "{{ alert.team }}" type: team enrich_alert: - key: opsgenie_alert_id value: results.alertId ================================================ FILE: examples/workflows/opsgenie-create-alert.yml ================================================ workflow: id: opsgenie-critical-alert-creator name: OpsGenie Critical Alert Creator description: Creates OpsGenie alerts for critical Coralogix issues with team assignment and alert enrichment tracking. triggers: - type: manual - type: alert filters: - key: source value: coralogix - key: severity value: critical actions: - name: create-alert if: "not '{{ alert.opsgenie_alert_id }}'" provider: type: opsgenie config: "{{ providers.opsgenie }}" with: message: "{{ alert.name }}" responders: - name: "{{ alert.team }}" type: team enrich_alert: - key: opsgenie_alert_id value: results.alertId ================================================ FILE: examples/workflows/opsgenie_open_alerts.yml ================================================ workflow: id: opsgenie-alert-monitor name: OpsGenie Alert Monitor description: Monitors open alerts in OpsGenie and sends detailed Slack notifications with priority levels and timestamps. triggers: - type: interval value: 60 steps: - name: get-open-alerts provider: type: opsgenie config: "{{ providers.opsgenie }}" with: type: alerts query: "status: open" actions: - name: slack provider: type: slack config: " {{ providers.slack-demo }} " with: # Message is always mandatory message: > Opsgenie has {{ steps.get-open-alerts.results.number_of_alerts }} open alerts blocks: - type: section text: type: mrkdwn text: |- {{#steps.get-open-alerts.results.alerts}} - Alert Id: {{id}} | Priortiy: {{priority}} | Created at: {{created_at}} | Message: {{message}} {{/steps.get-open-alerts.results.alerts}} ================================================ FILE: examples/workflows/pagerduty.yml ================================================ workflow: id: pagerduty-example name: PagerDuty workflow example description: retrieve PagerDuty incident, create event and incident triggers: - type: manual steps: - name: check-incident-exist-pd-fingerprint if: "{{ incident.fingerprint }} != ''" provider: type: pagerduty config: "{{ providers.PagerDuty }}" with: incident_id: "{{ incident.fingerprint }}" - name: check-incident-exist-pd-incident-key-dedup-key provider: type: pagerduty config: "{{ providers.PagerDuty }}" with: incident_key: "7f3baa50-e7ef-4891-bd4a-d1ee310dff8f" actions: - name: pd-create-event provider: type: pagerduty config: "{{ providers.PagerDuty }}" with: routing_key: 'your_routing_key' # optional, otherwise it will take from provider configuration$ severity: critical source: keep component: job_service group: job class: job custom_details: environment: 'production' url: 'https://keep.example.org' links: - href: "https://keep.example.com/" text: "View in Keep" dedup: "{{ incident.id }}" event_type: trigger title: "TestEvent" - name: pd-create-inc provider: type: pagerduty config: "{{ providers.PagerDuty }}" with: source: keep alert_body: details: client: keep client_url: "https://keep.example.com/incidents/{{ incident.id }}" description: "{{ incident.user_summary }}" alert_count: "{{ incident.alerts_count }}" alerts: "{{ incident.alerts }}" type: incident_body dedup: "{{ incident.id }}" status: "triggered" service_id: "{{ incident.service_id }}" requester: email@example.com severity: "{{ incident.severity }}" title: "{{ incident.user_generated_name }}" ================================================ FILE: examples/workflows/pattern-matching-cel.yml ================================================ workflow: id: pattern-matching-monitor-cel name: Pattern Matching Monitor (CEL) description: Monitors alerts with pattern matching using CEL filters. triggers: - type: alert cel: name.contains("error") || name.contains("failure") actions: - name: notify provider: type: console with: message: "Error or failure detected: {{ alert.name }}" ================================================ FILE: examples/workflows/permissions_example.yml ================================================ workflow: id: permissions-example name: Permissions Example description: "Demonstrates how to restrict workflow execution using permissions" # Restrict execution to admin role and specific users permissions: - admin - sarah.smith@example.com # noc user triggers: - type: manual steps: - name: get-system-status provider: type: http with: url: "https://api.example.com/status" method: GET actions: - name: send-status-notification provider: type: slack config: "{{ providers.slack-operations }}" with: channel: "#operations" message: | *Sensitive System Status Check* Status: {{ steps.get-system-status.results.status }} Health: {{ steps.get-system-status.results.health }} Last Updated: {{ steps.get-system-status.results.last_updated }} _This workflow has restricted permissions and can only be executed by authorized users._ ================================================ FILE: examples/workflows/planner_basic.yml ================================================ workflow: id: planner-task-creator name: Microsoft Planner Task Creator description: Creates tasks in Microsoft Planner with retry capabilities for reliable task creation. triggers: - type: interval value: 15 actions: - name: create-planner-task provider: type: planner config: " {{ providers.planner }} " with: title: "Keep HQ Task1" plan_id: "tAtCor_XPEmqTzVqTigCycgABz0K" on-failure: retry: count: 2 interval: 2 ================================================ FILE: examples/workflows/posthog_example.yml ================================================ workflow: id: posthog-domain-tracker name: PostHog Domain Tracker description: Tracks domains from PostHog session recordings over the last 24 hours and sends a summary to Slack. triggers: - type: manual - type: interval value: 86400 # Run daily (in seconds) steps: - name: get-posthog-domains provider: config: "{{ providers.posthog }}" type: posthog with: query_type: session_recording_domains hours: 24 limit: 500 actions: - name: send-to-slack provider: config: "{{ providers.slack }}" type: slack with: blocks: - type: header text: type: plain_text text: "PostHog Session Recording Domains (Last 24 Hours)" emoji: true - type: section text: type: mrkdwn text: "Found *{{ steps.get-posthog-domains.results.unique_domains_count }}* unique domains across *{{ steps.get-posthog-domains.results.total_domains_found }}* occurrences" - type: divider - type: section text: type: mrkdwn text: "Domains:*" - type: section text: type: mrkdwn text: "{{#steps.get-posthog-domains.results.unique_domains}} • *{{ . }}* {{/steps.get-posthog-domains.results.unique_domains}}" - type: divider ================================================ FILE: examples/workflows/query-databend.yml ================================================ workflow: id: databend-performance-monitor name: Databend Performance Monitor description: Executes performance analysis queries on Databend for large dataset operations. disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: - name: databend-step provider: type: databend config: "{{ providers.databend }}" with: query: SELECT avg(number) FROM numbers(100000000) actions: [] ================================================ FILE: examples/workflows/query_clickhouse.yml ================================================ workflow: id: clickhouse-error-monitor name: ClickHouse Error Monitor description: Monitors ClickHouse logs for errors and sends notifications through both Ntfy and Slack channels. triggers: - type: manual steps: - name: clickhouse-step provider: config: "{{ providers.clickhouse }}" type: clickhouse with: query: SELECT * FROM logs_table ORDER BY timestamp DESC LIMIT 1; single_row: "True" actions: - name: ntfy-action if: "'{{ steps.clickhouse-step.results.level }}' == 'ERROR'" provider: config: "{{ providers.ntfy }}" type: ntfy with: message: "Error in clickhouse logs_table: {{ steps.clickhouse-step.results.level }}" topic: clickhouse - name: slack-action if: "'{{ steps.clickhouse-step.results.level }}' == 'ERROR'" provider: config: "{{ providers.slack }}" type: slack with: message: "Error in clickhouse logs_table: {{ steps.clickhouse-step.results.level }}" ================================================ FILE: examples/workflows/query_grafana_loki.yaml ================================================ workflow: id: loki-log-analyzer name: Loki Log Analyzer description: Analyzes log rates from Grafana Loki with customizable queries and time ranges for monitoring log patterns. disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: - name: grafana_loki-step provider: type: grafana_loki config: "{{ providers.loki }}" with: query: sum(rate({job="varlogs"}[10m])) by (level) queryType: query_range step: 300 actions: [] ================================================ FILE: examples/workflows/query_mongodb.yaml ================================================ workflow: id: mongodb-document-finder name: MongoDB Document Finder description: Executes targeted MongoDB queries with filters to retrieve specific documents from collections. triggers: - type: manual steps: - name: mongodb-step provider: config: "{{ providers.mongo }}" type: mongodb with: # Please note that argument order is important for MongoDB queries. query: | { "find": "mycollection", "filter": { "name": "First Document" } } single_row: true ================================================ FILE: examples/workflows/query_victorialogs.yaml ================================================ workflow: id: victorialogs-stats-analyzer name: VictoriaLogs Stats Analyzer description: Analyzes VictoriaLogs data with statistical queries to track log level distributions and patterns. disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: - name: victorialogs-step provider: config: "{{ providers.logs }}" type: victorialogs with: query: "* | stats by (level) count(*)" queryType: stats_query_range actions: [] ================================================ FILE: examples/workflows/query_victoriametrics.yml ================================================ workflow: id: victoriametrics-threshold-monitor name: VictoriaMetrics Threshold Monitor description: Monitors VictoriaMetrics metrics with threshold-based alerts, sending notifications to both Slack and Ntfy. triggers: - type: manual steps: - name: victoriametrics-step provider: config: "{{ providers.victoriametrics }}" type: victoriametrics with: query: avg(rate(process_cpu_seconds_total)) queryType: query actions: - name: trigger-slack1 condition: - name: threshold-condition type: threshold value: "{{ steps.victoriametrics-step.results.data.result.0.value.1 }}" compare_to: 0.0050 alias: A compare_type: gt provider: type: slack config: "{{ providers.slack }}" with: message: "Result: {{ steps.victoriametrics-step.results.data.result.0.value.1 }} is greater than 0.0040! 🚨" - name: trigger-slack2 if: "{{ A }}" provider: type: slack config: "{{ providers.slack }}" with: message: "Result: {{ steps.victoriametrics-step.results.data.result.0.value.1 }} is greater than 0.0040! 🚨" - name: trigger-ntfy if: "{{ A }}" provider: type: ntfy config: "{{ providers.ntfy }}" with: message: "Result: {{ steps.victoriametrics-step.results.data.result.0.value.1 }} is greater than 0.0040! 🚨" topic: ezhil ================================================ FILE: examples/workflows/raw_sql_query_datetime.yml ================================================ # Alert if a result queried from the DB is above a certain thershold. workflow: id: mysql-datetime-monitor name: MySQL Datetime Monitor description: Monitors time differences in MySQL database entries and alerts via Slack when exceeding one hour threshold. triggers: - type: interval value: 300 # every 5 minutes steps: - name: get-max-datetime provider: type: mysql config: "{{ providers.mysql-prod }}" with: # Get max(datetime) from the random table query: "SELECT MAX(datetime) FROM demo_table LIMIT 1" actions: - name: trigger-slack condition: - name: threshold-condition type: threshold # datetime_compare(t1, t2) compares t1-t2 and returns the diff in hours # utcnow() returns the local machine datetime in UTC # to_utc() converts a datetime to UTC value: keep.datetime_compare(keep.utcnow(), keep.to_utc("{{ steps.this.results[0][0] }}")) compare_to: 1 # hours compare_type: gt # greater than provider: type: slack config: " {{ providers.slack-demo }} " with: message: "DB datetime value ({{ actions.trigger-slack.conditions.threshold.0.compare_value }}) is greater than 1! 🚨" ================================================ FILE: examples/workflows/resolve_old_alerts.yml ================================================ workflow: id: alert-auto-resolver name: Alert Auto-Resolver description: Automatically resolves stale alerts that haven't been updated in over an hour to maintain alert hygiene. triggers: - type: manual - type: interval value: 60 steps: # get the alerts from keep - name: get-alerts provider: type: keep with: version: 2 filter: "status == 'firing'" actions: - name: resolve-alerts foreach: " {{ steps.get-alerts.results }} " if: "keep.to_timestamp('{{ foreach.value.lastReceived }}') < keep.utcnowtimestamp() - 3600" provider: type: mock with: enrich_alert: - key: status value: resolved disposable: true ================================================ FILE: examples/workflows/retrieve_cloudwatch_logs.yaml ================================================ workflow: id: cloudwatch-log-retriever name: CloudWatch Log Retriever description: Retrieves and analyzes CloudWatch logs with custom queries, filtering, and alert generation capabilities. triggers: - type: manual steps: - name: cw-logs provider: config: "{{ providers.cloudwatch }}" type: cloudwatch with: log_group: "meow_logs" query: "fields @message | sort @timestamp desc | limit 20" hours: 12 remove_ptr_from_results: true # We need only @message, no need for @ptr actions: - name: raise-alert if: keep.len( {{ steps.cw-logs.results }} ) > 0 provider: type: keep with: alert: name: "CW logs found!" ================================================ FILE: examples/workflows/run-github-workflow.yaml ================================================ workflow: id: run-github-workflow name: Run GitHub Workflow description: Triggers GitHub Actions workflows with customizable inputs for automated documentation testing. triggers: - type: manual actions: - name: run-gh-action provider: config: "{{ providers.github }}" type: github with: run_action: true repo_owner: keephq repo_name: keep workflow: test-docs.yml inputs: input1: value1 input2: value2 ================================================ FILE: examples/workflows/send-message-telegram-with-htmlmd.yaml ================================================ workflow: id: send-message-telegram-with-htmlmd name: telegram description: telegram disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: [] actions: # Telegram only supports limited formatting. Refer https://core.telegram.org/bots/api#formatting-options - name: telegram-action provider: type: telegram config: "{{ providers.telegram }}" with: chat_id: 1072776973 message: "This is html bold italic bold italic bold strikethrough italic bold strikethrough spoiler underline italic bold bold" # Uses HTML parse_mode: html - name: telegram-action provider: type: telegram config: "{{ providers.telegram }}" with: chat_id: 1072776973 message: "This is markdown *bold _italic bold ~italic bold strikethrough ||italic bold strikethrough spoiler||~ __underline italic bold___ bold*" # Uses MarkdownV2 parse_mode: markdown ================================================ FILE: examples/workflows/send_slack_message_on_failure.yaml ================================================ workflow: id: send-slack-message-on-failure name: Get alert root cause from OpenAI, notify if workflow fails description: Get alert root cause from OpenAI, notify if workflow fails triggers: - type: alert cel: alert.severity == "critical" on-failure: provider: type: slack config: "{{ providers.slack }}" with: channel: "" # message will be injected from the workflow engine # e.g. "Workflow failed with error: " steps: - name: openai-step provider: config: "{{ providers.openai }}" type: openai with: prompt: | You are a very talented engineer that receives critical alert and reports back the root cause analysis. Here is the context: keep.json_dumps({{alert}}) (it is a JSON of the alert). In your answer, also provide the reason why you think it is the root cause and specify what your certainty level is that it is the root cause. (between 1-10, where 1 is low and 10 is high) actions: - name: slack-action provider: config: "{{ providers.slack }}" type: slack with: message: "{{steps.openai-step.results}}" ================================================ FILE: examples/workflows/send_smtp_email.yml ================================================ workflow: id: smtp-email-sender name: SMTP Email Sender description: Sends customized email notifications through SMTP with configurable sender, recipient, and message content. triggers: - type: manual actions: - name: send-email provider: type: smtp config: "{{ providers.smtp }}" with: from_email: "your_email@gmail.com" from_name: "Workflow user" to_email: - "matvey@keephq.dev" subject: "Hello from Keep workflow!" body: "Hello! This is a test email from Keep workflow." ================================================ FILE: examples/workflows/send_smtp_html_email.yml ================================================ workflow: id: smtp-html-email-sender name: SMTP HTML Email Sender description: Sends HTML-formatted email notifications through SMTP with customizable content and styling. triggers: - type: manual actions: - name: send-html-email provider: type: smtp config: "{{ providers.smtp }}" with: from_email: "your_email@gmail.com" from_name: "Keep Workflow" to_email: - "recipient1@example.com" - "recipient2@example.com" subject: "Keep Alert Notification" html: |

Alert from Keep

This is an example of an HTML-formatted email sent via SMTP provider.

Alert Type System Health Check
Status ✓ Operational
Timestamp {{ utcnow }}

Note: This email demonstrates the HTML formatting capabilities of the SMTP provider.

================================================ FILE: examples/workflows/sendgrid_basic.yml ================================================ workflow: id: sendgrid-notification-sender name: SendGrid Notification Sender description: Sends HTML-formatted email notifications to multiple recipients using SendGrid's email service. triggers: - type: manual actions: - name: trigger-email provider: type: sendgrid config: " {{ providers.Sendgrid }} " with: to: - "youremail@gmail.com" - "youranotheremail@gmail.com" subject: "Hello from Keep!" html: "Test with HTML" ================================================ FILE: examples/workflows/service-error-rate-monitor-datadog.yml ================================================ # AUTO GENERATED # Alert that was created with Keep semantic layer # Prompt: can you write an alert spec that triggers when a service has more than 0.01% error rate in datadog for more than an hour? workflow: id: service-error-rate-monitor name: Service Error Rate Monitor description: Monitors service error rates through Datadog metrics, triggering alerts when error rate exceeds 0.01% for over an hour with Slack notifications. owners: - github-johndoe - slack-janedoe services: - my-service triggers: - type: manual steps: - name: check-error-rate provider: type: datadog config: "{{ providers.datadog }}" with: query: "sum:my_service.errors{*}.as_count() / sum:my_service.requests{*}.as_count() * 100" timeframe: "1h" actions: - name: notify-slack condition: - name: threshold-condition type: threshold value: "{{ steps.check-error-rate.results }}" compare_to: 0.01 compare_type: gt provider: type: slack config: "{{ providers.slack-demo }}" with: channel: service-alerts message: > The my_service error rate is higher than 0.01% for more than an hour. Please investigate. ================================================ FILE: examples/workflows/severity_changed.yml ================================================ workflow: id: severity-change-monitor name: Severity Change Monitor description: Tracks alert severity changes and provides detailed notifications about severity level transitions. triggers: - type: alert severity_changed: true actions: - name: echo-test provider: type: console with: # "The severity has changed from warning to info (it has decreased from last alert)" message: "The severity has changed from {{ alert.previous_severity }} to {{ alert.severity }} (it has {{ alert.severity_change }} since last alert)" ================================================ FILE: examples/workflows/signl4-alerting-workflow.yaml ================================================ workflow: id: signl4-alert-notifier name: SIGNL4 Alert Notifier description: Routes alerts to SIGNL4 for mobile team alerting with customizable titles and messages. triggers: - filters: - key: source value: r".*" type: alert owners: [] services: [] steps: [] actions: - name: signl4-action provider: config: "{{ providers.signl4-alerting }}" type: signl4 with: message: Test. title: Keep Alert ================================================ FILE: examples/workflows/simple_http_request_ntfy.yml ================================================ # Alert if a result queried from the DB is above a certain thershold. workflow: id: mysql-ntfy-monitor name: MySQL Ntfy Monitor description: Monitors MySQL datetime values and sends notifications through Ntfy when thresholds are exceeded. triggers: - type: interval value: 300 # every 5 minutes steps: - name: get-max-datetime provider: type: mysql config: "{{ providers.mysql-prod }}" with: # Get max(datetime) from the random table query: "SELECT MAX(datetime) FROM demo_table LIMIT 1" actions: - name: trigger-ntfy condition: - name: threshold-condition type: threshold # datetime_compare(t1, t2) compares t1-t2 and returns the diff in hours # utcnow() returns the local machine datetime in UTC # to_utc() converts a datetime to UTC value: keep.datetime_compare(keep.utcnow(), keep.to_utc({{ steps.get-max-datetime.results[0][0] }})) compare_to: 1 # hours compare_type: gt # greater than provider: type: http with: method: POST body: alert: "{{ alert }}" fingerprint: "{{ alert.fingerprint }}" some_customized_field: "{{ keep.strip(alert.some_attribute) }}" url: "https://ntfy.sh/MoRen5UlPEQr8s4Y" ================================================ FILE: examples/workflows/slack-message-reaction.yml ================================================ workflow: id: slack-alert-lifecycle name: Slack Alert Lifecycle Manager description: Manages alert lifecycle in Slack with automatic reactions for resolved alerts and enriched tenant information. disabled: false triggers: - type: manual - filters: - key: source value: gcpmonitoring type: alert consts: {} owners: [] services: [] steps: [] actions: - name: slack-alert-resolved if: "'{{ alert.slack_timestamp }}' and '{{ alert.status }}' == 'resolved'" provider: config: "{{ providers.keephq }}" type: slack with: channel: C06PF9TCWUF message: "white_check_mark" thread_timestamp: "{{ alert.slack_timestamp }}" notification_type: "reaction" - name: get-tenant-name if: "not '{{ alert.customer_name }}'" provider: config: "{{ providers.readonly }}" type: mysql with: as_dict: true enrich_alert: - key: customer_name value: results.name query: select * from tenant where id = '{{ alert.tenantId }}' single_row: true - name: send-slack-alert if: "not '{{ alert.slack_timestamp }}'" provider: config: "{{ providers.keephq }}" type: slack with: enrich_alert: - key: slack_timestamp value: results.slack_timestamp blocks: - text: emoji: true text: "{{alert.gcp.policy_name}}" type: plain_text type: header - elements: - elements: - text: "Tenant ID: {{alert.tenantId}}{{^alert.tenantId}}n/a{{/alert.tenantId}}" type: text type: rich_text_section type: rich_text - elements: - elements: - text: "Tenant Name: {{alert.customer_name}}{{^alert.customer_name}}n/a{{/alert.customer_name}}" type: text type: rich_text_section type: rich_text - elements: - elements: - text: "Scopes: {{alert.validatedScopes}}{{^alert.validatedScopes}}n/a{{/alert.validatedScopes}}" type: text type: rich_text_section type: rich_text - elements: - elements: - text: "Description: {{alert.content}}" type: text type: rich_text_section type: rich_text - elements: - action_id: actionId-0 text: emoji: true text: ":gcp: Original Alert" type: plain_text type: button url: "{{alert.url}}" type: actions channel: C06PF9TCWUF message: "" ================================================ FILE: examples/workflows/slack-workflow-trigger.yml ================================================ workflow: id: slack-workflow-trigger name: Slack Interactive Workflow Trigger description: Creates an interactive Slack message with a button that can trigger another workflow, demonstrating workflow chaining through Slack interactions. disabled: false triggers: - type: manual - type: alert consts: {} owners: [] services: [] steps: [] actions: - name: send-slack-alert if: "not '{{ alert.slack_timestamp }}'" provider: config: "{{ providers.slack-prod }}" type: slack with: blocks: - text: emoji: true text: "{{alert.name}}" type: plain_text type: header - elements: - action_id: actionId-0 text: emoji: true text: "Trigger Slack Workflow" type: plain_text type: button # The following will trigger the workflow with the whole alert object: # url: "https://api.keephq.dev/workflows/WORKFLOW_ID_TO_EXECUTE/run?alert={{alert.id}}&api_key=YOUR_API_KEY" # The following will trigger the workflow with the alert name, as an example, while any parameters can be passed: url: "https://api.keephq.dev/workflows/WORKFLOW_ID_TO_EXECUTE/run?name={{alert.name}}&api_key=YOUR_API_KEY" type: actions channel: C06PF9TCWUF message: "" ================================================ FILE: examples/workflows/slack_basic.yml ================================================ workflow: id: cloudwatch-slack-notifier name: CloudWatch Slack Notifier description: Forwards AWS CloudWatch alarms to Slack channels with customized alert messages. triggers: - type: alert filters: - key: source value: cloudwatch - type: manual actions: - name: trigger-slack provider: type: slack config: " {{ providers.slack-prod }} " with: message: "Got alarm from aws cloudwatch! {{ alert.name }}" ================================================ FILE: examples/workflows/slack_basic_cel.yml ================================================ workflow: id: cloudwatch-slack-notifier-cel name: CloudWatch Slack Notifier (CEL) description: Forwards AWS CloudWatch alarms to Slack channels with customized alert messages using CEL filters. triggers: - type: alert cel: source.contains("cloudwatch") - type: manual actions: - name: trigger-slack provider: type: slack config: " {{ providers.slack-prod }} " with: message: "Got alarm from aws cloudwatch! {{ alert.name }}" ================================================ FILE: examples/workflows/slack_basic_interval.yml ================================================ workflow: id: scheduled-slack-notifier name: Scheduled Slack Notifier description: Sends periodic Slack messages at configurable intervals for regular status updates or reminders. triggers: - type: interval value: 15 actions: - name: trigger-slack provider: type: slack config: " {{ providers.slack-demo }} " with: message: "Send a slack message every 15 seconds!" ================================================ FILE: examples/workflows/slack_message_update.yml ================================================ workflow: id: zabbix-notification-lifecycle name: Slack Notification Lifecycle Manager description: Manages messages and updates as attachments in Slack with automatic updates on resolved alerts disabled: false triggers: - type: manual - type: alert cel: severity > 'info' && source.contains('zabbix') inputs: [] consts: {} owners: [] services: [] steps: [] actions: - name: slack-alert-resolved if: "'{{ alert.slack_timestamp }}' and '{{ alert.status }}' == 'resolved'" provider: type: slack config: "{{ providers.keephq }}" with: slack_timestamp: "{{alert.slack_timestamp}}" channel: C06PF9TCWUF attachments: - color: good title: "Resolved: {{alert.name}}" title_link: "{{alert.url}}" fields: - title: Host value: "{{alert.hostname}}" short: true - title: Severity value: "{{alert.severity}}" short: true - title: Description value: "{{alert.description}}" short: true - title: Time value: "{{alert.time}}" short: true - name: slack-alert if: not '{{ alert.slack_timestamp }}' or '{{alert.status}}' == 'firing' provider: type: slack config: "{{ providers.keephq }}" with: enrich_alert: - key: slack_timestamp value: results.slack_timestamp channel: C06PF9TCWUF attachments: - color: danger title: "{{alert.name}}" title_link: "{{alert.url}}" fields: - title: Host value: "{{alert.hostname}}" short: true - title: Severity value: "{{alert.severity}}" short: true - title: Description value: "{{alert.description}}" short: true - title: Time value: "{{alert.time}}" short: true ================================================ FILE: examples/workflows/squadcast_example.yml ================================================ workflow: id: squadcast-incident-creator name: SquadCast Incident Creator description: Creates SquadCast incidents from alerts with customizable messages and additional context data. triggers: - type: alert actions: - name: create-incident provider: config: "{{ providers.squadcast }}" type: squadcast with: additional_json: "{{ alert }}" description: TEST message: "{{ alert.name }}-test" notify_type: incident ================================================ FILE: examples/workflows/teams-adaptive-card-notifier.yaml ================================================ workflow: id: teams-adaptive-card-notifier name: Teams Adaptive Card Notifier description: Sends customized Microsoft Teams notifications using Adaptive Cards with dynamic alert information and formatted sections. disabled: false triggers: - type: manual - filters: - key: source value: r".*" type: alert consts: {} owners: [] services: [] steps: [] actions: - name: teams-action provider: config: "{{ providers.teams }}" type: teams with: message: "" sections: '[{"type": "TextBlock", "text": "{{alert.name}}"}, {"type": "TextBlock", "text": "Tal from Keep"}]' typeCard: message ================================================ FILE: examples/workflows/teams-adaptive-cards-with-mentions.yaml ================================================ workflow: id: teams-adaptive-card-with-mentions name: Teams Adaptive Card With Mentions description: Sends Microsoft Teams notifications using Adaptive Cards with user mentions to notify specific team members. disabled: false triggers: - type: manual - filters: - key: source value: r".*" type: alert consts: {} owners: [] services: [] steps: [] actions: - name: teams-action provider: config: "{{ providers.teams }}" type: teams with: typeCard: message sections: '[{"type": "TextBlock", "text": "Alert: {{alert.name}}"}, {"type": "TextBlock", "text": "Hello John Doe, please review this alert!"}, {"type": "TextBlock", "text": "Severity: {{alert.severity}}"}]' mentions: '[{"id": "john.doe@example.com", "name": "John Doe"}]' ================================================ FILE: examples/workflows/telegram_advanced.yml ================================================ workflow: id: telegram-message-topic-markup name: Telegram Message Sender with Topic Markup description: Send messages into Telegram topic with a message containing a reply markup. triggers: - type: manual actions: - name: telegram provider: type: telegram config: "{{ providers.telegram }}" with: message: "message with topic markup" chat_id: "-1001234567890" topic_id: "1234" reply_markup: 📌 Confluence 📖: url: "confluence.example.com" 📖 Documentation 📖: url: "docs.example.com" ================================================ FILE: examples/workflows/telegram_basic.yml ================================================ workflow: id: telegram-message-sender name: Telegram Message Sender description: Sends customized notifications to Telegram channels or users using environment-configured chat IDs. triggers: - type: manual actions: - name: telegram provider: type: telegram config: "{{ providers.telegram }}" with: message: "test" chat_id: "-1001234567890" image_url: "https://cdn.prod.website-files.com/66adeb018210ff2165886994/67aa1f6766f15cb7ec62e962_Keep%20With%20Name.svg" ================================================ FILE: examples/workflows/test_jira_create_with_custom_fields.yml ================================================ workflow: id: test-jira-create-custom-fields name: Test Jira Create with Custom Fields description: Test workflow to demonstrate CREATE operations with custom fields disabled: false triggers: - type: manual inputs: [] consts: {} owners: [] services: [] steps: [] actions: - name: jira-action provider: type: jira config: "{{ providers.jira }}" with: project_key: "TEST" board_name: "TEST" summary: "Create new issue with custom fields" description: "This is a test issue created with custom fields" issue_type: "Task" custom_fields: customfield_10696: "10" customfield_10201: "Critical" ================================================ FILE: examples/workflows/test_jira_custom_fields_fix.yml ================================================ workflow: id: test-jira-custom-fields-fix name: Test Jira Custom Fields Fix description: Test workflow to demonstrate the fix for Jira custom fields update issue disabled: false triggers: - type: manual inputs: [] consts: {} owners: [] services: [] steps: [] actions: - name: jira-action provider: type: jira config: "{{ providers.jira }}" with: issue_id: "{{ incident.ticket_id }}" project_key: "TEST" board_name: "TEST" summary: "Update summary of an issue" description: "Test description" issue_type: "Task" custom_fields: customfield_10696: "10" customfield_10201: "Critical" ================================================ FILE: examples/workflows/update-incident-grafana-incident.yaml ================================================ workflow: id: grafana-incident-enricher name: Grafana Incident AI Enricher description: Enriches Grafana incidents with AI-generated titles using OpenAI analysis of incident context. triggers: - type: incident events: - created consts: {} owners: [] services: [] steps: - name: get-enrichments provider: type: openai config: "{{ providers.openai }}" with: prompt: You received such an incident {{incident}}, generate title model: gpt-4o-mini structured_output_format: type: json_schema json_schema: name: missing_fields schema: type: object properties: title: type: string description: "Anaylse the {{incident}} carefully and give a suitable title" required: - "title" additionalProperties: false strict: true actions: - name: grafana_incident-action provider: type: grafana_incident config: "{{ providers.grafana }}" with: # Checkout https://docs.keephq.dev/providers/documentation/grafana_incident-provider for other available fields updateType: updateIncidentTitle operationType: update incident_id: "{{ incident.fingerprint }}" title: "{{ steps.get-enrichments.results.response.title }}" ================================================ FILE: examples/workflows/update-task-in-asana.yaml ================================================ workflow: id: update-task-in-asana name: Update task in asana description: asana disabled: false triggers: - type: manual consts: {} owners: [] services: [] steps: - name: asana-step provider: type: asana config: "{{ providers.asana }}" with: task_id: 1209749862246975 completed: true name: "done: updated the task" actions: [] ================================================ FILE: examples/workflows/update_jira_ticket.yml ================================================ workflow: id: jira-ticket-updater name: Jira Ticket Updater description: Updates existing Jira issues with new summaries and descriptions while maintaining issue relationships. triggers: - type: manual actions: - name: jira-action provider: config: "{{ providers.Jira }}" type: jira with: board_name: "" description: Update description of an issue issue_id: 10023 project_key: "" summary: Update summary of an issue ================================================ FILE: examples/workflows/update_service_now_tickets_status.yml ================================================ workflow: id: servicenow-ticket-sync name: ServiceNow Ticket Sync description: Synchronizes ServiceNow ticket statuses with Keep alerts and maintains bidirectional state tracking. triggers: - type: manual steps: # get the alerts from keep - name: get-alerts provider: type: keep # get all the alerts with sys_id (means that ticket exists for them) with: filters: - key: ticket_type value: servicenow actions: # update the tickets - name: update-ticket foreach: " {{ steps.get-alerts.results }} " provider: type: servicenow config: " {{ providers.servicenow }} " with: ticket_id: "{{ foreach.value.alert_enrichment.enrichments.ticket_id }}" table_name: "{{ foreach.value.alert_enrichment.enrichments.table_name }}" fingerprint: "{{ foreach.value.alert_fingerprint }}" enrich_alert: - key: ticket_status value: results.state ================================================ FILE: examples/workflows/update_workflows_from_http.yml ================================================ workflow: id: http-workflow-sync name: HTTP Workflow Sync description: Updates Keep workflows from remote HTTP sources, supporting GitHub raw content and other HTTP endpoints. triggers: - type: manual steps: - name: get-workflow provider: type: http with: method: GET url: "https://raw.githubusercontent.com/keephq/keep/refs/heads/main/examples/workflows/new_github_stars.yml" actions: - name: update provider: type: keep with: workflow_to_update_yaml: "raw_render_without_execution({{ steps.get-workflow.results.body }})" ================================================ FILE: examples/workflows/update_workflows_from_s3.yml ================================================ workflow: id: s3-workflow-sync name: S3 Workflow Sync description: Synchronizes Keep workflows from S3 bucket storage with optional full sync capabilities. triggers: - type: manual steps: - name: s3-dump provider: config: "{{ providers.s3 }}" type: s3 with: bucket: "keep-workflows" actions: # optional: delete all other workflows before updating for full sync # - name: delete-all-other-workflows # provider: # type: keep # with: # delete_all_other_workflows: true - name: update foreach: "{{ steps.s3-dump.results }}" provider: type: keep with: workflow_to_update_yaml: "raw_render_without_execution({{ foreach.value }})" ================================================ FILE: examples/workflows/webhook_example.yml ================================================ workflow: id: webhook-test-runner name: Webhook Test Runner description: Tests webhook functionality with console logging and customizable message payloads. debug: true triggers: - type: manual steps: - name: console-test provider: type: console with: message: "Hello world!" actions: - name: webhook-test provider: type: webhook config: "{{ providers.test }}" with: body: message: "Hello world" ================================================ FILE: examples/workflows/webhook_example_foreach.yml ================================================ workflow: id: webhook-batch-processor name: Webhook Batch Processor description: Processes multiple alerts through webhooks with conditional execution based on alert status. debug: true triggers: - type: manual steps: - name: webhook-get provider: type: webhook config: "{{ providers.test }}" with: method: GET url: "http://localhost:8000" - name: get-alerts foreach: " {{ steps.webhook-get.results.body.ids }}" provider: type: keep with: version: 2 filter: 'id=="{{ foreach.value }}"' actions: - name: echo foreach: " {{ steps.get-alerts.results }}" if: '{{ foreach.value.0.status }} == "firing"' provider: type: console with: logger: true message: "alert {{ foreach.value.0.id }} is {{ foreach.value.0.status }}" # actions: # - name: webhook-test # foreach: " {{ steps.get-alerts.results }}" # if: '{{ foreach.value.0.status }} == "firing"' # provider: # type: webhook # config: "{{ providers.test }}" # with: # body: # message: "Hello world" ================================================ FILE: examples/workflows/workflow_only_first_time_example.yml ================================================ workflow: id: first-alert-notifier name: First Alert Notifier description: Sends Slack notifications only for the first occurrence of an alert within a 24-hour window. triggers: - type: alert filters: - key: name value: "server-is-down" actions: - name: send-slack-message if: "keep.is_first_time('{{ alert.fingerprint }}', '24h')" provider: type: slack config: "{{ providers.slack }}" with: message: | "Tier 1 Alert: {{ alert.name }} - {{ alert.description }} Alert details: {{ alert }}" ================================================ FILE: examples/workflows/workflow_start_example.yml ================================================ workflow: id: tiered-alert-escalator name: Tiered Alert Escalator description: Manages alert escalation through different tiers based on alert duration with targeted Slack notifications. triggers: - type: alert filters: - key: name value: "server-is-down" actions: - name: send-slack-message-tier-1 if: "keep.get_firing_time('{{ alert }}', 'minutes') > 15 and not keep.get_firing_time('{{ alert }}', 'minutes') < 30" provider: type: slack config: "{{ providers.slack }}" with: message: | "Tier 1 Alert: {{ alert.name }} - {{ alert.description }} Alert details: {{ alert }}" - name: send-slack-message-tier-2 if: "keep.get_firing_time('{{ alert }}', 'minutes') > 30" provider: type: slack config: "{{ providers.slack }}" with: message: | "Tier 2 Alert: {{ alert.name }} - {{ alert.description }} Alert details: {{ alert }}" ================================================ FILE: examples/workflows/zoom_chat_example.yml ================================================ workflow: id: zoom_chat-message name: Zoom Chat Message description: Sends a notification to a Zoom Chat channel via the Incoming Webhook application. triggers: - type: manual actions: - name: zoom_chat-action provider: type: zoom_chat config: "{{ providers.zoom_chat }}" with: message: test message from keep severity: critical title: critical test message tagged_users: joesmith@mail.com details_url: https://www.github.com/keep ================================================ FILE: examples/workflows/zoom_example.yml ================================================ workflow: id: zoom-warroom-creator name: Zoom War Room Creator description: Creates Zoom war room meetings for alerts with automatic recording and Slack notification containing join links. triggers: - type: manual actions: - name: create-zoom-meeting provider: type: zoom config: "{{ providers.zoom }}" with: topic: "War room - {{ alert.name }}" record_meeting: true - name: send-slack-alert provider: config: "{{ providers.slack }}" type: slack with: blocks: - text: emoji: true text: "{{alert.name}}" type: plain_text type: header - elements: - action_id: actionId-0 text: emoji: true text: "Join Warroom [Zoom]" type: plain_text type: button url: "{{ steps.create-zoom-meeting.results.join_url }}" type: actions message: "" ================================================ FILE: keep/actions/__init__.py ================================================ ================================================ FILE: keep/actions/actions_exception.py ================================================ from fastapi import HTTPException class ActionsCRUDException(HTTPException): """An exception class that depicts any error comming from Action""" ================================================ FILE: keep/actions/actions_factory.py ================================================ import time import logging from io import StringIO from uuid import uuid4 from typing import List, Union from pydantic import ValidationError from keep.api.models.action import ActionDTO from keep.api.models.db.action import Action from keep.api.core.db import get_all_actions, create_actions, delete_action, get_action, update_action from keep.actions.actions_exception import ActionsCRUDException from keep.functions import cyaml logger = logging.getLogger(__name__) class ActionsCRUD: """CRUD for Action model that shares across CLI, API, ...""" @staticmethod def get_all_actions(tenant_id: str) -> List[ActionDTO]: action_models = get_all_actions(tenant_id) return ActionsCRUD._convert_models_to_dtos(action_models) @staticmethod def _convert_models_to_dtos(models: List[Action]) -> List[ActionDTO]: """Convert model to dto, ignore the result if one model is invalid""" results: List[ActionDTO] = [] for model in models: try: dto = ActionDTO(id=model.id, use=model.use, name=model.name, details=cyaml.safe_load(StringIO(model.action_raw))) results.append(dto) except ValidationError: logger.warning("Unmatched Action model and the coresponding DTO", exc_info=True, extra={ "data": model.dict() }) return results @staticmethod def add_actions(tenant_id: str, installed_by: str, action_dtos: List[dict]): try: actions = [] for action_dto in action_dtos: action = Action( id=str(uuid4()), tenant_id=tenant_id, installed_by=installed_by, installation_time=time.time(), name=action_dto.get("name"), use=action_dto.get("use") or action_dto.get("name"), # if there is no `use` tag, use `name` instead action_raw=cyaml.dump(action_dto) ) actions.append(action) create_actions(actions) except Exception: logger.exception("Failed to create actions") raise ActionsCRUDException(status_code=422, detail="Unable to create the actions") @staticmethod def remove_action(tenant_id: str, action_id: str): try: deleted_action = delete_action(tenant_id, action_id) return deleted_action except Exception: logger.exception("Unknown exception when delete action from database") raise ActionsCRUDException(status_code=422, detail="Unable to delete the requested action") @staticmethod def get_action(tenant_id: str, action_id: str) -> Union[Action, None]: try: return get_action(tenant_id, action_id) except Exception: logger.exception("Unknown exception when getting action from database") raise ActionsCRUDException(status_code=400, detail="Unable to get an action") @staticmethod def update_action(tenant_id: str, action_id: str, payload: dict) -> Union[Action, None]: try: action_payload = Action( name=payload.get("name"), use=payload.get("use") or payload.get("name"), action_raw=cyaml.dump(payload) ) updated_action = update_action(tenant_id, action_id, action_payload) if updated_action: return update_action raise ActionsCRUDException(status_code=422, detail="No action matched to be updated") except Exception: logger.exception("Uknown exception when update an action on database") raise ActionsCRUDException(status_code=400, detail="Unable to update an action") ================================================ FILE: keep/alembic.ini ================================================ [alembic] # Re-defined in the keep/api/core/db_on_start.py to make it stable while keep is installed as a package script_location = keep/api/models/db/migrations file_template = %%(year)d-%%(month).2d-%%(day).2d-%%(hour).2d-%%(minute).2d_%%(rev)s prepend_sys_path = . output_encoding = utf-8 [post_write_hooks] hooks = black,isort black.type = console_scripts black.entrypoint = black isort.type = console_scripts isort.entrypoint = isort # Logging configuration [loggers] keys = root,sqlalchemy,alembic [handlers] keys = console [formatters] keys = generic [logger_root] level = WARN handlers = console qualname = [logger_sqlalchemy] level = WARN handlers = qualname = sqlalchemy.engine [logger_alembic] level = INFO handlers = qualname = alembic [handler_console] class = StreamHandler args = (sys.stderr,) level = NOTSET formatter = generic [formatter_generic] format = %(levelname)-5.5s [PID %(process)d] [%(name)s] %(message)s datefmt = %H:%M:%S ================================================ FILE: keep/api/__init__.py ================================================ ================================================ FILE: keep/api/alert_deduplicator/__init__.py ================================================ ================================================ FILE: keep/api/alert_deduplicator/alert_deduplicator.py ================================================ import copy import hashlib import json import logging import uuid from fastapi import HTTPException from keep.api.core.config import config from keep.api.core.db import ( create_deduplication_event, create_deduplication_rule, delete_deduplication_rule, get_alerts_fields, get_all_deduplication_rules, get_all_deduplication_stats, get_custom_deduplication_rule, get_deduplication_rule_by_id, get_last_alert_hashes_by_fingerprints, update_deduplication_rule, ) from keep.api.models.alert import ( AlertDto, DeduplicationRuleDto, DeduplicationRuleRequestDto, ) from keep.providers.providers_factory import ProvidersFactory DEFAULT_RULE_UUID = "00000000-0000-0000-0000-000000000000" class AlertDeduplicator: DEDUPLICATION_DISTRIBUTION_ENABLED = config( "KEEP_DEDUPLICATION_DISTRIBUTION_ENABLED", cast=bool, default=True ) CUSTOM_DEDUPLICATION_DISTRIBUTION_ENABLED = config( "KEEP_CUSTOM_DEDUPLICATION_ENABLED", cast=bool, default=True ) def __init__(self, tenant_id): self.logger = logging.getLogger(__name__) self.tenant_id = tenant_id def _apply_deduplication_rule( self, alert: AlertDto, rule: DeduplicationRuleDto, last_alert_fingerprint_to_hash: dict[str, str] | None = None, ) -> bool: """ Apply a deduplication rule to an alert. Gets an alert and a deduplication rule and apply the rule to the alert by: - removing the fields that should be ignored - calculating the hash - checking if the hash is already in the database - setting the isFullDuplicate or isPartialDuplicate flag """ # we don't want to remove fields from the original alert alert_copy = copy.deepcopy(alert) # remove the fields that should be ignored for field in rule.ignore_fields: alert_copy = self._remove_field(field, alert_copy) # calculate the hash alert_hash = hashlib.sha256( json.dumps(alert_copy.dict(), default=str, sort_keys=True).encode() ).hexdigest() alert.alert_hash = alert_hash # Check if the hash is already in the database. # If last_alert_fingerprint_to_hash is provided, use it # else, get the hash from the database last_alerts_hash_by_fingerprint = ( last_alert_fingerprint_to_hash or get_last_alert_hashes_by_fingerprints( self.tenant_id, [alert.fingerprint] ) ) # the hash is the same as the last alert hash by fingerprint - full deduplication if ( last_alerts_hash_by_fingerprint.get(alert.fingerprint) and last_alerts_hash_by_fingerprint.get(alert.fingerprint) == alert_hash ): self.logger.info( "Alert is deduplicated", extra={ "alert_id": alert.id, "rule_id": rule.id, "tenant_id": self.tenant_id, }, ) alert.isFullDuplicate = True # it means that there is another alert with the same fingerprint but different hash # so its a deduplication elif last_alerts_hash_by_fingerprint.get(alert.fingerprint): self.logger.info( "Alert is partially deduplicated", extra={ "alert_id": alert.id, "tenant_id": self.tenant_id, }, ) alert.isPartialDuplicate = True else: self.logger.debug( "Alert is not deduplicated", extra={ "alert_id": alert.id, "fingerprint": alert.fingerprint, "tenant_id": self.tenant_id, "last_alert_hash_by_fingerprint": last_alerts_hash_by_fingerprint, }, ) return alert def apply_deduplication( self, alert: AlertDto, rules: list["DeduplicationRuleDto"] | None = None, last_alert_fingerprint_to_hash: dict[str, str] | None = None, ) -> bool: # IMPOTRANT NOTE TO SOMEONE WORKING ON THIS CODE: # apply_deduplication runs AFTER _format_alert, so you can assume that alert fields are in the expected format. # you are also safe to assume that alert.fingerprint is set by the provider itself # get only relevant rules rules = rules or self.get_deduplication_rules( self.tenant_id, alert.providerId, alert.providerType ) for rule in rules: self.logger.debug( "Applying deduplication rule to alert", extra={ "rule_id": rule.id, "alert_id": alert.id, }, ) alert = self._apply_deduplication_rule( alert, rule, last_alert_fingerprint_to_hash ) self.logger.debug( "Alert after deduplication rule applied", extra={ "rule_id": rule.id, "alert_id": alert.id, "is_full_duplicate": alert.isFullDuplicate, "is_partial_duplicate": alert.isPartialDuplicate, }, ) if AlertDeduplicator.DEDUPLICATION_DISTRIBUTION_ENABLED: if alert.isFullDuplicate or alert.isPartialDuplicate: # create deduplication event create_deduplication_event( tenant_id=self.tenant_id, deduplication_rule_id=rule.id, deduplication_type=( "full" if alert.isFullDuplicate else "partial" ), provider_id=alert.providerId, provider_type=alert.providerType, ) # we don't need to check the other rules break else: # create none deduplication event, for statistics create_deduplication_event( tenant_id=self.tenant_id, deduplication_rule_id=rule.id, deduplication_type="none", provider_id=alert.providerId, provider_type=alert.providerType, ) return alert def _remove_field(self, field, alert: AlertDto) -> AlertDto: alert = copy.deepcopy(alert) field_parts = field.split(".") if len(field_parts) == 1: try: delattr(alert, field) except AttributeError: self.logger.warning(f"Failed to delete attribute {field} from alert") else: alert_attr = field_parts[0] d = copy.deepcopy(getattr(alert, alert_attr)) for part in field_parts[1:-1]: d = d[part] del d[field_parts[-1]] setattr(alert, field_parts[0], d) return alert def get_deduplication_rules( self, tenant_id, provider_id, provider_type ) -> list[DeduplicationRuleDto]: # if not provider_type, force it to be "keep" so custom deduplication rule can be used if not provider_type: provider_type = "keep" # try to get the rule from the database rule = ( get_custom_deduplication_rule(tenant_id, provider_id, provider_type) if AlertDeduplicator.CUSTOM_DEDUPLICATION_DISTRIBUTION_ENABLED else None ) if not rule: self.logger.debug( "No custom deduplication rule found, using deafult full deduplication rule", extra={ "provider_id": provider_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) rule = self._get_default_full_deduplication_rule(provider_id, provider_type) return [rule] # else, return the custom rules self.logger.debug( "Using custom deduplication rules", extra={ "provider_id": provider_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) # if full deduplication rule found, return the rules if rule.full_deduplication: return [rule] # if not, assign them the default full deduplication rule ignore fields self.logger.info( "No full deduplication rule found, assigning default full deduplication rule ignore fields" ) default_full_dedup_rule = self._get_default_full_deduplication_rule( provider_id=provider_id, provider_type=provider_type ) rule.ignore_fields = default_full_dedup_rule.ignore_fields return [rule] def _generate_uuid(self, provider_id, provider_type): # this is a way to generate a unique uuid for the default deduplication rule per (provider_id, provider_type) namespace_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, "keephq.dev") # this is a workaround for this - https://github.com/keephq/keep/issues/4273 if not provider_id and provider_type and provider_type.lower() == "keep": provider_type = None generated_uuid = str( uuid.uuid5(namespace_uuid, f"{provider_id}_{provider_type}") ) return generated_uuid def _get_default_full_deduplication_rule( self, provider_id, provider_type ) -> DeduplicationRuleDto: # this is a way to generate a unique uuid for the default deduplication rule per (provider_id, provider_type) generated_uuid = self._generate_uuid(provider_id, provider_type) # just return a default deduplication rule with lastReceived field if not provider_type: provider_type = "keep" return DeduplicationRuleDto( id=generated_uuid, name=f"{provider_type} default deduplication rule", description=f"{provider_type} default deduplication rule", default=True, distribution=[{"hour": i, "number": 0} for i in range(24)], fingerprint_fields=[], # ["fingerprint"], # this is fallback provider_type=provider_type or "keep", provider_id=provider_id, full_deduplication=True, ignore_fields=["lastReceived"], priority=0, last_updated=None, last_updated_by=None, created_at=None, created_by=None, ingested=0, dedup_ratio=0.0, enabled=True, is_provisioned=False, ) def get_deduplications(self) -> list[DeduplicationRuleDto]: # get all providers installed_providers = ProvidersFactory.get_installed_providers(self.tenant_id) installed_providers = [ provider for provider in installed_providers if "alert" in provider.tags ] # get all linked providers linked_providers = ProvidersFactory.get_linked_providers(self.tenant_id) providers = [*installed_providers, *linked_providers] # get default deduplication rules default_deduplications = ProvidersFactory.get_default_deduplication_rules() default_deduplications_dict = { dd.provider_type: dd for dd in default_deduplications } for dd in default_deduplications: provider_id, provider_type = dd.provider_id, dd.provider_type dd.id = self._generate_uuid(provider_id, provider_type) # get custom deduplication rules custom_deduplications = ( get_all_deduplication_rules(self.tenant_id) if AlertDeduplicator.CUSTOM_DEDUPLICATION_DISTRIBUTION_ENABLED else [] ) # cast to dto custom_deduplications_dto = [ DeduplicationRuleDto( id=str(rule.id), name=rule.name, description=rule.description, default=False, distribution=[{"hour": i, "number": 0} for i in range(24)], fingerprint_fields=rule.fingerprint_fields, provider_type=rule.provider_type, provider_id=rule.provider_id, full_deduplication=rule.full_deduplication, ignore_fields=rule.ignore_fields, priority=rule.priority, last_updated=str(rule.last_updated), last_updated_by=rule.last_updated_by, created_at=str(rule.created_at), created_by=rule.created_by, ingested=0, dedup_ratio=0.0, enabled=rule.enabled, is_provisioned=rule.is_provisioned, ) for rule in custom_deduplications ] custom_deduplications_dict = {} for rule in custom_deduplications_dto: key = f"{rule.provider_type}_{rule.provider_id}" # for linked providers without an id ("main") if "null" in key: key = key.replace("null", "None") if key not in custom_deduplications_dict: custom_deduplications_dict[key] = [] custom_deduplications_dict[key].append(rule) # get the "catch all" full deduplication rule catch_all_full_deduplication = self._get_default_full_deduplication_rule( provider_id=None, provider_type=None ) # calculate the deduplciations # if a provider has custom deduplication rule, use it # else, use the default deduplication rule of the provider if "keep_None" in custom_deduplications_dict: self.logger.info( "Using custom deduplication rule for default deduplication rule", extra={ "tenant_id": self.tenant_id, }, ) final_deduplications = custom_deduplications_dict["keep_None"] else: final_deduplications = [catch_all_full_deduplication] for provider in providers: # if the provider doesn't have a deduplication rule, use the default one key = f"{provider.type}_{provider.id}" if key not in custom_deduplications_dict: # no default deduplication rule found [if provider doesn't have FINGERPRINT_FIELDS] if provider.type not in default_deduplications_dict: self.logger.warning( f"Provider {provider.type} does not have a default deduplication" ) continue # create a copy of the default deduplication rule default_deduplication = copy.deepcopy( default_deduplications_dict[provider.type] ) default_deduplication.id = self._generate_uuid( provider.id, provider.type ) # copy the provider id to the description if provider.id: default_deduplication.description = ( f"{default_deduplication.description} - {provider.id}" ) default_deduplication.provider_id = provider.id # set the provider type final_deduplications.append(default_deduplication) # else, just use the custom deduplication rule else: final_deduplications += custom_deduplications_dict[key] # now calculate some statistics # alerts_by_provider_stats = get_all_alerts_by_providers(self.tenant_id) deduplication_stats = get_all_deduplication_stats(self.tenant_id) result = [] for dedup in final_deduplications: self.logger.debug( "Calculating deduplication stats", extra={ "deduplication_rule_id": dedup.id, "tenant_id": self.tenant_id, "deduplication_stats": deduplication_stats, }, ) key = dedup.id full_dedup = deduplication_stats.get(key, {"full_dedup_count": 0}).get( "full_dedup_count", 0 ) partial_dedup = deduplication_stats.get( key, {"partial_dedup_count": 0} ).get("partial_dedup_count", 0) none_dedup = deduplication_stats.get(key, {"none_dedup_count": 0}).get( "none_dedup_count", 0 ) dedup.ingested = full_dedup + partial_dedup + none_dedup # total dedup count is the sum of full and partial dedup count dedup_count = full_dedup + partial_dedup if dedup.ingested == 0: dedup.dedup_ratio = 0.0 # this shouldn't happen, only in backward compatibility or some bug that dedup events are not created elif key not in deduplication_stats: self.logger.warning(f"Provider {key} does not have deduplication stats") dedup.dedup_ratio = 0.0 elif dedup_count == 0: dedup.dedup_ratio = 0.0 else: dedup.dedup_ratio = (dedup_count / dedup.ingested) * 100 dedup.distribution = deduplication_stats[key].get( "alerts_last_24_hours" ) result.append(dedup) if AlertDeduplicator.DEDUPLICATION_DISTRIBUTION_ENABLED: for dedup in result: for pd, stats in deduplication_stats.items(): if pd == f"{dedup.provider_id}_{dedup.provider_type}": distribution = stats.get("alert_last_24_hours") dedup.distribution = distribution break # sort providers to have enabled first result = sorted(result, key=lambda x: x.default, reverse=True) # if the default is empty, remove it if len(result) == 1 and result[0].ingested == 0: # empty states, no alerts return [] return result def get_deduplication_fields(self) -> list[str]: fields = get_alerts_fields(self.tenant_id) fields_per_provider = {} for field in fields: provider_type = field.provider_type if field.provider_type else "null" provider_id = field.provider_id if field.provider_id else "null" key = f"{provider_type}_{provider_id}" if key not in fields_per_provider: fields_per_provider[key] = [] fields_per_provider[key].append(field.field_name) return fields_per_provider def create_deduplication_rule( self, rule: DeduplicationRuleRequestDto, created_by: str ) -> DeduplicationRuleDto: # check that provider installed (cannot create deduplication rule for uninstalled provider) provider = None installed_providers = ProvidersFactory.get_installed_providers(self.tenant_id) linked_providers = ProvidersFactory.get_linked_providers(self.tenant_id) provider_key = f"{rule.provider_type}_{rule.provider_id}" if "null" in provider_key: # for linked providers without an id ("main") # see this ticket - https://github.com/keephq/keep/issues/3729 provider_key = provider_key.replace("null", "None") rule.provider_id = None for p in installed_providers + linked_providers: if provider_key == f"{p.type}_{p.id}": provider = p break if not provider and provider_key: message = f"Provider {rule.provider_type} not found" if rule.provider_id: message += f" with id {rule.provider_id}" raise HTTPException( status_code=404, detail=message, ) # Use the db function to create a new deduplication rule new_rule = create_deduplication_rule( tenant_id=self.tenant_id, name=rule.name, description=rule.description, provider_id=rule.provider_id, provider_type=rule.provider_type, created_by=created_by, enabled=True, fingerprint_fields=rule.fingerprint_fields, full_deduplication=rule.full_deduplication, ignore_fields=rule.ignore_fields or [], priority=0, ) return new_rule def update_deduplication_rule( self, rule_id: str, rule: DeduplicationRuleRequestDto, updated_by: str ) -> DeduplicationRuleDto: """ Updates an existing deduplication rule or creates a new one if the rule is a default rule. Args: rule_id (str): The ID of the deduplication rule to update. rule (DeduplicationRuleRequestDto): The new deduplication rule data. updated_by (str): The identifier of the user who is updating the rule. Returns: DeduplicationRuleDto: The updated deduplication rule. Raises: HTTPException 404: If the deduplication rule is not found (404) HTTPException 409: if a provisioned rule is attempted to be updated (409). """ # check if this is a default rule default_rule_id = self._generate_uuid(rule.provider_id, rule.provider_type) # if its a default, we need to override and create a new rule if rule_id == default_rule_id: self.logger.info("Default rule update, creating a new rule") rule_dto = self.create_deduplication_rule(rule, updated_by) self.logger.info("Default rule updated") return rule_dto rule_before_update = get_deduplication_rule_by_id(self.tenant_id, rule_id) if not rule_before_update: raise HTTPException( status_code=404, detail="Deduplication rule not found", ) if rule_before_update.is_provisioned: raise HTTPException( status_code=409, detail="Provisioned deduplication rule cannot be updated", ) # else, use the db function to update an existing deduplication rule updated_rule = update_deduplication_rule( rule_id=rule_id, tenant_id=self.tenant_id, name=rule.name, description=rule.description, provider_id=rule.provider_id, provider_type=rule.provider_type, last_updated_by=updated_by, enabled=True, fingerprint_fields=rule.fingerprint_fields, full_deduplication=rule.full_deduplication, ignore_fields=rule.ignore_fields or [], priority=0, ) return updated_rule def delete_deduplication_rule(self, rule_id: str) -> bool: """ Deletes a deduplication rule by its ID. Args: rule_id (str): The ID of the deduplication rule to be deleted. Returns: bool: True if the deduplication rule was successfully deleted, False otherwise. Raises: HTTPException 404: If the deduplication rule is not found. HTTPException 409: If the deduplication rule is provisioned and cannot be deleted. """ # Use the db function to delete a deduplication rule deduplication_rule_to_be_deleted = get_deduplication_rule_by_id( self.tenant_id, rule_id ) if not deduplication_rule_to_be_deleted: raise HTTPException( status_code=404, detail="Deduplication rule not found", ) if deduplication_rule_to_be_deleted.is_provisioned: raise HTTPException( status_code=409, detail="Provisioned deduplication rule cannot be deleted", ) success = delete_deduplication_rule(rule_id=rule_id, tenant_id=self.tenant_id) return success ================================================ FILE: keep/api/alert_deduplicator/deduplication_rules_provisioning.py ================================================ import json import logging import re import keep.api.core.db as db from keep.api.core.config import config from keep.providers.providers_factory import ProvidersFactory logger = logging.getLogger(__name__) def provision_deduplication_rules(deduplication_rules: dict[str, any], tenant_id: str): """ Provisions deduplication rules for a given tenant. Args: deduplication_rules (dict[str, any]): A dictionary where the keys are rule names and the values are DeduplicationRuleRequestDto objects. tenant_id (str): The ID of the tenant for which deduplication rules are being provisioned. """ enrich_with_providers_info(deduplication_rules, tenant_id) all_deduplication_rules_from_db = db.get_all_deduplication_rules(tenant_id) provisioned_deduplication_rules = [ rule for rule in all_deduplication_rules_from_db if rule.is_provisioned ] provisioned_deduplication_rules_from_db_dict = { rule.name: rule for rule in provisioned_deduplication_rules } actor = "system" # delete rules that are not in the env for provisioned_deduplication_rule in provisioned_deduplication_rules: if str(provisioned_deduplication_rule.name) not in deduplication_rules: logger.info( "Deduplication rule with name '%s' is not in the env, deleting from DB", provisioned_deduplication_rule.name, ) db.delete_deduplication_rule( rule_id=str(provisioned_deduplication_rule.id), tenant_id=tenant_id ) for ( deduplication_rule_name, deduplication_rule_to_provision, ) in deduplication_rules.items(): if deduplication_rule_name in provisioned_deduplication_rules_from_db_dict: logger.info( "Deduplication rule with name '%s' already exists, updating in DB", deduplication_rule_name, ) db.update_deduplication_rule( tenant_id=tenant_id, rule_id=str( provisioned_deduplication_rules_from_db_dict.get( deduplication_rule_name ).id ), name=deduplication_rule_name, description=deduplication_rule_to_provision.get("description", ""), provider_id=deduplication_rule_to_provision.get("provider_id"), provider_type=deduplication_rule_to_provision["provider_type"], last_updated_by=actor, enabled=True, fingerprint_fields=deduplication_rule_to_provision.get( "fingerprint_fields", [] ), full_deduplication=deduplication_rule_to_provision.get( "full_deduplication", False ), ignore_fields=deduplication_rule_to_provision.get("ignore_fields") or [], priority=0, ) continue logger.info( "Deduplication rule with name '%s' does not exist, creating in DB", deduplication_rule_name, ) db.create_deduplication_rule( tenant_id=tenant_id, name=deduplication_rule_name, description=deduplication_rule_to_provision.get("description", ""), provider_id=deduplication_rule_to_provision.get("provider_id"), provider_type=deduplication_rule_to_provision["provider_type"], created_by=actor, enabled=True, fingerprint_fields=deduplication_rule_to_provision.get( "fingerprint_fields", [] ), full_deduplication=deduplication_rule_to_provision.get( "full_deduplication", False ), ignore_fields=deduplication_rule_to_provision.get("ignore_fields") or [], priority=0, is_provisioned=True, ) def provision_deduplication_rules_from_env(tenant_id: str): """ Provisions deduplication rules from environment variables for a given tenant. This function reads deduplication rules from environment variables, validates them, and then provisions them into the database. It handles the following: - Deletes deduplication rules from the database that are not present in the environment variables. - Updates existing deduplication rules in the database if they are present in the environment variables. - Creates new deduplication rules in the database if they are not already present. Args: tenant_id (str): The ID of the tenant for which deduplication rules are being provisioned. Raises: ValueError: If the deduplication rules from the environment variables are invalid. """ deduplication_rules_from_env_dict = get_deduplication_rules_to_provision() if not deduplication_rules_from_env_dict: logger.info("No deduplication rules found in env. Nothing to provision.") return provision_deduplication_rules(deduplication_rules_from_env_dict, tenant_id) def enrich_with_providers_info(deduplication_rules: dict[str, any], tenant_id: str): """ Enriches passed deduplication rules with provider ID and type information. Args: deduplication_rules (dict[str, any]): A list of deduplication rules to be enriched. tenant_id (str): The ID of the tenant for which deduplication rules are being provisioned. """ installed_providers = ProvidersFactory.get_installed_providers(tenant_id) installed_providers_dict = { provider.details.get("name"): provider for provider in installed_providers } for rule_name, rule in deduplication_rules.items(): logger.info(f"Enriching deduplication rule: {rule_name}") provider = installed_providers_dict.get(rule.get("provider_name")) rule["provider_id"] = provider.id rule["provider_type"] = provider.type def get_deduplication_rules_to_provision() -> dict[str, dict]: """ Reads deduplication rules from an environment variable and returns them as a dictionary. The function checks if the environment variable `KEEP_DEDUPLICATION_RULES` contains a path to a JSON file or a JSON string. If it is a path, it reads the file and parses the JSON content. If it is a JSON string, it parses the string directly. Returns: dict[str, DeduplicationRuleRequestDto]: A dictionary where the keys are rule names and the values are DeduplicationRuleRequestDto objects. Raises: Exception: If there is an error parsing the JSON content from the file or the environment variable. """ env_var_key = "KEEP_PROVIDERS" deduplication_rules_from_env_var = config(key=env_var_key, default=None) if not deduplication_rules_from_env_var: return None # check if env var is absolute or relative path to a deduplication rules json file if re.compile(r"^(\/|\.\/|\.\.\/).*\.json$").match( deduplication_rules_from_env_var ): with open( file=deduplication_rules_from_env_var, mode="r", encoding="utf8" ) as file: try: deduplication_rules_from_env_json: dict = json.loads(file.read()) except json.JSONDecodeError as e: raise Exception( f"Error parsing deduplication rules from file {deduplication_rules_from_env_var}: {e}" ) from e else: try: deduplication_rules_from_env_json = json.loads( deduplication_rules_from_env_var ) except json.JSONDecodeError as e: raise Exception( f"Error parsing deduplication rules from env var {env_var_key}: {e}" ) from e deduplication_rules_dict: dict[str, dict] = {} for provider_name, provider_config in deduplication_rules_from_env_json.items(): for rule_name, rule_config in provider_config.get( "deduplication_rules", {} ).items(): rule_config["name"] = rule_name rule_config["provider_name"] = provider_name rule_config["provider_type"] = provider_config.get("type") deduplication_rules_dict[rule_name] = rule_config if not deduplication_rules_dict: return None return deduplication_rules_dict ================================================ FILE: keep/api/api.py ================================================ import asyncio import logging import os import time from contextlib import asynccontextmanager from functools import wraps from importlib import metadata from typing import Awaitable, Callable from arq import ArqRedis import requests import uvicorn from dotenv import find_dotenv, load_dotenv from fastapi import FastAPI, Request from fastapi.middleware.gzip import GZipMiddleware from fastapi.responses import JSONResponse from prometheus_fastapi_instrumentator import Instrumentator from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded from slowapi.middleware import SlowAPIMiddleware from starlette.middleware.cors import CORSMiddleware from starlette_context import plugins from starlette_context.middleware import RawContextMiddleware from keep.api.arq_pool import get_pool import keep.api.logging import keep.api.observability from keep.api.tasks import process_watcher_task import keep.api.utils.import_ee from keep.api.core.config import config from keep.api.core.db import dispose_session from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.core.limiter import limiter from keep.api.logging import CONFIG as logging_config from keep.api.middlewares import LoggingMiddleware from keep.api.routes import ( actions, ai, alerts, dashboard, deduplications, extraction, facets, cel, healthcheck, incidents, maintenance, mapping, metrics, preset, provider_images, providers, pusher, rules, settings, status, tags, topology, whoami, workflows, ) from keep.api.routes.auth import groups as auth_groups from keep.api.routes.auth import permissions, roles, users from keep.event_subscriber.event_subscriber import EventSubscriber from keep.identitymanager.identitymanagerfactory import ( IdentityManagerFactory, IdentityManagerTypes, ) from keep.topologies.topology_processor import TopologyProcessor from keep.api.consts import KEEP_ARQ_QUEUE_MAINTENANCE, MAINTENANCE_WINDOW_ALERT_STRATEGY, REDIS # load all providers into cache from keep.workflowmanager.workflowmanager import WorkflowManager load_dotenv(find_dotenv()) keep.api.logging.setup_logging() logger = logging.getLogger(__name__) HOST = config("KEEP_HOST", default="0.0.0.0") PORT = config("PORT", default=8080, cast=int) SCHEDULER = config("SCHEDULER", default="true", cast=bool) CONSUMER = config("CONSUMER", default="true", cast=bool) TOPOLOGY = config("KEEP_TOPOLOGY_PROCESSOR", default="false", cast=bool) WATCHER = config("WATCHER", default="false", cast=bool) KEEP_DEBUG_TASKS = config("KEEP_DEBUG_TASKS", default="false", cast=bool) KEEP_DEBUG_MIDDLEWARES = config("KEEP_DEBUG_MIDDLEWARES", default="false", cast=bool) KEEP_USE_LIMITER = config("KEEP_USE_LIMITER", default="false", cast=bool) MAINTENANCE_WINDOWS = config("MAINTENANCE_WINDOWS", default="false", cast=bool) AUTH_TYPE = config("AUTH_TYPE", default=IdentityManagerTypes.NOAUTH.value).lower() try: KEEP_VERSION = metadata.version("keep") except Exception: KEEP_VERSION = config("KEEP_VERSION", default="unknown") # Monkey patch requests to disable redirects (guard against re-patching on reload) if not getattr(requests.Session.request, "_keep_no_redirect", False): _original_request = requests.Session.request def no_redirect_request(self, method, url, **kwargs): kwargs["allow_redirects"] = False return _original_request(self, method, url, **kwargs) no_redirect_request._keep_no_redirect = True requests.Session.request = no_redirect_request async def check_pending_tasks(background_tasks: set): while True: events_in_queue = len(background_tasks) logger.info( f"{events_in_queue} background tasks pending", extra={ "pending_tasks": events_in_queue, }, ) await asyncio.sleep(1) async def startup(): """ This runs for every worker on startup. Read more about lifespan here: https://fastapi.tiangolo.com/advanced/events/#lifespan """ logger.info("Disope existing DB connections") # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq # https://stackoverflow.com/questions/43944787/sqlalchemy-celery-with-scoped-session-error/54751019#54751019 dispose_session() logger.info("Starting the services") # Start the scheduler if SCHEDULER: try: logger.info("Starting the scheduler") wf_manager = WorkflowManager.get_instance() await wf_manager.start() logger.info("Scheduler started successfully") except Exception: logger.exception("Failed to start the scheduler") # Start the consumer if CONSUMER: try: logger.info("Starting the consumer") event_subscriber = EventSubscriber.get_instance() # TODO: there is some "race condition" since if the consumer starts before the server, # and start getting events, it will fail since the server is not ready yet # we should add a "wait" here to make sure the server is ready await event_subscriber.start() logger.info("Consumer started successfully") except Exception: logger.exception("Failed to start the consumer") # Start the topology processor if TOPOLOGY: try: logger.info("Starting the topology processor") topology_processor = TopologyProcessor.get_instance() await topology_processor.start() logger.info("Topology processor started successfully") except Exception: logger.exception("Failed to start the topology processor") if WATCHER or (MAINTENANCE_WINDOWS and MAINTENANCE_WINDOW_ALERT_STRATEGY == "recover_previous_status"): if REDIS: try: logger.info("Starting the watcher process") redis: ArqRedis = await get_pool() job = await redis.enqueue_job( "async_process_watcher", _queue_name=KEEP_ARQ_QUEUE_MAINTENANCE, ) logger.info( "Enqueued job", extra={ "job_id": job.job_id, "queue": KEEP_ARQ_QUEUE_MAINTENANCE, }, ) except Exception: logger.exception("Failed to start the maintenance windows") else: asyncio.create_task(process_watcher_task.async_process_watcher()) logger.info( "Added task", extra={ "task": "task", }, ) logger.info("Services started successfully") async def shutdown(): """ This runs for every worker on shutdown. Read more about lifespan here: https://fastapi.tiangolo.com/advanced/events/#lifespan """ logger.info("Shutting down Keep") if SCHEDULER: logger.info("Stopping the scheduler") wf_manager = WorkflowManager.get_instance() # stop the scheduler try: await wf_manager.stop() # in pytest, there could be race condition except TypeError: pass logger.info("Scheduler stopped successfully") if CONSUMER: logger.info("Stopping the consumer") event_subscriber = EventSubscriber.get_instance() try: await event_subscriber.stop() # in pytest, there could be race condition except TypeError: pass logger.info("Consumer stopped successfully") logger.info("Keep shutdown complete") @asynccontextmanager async def lifespan(app: FastAPI): """ This runs for every worker on startup and shutdown. Read more about lifespan here: https://fastapi.tiangolo.com/advanced/events/#lifespan """ app.state.limiter = limiter # create a set of background tasks background_tasks = set() # if debug tasks are enabled, create a task to check for pending tasks if KEEP_DEBUG_TASKS: logger.info("Starting background task to check for pending tasks") asyncio.create_task(check_pending_tasks(background_tasks)) # Startup await startup() # yield the background tasks, this is available for the app to use in request context yield {"background_tasks": background_tasks} # Shutdown await shutdown() def get_app( auth_type: IdentityManagerTypes = IdentityManagerTypes.NOAUTH.value, ) -> FastAPI: keep_api_url = config("KEEP_API_URL", default=None) if not keep_api_url: logger.info( "KEEP_API_URL is not set, setting it to default", extra={"keep_api_url": f"http://{HOST}:{PORT}"}, ) os.environ["KEEP_API_URL"] = f"http://{HOST}:{PORT}" logger.info( f"Starting Keep with {os.environ['KEEP_API_URL']} as URL and version {KEEP_VERSION}", extra={ "keep_version": KEEP_VERSION, "keep_api_url": keep_api_url, }, ) app = FastAPI( title="Keep API", description="Rest API powering https://platform.keephq.dev and friends 🏄‍♀️", version=KEEP_VERSION, lifespan=lifespan, ) @app.get("/", include_in_schema=False) async def root(): """ App description and version. """ return {"message": app.description, "version": KEEP_VERSION} app.add_middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),)) app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) app.add_middleware( GZipMiddleware, minimum_size=30 * 1024 * 1024 ) # Approximately 30 MiB, https://cloud.google.com/run/quotas app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) app.include_router(providers.router, prefix="/providers", tags=["providers"]) app.include_router(actions.router, prefix="/actions", tags=["actions"]) app.include_router(ai.router, prefix="/ai", tags=["ai"]) app.include_router(healthcheck.router, prefix="/healthcheck", tags=["healthcheck"]) app.include_router(alerts.router, prefix="/alerts", tags=["alerts"]) app.include_router(incidents.router, prefix="/incidents", tags=["incidents"]) app.include_router(settings.router, prefix="/settings", tags=["settings"]) app.include_router( workflows.router, prefix="/workflows", tags=["workflows", "alerts"] ) app.include_router(whoami.router, prefix="/whoami", tags=["whoami"]) app.include_router(pusher.router, prefix="/pusher", tags=["pusher"]) app.include_router(status.router, prefix="/status", tags=["status"]) app.include_router(rules.router, prefix="/rules", tags=["rules"]) app.include_router(preset.router, prefix="/preset", tags=["preset"]) app.include_router( mapping.router, prefix="/mapping", tags=["enrichment", "mapping"] ) app.include_router( auth_groups.router, prefix="/auth/groups", tags=["auth", "groups"] ) app.include_router( permissions.router, prefix="/auth/permissions", tags=["auth", "permissions"] ) app.include_router(roles.router, prefix="/auth/roles", tags=["auth", "roles"]) app.include_router(users.router, prefix="/auth/users", tags=["auth", "users"]) app.include_router(metrics.router, prefix="/metrics", tags=["metrics"]) app.include_router( extraction.router, prefix="/extraction", tags=["enrichment", "extraction"] ) app.include_router(dashboard.router, prefix="/dashboard", tags=["dashboard"]) app.include_router(tags.router, prefix="/tags", tags=["tags"]) app.include_router(maintenance.router, prefix="/maintenance", tags=["maintenance"]) app.include_router(topology.router, prefix="/topology", tags=["topology"]) app.include_router( deduplications.router, prefix="/deduplications", tags=["deduplications"] ) app.include_router(facets.router, prefix="/{entity_name}/facets", tags=["facets"]) app.include_router(facets.router, prefix="/{entity_name}/facets", tags=["facets"]) app.include_router(cel.router, prefix="/cel", tags=["cel"]) app.include_router( provider_images.router, prefix="/provider-images", tags=["provider-images"] ) # if its single tenant with authentication, add signin endpoint logger.info(f"Starting Keep with authentication type: {AUTH_TYPE}") # If we run Keep with SINGLE_TENANT auth type, we want to add the signin endpoint identity_manager = IdentityManagerFactory.get_identity_manager( SINGLE_TENANT_UUID, None, AUTH_TYPE ) # if any endpoints needed, add them on_start identity_manager.on_start(app) @app.exception_handler(Exception) async def catch_exception(request: Request, exc: Exception): logging.error( f"An unhandled exception occurred: {exc}, Trace ID: {request.state.trace_id}. Tenant ID: {request.state.tenant_id}" ) return JSONResponse( status_code=500, content={ "message": "An internal server error occurred.", "trace_id": request.state.trace_id, "error_msg": str(exc), }, ) app.add_middleware(LoggingMiddleware) if KEEP_USE_LIMITER: app.add_middleware(SlowAPIMiddleware) if config("KEEP_METRICS", default="true", cast=bool): Instrumentator( excluded_handlers=["/metrics", "/metrics/processing"], should_group_status_codes=False, ).instrument(app=app, metric_namespace="keep") if config("KEEP_OTEL_ENABLED", default="true", cast=bool): keep.api.observability.setup(app) # if debug middlewares are enabled, instrument them if KEEP_DEBUG_MIDDLEWARES: logger.info("Instrumenting middlewares") app = instrument_middleware(app) logger.info("Instrumented middlewares") return app logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # SHAHAR: # This (and instrument_middleware) is a helper function to wrap the call of a middleware with timing # It will log the time it took for the middleware to run # It should NOT be used in production! def wrap_call(middleware_cls, original_call): # if the call is already wrapped, return it if hasattr(original_call, "_timing_wrapped"): return original_call @wraps(original_call) async def timed_call( self, scope: dict, receive: Callable[[], Awaitable[dict]], send: Callable[[dict], Awaitable[None]], ): if scope["type"] != "http": return await original_call(self, scope, receive, send) start_time = time.time() try: response = await original_call(self, scope, receive, send) return response finally: process_time = (time.time() - start_time) * 1000 path = scope.get("path", "") method = scope.get("method", "") middleware_name = self.__class__.__name__ logger.info( f"⏱️ {middleware_name:<40} {method} {path} took {process_time:>8.2f}ms" ) timed_call._timing_wrapped = True return timed_call def instrument_middleware(app): # Get middleware from FastAPI app for middleware in app.user_middleware: if hasattr(middleware.cls, "__call__"): original_call = middleware.cls.__call__ middleware.cls.__call__ = wraps(original_call)( wrap_call(middleware.cls, original_call) ) return app def run(app: FastAPI): logger.info("Starting the uvicorn server") # call on starting to create the db and tables import keep.api.config keep.api.config.on_starting() uvicorn.run( "keep.api.api:get_app", host=HOST, port=PORT, log_config=logging_config, lifespan="on", workers=config("KEEP_WORKERS", default=None, cast=int), limit_concurrency=config("KEEP_LIMIT_CONCURRENCY", default=None, cast=int), ) ================================================ FILE: keep/api/arq_pool.py ================================================ from arq import create_pool from keep.api.redis_settings import get_redis_settings async def get_pool(): """Create and return an ARQ Redis pool using shared Redis settings.""" return await create_pool(get_redis_settings()) ================================================ FILE: keep/api/arq_worker.py ================================================ import asyncio import functools import logging from concurrent.futures import ThreadPoolExecutor from typing import Optional from uuid import uuid4 import redis from arq import Worker, cron from arq.worker import create_worker from dotenv import find_dotenv, load_dotenv from pydantic.utils import import_string from starlette.datastructures import CommaSeparatedStrings import keep.api.logging from keep.api.consts import ( KEEP_ARQ_QUEUE_BASIC, KEEP_ARQ_TASK_POOL, KEEP_ARQ_TASK_POOL_ALL, KEEP_ARQ_TASK_POOL_BASIC_PROCESSING, WATCHER_LAPSED_TIME, ) from keep.api.core.config import config from keep.api.redis_settings import get_redis_settings from keep.api.tasks.process_event_task import process_event # Load environment variables load_dotenv(find_dotenv()) keep.api.logging.setup_logging() logger = logging.getLogger(__name__) # Current worker will pick up tasks only according to its execution pool: all_tasks_for_the_worker = [] if KEEP_ARQ_TASK_POOL in [KEEP_ARQ_TASK_POOL_ALL, KEEP_ARQ_TASK_POOL_BASIC_PROCESSING]: logger.info( "Enabling basic processing tasks for the worker", extra={"task_pool": KEEP_ARQ_TASK_POOL}, ) all_tasks_for_the_worker += [ ("keep.api.tasks.process_event_task.async_process_event", KEEP_ARQ_QUEUE_BASIC), ( "keep.api.tasks.process_topology_task.async_process_topology", KEEP_ARQ_QUEUE_BASIC, ), ( "keep.api.tasks.process_incident_task.async_process_incident", KEEP_ARQ_QUEUE_BASIC, ), ] ARQ_BACKGROUND_FUNCTIONS: Optional[CommaSeparatedStrings] = config( "ARQ_BACKGROUND_FUNCTIONS", cast=CommaSeparatedStrings, default=[task for task, _ in all_tasks_for_the_worker], ) FUNCTIONS: list = ( [ import_string(background_function) for background_function in list(ARQ_BACKGROUND_FUNCTIONS) ] if ARQ_BACKGROUND_FUNCTIONS is not None else list() ) async def process_event_in_worker( ctx, tenant_id, provider_type, provider_id, fingerprint, api_key_name, trace_id, event, notify_client=True, timestamp_forced=None, ): logger.info( "Processing event in worker", extra={ "tenant_id": tenant_id, "provider_type": provider_type, "provider_id": provider_id, "fingerprint": fingerprint, "tract_id": trace_id, }, ) # Create a new context that includes both the arq ctx and any other parameters process_event_func_sync = functools.partial( process_event, ctx=ctx, # Pass ctx as a named parameter tenant_id=tenant_id, provider_type=provider_type, provider_id=provider_id, fingerprint=fingerprint, api_key_name=api_key_name, trace_id=trace_id, event=event, notify_client=notify_client, timestamp_forced=timestamp_forced, ) loop = asyncio.get_running_loop() # run the function in the thread pool resp = await loop.run_in_executor(ctx["pool"], process_event_func_sync) logger.info( "Event processed in worker", extra={ "tenant_id": tenant_id, "provider_type": provider_type, "provider_id": provider_id, "fingerprint": fingerprint, "tract_id": trace_id, }, ) return resp FUNCTIONS.append(process_event_in_worker) async def startup(ctx): """ARQ worker startup callback""" EVENT_WORKERS = int(config("KEEP_EVENT_WORKERS", default=5, cast=int)) # Create dedicated threadpool process_event_executor = ThreadPoolExecutor( max_workers=EVENT_WORKERS, thread_name_prefix="process_event_worker" ) ctx["pool"] = process_event_executor async def shutdown(ctx): """ARQ worker shutdown callback""" # Clean up any resources if needed if "pool" in ctx: ctx["pool"].shutdown(wait=True) def at_every_x_minutes(x: int, start: int = 0, end: int = 59): """Helper function to generate cron-like minute intervals""" return {*list(range(start, end, x))} # Redis settings are now imported from shared module class WorkerSettings: """ Settings for the ARQ worker. """ on_startup = startup on_shutdown = shutdown redis_settings = get_redis_settings() timeout = 30 functions: list = FUNCTIONS cron_jobs: list = [cron("keep.api.tasks.process_watcher_task.async_process_watcher", second=max(0, WATCHER_LAPSED_TIME-1))] queue_name: str health_check_interval: int = 10 health_check_key: str def __init__(self, queue_name: str): self.queue_name = queue_name def get_arq_worker(queue_name: str) -> Worker: """ Create and configure an ARQ worker for the specified queue. Args: queue_name: The name of the queue to which the worker will listen Returns: A configured ARQ worker """ keep_result = config( "ARQ_KEEP_RESULT", cast=int, default=3600 ) # duration to keep job results for expires = config( "ARQ_EXPIRES", cast=int, default=3600 ) # the default length of time from when a job is expected to start after which the job expires, making it shorter to avoid clogging # generate a worker id so each worker will have a different health check key worker_id = str(uuid4()).replace("-", "") worker = create_worker( WorkerSettings, keep_result=keep_result, expires_extra_ms=expires, queue_name=queue_name, health_check_key=f"{queue_name}:{worker_id}:health-check", ) return worker async def safe_run_worker(worker: Worker, number_of_errors_before_restart=0): """ Run a worker with automatic reconnection in case of Redis connection errors. Args: worker: The ARQ worker to run """ try: number_of_errors = 0 while True: try: await worker.async_run() except asyncio.CancelledError: # pragma: no cover # happens on shutdown, fine pass except redis.exceptions.ConnectionError: number_of_errors += 1 # we want to raise an exception if we have too many errors if ( number_of_errors_before_restart and number_of_errors >= number_of_errors_before_restart ): logger.error( f"Worker encountered {number_of_errors} errors, restarting..." ) raise logger.exception("Failed to connect to Redis... Retry in 3 seconds") await asyncio.sleep(3) continue except Exception: number_of_errors += 1 # we want to raise an exception if we have too many errors if ( number_of_errors_before_restart and number_of_errors >= number_of_errors_before_restart ): logger.error( f"Worker encountered {number_of_errors} errors, restarting..." ) raise # o.w: log the error and continue logger.exception("Worker error") await asyncio.sleep(3) continue break finally: await worker.close() ================================================ FILE: keep/api/arq_worker_debug_patch.py ================================================ import functools import logging import time from typing import Optional from arq.worker import Worker # Set up detailed logging logging_format = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" logging.basicConfig(level=logging.DEBUG, format=logging_format) debug_logger = logging.getLogger("arq.debug") # Original methods we'll patch original_run_job = Worker.run_job original_finish_job = Worker.finish_job original_start_jobs = Worker.start_jobs # Tracking in-progress jobs for additional context in_progress_jobs = {} def log_function_call(func): @functools.wraps(func) async def wrapper(self, *args, **kwargs): job_id = args[0] if args else None debug_logger.info(f"ENTER: {func.__name__} for job {job_id}") # Log arguments debug_logger.debug(f"ARGS: {func.__name__} - {args}") debug_logger.debug(f"KWARGS: {func.__name__} - {kwargs}") start_time = time.time() try: result = await func(self, *args, **kwargs) debug_logger.info( f"EXIT: {func.__name__} for job {job_id} in {time.time() - start_time:.4f}s" ) return result except Exception as e: debug_logger.exception(f"ERROR in {func.__name__} for job {job_id}: {e}") raise return wrapper # Patch run_job method to add extensive logging async def patched_run_job(self, job_id: str, score: int) -> None: debug_logger.info(f"🔍 JOB START: {job_id} with score {score}") # Record job start time and info in_progress_jobs[job_id] = { "start_time": time.time(), "score": score, "attempts": 0, } # Get redis retry counter retry_key = "arq:retry:" + job_id try: retry_count = await self.pool.get(retry_key) debug_logger.info(f"🔢 Current retry count for {job_id}: {retry_count}") except Exception as e: debug_logger.warning(f"Could not get retry count for {job_id}: {e}") # Log any existing in-progress markers in_progress_key = "arq:in-progress:" + job_id try: in_progress_exists = await self.pool.exists(in_progress_key) debug_logger.info( f"🏁 In-progress key exists for {job_id}: {in_progress_exists}" ) if in_progress_exists: ttl = await self.pool.pttl(in_progress_key) debug_logger.info(f"⏱️ In-progress TTL for {job_id}: {ttl}ms") except Exception as e: debug_logger.warning(f"Could not check in-progress for {job_id}: {e}") # Run the original method try: await original_run_job(self, job_id, score) finally: if job_id in in_progress_jobs: duration = time.time() - in_progress_jobs[job_id]["start_time"] debug_logger.info(f"🏁 JOB END: {job_id} took {duration:.4f}s") in_progress_jobs.pop(job_id, None) # Patch finish_job to track job completion async def patched_finish_job( self, job_id: str, finish: bool, result_data: Optional[bytes], result_timeout_s: Optional[float], keep_result_forever: bool, incr_score: Optional[int], keep_in_progress: Optional[float], ) -> None: debug_logger.info( f"💾 FINISH JOB {job_id}: finish={finish}, incr_score={incr_score}, " f"keep_in_progress={keep_in_progress}" ) # Inspect transaction before it happens in_progress_key = "arq:in-progress:" + job_id retry_key = "arq:retry:" + job_id queue_key = self.queue_name # Log Redis state before transaction debug_logger.info(f"📊 REDIS STATE BEFORE FINISH for {job_id}:") try: exists_progress = await self.pool.exists(in_progress_key) exists_retry = await self.pool.exists(retry_key) job_in_queue = await self.pool.zscore(queue_key, job_id) debug_logger.info(f" - In-progress exists: {exists_progress}") debug_logger.info(f" - Retry key exists: {exists_retry}") debug_logger.info(f" - Job in queue score: {job_in_queue}") if exists_retry: retry_value = await self.pool.get(retry_key) debug_logger.info(f" - Retry count: {retry_value}") except Exception as e: debug_logger.exception(f"Error checking Redis state: {e}") try: await original_finish_job( self, job_id, finish, result_data, result_timeout_s, keep_result_forever, incr_score, keep_in_progress, ) finally: # Log Redis state after transaction debug_logger.info(f"📊 REDIS STATE AFTER FINISH for {job_id}:") try: exists_progress = await self.pool.exists(in_progress_key) exists_retry = await self.pool.exists(retry_key) job_in_queue = await self.pool.zscore(queue_key, job_id) debug_logger.info(f" - In-progress exists: {exists_progress}") debug_logger.info(f" - Retry key exists: {exists_retry}") debug_logger.info(f" - Job in queue score: {job_in_queue}") except Exception as e: debug_logger.exception(f"Error checking Redis state: {e}") # Patch start_jobs to monitor job pickup async def patched_start_jobs(self, job_ids: list) -> None: if job_ids: debug_logger.info(f"🔍 STARTING JOBS: Found {len(job_ids)} jobs to process") for job_id_bytes in job_ids: job_id = job_id_bytes.decode() debug_logger.info(f"🔍 JOB PICKUP: {job_id}") await original_start_jobs(self, job_ids) # Patch the pipeline to capture Redis watch errors original_pipeline_execute = None async def patched_pipeline_execute(self, *args, **kwargs): try: result = await original_pipeline_execute(self, *args, **kwargs) debug_logger.debug(f"Pipeline executed successfully: {result}") return result except Exception as e: debug_logger.warning(f"Pipeline execution failed: {e}") debug_logger.warning(f"Pipeline commands: {self.command_stack}") raise # Apply the patches def apply_arq_debug_patches(): debug_logger.info("🛠️ Applying ARQ debug patches") # Apply basic logging to key methods for method_name in ["_poll_iteration", "heart_beat", "main"]: original = getattr(Worker, method_name) setattr(Worker, method_name, log_function_call(original)) # Apply our custom patches Worker.run_job = patched_run_job Worker.finish_job = patched_finish_job Worker.start_jobs = patched_start_jobs # Patch the Redis pipeline when the worker starts up original_main = Worker.main async def patched_main(self): global original_pipeline_execute # Now we can safely patch the pipeline execute method from redis import asyncio as aioredis pipeline_cls = aioredis.client.Pipeline original_pipeline_execute = pipeline_cls.execute pipeline_cls.execute = patched_pipeline_execute # Add patches for watch errors original_watch = aioredis.client.Redis.watch async def patched_watch(self, *keys): debug_logger.info(f"👀 REDIS WATCH: watching keys {keys}") return await original_watch(self, *keys) aioredis.client.Redis.watch = patched_watch debug_logger.info("✅ Redis pipeline and watch methods patched") # Call the original main method return await original_main(self) Worker.main = patched_main debug_logger.info("✅ ARQ debug patches applied") # Patch process_event_task.py to track possible Retry exceptions def patch_process_event(): try: from keep.api.tasks.process_event_task import process_event original_process_event = process_event def patched_process_event(*args, **kwargs): debug_logger.info( f"🔄 PROCESS_EVENT called with args={args}, kwargs={kwargs}" ) try: result = original_process_event(*args, **kwargs) debug_logger.info(f"✅ PROCESS_EVENT completed successfully: {result}") return result except Exception as e: debug_logger.exception(f"❌ PROCESS_EVENT failed: {e}") raise from keep.api.tasks import process_event_task process_event_task.process_event = patched_process_event debug_logger.info("✅ Patched process_event function") except ImportError: debug_logger.warning("⚠️ Could not patch process_event (import failed)") # Add a helper function to dump Redis state for a job async def dump_job_state(redis_pool, job_id: str): """Dump all Redis keys related to a specific job""" debug_logger.info(f"📊 DUMPING STATE FOR JOB {job_id}") # Define key prefixes prefixes = [ "arq:job:", "arq:result:", "arq:retry:", "arq:in-progress:", "arq:abort-jobs", ] # Check queue queues = await redis_pool.keys("arq:queue:*") for queue in queues: score = await redis_pool.zscore(queue, job_id) if score: debug_logger.info(f"Job {job_id} found in queue {queue} with score {score}") # Check all relevant keys for prefix in prefixes: key = prefix + job_id exists = await redis_pool.exists(key) if exists: value = await redis_pool.get(key) debug_logger.info(f"Key {key} exists with value: {value}") ttl = await redis_pool.ttl(key) debug_logger.info(f"TTL for {key}: {ttl}s") ================================================ FILE: keep/api/arq_worker_gunicorn.py ================================================ import asyncio import logging import os import signal import sys import threading import time from dotenv import find_dotenv, load_dotenv from fastapi import FastAPI from fastapi.responses import JSONResponse from gunicorn.workers.base import Worker import keep.api.logging import keep.api.observability from keep.api.arq_worker import get_arq_worker, safe_run_worker from keep.api.consts import ( KEEP_ARQ_QUEUE_BASIC, KEEP_ARQ_TASK_POOL, KEEP_ARQ_TASK_POOL_ALL, KEEP_ARQ_TASK_POOL_BASIC_PROCESSING, ) from keep.api.core.config import config from keep.api.core.db import dispose_session from keep.workflowmanager.workflowmanager import WorkflowManager # Load environment variables load_dotenv(find_dotenv()) keep.api.logging.setup_logging() logger = logging.getLogger(__name__) def determine_queue_name(): """Determine the queue name based on task pool configuration""" # this is the same behavior as in the original arq_worker.py # but from some reason if returns None so we "duplicate the logic here" if not KEEP_ARQ_TASK_POOL: return KEEP_ARQ_TASK_POOL_ALL elif KEEP_ARQ_TASK_POOL in [ KEEP_ARQ_TASK_POOL_ALL, KEEP_ARQ_TASK_POOL_BASIC_PROCESSING, ]: return KEEP_ARQ_QUEUE_BASIC else: raise ValueError(f"Invalid task pool: {KEEP_ARQ_TASK_POOL}") async def run_arq_worker(worker_id, number_of_errors_before_restart=0): """Run an ARQ worker""" logger.info(f"Starting ARQ Worker {worker_id} (PID: {os.getpid()})") try: queue_name = determine_queue_name() except ValueError as e: # gunicorn will restart the worker if it exits with a non-zero code logger.exception(f"Invalid task pool configuration: {e}") os._exit(1) if not queue_name: # let gunicorn restart the worker logger.info("No task pools configured to run - exiting") os._exit(1) # Apply debug patches if needed if config("LOG_LEVEL", default="INFO") == "DEBUG": logger.info("Applying ARQ debug patches") try: module_name = __name__.rsplit(".", 1)[0] if "." in __name__ else "" import_path = ( f"{module_name}.arq_worker_debug_patch" if module_name else "arq_worker_debug_patch" ) debug_module = __import__( import_path, fromlist=["apply_arq_debug_patches", "patch_process_event"] ) debug_module.apply_arq_debug_patches() debug_module.patch_process_event() logger.info("ARQ debug patches applied") except ImportError: logger.warning( "Could not import ARQ debug patches, continuing without them" ) # Start the workflow manager logger.info("Starting Workflow Manager") wf_manager = WorkflowManager.get_instance() await wf_manager.start() logger.info("Workflow Manager started") # Get and run the ARQ worker worker = get_arq_worker(queue_name) try: await safe_run_worker( worker, number_of_errors_before_restart=number_of_errors_before_restart ) except Exception as e: logger.exception(f"ARQ worker failed: {e}") # let GUnicorn restart the worker os._exit(1) logger.info(f"ARQ Worker {worker_id} finished") class ARQGunicornWorker(Worker): """ Custom Gunicorn worker that runs an ARQ worker. This worker properly integrates with Gunicorn's request handling model. """ def __init__(self, *args, **kwargs): """Initialize the worker""" super().__init__(*args, **kwargs) self.worker_id = self.age self.arq_running = False self.loop = None self.heartbeat_file = None self.last_heartbeat = 0 self.stop_heartbeat = False self.heartbeat_thread = None self.logger = logging.getLogger(__name__) self.number_of_errors_before_restart = config( "ARQ_NUMBER_OF_ERRORS_BEFORE_RESTART", cast=int, default=5 ) # Setup heartbeat directory self.heartbeat_dir = os.environ.get("ARQ_HEARTBEAT_DIR", "/tmp/arq_heartbeats") os.makedirs(self.heartbeat_dir, exist_ok=True) # Initialize heartbeat file self.heartbeat_file = os.path.join( self.heartbeat_dir, f"arq_worker_{os.getpid()}.heartbeat" ) self.max_heartbeat_age = int(os.environ.get("ARQ_MAX_HEARTBEAT_AGE", "30")) # Store ARQ task self.arq_task = None def update_heartbeat(self): """Update the heartbeat file to indicate the worker is alive""" try: self.logger.info(f"Updating heartbeat: {self.heartbeat_file}") self.last_heartbeat = time.time() with open(self.heartbeat_file, "w") as f: f.write(str(self.last_heartbeat)) except Exception as e: self.logger.warning(f"Failed to update heartbeat: {e}") def start_heartbeat_thread(self): """Start a background thread to update the heartbeat file""" self.stop_heartbeat = False def heartbeat_loop(): """Periodic heartbeat updates""" while not self.stop_heartbeat: self.update_heartbeat() time.sleep(5) # Update heartbeat every 5 seconds self.logger.info("Starting heartbeat thread") self.heartbeat_thread = threading.Thread(target=heartbeat_loop, daemon=True) self.heartbeat_thread.start() def check_heartbeat(self): """Check if heartbeat is still being updated, return True if healthy""" try: if os.path.exists(self.heartbeat_file): with open(self.heartbeat_file, "r") as f: try: last_heartbeat = float(f.read().strip()) # Check if heartbeat is too old heartbeat_age = time.time() - last_heartbeat if heartbeat_age > self.max_heartbeat_age: self.log.error( f"Heartbeat is too old: {heartbeat_age:.1f}s > {self.max_heartbeat_age}s" ) return False return True except ValueError: self.log.error("Invalid heartbeat value") return False else: self.log.error(f"Heartbeat file not found: {self.heartbeat_file}") return False except Exception as e: self.log.exception(f"Error checking heartbeat: {e}") return False async def handle_http_request(self, reader, writer): """Handle HTTP health check requests""" try: # Read the request (but we don't really care about the content) # We just need to read enough to clear the buffer await reader.read(1024) # Check worker health if self.check_heartbeat() and self.arq_running: response = f"HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\nARQ Worker {self.worker_id} Running\n" else: response = f"HTTP/1.1 503 Service Unavailable\r\nContent-Type: text/plain\r\n\r\nARQ Worker {self.worker_id} Heartbeat Failed\n" # Send the response writer.write(response.encode()) await writer.drain() except Exception as e: self.log.exception(f"Error handling HTTP request: {e}") try: error_response = "HTTP/1.1 500 Internal Server Error\r\nContent-Type: text/plain\r\n\r\nError processing request\n" writer.write(error_response.encode()) await writer.drain() except Exception as e: pass finally: # Close the connection try: writer.close() await writer.wait_closed() except Exception: pass async def _run(self): """Run the ARQ worker and handle requests from Gunicorn""" self.log.info(f"Starting ARQ worker {self.worker_id} in process {os.getpid()}") # Start the ARQ worker self.arq_running = True self.arq_task = asyncio.create_task( run_arq_worker( self.worker_id, number_of_errors_before_restart=self.number_of_errors_before_restart, ) ) # Wait for the ARQ worker to complete try: await self.arq_task except Exception as e: self.log.exception(f"ARQ worker failed: {e}") # let GUnicorn restart the worker os._exit(1) finally: self.arq_running = False self.log.info(f"ARQ worker {self.worker_id} finished") def init_process(self): """Initialize the worker process - required Gunicorn Worker method""" # Start heartbeat self.update_heartbeat() self.start_heartbeat_thread() self.logger.info("Init process") # Initialize the base worker super().init_process() # Clean up any existing DB connections dispose_session() def run(self): """Run the worker - required Gunicorn Worker method""" self.log.info(f"ARQGunicornWorker running in process {os.getpid()}") # Create and set the event loop self.loop = asyncio.new_event_loop() asyncio.set_event_loop(self.loop) # Set up signal handlers in the main thread for sig in [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT]: self.loop.add_signal_handler( sig, lambda s=sig: asyncio.create_task(self.handle_signal(s)) ) # Run the ARQ worker try: self.arq_task = self.loop.create_task(self._run()) # This is the key part: we use Gunicorn's socket to handle requests # The sockets are already set up by Gunicorn's master process for sock in self.sockets: # Create server for each socket passed by Gunicorn server = asyncio.start_server( self.handle_http_request, sock=sock, ) self.loop.run_until_complete(server) self.log.info(f"Started HTTP server on socket {sock}") # Run the event loop self.loop.run_forever() except Exception as e: self.log.exception(f"Error in main event loop: {e}") finally: self.logger.info("Shutting down ARQGunicornWorker") self.stop_heartbeat = True if self.heartbeat_thread and self.heartbeat_thread.is_alive(): self.heartbeat_thread.join(timeout=5) self.logger.info("Heartbeat thread stopped") # Clean up the event loop try: # Cancel any pending tasks for task in asyncio.all_tasks(self.loop): task.cancel() # Run the loop until tasks are cancelled self.loop.run_until_complete(asyncio.sleep(0.1)) # Close the loop self.loop.close() except Exception as e: self.log.exception(f"Error closing event loop: {e}") async def handle_signal(self, sig): """Handle signals asynchronously""" self.log.info(f"Received signal {sig}, shutting down") self.arq_running = False # Cancel the ARQ task if it's running if self.arq_task and not self.arq_task.done(): self.arq_task.cancel() try: await self.arq_task except asyncio.CancelledError: self.log.info("ARQ task cancelled") # Stop the event loop self.loop.stop() def create_app(): """ Create a simple WSGI app for Gunicorn. This is just a placeholder as our custom worker handles all the logic. """ logger.info("Creating ARQ worker WSGI app") # Verify task pool if not KEEP_ARQ_TASK_POOL: logger.warning("No task pools configured to run") # Simple FastAPI app that just returns a status message app = FastAPI( title="Keep ARQ Worker", description="Rest API powering https://platform.keephq.dev and friends 🏄‍♀️", ) @app.get("/") def get_status(body: dict = None): data = b"ARQ Worker Running\n" return JSONResponse( content=data, status_code=200, headers={"Content-Type": "text/plain"}, ) if config("KEEP_OTEL_ENABLED", default="true", cast=bool): keep.api.observability.setup(app) return app # If this module is run directly, it will act as a standalone entry point if __name__ == "__main__": logger.info("Running ARQ worker standalone (without Gunicorn)") try: # Set a default worker ID for standalone execution app = create_app() worker_id = 0 asyncio.run(run_arq_worker(worker_id)) except KeyboardInterrupt: logger.info("Worker interrupted") sys.exit(0) except Exception as e: logger.exception(f"Worker failed with exception: {e}") sys.exit(1) ================================================ FILE: keep/api/bl/ai_suggestion_bl.py ================================================ import hashlib import json import logging import uuid from typing import Dict, List, Optional, Set, Tuple from uuid import UUID from fastapi import HTTPException from openai import OpenAI, OpenAIError from sqlmodel import Session from keep.api.bl.incidents_bl import IncidentBl from keep.api.consts import OPENAI_MODEL_NAME from keep.api.core.db import get_session_sync from keep.api.models.alert import AlertDto from keep.api.models.db.ai_suggestion import AIFeedback, AISuggestion, AISuggestionType from keep.api.models.db.topology import TopologyServiceDtoOut from keep.api.models.incident import ( IncidentCandidate, IncidentClustering, IncidentDto, IncidentsClusteringSuggestion, ) class AISuggestionBl: def __init__(self, tenant_id: str, session: Session | None = None) -> None: self.logger = logging.getLogger(__name__) self.tenant_id = tenant_id self.session = session if session else get_session_sync() # Todo: interface it with any model # https://github.com/keephq/keep/issues/2373 # Todo: per-tenant keys # https://github.com/keephq/keep/issues/2365 # Todo: also goes with settings page # https://github.com/keephq/keep/issues/2365 try: self._client = OpenAI() except OpenAIError as e: # if its api key error, we should raise 400 self.logger.error(f"Failed to initialize OpenAI client: {e}") raise HTTPException( status_code=400, detail="AI service is not enabled for the client." ) def get_suggestion_by_input(self, suggestion_input: Dict) -> Optional[AISuggestion]: """ Retrieve an AI suggestion by its input. Args: - suggestion_input (Dict): The input of the suggestion. Returns: - Optional[AISuggestion]: The suggestion object if found, otherwise None. """ suggestion_input_hash = self.hash_suggestion_input(suggestion_input) return ( self.session.query(AISuggestion) .filter( AISuggestion.tenant_id == self.tenant_id, AISuggestion.suggestion_input_hash == suggestion_input_hash, ) .first() ) def hash_suggestion_input(self, suggestion_input: Dict) -> str: """ Hash the suggestion input to allow for duplicate suggestions with the same input. Args: - suggestion_input (Dict): The input of the suggestion. Returns: - str: The hash of the suggestion input. """ json_input = json.dumps(suggestion_input, sort_keys=True) return hashlib.sha256(json_input.encode()).hexdigest() def add_suggestion( self, user_id: str, suggestion_input: Dict, suggestion_type: AISuggestionType, suggestion_content: Dict, model: str, ) -> AISuggestion: """ Add a new AI suggestion to the database. Args: - suggestion_type (AISuggestionType): The type of suggestion. - suggestion_content (Dict): The content of the suggestion. - model (str): The model used for the suggestion. Returns: - AISuggestion: The created suggestion object. """ self.logger.info( "Adding new AI suggestion", extra={ "tenant_id": self.tenant_id, "suggestion_type": suggestion_type, }, ) try: suggestion_input_hash = self.hash_suggestion_input(suggestion_input) suggestion = AISuggestion( tenant_id=self.tenant_id, user_id=user_id, suggestion_input=suggestion_input, suggestion_input_hash=suggestion_input_hash, suggestion_type=suggestion_type, suggestion_content=suggestion_content, model=model, ) self.session.add(suggestion) self.session.commit() self.logger.info( "AI suggestion added successfully", extra={ "tenant_id": self.tenant_id, "suggestion_id": suggestion.id, }, ) return suggestion except Exception as e: self.logger.error( "Failed to add AI suggestion", extra={ "tenant_id": self.tenant_id, "error": str(e), }, ) self.session.rollback() raise def add_feedback( self, suggestion_id: UUID, user_id: str, feedback_content: str, rating: Optional[int] = None, comment: Optional[str] = None, ) -> AIFeedback: """ Add AI feedback to the database. Args: - suggestion_id (UUID): The ID of the suggestion being feedback on. - user_id (str): The ID of the user providing feedback. - feedback_content (str): The feedback content. - rating (Optional[int]): The user's rating of the AI suggestion. - comment (Optional[str]): Any additional comments from the user. Returns: - AIFeedback: The created feedback object. """ self.logger.info( "Saving AI feedback", extra={ "tenant_id": self.tenant_id, "suggestion_id": suggestion_id, }, ) try: feedback = AIFeedback( suggestion_id=suggestion_id, user_id=user_id, feedback_content=feedback_content, rating=rating, comment=comment, ) self.session.add(feedback) self.session.commit() self.logger.info( "AI feedback saved successfully", extra={ "tenant_id": self.tenant_id, "feedback_id": feedback.id, }, ) return feedback except Exception as e: self.logger.error( "Failed to save AI feedback", extra={ "tenant_id": self.tenant_id, "error": str(e), }, ) self.session.rollback() raise def get_feedback( self, suggestion_type: AISuggestionType | None = None ) -> List[AIFeedback]: """ Retrieve AI feedback from the database. Args: - suggestion_type (AISuggestionType | None): Optional filter for suggestion type. Returns: - List[AIFeedback]: List of feedback objects. """ query = ( self.session.query(AIFeedback) .join(AISuggestion) .filter(AISuggestion.tenant_id == self.tenant_id) ) if suggestion_type: query = query.filter(AISuggestion.suggestion_type == suggestion_type) feedback_list = query.all() self.logger.info( "Retrieved AI feedback", extra={ "tenant_id": self.tenant_id, "feedback_count": len(feedback_list), "suggestion_type": suggestion_type, }, ) return feedback_list def suggest_incidents( self, alerts_dto: List[AlertDto], topology_data: List[TopologyServiceDtoOut], user_id: str, ) -> IncidentsClusteringSuggestion: """Create incident suggestions using AI.""" if len(alerts_dto) > 50: raise HTTPException(status_code=400, detail="Too many alerts to process") # Check for existing suggestion alerts_fingerprints = [alert.fingerprint for alert in alerts_dto] suggestion_input = {"alerts_fingerprints": alerts_fingerprints} existing_suggestion = self.get_suggestion_by_input(suggestion_input) if existing_suggestion: self.logger.info("Retrieving existing suggestion from DB") incident_clustering = IncidentClustering.parse_obj( existing_suggestion.suggestion_content ) processed_incidents = self._process_incidents( incident_clustering.incidents, alerts_dto ) return IncidentsClusteringSuggestion( incident_suggestion=processed_incidents, suggestion_id=str(existing_suggestion.id), ) try: # Prepare prompts system_prompt, user_prompt = self._prepare_prompts( alerts_dto, topology_data ) # Get completion from OpenAI completion = self._get_ai_completion(system_prompt, user_prompt) # Parse and process response incident_clustering = IncidentClustering.parse_raw( completion.choices[0].message.content ) # Save suggestion suggestion = self.add_suggestion( user_id=user_id, suggestion_input=suggestion_input, suggestion_type=AISuggestionType.INCIDENT_SUGGESTION, suggestion_content=incident_clustering.dict(), model=OPENAI_MODEL_NAME, ) # Process incidents processed_incidents = self._process_incidents( incident_clustering.incidents, alerts_dto ) return IncidentsClusteringSuggestion( incident_suggestion=processed_incidents, suggestion_id=str(suggestion.id), ) except Exception as e: self.logger.error(f"AI incident creation failed: {e}") raise HTTPException(status_code=500, detail="AI service is unavailable.") async def commit_incidents( self, suggestion_id: UUID, incidents_with_feedback: List[Dict], user_id: str, incident_bl: IncidentBl, ) -> List[IncidentDto]: """Commit incidents with user feedback.""" committed_incidents = [] # Add feedback to the database changes = { incident_commit["incident"]["id"]: incident_commit["changes"] for incident_commit in incidents_with_feedback } self.add_feedback( suggestion_id=suggestion_id, user_id=user_id, feedback_content=changes, ) for incident_with_feedback in incidents_with_feedback: if not incident_with_feedback["accepted"]: self.logger.info( f"Incident {incident_with_feedback['incident']['name']} rejected by user, skipping creation" ) continue try: # Create the incident incident_dto = IncidentDto.parse_obj(incident_with_feedback["incident"]) created_incident = incident_bl.create_incident( incident_dto, generated_from_ai=True ) # Add alerts to the created incident alert_ids = [ alert["fingerprint"] for alert in incident_with_feedback["incident"]["alerts"] ] await incident_bl.add_alerts_to_incident(created_incident.id, alert_ids) committed_incidents.append(created_incident) self.logger.info( f"Incident {incident_with_feedback['incident']['name']} created successfully" ) except Exception as e: self.logger.error( f"Failed to create incident {incident_with_feedback['incident']['name']}: {str(e)}" ) return committed_incidents def _prepare_prompts( self, alerts_dto: List[AlertDto], topology_data: List[TopologyServiceDtoOut] ) -> Tuple[str, str]: """Prepare system and user prompts for AI.""" alert_descriptions = "\n".join( [ f"Alert {idx+1}: {json.dumps(alert.dict())}" for idx, alert in enumerate(alerts_dto) ] ) topology_text = "\n".join( [ f"Topology {idx+1}: {json.dumps(topology.dict(), default=str)}" for idx, topology in enumerate(topology_data) ] ) system_prompt = """ You are an advanced AI system specializing in IT operations and incident management. Your task is to analyze the provided IT operations alerts and topology data, and cluster them into meaningful incidents. Consider factors such as: 1. Alert description and content 2. Potential temporal proximity 3. Affected systems or services 4. Type of IT issue (e.g., performance degradation, service outage, resource utilization) 5. Potential root causes 6. Relationships and dependencies between services in the topology data Group related alerts into distinct incidents and provide a detailed analysis for each incident. For each incident: 1. Assess its severity 2. Recommend initial actions for the IT operations team 3. Provide a confidence score (0.0 to 1.0) for the incident clustering 4. Explain how the confidence score was calculated, considering factors like alert similarity, topology relationships, and the strength of the correlation between alerts Use the topology data to improve your incident clustering by considering service dependencies and relationships. """ user_prompt = f""" Analyze the following IT operations alerts and topology data, then group the alerts into incidents: Alerts: {alert_descriptions} Topology data: {topology_text} Provide your analysis and clustering in the specified JSON format. """ return system_prompt, user_prompt def _get_ai_completion(self, system_prompt: str, user_prompt: str): """Get completion from OpenAI.""" return self._client.chat.completions.create( model=OPENAI_MODEL_NAME, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], response_format={ "type": "json_schema", "json_schema": { "name": "incident_clustering", "schema": { "type": "object", "properties": { "incidents": { "type": "array", "items": { "type": "object", "properties": { "incident_name": {"type": "string"}, "alerts": { "type": "array", "items": {"type": "integer"}, "description": "List of alert numbers (1-based index)", }, "reasoning": {"type": "string"}, "severity": { "type": "string", "enum": [ "critical", "high", "warning", "info", "low", ], }, "recommended_actions": { "type": "array", "items": {"type": "string"}, }, "confidence_score": {"type": "number"}, "confidence_explanation": {"type": "string"}, }, "required": [ "incident_name", "alerts", "reasoning", "severity", "recommended_actions", "confidence_score", "confidence_explanation", ], }, } }, "required": ["incidents"], }, }, }, temperature=0.2, ) def _process_incidents( self, incidents: List[IncidentCandidate], alerts_dto: List[AlertDto] ) -> List[IncidentDto]: """Process incidents and create DTOs.""" processed_incidents = [] for incident in incidents: alert_sources: Set[str] = set() alert_services: Set[str] = set() for alert_index in incident.alerts: alert = alerts_dto[alert_index - 1] if alert.source: alert_sources.add(alert.source[0]) if alert.service: alert_services.add(alert.service) incident_alerts = [alerts_dto[i - 1] for i in incident.alerts] start_time = min(alert.lastReceived for alert in incident_alerts) last_seen_time = max(alert.lastReceived for alert in incident_alerts) incident_dto = IncidentDto( id=uuid.uuid4(), name=incident.incident_name, start_time=start_time, last_seen_time=last_seen_time, description=incident.reasoning, confidence_score=incident.confidence_score, confidence_explanation=incident.confidence_explanation, severity=incident.severity, alert_ids=[alerts_dto[i - 1].id for i in incident.alerts], recommended_actions=incident.recommended_actions, is_predicted=True, is_candidate=True, is_visible=True, alerts_count=len(incident.alerts), alert_sources=list(alert_sources), alerts=incident_alerts, services=list(alert_services), ) processed_incidents.append(incident_dto) return processed_incidents ================================================ FILE: keep/api/bl/dismissal_expiry_bl.py ================================================ """ Business logic for handling dismissal expiry. This module provides functionality to automatically expire alert dismissals when their dismissedUntil timestamp has passed. """ import datetime import logging from typing import List, Optional from sqlmodel import Session, select from keep.api.core.db import get_session_sync from keep.api.core.db_utils import get_json_extract_field from keep.api.core.elastic import ElasticClient from keep.api.core.dependencies import get_pusher_client from keep.api.models.action_type import ActionType from keep.api.models.alert import AlertDto from keep.api.models.db.alert import Alert, AlertAudit, AlertEnrichment class DismissalExpiryBl: @staticmethod def get_alerts_with_expired_dismissals(session: Session) -> List[AlertEnrichment]: """ Get all AlertEnrichment records that have expired dismissedUntil timestamps. Returns enrichment records where: 1. dismissed = true 2. dismissedUntil is not null and not "forever" 3. dismissedUntil timestamp is in the past Args: session: Database session Returns: List of AlertEnrichment objects with expired dismissals """ logger = logging.getLogger(__name__) now = datetime.datetime.now(datetime.timezone.utc) logger.info("Searching for enrichments with expired dismissals") # Query for enrichments with dismissed=true and dismissedUntil set # Use the proper helper function for cross-database compatibility dismissed_field = get_json_extract_field(session, AlertEnrichment.enrichments, "dismissed") dismissed_until_field = get_json_extract_field(session, AlertEnrichment.enrichments, "dismissUntil") # Build cross-database compatible boolean comparison # Different databases store/extract JSON booleans differently: # - SQLite: json_extract can return 1/0 for true/false OR "True"/"False"/"true"/"false" strings depending on how data was stored # - MySQL: JSON_UNQUOTE(JSON_EXTRACT()) returns "true"/"false" strings (lowercase) # - PostgreSQL: json_extract_path_text() returns "true"/"false" strings (lowercase) OR "True"/"False" (depending on input) if session.bind.dialect.name == "sqlite": # Handle both integer and string representations in SQLite dismissed_condition = (dismissed_field == 1) | (dismissed_field == "True") | (dismissed_field == "true") elif session.bind.dialect.name == "postgresql": # PostgreSQL can return both "true"/"false" and "True"/"False" depending on how the data was stored dismissed_condition = (dismissed_field == "true") | (dismissed_field == "True") else: # For MySQL, compare with lowercase string "true" dismissed_condition = dismissed_field == "true" query = session.exec( select(AlertEnrichment).where( dismissed_condition, # dismissedUntil is not null dismissed_until_field.isnot(None), # dismissedUntil is not "forever" dismissed_until_field != "forever", ) ) candidate_enrichments = query.all() logger.info(f"Found {len(candidate_enrichments)} candidate enrichments with dismissals") # Filter in Python for safety and clarity (parsing ISO timestamps) expired_enrichments = [] for enrichment in candidate_enrichments: dismiss_until_str = enrichment.enrichments.get("dismissUntil") if not dismiss_until_str or dismiss_until_str == "forever": continue try: # Parse the dismissedUntil timestamp dismiss_until = datetime.datetime.strptime( dismiss_until_str, "%Y-%m-%dT%H:%M:%S.%fZ" ).replace(tzinfo=datetime.timezone.utc) # Check if it's expired (current time > dismissedUntil) if now > dismiss_until: logger.info( f"Found expired dismissal for fingerprint {enrichment.alert_fingerprint}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint, "dismissed_until": dismiss_until_str, "expired_by_seconds": (now - dismiss_until).total_seconds() } ) expired_enrichments.append(enrichment) except (ValueError, TypeError) as e: # Log invalid timestamp but don't fail logger.warning( f"Invalid dismissedUntil timestamp for fingerprint {enrichment.alert_fingerprint}: {dismiss_until_str}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint, "error": str(e) } ) continue logger.info(f"Found {len(expired_enrichments)} enrichments with expired dismissals") return expired_enrichments @staticmethod def check_dismissal_expiry(logger: logging.Logger, session: Optional[Session] = None): """ Check for alerts with expired dismissedUntil and restore them. This function: 1. Finds AlertEnrichment records with expired dismissedUntil timestamps 2. Updates their enrichments to set dismissed=false and dismissedUntil=null 3. Cleans up disposable fields 4. Updates Elasticsearch indexes 5. Notifies UI of changes 6. Adds audit trail Args: logger: Logger instance for detailed logging session: Optional database session (creates new if None) """ logger.info("Starting dismissal expiry check") if session is None: session = get_session_sync() try: # Find enrichments with expired dismissedUntil expired_enrichments = DismissalExpiryBl.get_alerts_with_expired_dismissals(session) if not expired_enrichments: logger.info("No enrichments with expired dismissals found") return logger.info(f"Processing {len(expired_enrichments)} expired dismissal enrichments") # Process each expired enrichment for enrichment in expired_enrichments: logger.info( f"Processing expired dismissal for fingerprint {enrichment.alert_fingerprint}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint, "dismissed_until": enrichment.enrichments.get("dismissedUntil") } ) # Store original values for audit original_dismissed = enrichment.enrichments.get("dismissed", False) original_dismissed_until = enrichment.enrichments.get("dismissedUntil") # Update enrichment - set back to not dismissed new_enrichments = enrichment.enrichments.copy() new_enrichments["dismissed"] = False new_enrichments["dismissUntil"] = None # Clear the original field # Reset status if it was set to suppressed during dismissal enrichment_status = enrichment.enrichments.get("status") if enrichment_status == "suppressed": # Remove the suppressed status entirely - let the system use the original alert status # The AlertDto will get the status from the original alert event data new_enrichments.pop("status", None) logger.info( f"Removed suppressed status for fingerprint {enrichment.alert_fingerprint} - will use original alert status", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint, "removed_status": enrichment_status } ) # Clean up ALL disposable fields (use pattern matching instead of hardcoded list) cleaned_fields = [] keys_to_remove = [] for field_name in new_enrichments.keys(): if field_name.startswith("disposable_"): keys_to_remove.append(field_name) cleaned_fields.append(field_name) # Remove the disposable fields for field_name in keys_to_remove: new_enrichments.pop(field_name) if cleaned_fields: logger.info( f"Cleaned up disposable fields: {cleaned_fields}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) # Update the enrichment record enrichment.enrichments = new_enrichments session.add(enrichment) # Add audit trail try: audit = AlertAudit( tenant_id=enrichment.tenant_id, fingerprint=enrichment.alert_fingerprint, user_id="system", action=ActionType.DISMISSAL_EXPIRED.value, # Use .value to get the string description=( f"Dismissal expired at {original_dismissed_until}, " f"enrichment updated from dismissed={original_dismissed} to dismissed=False" ) ) session.add(audit) logger.info( "Added audit trail for expired dismissal", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) except Exception as e: logger.error( f"Failed to add audit trail for fingerprint {enrichment.alert_fingerprint}: {e}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) # Update Elasticsearch index try: # Get the latest alert for this fingerprint to create AlertDto latest_alert = session.exec( select(Alert) .where(Alert.tenant_id == enrichment.tenant_id) .where(Alert.fingerprint == enrichment.alert_fingerprint) .order_by(Alert.timestamp.desc()) .limit(1) ).first() if latest_alert: # Create AlertDto with updated enrichments alert_data = latest_alert.event.copy() # Only update specific enrichment fields, don't override alert event data with None values enrichment_fields = ['dismissed', 'dismissUntil', 'note', 'assignee', 'status'] for field in enrichment_fields: if field in new_enrichments and new_enrichments[field] is not None: alert_data[field] = new_enrichments[field] elif field in new_enrichments and new_enrichments[field] is None and field in ['dismissed', 'dismissUntil']: # For dismissal fields, None is a valid value (means not dismissed) alert_data[field] = new_enrichments[field] alert_dto = AlertDto(**alert_data) elastic_client = ElasticClient(enrichment.tenant_id) elastic_client.index_alert(alert_dto) logger.info( f"Updated Elasticsearch index for fingerprint {enrichment.alert_fingerprint}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) else: logger.warning( f"No alert found for fingerprint {enrichment.alert_fingerprint}, skipping Elasticsearch update", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) except Exception as e: logger.error( f"Failed to update Elasticsearch for fingerprint {enrichment.alert_fingerprint}: {e}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) # Notify UI of change try: pusher_client = get_pusher_client() if pusher_client: pusher_client.trigger( f"private-{enrichment.tenant_id}", "alert-update", { "fingerprint": enrichment.alert_fingerprint, "action": "dismissal_expired" } ) logger.info( f"Sent UI notification for fingerprint {enrichment.alert_fingerprint}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) except Exception as e: logger.error( f"Failed to send UI notification for fingerprint {enrichment.alert_fingerprint}: {e}", extra={ "tenant_id": enrichment.tenant_id, "fingerprint": enrichment.alert_fingerprint } ) # Commit all changes session.commit() logger.info( f"Successfully processed {len(expired_enrichments)} expired dismissal enrichments", extra={"processed_count": len(expired_enrichments)} ) except Exception as e: logger.error(f"Error during dismissal expiry check: {e}", exc_info=True) session.rollback() raise finally: logger.info("Dismissal expiry check completed") ================================================ FILE: keep/api/bl/enrichments_bl.py ================================================ import datetime import html import json import logging import re import uuid from uuid import UUID import celpy import chevron import json5 from elasticsearch import NotFoundError from fastapi import HTTPException from sqlalchemy import func from sqlalchemy_utils import UUIDType from sqlmodel import Session, select from keep.api.core.config import config from keep.api.core.db import batch_enrich from keep.api.core.db import enrich_entity as enrich_alert_db from keep.api.core.db import ( get_alert_by_event_id, get_enrichment_with_session, get_extraction_rule_by_id, get_incidents_by_alert_fingerprint, get_last_alert_by_fingerprint, get_mapping_rule_by_id, get_session_sync, get_topology_data_by_dynamic_matcher, is_all_alerts_resolved, ) from keep.api.core.elastic import ElasticClient from keep.api.models.action_type import ActionType from keep.api.models.alert import AlertDto from keep.api.models.db.alert import Alert from keep.api.models.db.enrichment_event import ( EnrichmentEvent, EnrichmentLog, EnrichmentStatus, EnrichmentType, ) from keep.api.models.db.extraction import ExtractionRule from keep.api.models.db.incident import IncidentStatus from keep.api.models.db.mapping import MappingRule from keep.api.models.db.rule import ResolveOn from keep.identitymanager.authenticatedentity import AuthenticatedEntity def is_valid_uuid(uuid_str): if isinstance(uuid_str, UUID): return True try: # UUID() will convert string to UUID object if valid uuid.UUID(uuid_str) return True except ValueError: return False def get_nested_attribute(obj: AlertDto, attr_path: str): """ Recursively get a nested attribute """ # Special case for source, since it's a list if attr_path == "source" and obj.source is not None and len(obj.source) > 0: return obj.source[0] if isinstance(attr_path, list): return ( all(get_nested_attribute(obj, attr) is not None for attr in attr_path) or None ) attributes = attr_path.split(".") for attr in attributes: # @@ is used as a placeholder for . in cases where the attribute name has a . # For example, we have {"results": {"some.attribute": "value"}} # We can access it by using "results.some@@attribute" so we won't think its a nested attribute if attr is not None and "@@" in attr: attr = attr.replace("@@", ".") obj = getattr( obj, attr, obj.get(attr, None) if isinstance(obj, dict) else None, ) if obj is None: return None return obj class EnrichmentsBl: ENRICHMENT_DISABLED = config("KEEP_ENRICHMENT_DISABLED", default="false", cast=bool) def __init__(self, tenant_id: str, db: Session | None = None): self.logger = logging.getLogger(__name__) self.tenant_id = tenant_id self.__logs: list[EnrichmentLog] = [] self.enrichment_event_id: UUID | None = None if not EnrichmentsBl.ENRICHMENT_DISABLED: self.db_session = db or get_session_sync() self.elastic_client = ElasticClient(tenant_id=tenant_id) else: self.db_session = None self.elastic_client = None def run_mapping_rule_by_id(self, rule_id: int, alert_id: UUID) -> AlertDto: rule = get_mapping_rule_by_id(self.tenant_id, rule_id, session=self.db_session) if not rule: raise HTTPException(status_code=404, detail="Mapping rule not found") alert = get_alert_by_event_id( self.tenant_id, str(alert_id), session=self.db_session ) if not alert: raise HTTPException(status_code=404, detail="Alert not found") return self.check_if_match_and_enrich(alert, rule) def run_extraction_rule_by_id(self, rule_id: int, alert: Alert) -> AlertDto: rule = get_extraction_rule_by_id( self.tenant_id, rule_id, session=self.db_session ) # so we can track the enrichment event alert.event["event_id"] = alert.id if not rule: raise HTTPException(status_code=404, detail="Extraction rule not found") return self.run_extraction_rules(alert.event, pre=False, rules=[rule]) def run_extraction_rules( self, event: AlertDto | dict, pre=False, rules: list[ExtractionRule] = None ) -> AlertDto | dict: """ Run the extraction rules for the event """ if EnrichmentsBl.ENRICHMENT_DISABLED: self.logger.debug("Enrichment is disabled, skipping extraction rules") return event fingerprint = ( event.get("fingerprint") if isinstance(event, dict) else getattr(event, "fingerprint", None) ) event_id = ( event.get("event_id") if isinstance(event, dict) else getattr(event, "id", None) ) self._add_enrichment_log( "Running extraction rules for incoming event", "info", { "tenant_id": self.tenant_id, "fingerprint": fingerprint, "event_id": event_id, "pre": pre, }, ) rules: list[ExtractionRule] = rules or ( self.db_session.query(ExtractionRule) .filter(ExtractionRule.tenant_id == self.tenant_id) .filter(ExtractionRule.disabled == False) .filter(ExtractionRule.pre == pre) .order_by(ExtractionRule.priority.desc()) .all() ) if not rules: self._add_enrichment_log( f"No extraction rules found (pre: {pre})", "debug", { "tenant_id": self.tenant_id, "fingerprint": fingerprint, "event_id": event_id, "pre": pre, }, ) self._track_enrichment_event( event_id, EnrichmentStatus.SKIPPED, EnrichmentType.EXTRACTION, 0, {} ) return event is_alert_dto = False if isinstance(event, AlertDto): is_alert_dto = True event = json.loads(json.dumps(event.dict(), default=str)) for rule in rules: attribute = rule.attribute if ( attribute.startswith("{{") is False and attribute.endswith("}}") is False ): # Wrap the attribute in {{ }} to make it a valid chevron template attribute = f"{{{{ {attribute} }}}}" attribute_value = chevron.render(attribute, event) attribute_value = html.unescape(attribute_value) if not attribute_value: self._add_enrichment_log( f"Attribute ({rule.attribute}) value is empty, skipping extraction", "info", {"rule_id": rule.id}, ) self._track_enrichment_event( event_id, EnrichmentStatus.SKIPPED, EnrichmentType.EXTRACTION, rule.id, {}, ) continue if rule.condition is None or rule.condition == "*" or rule.condition == "": self._add_enrichment_log( f"No condition specified for rule {rule.name}, enriching...", "info", { "rule_id": rule.id, "tenant_id": self.tenant_id, "fingerprint": fingerprint, }, ) else: env = celpy.Environment() ast = env.compile(rule.condition) prgm = env.program(ast) activation = celpy.json_to_cel(event) relevant = prgm.evaluate(activation) if not relevant: self._add_enrichment_log( f"Condition did not match, skipping extraction for rule {rule.name} with condition {rule.condition}", "debug", {"rule_id": rule.id}, ) self._track_enrichment_event( event_id, EnrichmentStatus.SKIPPED, EnrichmentType.EXTRACTION, rule.id, {}, ) continue match_result = re.search(rule.regex, attribute_value) if match_result: match_dict = match_result.groupdict() # we don't override source match_dict.pop("source", None) event.update(match_dict) self.enrich_entity( fingerprint, match_dict, action_type=ActionType.EXTRACTION_RULE_ENRICH, action_callee="system", action_description=f"Alert enriched with extraction from rule `{rule.name}`", should_exist=False, ) self._add_enrichment_log( "Event enriched with extraction rule", "info", { "rule_id": rule.id, "tenant_id": self.tenant_id, "fingerprint": fingerprint, }, ) self._track_enrichment_event( event_id, EnrichmentStatus.SUCCESS, EnrichmentType.EXTRACTION, rule.id, match_dict, ) else: self._add_enrichment_log( "Regex did not match, skipping extraction", "info", { "rule_id": rule.id, "tenant_id": self.tenant_id, "fingerprint": fingerprint, }, ) self._track_enrichment_event( event_id, EnrichmentStatus.SKIPPED, EnrichmentType.EXTRACTION, rule.id, {}, ) return AlertDto(**event) if is_alert_dto else event def run_mapping_rules(self, alert: AlertDto) -> AlertDto: """ Run the mapping rules for the alert. Args: - alert (AlertDto): The incoming alert to be processed and enriched. Returns: - AlertDto: The enriched alert after applying mapping rules. """ if EnrichmentsBl.ENRICHMENT_DISABLED: self.logger.debug("Enrichment is disabled, skipping mapping rules") return alert self._add_enrichment_log( "Running mapping rules for incoming alert", "info", {"fingerprint": alert.fingerprint, "tenant_id": self.tenant_id}, ) # Retrieve all active mapping rules for the current tenant, ordered by priority rules: list[MappingRule] = ( self.db_session.query(MappingRule) .filter(MappingRule.tenant_id == self.tenant_id) .filter(MappingRule.disabled == False) .order_by(MappingRule.priority.desc()) .all() ) if not rules: # If no mapping rules are found for the tenant, log and return the original alert self._add_enrichment_log( "No mapping rules found for tenant", "debug", {"fingerprint": alert.fingerprint, "tenant_id": self.tenant_id}, ) return alert for rule in rules: self.check_if_match_and_enrich(alert, rule) return alert def check_if_match_and_enrich(self, alert: AlertDto, rule: MappingRule) -> bool: """ Check if the alert matches the conditions specified in the mapping rule. If a match is found, enrich the alert and log the enrichment. Args: - alert (AlertDto): The incoming alert to be processed. - rule (MappingRule): The mapping rule to be checked against. Returns: - bool: True if alert matches the rule, False otherwise. """ self._add_enrichment_log( "Checking alert against mapping rule", "debug", {"fingerprint": alert.fingerprint, "rule_id": rule.id}, ) # Check if the alert has any of the attributes defined in matchers match = False for matcher in rule.matchers: if matcher and get_nested_attribute(alert, matcher) is not None: self._add_enrichment_log( f"Alert matched a mapping rule for matcher: {matcher}", "debug", { "fingerprint": alert.fingerprint, "rule_id": rule.id, "matcher": matcher, }, ) match = True break if not match: self._add_enrichment_log( "Alert does not match any of the conditions for the rule", "debug", { "fingerprint": alert.fingerprint, "rule_id": rule.id, "matchers": rule.matchers, "alert": str(alert), }, ) self._track_enrichment_event( alert.id, EnrichmentStatus.SKIPPED, EnrichmentType.MAPPING, rule.id, {} ) return False self._add_enrichment_log( "Alert matched a mapping rule, enriching...", "info", {"fingerprint": alert.fingerprint, "rule_id": rule.id}, ) # Apply enrichment to the alert enrichments = {} if rule.type == "topology": matcher_value = {} for matcher in rule.matchers: # [0] because topology is always 1 matcher matcher_value[matcher[0]] = get_nested_attribute(alert, matcher[0]) topology_service = get_topology_data_by_dynamic_matcher( self.tenant_id, matcher_value ) if not topology_service: self._add_enrichment_log( "No topology service found to match on", "debug", {"matcher_value": matcher_value}, ) else: enrichments = topology_service.dict(exclude_none=True) # repository could be taken from application too if not topology_service.repository and topology_service.applications: for application in topology_service.applications: if application.repository: enrichments["repository"] = application.repository # Remove redundant fields enrichments.pop("tenant_id", None) enrichments.pop("id", None) elif rule.type == "csv": if not rule.is_multi_level: for row in rule.rows: if any( self._check_matcher(alert, row, matcher) for matcher in rule.matchers ): # Extract enrichments from the matched row enrichments = {} for key, value in row.items(): if value is not None: is_matcher = False for matcher in rule.matchers: if key in matcher: is_matcher = True break if not is_matcher: # If the key has . (dot) in it, it'll be added as is while it needs to be nested. # @tb: fix when somebody will be complaining about this. if isinstance(value, str): value = value.strip() enrichments[key.strip()] = value break else: # Multi-level mapping # We can assume that the matcher is only a single key. i.e., [['customers']] key = rule.matchers[0][0] # this should be a list of values we need to try and match, and enrich matcher_values = get_nested_attribute(alert, key) if not matcher_values: self._add_enrichment_log("WTF, should not happen?", "error") else: if isinstance(matcher_values, str): matcher_values = json5.loads(matcher_values) for matcher in matcher_values: if rule.prefix_to_remove: matcher = matcher.replace(rule.prefix_to_remove, "") for row in rule.rows: if self._check_explicit_match(row, key, matcher): if rule.new_property_name not in enrichments: enrichments[rule.new_property_name] = {} if matcher not in enrichments[rule.new_property_name]: enrichments[rule.new_property_name][matcher] = {} for enrichment_key, enrichment_value in row.items(): if enrichment_value is not None: enrichments[rule.new_property_name][matcher][ enrichment_key.strip() ] = enrichment_value.strip() break if enrichments: # Enrich the alert with the matched data from the row for key, matcher in enrichments.items(): # It's not relevant to enrich if the value if empty if matcher is not None: if isinstance(matcher, str): matcher = matcher.strip() setattr(alert, key.strip(), matcher) # Save the enrichments to the database # SHAHAR: since when running this enrich_alert, the alert is not in elastic yet (its indexed after), # enrich alert will fail to update the alert in elastic. # hence should_exist = False self.enrich_entity( alert.fingerprint, enrichments, action_type=ActionType.MAPPING_RULE_ENRICH, action_callee="system", action_description=f"Alert enriched with mapping from rule `{rule.name}`", should_exist=False, ) self._add_enrichment_log( "Alert enriched", "info", {"fingerprint": alert.fingerprint, "rule_id": rule.id}, ) self._track_enrichment_event( alert.id, EnrichmentStatus.SUCCESS, EnrichmentType.MAPPING, rule.id, enrichments, ) return True # Exit on first successful enrichment (assuming single match) self._add_enrichment_log( "Alert was not enriched by mapping rule", "info", {"rule_id": rule.id, "alert_fingerprint": alert.fingerprint}, ) self._track_enrichment_event( alert.id, EnrichmentStatus.FAILURE, EnrichmentType.MAPPING, rule.id, {}, ) return False @staticmethod def _is_match(value, pattern): if value is None or pattern is None: return False return re.search(pattern, value) is not None def _check_explicit_match( self, row: dict, matcher: str, explicit_value: str ) -> bool: """ Check if the row matches the explicit given value, for example, in multi-level-mapping Args: row (dict): The row from the mapping rule data to compare against. matcher (str): The matcher string specifying conditions. explicit_value (str): The explicit value to compare against. Returns: bool: True if the row matches the explicit given value, False otherwise. """ return row.get(matcher.strip()) == explicit_value.strip() def _check_matcher( self, alert: AlertDto, row: dict, matcher: list, ) -> bool: """ Check if the alert matches the conditions specified by a matcher. Args: - alert (AlertDto): The incoming alert to be processed. - row (dict): The row from the mapping rule data to compare against. - matcher (str): The matcher string specifying conditions. Returns: - bool: True if alert matches the matcher, False otherwise. """ try: return all( self._is_match( get_nested_attribute(alert, attribute.strip()), row.get(attribute.strip()), ) or get_nested_attribute(alert, attribute.strip()) == row.get(attribute.strip()) or row.get(attribute.strip()) == "*" # Wildcard match for attribute in matcher ) except TypeError: self._add_enrichment_log( "Error while checking matcher", "error", { "fingerprint": alert.fingerprint, "matcher": matcher, }, ) return False @staticmethod def get_enrichment_metadata( enrichments: dict, authenticated_entity: AuthenticatedEntity ) -> tuple[ActionType, str, bool, bool]: """ Get the metadata for the enrichment Args: enrichments (dict): The enrichments to get the metadata for authenticated_entity (AuthenticatedEntity): The authenticated entity that performed the enrichment Returns: tuple[ActionType, str, bool, bool]: action_type, action_description, should_run_workflow, should_check_incidents_resolution """ should_run_workflow = False should_check_incidents_resolution = False action_type = ActionType.GENERIC_ENRICH action_description = ( f"Alert enriched by {authenticated_entity.email} - {enrichments}" ) # Shahar: TODO, change to the specific action type, good enough for now if "status" in enrichments and authenticated_entity.api_key_name is None: action_type = ( ActionType.MANUAL_RESOLVE if enrichments["status"] == "resolved" else ActionType.MANUAL_STATUS_CHANGE ) action_description = f"Alert status was changed to {enrichments['status']} by {authenticated_entity.email}" should_run_workflow = True if enrichments["status"] == "resolved": should_check_incidents_resolution = True elif "status" in enrichments and authenticated_entity.api_key_name: action_type = ( ActionType.API_AUTOMATIC_RESOLVE if enrichments["status"] == "resolved" else ActionType.API_STATUS_CHANGE ) action_description = f"Alert status was changed to {enrichments['status']} by API `{authenticated_entity.api_key_name}`" should_run_workflow = True if enrichments["status"] == "resolved": should_check_incidents_resolution = True elif "note" in enrichments and enrichments["note"]: action_type = ActionType.COMMENT action_description = ( f"Comment added by {authenticated_entity.email} - {enrichments['note']}" ) elif "ticket_url" in enrichments: action_type = ActionType.TICKET_ASSIGNED action_description = f"Ticket assigned by {authenticated_entity.email} - {enrichments['ticket_url']}" return ( action_type, action_description, should_run_workflow, should_check_incidents_resolution, ) def batch_enrich( self, fingerprints: list[str], enrichments: dict, action_type: ActionType, action_callee: str, action_description: str, dispose_on_new_alert=False, audit_enabled=True, ): self.logger.debug( "enriching multiple fingerprints", extra={"fingerprints": fingerprints, "tenant_id": self.tenant_id}, ) # if these enrichments are disposable, manipulate them with a timestamp # so they can be disposed of later if dispose_on_new_alert: self.logger.info( "Enriching disposable enrichments", extra={"fingerprints": fingerprints, "tenant_id": self.tenant_id}, ) # for every key, add a disposable key with the value and a timestamp disposable_enrichments = {} for key, value in enrichments.items(): disposable_enrichments[f"disposable_{key}"] = { "value": value, "timestamp": datetime.datetime.now( tz=datetime.timezone.utc ).timestamp(), # timestamp for disposal [for future use] } enrichments.update(disposable_enrichments) batch_enrich( self.tenant_id, fingerprints, enrichments, action_type, action_callee, action_description, audit_enabled=audit_enabled, session=self.db_session, ) def disposable_enrich_entity( self, fingerprint: str, enrichments: dict, action_type: ActionType, action_callee: str, action_description: str, should_exist=True, force=False, audit_enabled=True, ): common_kwargs = { "enrichments": enrichments, "action_type": action_type, "action_callee": action_callee, "action_description": action_description, "should_exist": should_exist, "force": force, } self.enrich_entity( fingerprint=fingerprint, dispose_on_new_alert=True, audit_enabled=audit_enabled, **common_kwargs, ) last_alert = get_last_alert_by_fingerprint( self.tenant_id, fingerprint, session=self.db_session ) # Create instance-wide enrichment for history # For better database-native UUID support alert_id = UUIDType(binary=False).process_bind_param( last_alert.alert_id, self.db_session.bind.dialect ) # For elastic we do not save instance-level enrichments common_kwargs["should_exist"] = False self.enrich_entity(fingerprint=alert_id, audit_enabled=False, **common_kwargs) def enrich_entity( self, fingerprint: str | UUID, enrichments: dict, action_type: ActionType, action_callee: str, action_description: str, should_exist=True, dispose_on_new_alert=False, force=False, audit_enabled=True, ): """ should_exist = False only in mapping where the alert is not yet in elastic action_type = AlertActionType - the action type of the enrichment action_callee = the action callee of the enrichment Enrich the alert with extraction and mapping rules """ # enrich db if isinstance(fingerprint, UUID): fingerprint = UUIDType(binary=False).process_bind_param( fingerprint, self.db_session.bind.dialect ) self.logger.debug( "enriching alert db", extra={"fingerprint": fingerprint, "tenant_id": self.tenant_id}, ) # if these enrichments are disposable, manipulate them with a timestamp # so they can be disposed of later if dispose_on_new_alert: self.logger.info( "Enriching disposable enrichments", extra={"fingerprint": fingerprint} ) # for every key, add a disposable key with the value and a timestamp disposable_enrichments = {} for key, value in enrichments.items(): disposable_enrichments[f"disposable_{key}"] = { "value": value, "timestamp": datetime.datetime.now( tz=datetime.timezone.utc ).timestamp(), # timestamp for disposal [for future use] } enrichments.update(disposable_enrichments) enrich_alert_db( self.tenant_id, fingerprint, enrichments, action_callee=action_callee, action_type=action_type, action_description=action_description, session=self.db_session, force=force, audit_enabled=audit_enabled, ) self.logger.debug( "alert enriched in db, enriching elastic", extra={"fingerprint": fingerprint}, ) # enrich elastic only if should exist, since # in elastic the alertdto is being kept which is alert + enrichments # so for example, in mapping, the enrichment happens before the alert is indexed in elastic # if should_exist: try: self.elastic_client.enrich_alert( alert_fingerprint=fingerprint, alert_enrichments=enrichments, ) except NotFoundError: self.logger.exception( "Failed to enrich alert in Elastic", extra={"fingerprint": fingerprint, "tenant_id": self.tenant_id}, ) self.logger.debug( "alert enriched in elastic", extra={"fingerprint": fingerprint} ) def get_total_enrichment_events( self, rule_id: int, _type: EnrichmentType = EnrichmentType.MAPPING ): query = select(func.count(EnrichmentEvent.id)).where( EnrichmentEvent.rule_id == rule_id, EnrichmentEvent.tenant_id == self.tenant_id, EnrichmentEvent.enrichment_type == _type.value, ) return self.db_session.exec(query).one() def get_enrichment_event(self, enrichment_event_id: UUID) -> EnrichmentEvent: query = select(EnrichmentEvent).where( EnrichmentEvent.id == enrichment_event_id, EnrichmentEvent.tenant_id == self.tenant_id, ) enrichment_event = self.db_session.exec(query).one() if not enrichment_event: raise HTTPException(status_code=404, detail="Enrichment event not found") return enrichment_event def get_enrichment_events( self, rule_id: int, limit: int, offset: int, _type: EnrichmentType = EnrichmentType.MAPPING, ): # todo: easy to make async query = ( select(EnrichmentEvent) .where( EnrichmentEvent.rule_id == rule_id, EnrichmentEvent.tenant_id == self.tenant_id, EnrichmentEvent.enrichment_type == _type.value, ) .order_by(EnrichmentEvent.timestamp.desc()) .offset(offset) .limit(limit) ) return self.db_session.exec(query).all() def get_enrichment_event_logs(self, enrichment_event_id: UUID): query = select(EnrichmentLog).where( EnrichmentLog.enrichment_event_id == enrichment_event_id, EnrichmentLog.tenant_id == self.tenant_id, ) return self.db_session.exec(query).all() def dispose_enrichments(self, fingerprint: str): """ Dispose of enrichments from the alert """ if EnrichmentsBl.ENRICHMENT_DISABLED: self.logger.debug("Enrichment is disabled, skipping dispose enrichments") return self.logger.debug("disposing enrichments", extra={"fingerprint": fingerprint}) enrichments = get_enrichment_with_session( self.db_session, self.tenant_id, fingerprint ) if not enrichments or not enrichments.enrichments: self.logger.debug( "no enrichments to dispose", extra={"fingerprint": fingerprint} ) return # Remove all disposable enrichments new_enrichments = {} disposed = False for key, val in enrichments.enrichments.items(): if key.startswith("disposable_"): disposed = True continue elif f"disposable_{key}" not in enrichments.enrichments: new_enrichments[key] = val # Only update the alert if there are disposable enrichments to dispose disposed_keys = set(enrichments.enrichments.keys()) - set( new_enrichments.keys() ) if disposed: enrich_alert_db( self.tenant_id, fingerprint, new_enrichments, session=self.db_session, action_callee="system", action_type=ActionType.DISPOSE_ENRICHED_ALERT, action_description=f"Disposing enrichments from alert - {disposed_keys}", force=True, ) self.elastic_client.enrich_alert(fingerprint, new_enrichments) self.logger.debug( "enrichments disposed", extra={"fingerprint": fingerprint} ) def _track_enrichment_event( self, alert_id: UUID | None, status: EnrichmentStatus, enrichment_type: EnrichmentType, rule_id: int | None, enriched_fields: dict, ) -> None: """ Track an enrichment event in the database """ if alert_id is None or not is_valid_uuid(alert_id): self.__logs = [] self.logger.debug( "Cannot track enrichment event without a valid alert_id", extra={"tenant_id": self.tenant_id, "rule_id": rule_id}, ) return try: enrichment_event = EnrichmentEvent( tenant_id=self.tenant_id, status=status.value, enrichment_type=enrichment_type.value, rule_id=rule_id, alert_id=alert_id, enriched_fields=enriched_fields, ) self.db_session.add(enrichment_event) self.db_session.flush() if self.__logs: for log in self.__logs: log.enrichment_event_id = enrichment_event.id self.db_session.add(log) self.db_session.commit() self.__logs = [] self.enrichment_event_id = enrichment_event.id except Exception: self.__logs = [] self.logger.exception( "Failed to track enrichment event", extra={ "tenant_id": self.tenant_id, "alert_id": alert_id, "enrichment_type": enrichment_type.value, "rule_id": rule_id, }, ) def _add_enrichment_log( self, message: str, level: str, details: dict | None = None, ) -> None: """ Add a log entry for an enrichment event """ try: getattr(self.logger, level)(message, extra=details) log_entry = EnrichmentLog( tenant_id=self.tenant_id, message=message, ) self.__logs.append(log_entry) except Exception: self.logger.exception( "Failed to add enrichment log", extra={ "tenant_id": self.tenant_id, "message": message, }, ) def check_incident_resolution(self, alert: Alert | AlertDto): incidents = get_incidents_by_alert_fingerprint( self.tenant_id, alert.fingerprint, self.db_session ) self.db_session.expire_on_commit = False for incident in incidents: if incident.resolve_on == ResolveOn.ALL.value and is_all_alerts_resolved( incident=incident, session=self.db_session ): incident.status = IncidentStatus.RESOLVED.value self.db_session.add(incident) self.db_session.commit() ================================================ FILE: keep/api/bl/incident_reports.py ================================================ import json import logging import math import os from typing import Optional from uuid import UUID from openai import OpenAI from pydantic import BaseModel from keep.api.bl.incidents_bl import IncidentBl from keep.api.consts import OPENAI_MODEL_NAME from keep.api.models.db.incident import IncidentStatus from keep.api.models.incident import IncidentDto class IncidentMetrics(BaseModel): total_incidents: Optional[int] = None resolved_incidents: Optional[int] = None deleted_incidents: Optional[int] = None unresolved_incidents: Optional[int] = None class IncidentDurations(BaseModel): shortest_duration_seconds: Optional[int] = None shortest_duration_incident_id: Optional[str] = None longest_duration_seconds: Optional[int] = None longest_duration_incident_id: Optional[str] = None class IncidentReportDto(BaseModel): incident_name: Optional[str] = None incident_id: Optional[str] = None class ReoccuringIncidentReportDto(IncidentReportDto): occurrence_count: Optional[int] = None class IncidentReport(BaseModel): services_affected_metrics: Optional[dict[str, int]] = None severity_metrics: Optional[dict[str, list[IncidentReportDto]]] = None incident_durations: Optional[IncidentDurations] = None mean_time_to_detect_seconds: Optional[int] = None mean_time_to_resolve_seconds: Optional[int] = None most_frequent_reasons: Optional[dict[str, list[str]]] = None recurring_incidents: Optional[list[ReoccuringIncidentReportDto]] = None class OpenAIReportPart(BaseModel): most_frequent_reasons: Optional[dict[str, list[str]]] = None system_prompt = """ Generate an incident report based on the provided incidents dataset and response schema. Ensure all calculated metrics follow the specified format for consistency. **Calculations and Metrics:** 1. **Most Frequent Incident Reasons** - JSON property name: most_frequent_reasons - Identify the most common root causes by analyzing the following fields: incident_name, incident_summary, severity. - Try to find root causes that are not explicitly mentioned in the dataset. - Be concise, the reasons must be short but descriptive at the same time. - Group similar reasons to avoid duplicates. - Output only top 6 reasons. - Return a JSON object, which is a dictionary. - Each key in this dictionary must be an incident reason (a string describing the reason for the incident). - The value for each key must be a list of incident IDs (strings) that correspond to that reason. - The structure of object in most_frequent_reasons property should follow this exact format: { "Reason 1": ["incident_id_1", "incident_id_2"], "Reason 2": ["incident_id_3"], "Reason 3": ["incident_id_4", "incident_id_5", "incident_id_6"] } """ logger = logging.getLogger(__name__) class IncidentReportsBl: __open_ai_client = None @property def open_ai_client(self): if not self.__open_ai_client and os.environ.get("OPENAI_API_KEY"): self.__open_ai_client = OpenAI() return self.__open_ai_client def __init__(self, tenant_id: str): self.tenant_id = tenant_id self.incidents_bl = IncidentBl( tenant_id=tenant_id, session=None, pusher_client=None, user=None ) def get_incident_reports( self, incidents_query_cel: str, allowed_incident_ids: list[str] ) -> IncidentReport: incidents = self.__get_incidents(incidents_query_cel, allowed_incident_ids) open_ai_report_part = self.__calculate_report_in_openai(incidents) report = IncidentReport( most_frequent_reasons=open_ai_report_part.most_frequent_reasons ) incidents_dict = {incident.id: incident for incident in incidents} resolved_incidents = [ incident for incident in incidents if incident.status == IncidentStatus.RESOLVED ] report.mean_time_to_detect_seconds = self.__calculate_mttd(incidents) report.mean_time_to_resolve_seconds = self.__calculate_mttr(resolved_incidents) report.incident_durations = self.__calculate_durations(resolved_incidents) report.recurring_incidents = self.__calculate_recurring_incidents( incidents_dict ) report.severity_metrics = self.__calculate_severity_metrics(incidents) report.services_affected_metrics = self.__calculate_top_services_affected( incidents ) return report def __calculate_report_in_openai( self, incidents: list[IncidentDto] ) -> OpenAIReportPart: if self.open_ai_client is None: return IncidentReport() # Most recent incidents first incidents = sorted(incidents, key=lambda x: x.creation_time, reverse=True) # Limit incidents because OpenAI is either slow (timeouts) or has token limits incidents = incidents[:40] incidents_minified: list[dict] = [] for item in incidents: incidents_minified.append( { "incident_id": str(item.id), "incident_name": "\n".join( filter(None, [item.user_generated_name, item.ai_generated_name]) ), "incident_summary": "\n".join( filter(None, [item.user_summary, item.generated_summary]) ), "severity": item.severity, "services": item.services, } ) incidents_json = json.dumps(incidents_minified, default=str) response = self.open_ai_client.chat.completions.create( model=OPENAI_MODEL_NAME, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": incidents_json}, ], response_format={ "type": "json_schema", "json_schema": { "name": "OpenAIReportPart", "schema": OpenAIReportPart.schema(), }, }, seed=1239, temperature=0.2, ) model_response = response.choices[0].message.content try: report = OpenAIReportPart(**json.loads(model_response)) return report except Exception as e: logger.error( f"""Failed to parse OpenAI response: {e} Response: {model_response} """ ) raise e def __calculate_top_services_affected( self, incidents: list[IncidentDto] ) -> dict[str, int]: top_services_affected = {} for incident in incidents: for service in incident.services: if service == "null": continue if service not in top_services_affected: top_services_affected[service] = 0 top_services_affected[service] += 1 return top_services_affected def __calculate_severity_metrics( self, incidents: list[IncidentDto] ) -> dict[str, list[IncidentReportDto]]: severity_metrics = {} for incident in incidents: if incident.severity not in severity_metrics: severity_metrics[incident.severity] = [] severity_metrics[incident.severity].append( IncidentReportDto( incident_name=incident.user_generated_name or incident.ai_generated_name, incident_id=str(incident.id), ) ) return severity_metrics def __calculate_mttd(self, incidents: list[IncidentDto]) -> int: duration_sum = 0 incidents_count = 0 for incident in incidents: if not incident.start_time: continue duration_sum += ( incident.creation_time - incident.start_time ).total_seconds() incidents_count += 1 if incidents_count == 0: return 0 return math.ceil(duration_sum / incidents_count) def __calculate_mttr(self, resolved_incidents: list[IncidentDto]) -> int: filtered_incidents = [ incident for incident in resolved_incidents if incident.end_time ] if len(filtered_incidents) == 0: return 0 duration_sum = 0 for incident in filtered_incidents: start_time = incident.start_time or incident.creation_time duration_sum += (incident.end_time - start_time).total_seconds() return math.ceil(duration_sum / len(filtered_incidents)) def __calculate_durations( self, resolved_incidents: list[IncidentDto] ) -> IncidentDurations: if len(resolved_incidents) == 0: return None shortest_duration_ms = None shortest_duration_incident_id = None longest_duration_ms = None longest_duration_incident_id = None for incident in resolved_incidents: start_time = incident.start_time or incident.creation_time if not start_time or not incident.end_time: continue duration = (incident.end_time - start_time).total_seconds() if not shortest_duration_ms or duration < shortest_duration_ms: shortest_duration_ms = duration shortest_duration_incident_id = incident.id if not longest_duration_ms or duration > longest_duration_ms: longest_duration_ms = duration longest_duration_incident_id = incident.id return IncidentDurations( shortest_duration_seconds=shortest_duration_ms, shortest_duration_incident_id=str(shortest_duration_incident_id), longest_duration_seconds=longest_duration_ms, longest_duration_incident_id=str(longest_duration_incident_id), ) def __calculate_recurring_incidents( self, incidents_dict: dict[UUID, IncidentDto] ) -> list[ReoccuringIncidentReportDto]: recurring_incidents: dict[str, set[str]] = {} for incident in incidents_dict.values(): current_incident_in_the_past_id = incident.same_incident_in_the_past_id path = list([incident.id]) while current_incident_in_the_past_id: path.append(current_incident_in_the_past_id) past_incident = same_incident_in_the_past_id = incidents_dict.get( current_incident_in_the_past_id, None ) if not past_incident: break same_incident_in_the_past_id = ( past_incident.same_incident_in_the_past_id ) if not same_incident_in_the_past_id: root_incident_id = path[-1] if root_incident_id not in recurring_incidents: recurring_incidents[root_incident_id] = set() for incident_id in path: recurring_incidents[root_incident_id].add(incident_id) break current_incident_in_the_past_id = ( past_incident.same_incident_in_the_past_id ) return [ ReoccuringIncidentReportDto( incident_name=incidents_dict[root_incident_id].user_generated_name or incidents_dict[root_incident_id].ai_generated_name, incident_id=str(root_incident_id), occurrence_count=len(recurring_incidents), ) for root_incident_id, recurring_incidents in recurring_incidents.items() ] def __get_incidents( self, incidents_query_cel: str, allowed_incident_ids: list[str] ) -> list[IncidentDto]: query_result = self.incidents_bl.query_incidents( tenant_id=self.tenant_id, cel=f"status != 'deleted' && {incidents_query_cel}", limit=100, offset=0, allowed_incident_ids=allowed_incident_ids, is_candidate=False, ) return query_result.items ================================================ FILE: keep/api/bl/incidents_bl.py ================================================ import asyncio import logging import os import pathlib import sys from datetime import datetime, timezone from typing import List, Optional from uuid import UUID from fastapi import HTTPException from pusher import Pusher from sqlalchemy.orm.exc import StaleDataError from sqlmodel import Session from keep.api.arq_pool import get_pool from keep.api.bl.enrichments_bl import EnrichmentsBl from keep.api.core.db import ( add_alerts_to_incident, add_audit, create_incident_from_dto, delete_incident_by_id, enrich_alerts_with_incidents, get_all_alerts_by_fingerprints, get_incident_by_id, get_incident_unique_fingerprint_count, is_all_alerts_resolved, is_first_incident_alert_resolved, is_last_incident_alert_resolved, remove_alerts_to_incident_by_incident_id, update_incident_from_dto_by_id, update_incident_severity, ) from keep.api.core.elastic import ElasticClient from keep.api.core.incidents import get_last_incidents_by_cel from keep.api.models.action_type import ActionType from keep.api.models.db.incident import Incident, IncidentSeverity, IncidentStatus from keep.api.models.db.rule import ResolveOn from keep.api.models.incident import IncidentDto, IncidentDtoIn, IncidentSorting from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts from keep.api.utils.pagination import IncidentsPaginatedResultsDto from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.workflowmanager.workflowmanager import WorkflowManager MIN_INCIDENT_ALERTS_FOR_SUMMARY_GENERATION = int( os.environ.get("MIN_INCIDENT_ALERTS_FOR_SUMMARY_GENERATION", 5) ) ee_enabled = os.environ.get("EE_ENABLED", "false") == "true" if ee_enabled: path_with_ee = ( str(pathlib.Path(__file__).parent.resolve()) + "/../../../ee/experimental" ) sys.path.insert(0, path_with_ee) else: ALGORITHM_VERBOSE_NAME = NotImplemented class IncidentBl: def __init__( self, tenant_id: str, session: Session, pusher_client: Optional[Pusher] = None, user: str = None, ): self.tenant_id = tenant_id self.user = user self.session = session self.pusher_client = pusher_client self.logger = logging.getLogger(__name__) self.ee_enabled = os.environ.get("EE_ENABLED", "false").lower() == "true" self.redis = os.environ.get("REDIS", "false") == "true" def create_incident( self, incident_dto: [IncidentDtoIn | IncidentDto], generated_from_ai: bool = False, ) -> IncidentDto: """ Creates a new incident. Args: incident_dto (IncidentDtoIn | IncidentDto): The data transfer object containing the details of the incident to be created. generated_from_ai (bool, optional): Indicates if the incident was generated by Keep's AI. Defaults to False. Returns: IncidentDto: The newly created incident object, containing details of the incident. """ self.logger.info( "Creating incident", extra={"incident_dto": incident_dto.dict(), "tenant_id": self.tenant_id}, ) incident = create_incident_from_dto( self.tenant_id, incident_dto, generated_from_ai=generated_from_ai, session=self.session, ) self.logger.info( "Incident created", extra={"incident_id": incident.id, "tenant_id": self.tenant_id}, ) new_incident_dto = IncidentDto.from_db_incident(incident) self.logger.info( "Incident DTO created", extra={"incident_id": new_incident_dto.id, "tenant_id": self.tenant_id}, ) self.update_client_on_incident_change() self.logger.info( "Client updated on incident change", extra={"incident_id": new_incident_dto.id, "tenant_id": self.tenant_id}, ) self.send_workflow_event(new_incident_dto, "created") self.logger.info( "Workflows run on incident", extra={"incident_id": new_incident_dto.id, "tenant_id": self.tenant_id}, ) return new_incident_dto def sync_add_alerts_to_incident(self, *args, **kwargs) -> None: """ Synchronous wrapper for the async add_alerts_to_incident method. """ asyncio.run(self.add_alerts_to_incident(*args, **kwargs)) async def add_alerts_to_incident( self, incident_id: UUID, alert_fingerprints: List[str], is_created_by_ai: bool = False, override_count: bool = False, ) -> None: self.logger.info( "Adding alerts to incident", extra={ "incident_id": incident_id, "alert_fingerprints": alert_fingerprints, }, ) incident = get_incident_by_id( tenant_id=self.tenant_id, incident_id=incident_id, session=self.session ) if not incident: raise HTTPException(status_code=404, detail="Incident not found") add_alerts_to_incident( self.tenant_id, incident, alert_fingerprints, is_created_by_ai, session=self.session, override_count=override_count, ) self.logger.info( "Alerts added to incident", extra={ "incident_id": incident_id, "alert_fingerprints": alert_fingerprints, }, ) self.__postprocess_alerts_change(incident, alert_fingerprints) await self.__generate_summary(incident_id, incident) self.logger.info( "Summary generated", extra={ "incident_id": incident_id, "alert_fingerprints": alert_fingerprints, }, ) def __update_elastic(self, alert_fingerprints: List[str]): try: elastic_client = ElasticClient(self.tenant_id) if elastic_client.enabled: db_alerts = get_all_alerts_by_fingerprints( tenant_id=self.tenant_id, fingerprints=alert_fingerprints, session=self.session, ) db_alerts = enrich_alerts_with_incidents( self.tenant_id, db_alerts, session=self.session ) enriched_alerts_dto = convert_db_alerts_to_dto_alerts( db_alerts, with_incidents=True ) elastic_client.index_alerts(alerts=enriched_alerts_dto) except Exception: self.logger.exception("Failed to push alert to elasticsearch") raise def update_client_on_incident_change(self, incident_id: Optional[UUID] = None): if self.pusher_client is not None: self.logger.info( "Pushing incident change to client", extra={"incident_id": incident_id, "tenant_id": self.tenant_id}, ) try: self.pusher_client.trigger( f"private-{self.tenant_id}", "incident-change", {"incident_id": str(incident_id) if incident_id else None}, ) self.logger.info( "Incident change pushed to client", extra={"incident_id": incident_id, "tenant_id": self.tenant_id}, ) except Exception: self.logger.exception( "Failed to push incident change to client", extra={"incident_id": incident_id, "tenant_id": self.tenant_id}, ) def send_workflow_event(self, incident_dto: IncidentDto, action: str) -> None: try: workflow_manager = WorkflowManager.get_instance() workflow_manager.insert_incident(self.tenant_id, incident_dto, action) except Exception: self.logger.exception( "Failed to run workflows based on incident", extra={"incident_id": incident_dto.id, "tenant_id": self.tenant_id}, ) async def __generate_summary(self, incident_id: UUID, incident: Incident): try: fingerprints_count = get_incident_unique_fingerprint_count( self.tenant_id, incident_id ) if ( ee_enabled and self.redis and fingerprints_count > MIN_INCIDENT_ALERTS_FOR_SUMMARY_GENERATION and not incident.user_summary ): pool = await get_pool() job = await pool.enqueue_job( "process_summary_generation", tenant_id=self.tenant_id, incident_id=incident_id, ) self.logger.info( f"Summary generation for incident {incident_id} scheduled, job: {job}", extra={ "tenant_id": self.tenant_id, "incident_id": incident_id, }, ) except Exception: self.logger.exception( "Failed to generate summary for incident", extra={"incident_id": incident_id, "tenant_id": self.tenant_id}, ) def delete_alerts_from_incident( self, incident_id: UUID, alert_fingerprints: List[str] ) -> None: self.logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": self.tenant_id, }, ) incident = get_incident_by_id(tenant_id=self.tenant_id, incident_id=incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") remove_alerts_to_incident_by_incident_id( self.tenant_id, incident_id, alert_fingerprints ) self.__postprocess_alerts_change(incident, alert_fingerprints) def delete_incident(self, incident_id: UUID) -> None: self.logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": self.tenant_id, }, ) incident = get_incident_by_id(tenant_id=self.tenant_id, incident_id=incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") incident_dto = IncidentDto.from_db_incident(incident) deleted = delete_incident_by_id( tenant_id=self.tenant_id, incident_id=incident_id ) if not deleted: raise HTTPException(status_code=404, detail="Incident not found") self.update_client_on_incident_change() self.send_workflow_event(incident_dto, "deleted") def bulk_delete_incidents(self, incident_ids: List[UUID]) -> None: for incident_id in incident_ids: self.delete_incident(incident_id) def update_incident( self, incident_id: UUID, updated_incident_dto: IncidentDtoIn, generated_by_ai: bool, ) -> IncidentDto: self.logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": self.tenant_id, }, ) incident = update_incident_from_dto_by_id( self.tenant_id, incident_id, updated_incident_dto, generated_by_ai ) return self.__postprocess_incident_change(incident) def __postprocess_alerts_change(self, incident, alert_fingerprints): self.__update_elastic(alert_fingerprints) self.logger.info( "Alerts pushed to elastic", extra={ "incident_id": incident.id, "alert_fingerprints": alert_fingerprints, }, ) self.update_client_on_incident_change(incident.id) self.logger.info( "Client updated on incident change", extra={ "incident_id": incident.id, "alert_fingerprints": alert_fingerprints, }, ) incident_dto = IncidentDto.from_db_incident(incident) self.send_workflow_event(incident_dto, "updated") self.logger.info( "Workflows run on incident", extra={ "incident_id": incident.id, "alert_fingerprints": alert_fingerprints, }, ) def update_severity( self, incident_id: UUID, severity: IncidentSeverity, comment: Optional[str] = None, ) -> IncidentDto: self.logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": self.tenant_id, }, ) incident = update_incident_severity( self.tenant_id, incident_id, severity, ) if comment: add_audit( self.tenant_id, str(incident_id), self.user, ActionType.INCIDENT_COMMENT, comment, ) return self.__postprocess_incident_change(incident) def __postprocess_incident_change(self, incident): if not incident: raise HTTPException(status_code=404, detail="Incident not found") new_incident_dto = IncidentDto.from_db_incident(incident) self.update_client_on_incident_change(incident.id) self.logger.info( "Client updated on incident change", extra={"incident_id": incident.id}, ) self.send_workflow_event(new_incident_dto, "updated") self.logger.info( "Workflows run on incident", extra={"incident_id": incident.id}, ) return new_incident_dto @staticmethod def query_incidents( tenant_id: str, limit: int = 25, offset: int = 0, timeframe: int = None, upper_timestamp: datetime = None, lower_timestamp: datetime = None, is_candidate: bool = False, sorting: Optional[IncidentSorting] = IncidentSorting.creation_time, with_alerts: bool = False, is_predicted: bool = None, cel: str = None, allowed_incident_ids: Optional[List[str]] = None, ): incidents, total_count = get_last_incidents_by_cel( tenant_id=tenant_id, limit=limit, offset=offset, timeframe=timeframe, upper_timestamp=upper_timestamp, lower_timestamp=lower_timestamp, is_candidate=is_candidate, sorting=sorting, with_alerts=with_alerts, is_predicted=is_predicted, cel=cel, allowed_incident_ids=allowed_incident_ids, ) incidents_dto = [] for incident in incidents: incidents_dto.append(IncidentDto.from_db_incident(incident)) return IncidentsPaginatedResultsDto( limit=limit, offset=offset, count=total_count, items=incidents_dto ) def resolve_incident_if_require( self, incident: Incident, max_retries=3, handle_workflow_event: bool = True ) -> Incident: should_resolve = False if incident.resolve_on == ResolveOn.ALL.value and is_all_alerts_resolved( incident=incident, session=self.session ): should_resolve = True elif ( incident.resolve_on == ResolveOn.FIRST.value and is_first_incident_alert_resolved(incident, session=self.session) ): should_resolve = True elif ( incident.resolve_on == ResolveOn.LAST.value and is_last_incident_alert_resolved(incident, session=self.session) ): should_resolve = True incident_id = incident.id if should_resolve: for attempt in range(max_retries): try: incident.status = IncidentStatus.RESOLVED.value self.session.add(incident) self.session.commit() if handle_workflow_event: self.send_workflow_event( IncidentDto.from_db_incident(incident), "updated" ) break except StaleDataError as ex: if "expected to update" in ex.args[0]: self.logger.info( f"Phantom read detected while updating incident `{incident_id}`, retry #{attempt}" ) self.session.rollback() continue return incident def change_status( self, incident_id: UUID | str, new_status: IncidentStatus, change_by: AuthenticatedEntity, ) -> IncidentDto: self.logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": self.tenant_id, }, ) with_alerts = new_status in [ IncidentStatus.RESOLVED, IncidentStatus.ACKNOWLEDGED, ] incident = get_incident_by_id( self.tenant_id, incident_id, with_alerts=with_alerts, session=self.session ) if not incident: raise HTTPException(status_code=404, detail="Incident not found") if new_status in [IncidentStatus.RESOLVED, IncidentStatus.ACKNOWLEDGED]: enrichments = {"status": new_status.value} fingerprints = [alert.fingerprint for alert in incident.alerts] enrichments_bl = EnrichmentsBl(self.tenant_id, db=self.session) ( action_type, action_description, should_run_workflow, should_check_incidents_resolution, ) = enrichments_bl.get_enrichment_metadata(enrichments, change_by) enrichments_bl.batch_enrich( fingerprints, enrichments, action_type, change_by.email, action_description, dispose_on_new_alert=True, ) if new_status == IncidentStatus.RESOLVED: end_time = datetime.now(tz=timezone.utc) incident.end_time = end_time if incident.assignee != change_by.email: incident.assignee = change_by.email add_audit( self.tenant_id, str(incident_id), change_by.email, ActionType.INCIDENT_ASSIGN, f"Incident self-assigned to {change_by.email}", session=self.session, commit=False, ) add_audit( self.tenant_id, str(incident_id), change_by.email, ActionType.INCIDENT_STATUS_CHANGE, f"Incident status changed from {incident.status} to {new_status.value}", session=self.session, commit=False, ) incident.status = new_status.value self.session.add(incident) self.session.commit() return self.__postprocess_incident_change(incident) ================================================ FILE: keep/api/bl/maintenance_windows_bl.py ================================================ import datetime import json import logging import celpy from sqlmodel import Session from keep.api.consts import KEEP_CORRELATION_ENABLED, MAINTENANCE_WINDOW_ALERT_STRATEGY from opentelemetry import trace from keep.api.core.db import ( add_audit, get_alert_by_event_id, get_alerts_by_status, get_all_presets_dtos, get_last_alert_by_fingerprint, get_maintenance_windows_started, get_session_sync, recover_prev_alert_status, set_maintenance_windows_trace, ) from keep.api.core.dependencies import get_pusher_client from keep.api.models.action_type import ActionType from keep.api.models.alert import AlertDto, AlertStatus from keep.api.models.db.alert import Alert, AlertAudit from keep.api.models.db.maintenance_window import MaintenanceWindowRule from keep.api.tasks.notification_cache import get_notification_cache from keep.api.utils.cel_utils import preprocess_cel_expression from keep.rulesengine.rulesengine import RulesEngine from keep.workflowmanager.workflowmanager import WorkflowManager tracer = trace.get_tracer(__name__) class MaintenanceWindowsBl: def __init__(self, tenant_id: str, session: Session | None) -> None: self.logger = logging.getLogger(__name__) self.tenant_id = tenant_id self.session = session if session else get_session_sync() self.maintenance_rules: list[MaintenanceWindowRule] = ( self.session.query(MaintenanceWindowRule) .filter(MaintenanceWindowRule.tenant_id == tenant_id) .filter(MaintenanceWindowRule.enabled == True) .filter(MaintenanceWindowRule.end_time >= datetime.datetime.now(datetime.UTC)) .filter(MaintenanceWindowRule.start_time <= datetime.datetime.now(datetime.UTC)) .all() ) def check_if_alert_in_maintenance_windows(self, alert: AlertDto) -> bool: extra = {"tenant_id": self.tenant_id, "fingerprint": alert.fingerprint} if not self.maintenance_rules: self.logger.debug( "No maintenance window rules for this tenant", extra={"tenant_id": self.tenant_id}, ) return False self.logger.info("Checking maintenance window for alert", extra=extra) env = celpy.Environment() for maintenance_rule in self.maintenance_rules: if alert.status in maintenance_rule.ignore_statuses: self.logger.debug( "Alert status is set to be ignored, ignoring maintenance windows", extra={"tenant_id": self.tenant_id}, ) continue if maintenance_rule.end_time.replace(tzinfo=datetime.UTC) <= datetime.datetime.now(datetime.UTC): # this is wtf error, should not happen because of query in init self.logger.error( "Fetched maintenance window which already ended by mistake, should not happen!" ) continue cel_result = MaintenanceWindowsBl.evaluate_cel(maintenance_rule, alert, env, self.logger, extra) if cel_result: self.logger.info( "Alert is in maintenance window", extra={**extra, "maintenance_rule_id": maintenance_rule.id}, ) try: audit = AlertAudit( tenant_id=self.tenant_id, fingerprint=alert.fingerprint, user_id="Keep", action=ActionType.MAINTENANCE.value, description=( f"Alert in maintenance due to rule `{maintenance_rule.name}`" if not maintenance_rule.suppress else f"Alert suppressed due to maintenance rule `{maintenance_rule.name}`" ), ) self.session.add(audit) self.session.commit() except Exception: self.logger.exception( "Failed to write audit for alert maintenance window", extra={ "tenant_id": self.tenant_id, "fingerprint": alert.fingerprint, }, ) if maintenance_rule.suppress: # If user chose to suppress the alert, let it in but override the status. if MAINTENANCE_WINDOW_ALERT_STRATEGY == "recover_previous_status": alert.previous_status = alert.status alert.status = AlertStatus.MAINTENANCE.value else: alert.status = AlertStatus.SUPPRESSED.value return False return True self.logger.info("Alert is not in maintenance window", extra=extra) return False @staticmethod def evaluate_cel(maintenance_window: MaintenanceWindowRule, alert: AlertDto | Alert, environment: celpy.Environment, logger, logger_extra_info: dict) -> bool: cel = preprocess_cel_expression(maintenance_window.cel_query) ast = environment.compile(cel) prgm = environment.program(ast) if isinstance(alert, AlertDto): payload = alert.dict() else: payload = alert.event # todo: fix this in the future payload["source"] = payload["source"][0] activation = celpy.json_to_cel(json.loads(json.dumps(payload, default=str))) try: cel_result = prgm.evaluate(activation) return True if cel_result else False except celpy.evaluation.CELEvalError as e: error_msg = str(e).lower() if "no such member" in error_msg or "undeclared reference" in error_msg: logger.debug( f"Skipping maintenance window rule due to missing field: {str(e)}", extra={**logger_extra_info, "maintenance_rule_id": maintenance_window.id}, ) return False # Log unexpected CEL errors but don't fail the entire event processing logger.error( f"Unexpected CEL evaluation error: {str(e)}", extra={**logger_extra_info, "maintenance_rule_id": maintenance_window.id}, ) return False @staticmethod def recover_strategy( logger: logging.Logger, session: Session | None = None, ): """ This strategy will try to recover the previous status of the alerts that were in maintenance windows, once the maintenance windows are over, i.e they were deleted. For recovering the previous status, the maintenance windows shouldn't exist and the alerts should accomplish the following: - The alert is in [inhibited_status] status. - The alert timestamp is before the maintenance window end time. - The alert timestamp is after the maintenance window start time. - The CEL expression should match with the both alert and maintenance window. Once the status is recovered, Workflows, Correlations/Incidents and Presets will be launched, in the same way that a new alert. Args: logger (logging.Logger): The logger to use. session (Session | None): The SQLAlchemy session to use. If None, a new session will be created. """ logger.info("Starting recover strategy for maintenance windows review.") env = celpy.Environment() if session is None: session = get_session_sync() windows = get_maintenance_windows_started(session) alerts_in_maint = get_alerts_by_status(AlertStatus.MAINTENANCE, session) fingerprints_to_check: set = set() for alert in alerts_in_maint: active = False for window in windows: w_start = window.start_time w_end = window.end_time is_enable = window.enabled if window.tenant_id != alert.tenant_id: continue # Check active windows if ( w_start < alert.timestamp and alert.timestamp < w_end and w_end > datetime.datetime.utcnow() and is_enable ): logger.info("Checking alert %s in maintenance window %s", alert.id, window.id) is_in_cel = MaintenanceWindowsBl.evaluate_cel( window, alert, env, logger, {"tenant_id": alert.tenant_id, "alert_id": alert.id} ) # Recover source structure if not isinstance(alert.event.get("source"), list): alert.event["source"] = [alert.event["source"]] if is_in_cel: active = True set_maintenance_windows_trace(alert, window, session) logger.info("Alert %s is blocked due to the maintenance window: %s.", alert.id, window.id) break if not active: recover_prev_alert_status(alert, session) fingerprints_to_check.add((alert.tenant_id, alert.fingerprint)) add_audit( tenant_id=alert.tenant_id, fingerprint=alert.fingerprint, user_id="system", action=ActionType.MAINTENANCE_EXPIRED, description=( f"Alert {alert.id} has recover its previous status, " f"from {alert.event.get('previous_status')} to {alert.event.get('status')}" ), ) for (tenant, fp) in fingerprints_to_check: last_alert = get_last_alert_by_fingerprint(tenant, fp, session) alert = get_alert_by_event_id(tenant, str(last_alert.alert_id), session) if "previous_status" not in alert.event: logger.info( f"Alert {alert.id} does not have previous status, cannot proceed with recover strategy", extra={"tenant_id": tenant, "fingerprint": fp, "alert_id": alert.id, "alert.status": alert.event.get("status")}, ) continue if not isinstance(alert.event.get("source"), list): alert.event["source"] = [alert.event["source"]] alert_dto = AlertDto(**alert.event) with tracer.start_as_current_span("mw_recover_strategy_push_to_workflows"): try: # Now run any workflow that should run based on this alert # TODO: this should publish event workflow_manager = WorkflowManager.get_instance() # insert the events to the workflow manager process queue logger.info("Adding event to the workflow manager queue") workflow_manager.insert_events(tenant, [alert_dto]) logger.info("Added event to the workflow manager queue") except Exception: logger.exception( "Failed to run workflows based on alerts", extra={ "provider_type": alert_dto.providerType, "provider_id": alert_dto.providerId, "tenant_id": tenant, }, ) with tracer.start_as_current_span("mw_recover_strategy_run_rules_engine"): # Now we need to run the rules engine if KEEP_CORRELATION_ENABLED: incidents = [] try: rules_engine = RulesEngine(tenant_id=tenant) # handle incidents, also handle workflow execution as incidents = rules_engine.run_rules( [alert_dto], session=session ) except Exception: logger.exception( "Failed to run rules engine", extra={ "provider_type": alert_dto.providerType, "provider_id": alert_dto.providerId, "tenant_id": tenant, }, ) pusher_cache = get_notification_cache() if incidents and pusher_cache.should_notify(tenant, "incident-change"): pusher_client = get_pusher_client() try: pusher_client.trigger( f"private-{tenant}", "incident-change", {}, ) except Exception: logger.exception("Failed to tell the client to pull incidents") try: presets = get_all_presets_dtos(tenant) rules_engine = RulesEngine(tenant_id=tenant) presets_do_update = [] for preset_dto in presets: # filter the alerts based on the search query filtered_alerts = rules_engine.filter_alerts( [alert_dto], preset_dto.cel_query ) # if not related alerts, no need to update if not filtered_alerts: continue presets_do_update.append(preset_dto) if pusher_cache.should_notify(tenant, "poll-presets"): try: pusher_client.trigger( f"private-{tenant}", "poll-presets", json.dumps( [p.name.lower() for p in presets_do_update], default=str ), ) except Exception: logger.exception("Failed to send presets via pusher") except Exception: logger.exception( "Failed to send presets via pusher", extra={ "provider_type": alert_dto.providerType, "provider_id": alert_dto.providerId, "tenant_id": tenant, }, ) logger.info("Finished recover strategy for maintenance windows review.") ================================================ FILE: keep/api/config.py ================================================ import logging import os import keep.api.logging from keep.api.alert_deduplicator.deduplication_rules_provisioning import ( provision_deduplication_rules_from_env, ) from keep.api.api import AUTH_TYPE from keep.api.core.db_on_start import migrate_db, try_create_single_tenant from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.core.tenant_configuration import TenantConfiguration from keep.api.routes.dashboard import provision_dashboards from keep.identitymanager.identitymanagerfactory import IdentityManagerTypes from keep.providers.providers_factory import ProvidersFactory from keep.providers.providers_service import ProvidersService from keep.workflowmanager.workflowstore import WorkflowStore PORT = int(os.environ.get("PORT", 8080)) PROVISION_RESOURCES = os.environ.get("PROVISION_RESOURCES", "true") == "true" keep.api.logging.setup_logging() logger = logging.getLogger(__name__) def provision_resources(): if PROVISION_RESOURCES: logger.info("Loading providers into cache") # provision providers from env. relevant only on single tenant. logger.info("Provisioning providers and workflows") ProvidersService.provision_providers(SINGLE_TENANT_UUID) logger.info("Providers loaded successfully") WorkflowStore.provision_workflows(SINGLE_TENANT_UUID) logger.info("Workflows provisioned successfully") provision_dashboards(SINGLE_TENANT_UUID) logger.info("Dashboards provisioned successfully") logger.info("Provisioning deduplication rules") provision_deduplication_rules_from_env(SINGLE_TENANT_UUID) logger.info("Deduplication rules provisioned successfully") else: logger.info("Provisioning resources is disabled") def on_starting(server=None): """This function is called by the gunicorn server when it starts""" logger.info("Keep server starting") migrate_db() # Load this early and use preloading # https://www.joelsleppy.com/blog/gunicorn-application-preloading/ # @tb: 👏 @Matvey-Kuk ProvidersFactory.get_all_providers() # Load tenant configuration early TenantConfiguration() # Create single tenant if it doesn't exist if AUTH_TYPE in [ IdentityManagerTypes.DB.value, IdentityManagerTypes.NOAUTH.value, IdentityManagerTypes.OAUTH2PROXY.value, IdentityManagerTypes.ONELOGIN.value, "no_auth", # backwards compatibility "single_tenant", # backwards compatibility ]: excluded_from_default_user = [ IdentityManagerTypes.OAUTH2PROXY.value, IdentityManagerTypes.ONELOGIN.value, ] # for oauth2proxy, we don't want to create the default user try_create_single_tenant( SINGLE_TENANT_UUID, create_default_user=( False if AUTH_TYPE in excluded_from_default_user else True ), ) provision_resources() if os.environ.get("USE_NGROK", "false") == "true": from pyngrok import ngrok from pyngrok.conf import PyngrokConfig ngrok_config = PyngrokConfig( auth_token=os.environ.get("NGROK_AUTH_TOKEN", None) ) # If you want to use a custom domain, set the NGROK_DOMAIN & NGROK_AUTH_TOKEN environment variables # read https://ngrok.com/blog-post/free-static-domains-ngrok-users -> https://dashboard.ngrok.com/cloud-edge/domains ngrok_connection = ngrok.connect( PORT, pyngrok_config=ngrok_config, domain=os.environ.get("NGROK_DOMAIN", None), ) public_url = ngrok_connection.public_url logger.info(f"ngrok tunnel: {public_url}") os.environ["KEEP_API_URL"] = public_url logger.info("Keep server started") def post_worker_init(worker): # We need to reinitialize logging in each worker because gunicorn forks the worker processes print("Init logging in worker") logging.getLogger().handlers = [] # noqa keep.api.logging.setup_logging() # noqa print("Logging initialized in worker") post_worker_init = post_worker_init ================================================ FILE: keep/api/consts.py ================================================ import os from dotenv import find_dotenv, load_dotenv from keep.api.models.db.preset import PresetDto, StaticPresetsId load_dotenv(find_dotenv()) RUNNING_IN_CLOUD_RUN = os.environ.get("K_SERVICE") is not None PROVIDER_PULL_INTERVAL_MINUTE = int( os.environ.get("KEEP_PULL_INTERVAL", 10080) ) # maximum once a week STATIC_PRESETS = { "feed": PresetDto( id=StaticPresetsId.FEED_PRESET_ID.value, name="feed", options=[ {"label": "CEL", "value": ""}, { "label": "SQL", "value": {"sql": "", "params": {}}, }, ], created_by=None, is_private=False, is_noisy=False, should_do_noise_now=False, static=True, tags=[], ) } MAINTENANCE_WINDOW_ALERT_STRATEGY = os.environ.get( "MAINTENANCE_WINDOW_STRATEGY", "default" ) # recover_previous_status or default WATCHER_LAPSED_TIME = int(os.environ.get("KEEP_WATCHER_LAPSED_TIME", 60)) # in seconds ### # Set ARQ_TASK_POOL_TO_EXECUTE to "none", "all", "basic_processing" or "ai" # to split the tasks between the workers. ### KEEP_ARQ_TASK_POOL_ALL = "all" # All arq workers enabled for this service KEEP_ARQ_TASK_POOL_BASIC_PROCESSING = "basic_processing" # Everything except AI # Define queues for different task types KEEP_ARQ_QUEUE_BASIC = "basic_processing" KEEP_ARQ_QUEUE_WORKFLOWS = "workflows" KEEP_ARQ_QUEUE_MAINTENANCE = "maintenance" REDIS = os.environ.get("REDIS", "false") == "true" if REDIS: KEEP_ARQ_TASK_POOL = os.environ.get("KEEP_ARQ_TASK_POOL", KEEP_ARQ_TASK_POOL_ALL) else: KEEP_ARQ_TASK_POOL = os.environ.get("KEEP_ARQ_TASK_POOL", None) OPENAI_MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME", "gpt-4o-2024-08-06") KEEP_CORRELATION_ENABLED = os.environ.get("KEEP_CORRELATION_ENABLED", "true") == "true" ================================================ FILE: keep/api/core/alerts.py ================================================ import datetime import json import logging import os from typing import Tuple from sqlalchemy import and_, func, select from sqlalchemy.exc import OperationalError from sqlmodel import Session, text from keep.api.core.cel_to_sql.ast_nodes import DataType from keep.api.core.cel_to_sql.properties_metadata import ( FieldMappingConfiguration, PropertiesMetadata, PropertyMetadataInfo, ) from keep.api.core.cel_to_sql.sql_providers.get_cel_to_sql_provider_for_dialect import ( get_cel_to_sql_provider, ) from keep.api.core.db import engine # This import is required to create the tables from keep.api.core.facets import get_facet_options, get_facets from keep.api.models.alert import AlertSeverity, AlertStatus from keep.api.models.db.alert import ( Alert, AlertEnrichment, AlertField, Incident, LastAlert, LastAlertToIncident, ) from keep.api.models.db.facet import FacetType from keep.api.models.db.incident import IncidentStatus from keep.api.models.facet import FacetDto, FacetOptionDto, FacetOptionsQueryDto from keep.api.models.query import QueryDto, SortOptionsDto logger = logging.getLogger(__name__) alerts_hard_limit = int(os.environ.get("KEEP_LAST_ALERTS_LIMIT", 50000)) alert_field_configurations = [ FieldMappingConfiguration( map_from_pattern="id", map_to="lastalert.alert_id", data_type=DataType.UUID ), FieldMappingConfiguration( map_from_pattern="source", map_to="alert.provider_type", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="providerId", map_to="alert.provider_id", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="providerType", map_to="alert.provider_type", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="timestamp", map_to="lastalert.timestamp", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="fingerprint", map_to="lastalert.fingerprint", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="startedAt", map_to="lastalert.first_timestamp", data_type=DataType.DATETIME ), FieldMappingConfiguration( map_from_pattern="incident.id", map_to=[ "incident.id", ], data_type=DataType.UUID, ), FieldMappingConfiguration( map_from_pattern="incident.is_visible", map_to=[ "incident.is_visible", ], data_type=DataType.BOOLEAN, ), FieldMappingConfiguration( map_from_pattern="incident.name", map_to=[ "incident.user_generated_name", "incident.ai_generated_name", ], data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="severity", map_to=[ "JSON(alertenrichment.enrichments).*", "JSON(alert.event).*", ], enum_values=[ severity.value for severity in sorted( [severity for _, severity in enumerate(AlertSeverity)], key=lambda s: s.order, ) ], data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="lastReceived", map_to=[ "JSON(alertenrichment.enrichments).*", "JSON(alert.event).*", ], data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="status", map_to=[ "JSON(alertenrichment.enrichments).*", "JSON(alert.event).*", ], enum_values=list(reversed([item.value for _, item in enumerate(AlertStatus)])), data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="dismissed", map_to=["JSON(alertenrichment.enrichments).*"], data_type=DataType.BOOLEAN, ), FieldMappingConfiguration( map_from_pattern="firingCounter", map_to=[ "JSON(alertenrichment.enrichments).*", "JSON(alert.event).*", ], data_type=DataType.INTEGER, ), FieldMappingConfiguration( map_from_pattern="unresolvedCounter", map_to=[ "JSON(alertenrichment.enrichments).*", "JSON(alert.event).*", ], data_type=DataType.INTEGER, ), FieldMappingConfiguration( map_from_pattern="*", map_to=[ "JSON(alertenrichment.enrichments).*", "JSON(alert.event).*", ], data_type=DataType.STRING, ), ] # Copies the same configuration as above, but adds the "alert." prefix to each entry in map_from_pattern. # This allows users to write queries using dictionary-style field access, like: # alert['some_attribute'] == 'value' field_configurations_with_alert_prefix = [] for item in alert_field_configurations: field_configurations_with_alert_prefix.append( FieldMappingConfiguration( map_from_pattern=f"alert.{item.map_from_pattern}", map_to=item.map_to, data_type=item.data_type, enum_values=item.enum_values, ) ) alert_field_configurations = ( field_configurations_with_alert_prefix + alert_field_configurations ) properties_metadata = PropertiesMetadata(alert_field_configurations) static_facets = [ FacetDto( id="f8a91ac7-4916-4ad0-9b46-a5ddb85bfbb8", property_path="severity", name="Severity", is_static=True, type=FacetType.str, ), FacetDto( id="5dd1519c-6277-4109-ad95-c19d2f4f15e3", property_path="status", name="Status", is_static=True, type=FacetType.str, ), FacetDto( id="461bef05-fc20-4363-b427-9d26fe064e7f", property_path="source", name="Source", is_static=True, type=FacetType.str, ), FacetDto( id="6afa12d7-21df-4694-8566-fd56d5ee2266", property_path="incident.name", name="Incident", is_static=True, type=FacetType.str, ), FacetDto( id="77b8a6d4-3b8d-4b6a-9f8e-2c8e4b8f8e4c", property_path="dismissed", name="Dismissed", is_static=True, type=FacetType.str, ), ] static_facets_dict = {facet.id: facet for facet in static_facets} def get_threeshold_query(tenant_id: str): return func.coalesce( select(LastAlert.timestamp) .select_from(LastAlert) .where(LastAlert.tenant_id == tenant_id) .order_by(LastAlert.timestamp.desc()) .limit(1) .offset(alerts_hard_limit - 1) .scalar_subquery(), datetime.datetime.min, ) def __build_query_for_filtering( tenant_id: str, select_args: list, cel=None, limit=None, fetch_alerts_data=True, fetch_incidents=False, force_fetch=False, ): fetch_incidents = fetch_incidents or (cel and "incident." in cel) cel_to_sql_instance = get_cel_to_sql_provider(properties_metadata) sql_filter = None involved_fields = [] if cel: cel_to_sql_result = cel_to_sql_instance.convert_to_sql_str_v2(cel) sql_filter = cel_to_sql_result.sql involved_fields = cel_to_sql_result.involved_fields fetch_incidents = next( ( True for field in involved_fields if field.field_name.startswith("incident.") ), False, ) sql_query = select(*select_args).select_from(LastAlert) if fetch_alerts_data or force_fetch: sql_query = sql_query.join( Alert, and_( Alert.id == LastAlert.alert_id, Alert.tenant_id == LastAlert.tenant_id ), ).outerjoin( AlertEnrichment, and_( LastAlert.tenant_id == AlertEnrichment.tenant_id, LastAlert.fingerprint == AlertEnrichment.alert_fingerprint, ), ) if fetch_incidents or force_fetch: # Fingerprint with active incidents subquery, i.e in Firing status firing_subq = ( select(LastAlert.fingerprint) .join( LastAlertToIncident, LastAlert.fingerprint == LastAlertToIncident.fingerprint ) .join( Incident, LastAlertToIncident.incident_id == Incident.id ) .where(Incident.status == IncidentStatus.FIRING.value) .distinct() ).subquery() sql_query = sql_query.outerjoin( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ).outerjoin( Incident, and_( LastAlertToIncident.tenant_id == Incident.tenant_id, LastAlertToIncident.incident_id == Incident.id, LastAlert.fingerprint.in_(select(firing_subq.c.fingerprint)) ), ) sql_query = sql_query.filter(LastAlert.tenant_id == tenant_id).filter( LastAlert.timestamp >= get_threeshold_query(tenant_id) ) involved_fields = [] if sql_filter: sql_query = sql_query.where(text(sql_filter)) return { "query": sql_query, "involved_fields": involved_fields, "fetch_incidents": fetch_incidents, } def build_total_alerts_query(tenant_id, query: QueryDto): fetch_incidents = query.cel and "incident." in query.cel fetch_alerts_data = query.cel is not None or query.cel != "" count_funct = ( func.count(func.distinct(LastAlert.alert_id)) if fetch_incidents else func.count(1) ) built_query_result = __build_query_for_filtering( tenant_id=tenant_id, cel=query.cel, select_args=[count_funct], limit=query.limit, fetch_alerts_data=fetch_alerts_data, ) return built_query_result["query"] def build_alerts_query(tenant_id, query: QueryDto): cel_to_sql_instance = get_cel_to_sql_provider(properties_metadata) sort_by_exp = cel_to_sql_instance.get_order_by_expression( [ (sort_option.sort_by, sort_option.sort_dir) for sort_option in query.sort_options ] ) distinct_columns = [ text(cel_to_sql_instance.get_field_expression(sort_option.sort_by)) for sort_option in query.sort_options ] built_query_result = __build_query_for_filtering( tenant_id, select_args=[ Alert, AlertEnrichment, LastAlert.first_timestamp.label("startedAt"), ] + distinct_columns, cel=query.cel, ) sql_query = built_query_result["query"] fetch_incidents = built_query_result["fetch_incidents"] sql_query = sql_query.order_by(text(sort_by_exp)) if fetch_incidents: sql_query = sql_query.distinct(*(distinct_columns + [Alert.id])) if query.limit is not None: sql_query = sql_query.limit(query.limit) if query.offset is not None: sql_query = sql_query.offset(query.offset) return sql_query def query_last_alerts(tenant_id, query: QueryDto) -> Tuple[list[Alert], int]: query_with_defaults = query.copy() # Shahar: this happens when the frontend query builder fails to build a query if query_with_defaults.cel == "1 == 1": logger.warning("Failed to build query for alerts") query_with_defaults.cel = "" if query_with_defaults.limit is None: query_with_defaults.limit = 1000 if query_with_defaults.offset is None: query_with_defaults.offset = 0 if query_with_defaults.sort_by is not None: query_with_defaults.sort_options = [ SortOptionsDto( sort_by=query_with_defaults.sort_by, sort_dir=query_with_defaults.sort_dir, ) ] if not query_with_defaults.sort_options: query_with_defaults.sort_options = [ SortOptionsDto(sort_by="timestamp", sort_dir="desc") ] with Session(engine) as session: try: total_count_query = build_total_alerts_query( tenant_id=tenant_id, query=query_with_defaults ) total_count = session.exec(total_count_query).one()[0] if not query_with_defaults.limit: return [], total_count if query_with_defaults.offset >= alerts_hard_limit: return [], total_count if ( query_with_defaults.offset + query_with_defaults.limit > alerts_hard_limit ): query_with_defaults.limit = ( alerts_hard_limit - query_with_defaults.offset ) data_query = build_alerts_query(tenant_id, query_with_defaults) alerts_with_start = session.execute(data_query).all() except OperationalError as e: logger.warning( f"Failed to query alerts for query object '{json.dumps(query_with_defaults.dict(exclude_unset=True))}': {e}" ) return [], 0 # Process results based on dialect alerts = [] for alert_data in alerts_with_start: alert: Alert = alert_data[0] alert.alert_enrichment = alert_data[1] if not alert.event.get("startedAt"): alert.event["startedAt"] = str(alert_data[2]) else: alert.event["firstTimestamp"] = str(alert_data[2]) alert.event["event_id"] = str(alert.id) alerts.append(alert) return alerts, total_count def get_alert_facets_data( tenant_id: str, facet_options_query: FacetOptionsQueryDto, ) -> dict[str, list[FacetOptionDto]]: if facet_options_query and facet_options_query.facet_queries: facets = get_alert_facets(tenant_id, facet_options_query.facet_queries.keys()) else: facets = static_facets def base_query_factory( facet_property_path: str, involved_fields: PropertyMetadataInfo, select_statement, ): fetch_incidents = "incident." in facet_property_path or next( (True for item in involved_fields if "incident." in item.field_name), False, ) return __build_query_for_filtering( tenant_id=tenant_id, select_args=select_statement, force_fetch=False, fetch_incidents=fetch_incidents, )["query"] return get_facet_options( base_query_factory=base_query_factory, entity_id_column=LastAlert.alert_id, facets=facets, facet_options_query=facet_options_query, properties_metadata=properties_metadata, ) def get_alert_facets( tenant_id: str, facet_ids_to_load: list[str] = None ) -> list[FacetDto]: not_static_facet_ids = [] facets = [] if not facet_ids_to_load: return static_facets + get_facets(tenant_id, "alert") if facet_ids_to_load: for facet_id in facet_ids_to_load: if facet_id not in static_facets_dict: not_static_facet_ids.append(facet_id) continue facets.append(static_facets_dict[facet_id]) if not_static_facet_ids: facets += get_facets(tenant_id, "alert", not_static_facet_ids) return facets def get_alert_potential_facet_fields(tenant_id: str) -> list[str]: with Session(engine) as session: query = ( select(AlertField.field_name) .select_from(AlertField) .where(AlertField.tenant_id == tenant_id) .distinct(AlertField.field_name) ) result = session.exec(query).all() return [row[0] for row in result] ================================================ FILE: keep/api/core/cel_to_sql/ast_nodes.py ================================================ import datetime from types import NoneType from typing import Any, List, Optional from enum import Enum from pydantic import BaseModel, Field class Node(BaseModel): """ A base class representing a node in an abstract syntax tree (AST). This class serves as a parent class for various types of nodes that can appear in an AST. It does not implement any specific functionality but provides a common interface for all AST nodes. """ def __init__(self, **data): super().__init__(**data) node_type: str = Field(default=None) class ConstantNode(Node): """ A node representing a constant value in CEL abstract syntax tree. Example: 1, 'text', true Attributes: value (Any): The constant value represented by this node. Methods: __str__(): Returns the string representation of the constant value. """ node_type: str = Field(default="ConstantNode", const=True) value: Any = Field() def __str__(self): return self.value class ParenthesisNode(Node): """ A node representing a parenthesis expression in CEL abstract syntax tree (AST). Example: (alert.status == 'open') Attributes: expression (Any): The expression contained within the parentheses. Methods: __str__(): Returns a string representation of the parenthesis node. """ node_type: str = Field(default="ParenthesisNode", const=True) expression: Node = Field() def __str__(self): return f"({self.expression})" class LogicalNodeOperator(Enum): AND = "&&" OR = "||" class LogicalNode(Node): """ Represents a logical operation node in CEL abstract syntax tree (AST). Examples: alert.status == 'open' && alert.severity == 'high' alert.status == 'open' || alert.severity == 'high' Attributes: left (Any): The left operand of the logical operation. operator (str): The logical operator ('&&' for AND, '||' for OR). right (Any): The right operand of the logical operation. Methods: __init__(left: Any, operator: str, right: Any): Initializes a LogicalNode with the given left operand, operator, and right operand. __str__() -> str: Returns a string representation of the logical operation in the format "left operator right". """ node_type: str = Field(default="LogicalNode", const=True) left: Node = Field() operator: LogicalNodeOperator = Field() right: Node = Field() def __str__(self): return f"{self.left} {self.operator} {self.right}" class ComparisonNodeOperator(Enum): LT = "<" LE = "<=" GT = ">" GE = ">=" EQ = "==" NE = "!=" IN = "in" CONTAINS = "contains" STARTS_WITH = "startsWith" ENDS_WITH = "endsWith" class ComparisonNode(Node): """ A class representing a comparison operation in CEL abstract syntax tree (AST). Examples: alert.severity == 'high' alert.count > 10 alert.status != 'closed' Args: first_operand (Node): The left-hand side operand of the comparison. operator (str): The comparison operator. second_operand (Node): The right-hand side operand of the comparison. Methods: __str__(): Returns a string representation of the comparison operation. """ node_type: str = Field(default="ComparisonNode", const=True) first_operand: Optional[Node] = Field() operator: ComparisonNodeOperator = Field() second_operand: Optional[Node | Any] = Field() def __str__(self): return f"{self.first_operand} {self.operator} {self.second_operand}" class UnaryNodeOperator(Enum): NOT = "!" NEG = "-" HAS = "has" class UnaryNode(Node): """ Represents a unary operation node in CEL abstract syntax tree (AST). Examples: !alert.active -alert.threshold Attributes: operator (str): The operator for the unary operation. operand (Any): The operand for the unary operation. Methods: __init__(operator: str, operand: Any): Initializes a UnaryNode with the given operator and operand. __str__() -> str: Returns a string representation of the unary operation. """ node_type: str = Field(default="UnaryNode", const=True) operator: UnaryNodeOperator = Field() operand: Optional[Node] = Field() def __str__(self): if self.operator == UnaryNodeOperator.HAS: return f"{self.operand}({self.operator})" return f"{self.operator}{self.operand}" # TODO: To remove this class as it's not needed anymore class MemberAccessNode(Node): """ A node representing member access in CEL abstract syntax tree (AST). Attributes: member_name (str): The name of the member being accessed. Methods: __str__(): Returns the member name as a string. """ node_type: str = Field(default="MemberAccessNode", const=True) member_name: Optional[str] # TODO: to remove def __str__(self): return self.member_name # TODO: To remove this class as it's not needed anymore class MethodAccessNode(MemberAccessNode): """ Represents a method access node in CEL abstract syntax tree (AST). Examples: alert.name.contains('error') alert.name.startsWith('sys') alert.name.endsWith('log') Inherits from: MemberAccessNode Attributes: member_name (str): The name of the member being accessed. args (List[str], optional): A list of arguments for the method. Defaults to None. Methods: copy() -> MethodAccessNode: Creates a copy of the current MethodAccessNode instance. __str__() -> str: Returns a string representation of the method access node in the format: "member_name(arg1, arg2, ...)". """ node_type: str = Field(default="MethodAccessNode", const=True) member_name: str args: List[ConstantNode] = None def copy(self): return MethodAccessNode( member_name=self.member_name, args=self.args.copy() if self.args else None ) def __str__(self): args = [] for arg_node in self.args or []: args.append(str(arg_node)) return f"{self.member_name}({', '.join(args)})" class DataType(Enum): """ An enumeration representing various data types. Attributes: STRING (str): Represents a string data type. UUID (str): Represents a universally unique identifier (UUID) data type. INTEGER (str): Represents an integer data type. FLOAT (str): Represents a floating-point number data type. DATETIME (str): Represents a datetime data type. BOOLEAN (str): Represents a boolean data type. OBJECT (str): Represents an object data type. ARRAY (str): Represents an array data type. """ STRING = "string" UUID = "uuid" INTEGER = "integer" FLOAT = "float" DATETIME = "datetime" BOOLEAN = "boolean" OBJECT = "object" ARRAY = "array" NULL = "null" def from_type_to_data_type(_type: type) -> DataType: if _type is str: return DataType.STRING elif _type is int: return DataType.INTEGER elif _type is float: return DataType.FLOAT elif _type is bool: return DataType.BOOLEAN elif _type is NoneType: return DataType.NULL elif _type is dict: return DataType.OBJECT elif _type is list: return DataType.ARRAY elif _type is datetime.datetime: return DataType.DATETIME raise ValueError( f"There is no DataType corresponding to the provided type: {_type}" ) class PropertyAccessNode(MemberAccessNode): """ Represents a node in CEL abstract syntax tree (AST) that accesses a property of an object. Examples: alert.name alert.status Attributes: path (str): The property path being accessed. value (Any): The value associated with the member access, which can be another node. Methods: __init__(member_name, value: Any): Initializes the PropertyAccessNode with the given member name and value. is_function_call() -> bool: Determines if the member access represents a function call. get_property_path() -> str: Constructs and returns the property path as a string. get_method_access_node() -> MethodAccessNode: Retrieves the MethodAccessNode if the value represents a method access. __str__() -> str: Returns a string representation of the PropertyAccessNode. """ node_type: str = Field(default="PropertyAccessNode", const=True) path: list[str] = Field(default=None) data_type: DataType = Field(default=None) def is_function_call(self) -> bool: member_access_node = self.get_method_access_node() return member_access_node is not None # TODO: To remove this method as it's not needed anymore def get_property_path(self) -> list[str]: return self.path # TODO: To remove this method as it's not needed anymore def get_method_access_node(self) -> MethodAccessNode: if isinstance(self.value, MethodAccessNode): return self.value if isinstance(self.value, PropertyAccessNode): return self.value.get_method_access_node() return None def __str__(self): if self.value: return f"{self.member_name}.{self.value}" return self.member_name ================================================ FILE: keep/api/core/cel_to_sql/cel_ast_converter.py ================================================ import logging import re from typing import Any import celpy.celparser import lark import celpy from typing import List, cast from dateutil.parser import parse from keep.api.core.cel_to_sql.ast_nodes import ( ComparisonNode, ComparisonNodeOperator, ConstantNode, LogicalNode, LogicalNodeOperator, Node, ParenthesisNode, PropertyAccessNode, UnaryNode, UnaryNodeOperator, ) # Matches such strings: # '2025-03-23T15:42:00' # '2025-03-23T15:42:00Z' # '2025-03-23T15:42:00.123Z' # '2025-03-23T15:42:00+02:00' # '2025-03-23T15:42:00.456-05:30' iso_regex = re.compile( r"^(\d{4})-(\d{2})-(\d{2})" # Date: YYYY-MM-DD r"T" # T separator r"(\d{2}):(\d{2}):(\d{2})" # Time: hh:mm:ss r"(?:\.(\d+))?" # Optional fractional seconds r"(?:Z|[+-]\d{2}:\d{2})?$" # Optional timezone (Z or ±hh:mm) ) # Matches such strings: # '2025-03-23 15:42:00' # '1999-01-01 00:00:00' # '2025-01-20' datetime_regex = re.compile( r"^(\d{4})-(\d{2})-(\d{2})" # Date: YYYY-MM-DD r"(?:\s(\d{2}):(\d{2}):(\d{2}))?$" # Optional time: HH:MM:SS ) logger = logging.getLogger(__name__) class CelToAstConverter(lark.visitors.Visitor_Recursive): """Dump a CEL AST creating a close approximation to the original source.""" @classmethod def convert_to_ast(cls_, cel: str) -> Node: d = cls_() try: celpy_ast = d.celpy_env.compile(cel) d.visit(celpy_ast) return d.stack[0] except Exception as e: logger.warning('Error converting "%s" CEL to AST. Error: %s', cel, e) raise e def __init__(self) -> None: self.celpy_env = celpy.Environment() self.stack: List[Any] = [] self.member_access_stack: List[str] = [] def expr(self, tree: lark.Tree) -> None: if len(tree.children) == 1: return else: right = self.stack.pop() left = self.stack.pop() cond = self.stack.pop() self.stack.append( f"{cond} ? {left} : {right}" ) def conditionalor(self, tree: lark.Tree) -> None: if len(tree.children) == 1: return else: right = self.stack.pop() left = self.stack.pop() self.stack.append( LogicalNode(left=left, operator=LogicalNodeOperator.OR, right=right) ) def conditionaland(self, tree: lark.Tree) -> None: if len(tree.children) == 1: return else: right = self.stack.pop() left = self.stack.pop() self.stack.append( LogicalNode(left=left, operator=LogicalNodeOperator.AND, right=right) ) def relation(self, tree: lark.Tree) -> None: # self.member_access_stack.clear() if len(tree.children) == 1: return else: second_operand = self.stack.pop() comparison_node: ComparisonNode = self.stack.pop() comparison_node.second_operand = second_operand self.stack.append(comparison_node) def relation_lt(self, tree: lark.Tree) -> None: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=ComparisonNodeOperator.LT, second_operand=None, ) ) def relation_le(self, tree: lark.Tree) -> None: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=ComparisonNodeOperator.LE, second_operand=None, ) ) def relation_gt(self, tree: lark.Tree) -> None: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=ComparisonNodeOperator.GT, second_operand=None, ) ) def relation_ge(self, tree: lark.Tree) -> None: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=ComparisonNodeOperator.GE, second_operand=None, ) ) def relation_eq(self, tree: lark.Tree) -> None: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=ComparisonNodeOperator.EQ, second_operand=None, ) ) def relation_ne(self, tree: lark.Tree) -> None: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=ComparisonNodeOperator.NE, second_operand=None, ) ) def relation_in(self, tree: lark.Tree) -> None: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=ComparisonNodeOperator.IN, second_operand=None, ) ) def addition(self, tree: lark.Tree) -> None: if len(tree.children) == 1: return else: right = self.stack.pop() left: dict = self.stack.pop() left['right'] = right self.stack.append(left) def addition_add(self, tree: lark.Tree) -> None: left = self.stack.pop() self.stack.append({ 'left': left, 'operator': 'ADD' }) def addition_sub(self, tree: lark.Tree) -> None: left = self.stack.pop() self.stack.append({ 'left': left, 'operator': 'SUB' }) def multiplication(self, tree: lark.Tree) -> None: if len(tree.children) == 1: return else: right = self.stack.pop() left: dict = self.stack.pop() left['right'] = right self.stack.append(left) def multiplication_mul(self, tree: lark.Tree) -> None: left = self.stack.pop() self.stack.append({ 'left': left, 'operator': 'MUL' }) def multiplication_div(self, tree: lark.Tree) -> None: left = self.stack.pop() self.stack.append({ 'left': left, 'operator': 'DIV' }) def multiplication_mod(self, tree: lark.Tree) -> None: left = self.stack.pop() self.stack.append({ 'left': left, 'operator': 'MOD' }) def unary(self, tree: lark.Tree) -> None: if len(tree.children) == 1: return else: operand = self.stack.pop() unaryNode: UnaryNode = self.stack.pop() unaryNode.operand = operand self.stack.append(unaryNode) def unary_not(self, tree: lark.Tree) -> None: self.stack.append(UnaryNode(operator=UnaryNodeOperator.NOT, operand=None)) def unary_neg(self, tree: lark.Tree) -> None: self.stack.append(UnaryNode(operator=UnaryNodeOperator.NEG, operand=None)) def member_dot(self, tree: lark.Tree) -> None: right = cast(lark.Token, tree.children[1]).value if self.member_access_stack: property_member: PropertyAccessNode = self.member_access_stack.pop() new_property_access_node = PropertyAccessNode( path=property_member.path + [right] ) self.stack.pop() self.stack.append(new_property_access_node) self.member_access_stack.append(new_property_access_node) def member_dot_arg(self, tree: lark.Tree) -> None: if len(tree.children) == 3: exprlist = self.stack.pop() else: exprlist = [] right = cast(lark.Token, tree.children[1]).value if self.member_access_stack: if right.lower() in [ ComparisonNodeOperator.CONTAINS.value.lower(), ComparisonNodeOperator.STARTS_WITH.value.lower(), ComparisonNodeOperator.ENDS_WITH.value.lower(), ]: self.stack.append( ComparisonNode( first_operand=self.stack.pop(), operator=right, second_operand=exprlist[0], ) ) return raise NotImplementedError(f"Method '{right}' not implemented") else: raise ValueError("No member access stack") def member_index(self, tree: lark.Tree) -> None: right = self.stack.pop() left = self.stack.pop() if isinstance(right, ConstantNode): right = right.value prop_access_node: PropertyAccessNode = left new_property_access_node = PropertyAccessNode( path=prop_access_node.path + [str(right)] ) self.stack.append(new_property_access_node) self.member_access_stack.append(new_property_access_node) def member_object(self, tree: lark.Tree) -> None: raise NotImplementedError("Member object not implemented") def dot_ident_arg(self, tree: lark.Tree) -> None: raise NotImplementedError("Dot ident arg not implemented") def dot_ident(self, tree: lark.Tree) -> None: raise NotImplementedError("Dot ident not implemented") def ident_arg(self, tree: lark.Tree) -> None: token_value = tree.children[0].value if token_value == UnaryNodeOperator.HAS.value: self.stack.append( UnaryNode(operator=UnaryNodeOperator.HAS, operand=self.stack.pop()[0]) ) return raise NotImplementedError( "Ident arg not implemented for token_value:" + token_value ) def ident(self, tree: lark.Tree) -> None: property_member = PropertyAccessNode( path=[cast(lark.Token, tree.children[0]).value] ) self.member_access_stack.clear() self.stack.append(property_member) self.member_access_stack.append(property_member) def paren_expr(self, tree: lark.Tree) -> None: if not self.stack: raise ValueError("Cannot handle parenthesis expression without stack") self.stack.append(ParenthesisNode(expression=self.stack.pop())) def list_lit(self, tree: lark.Tree) -> None: if self.stack: left = self.stack.pop() self.stack.append([item for item in reversed(left)]) def map_lit(self, tree: lark.Tree) -> None: raise NotImplementedError("Map literal not implemented") def exprlist(self, tree: lark.Tree) -> None: list_items = list(self.stack.pop() for _ in tree.children) self.stack.append(list_items) def fieldinits(self, tree: lark.Tree) -> None: raise NotImplementedError("Fieldinits not implemented") def mapinits(self, tree: lark.Tree) -> None: raise NotImplementedError("Mapinits not implemented") def literal(self, tree: lark.Tree) -> None: if tree.children: value = cast(lark.Token, tree.children[0]).value constant_node = self.to_constant_node(value) self.stack.append(constant_node) def to_constant_node(self, value: str) -> ConstantNode: if value in ['null', 'NULL']: value = None elif (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")): value = value[1:-1] if not self.is_number(value) and self.is_date(value): value = parse(value) else: # this code is to handle the case when string literal contains escaped single/double quotes value = re.sub(r'\\(["\'])', r"\1", value) elif value == 'true' or value == 'false': value = value == 'true' elif '.' in value and self.is_float(value): value = float(value) elif self.is_number(value): value = int(value) else: raise ValueError(f"Unknown literal type: {value}") return ConstantNode(value=value) def is_number(self, value: str) -> bool: try: int(value) return True except ValueError: return False def is_float(self, value: str) -> bool: try: float(value) return True except ValueError: return False def is_date(self, value: str) -> bool: return iso_regex.match(value) or datetime_regex.match(value) ================================================ FILE: keep/api/core/cel_to_sql/properties_mapper.py ================================================ from typing import Optional from keep.api.core.cel_to_sql.ast_nodes import ( ComparisonNode, ComparisonNodeOperator, ConstantNode, DataType, LogicalNode, LogicalNodeOperator, MemberAccessNode, MethodAccessNode, Node, ParenthesisNode, PropertyAccessNode, UnaryNode, UnaryNodeOperator, ) from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, PropertiesMetadata, PropertyMetadataInfo, SimpleFieldMapping, ) class JsonPropertyAccessNode(PropertyAccessNode): """ A node representing access to a property within a JSON object. This class extends PropertyAccessNode to allow for the extraction of a specific property from a JSON object using a method access node. Attributes: json_property_name (str): The name of the JSON property to access. property_to_extract (str): The specific property to extract from the JSON object. method_access_node (MethodAccessNode): The method access node used for extraction. (*.contains, *.startsWith, etc) """ def __init__( self, json_property_name: str, property_to_extract: list[str], data_type: DataType, ): super().__init__( member_name=f"JSON({json_property_name}).{property_to_extract}", ) self.json_property_name = json_property_name self.property_to_extract = property_to_extract self.data_type = data_type json_property_name: Optional[str] property_to_extract: Optional[list[str]] method_access_node: Optional[MethodAccessNode] data_type: Optional[DataType] class MultipleFieldsNode(Node): """ A node representing multiple fields in a property access structure. It's used when for example one being queried field refers to multiple fields in the database. Attributes: fields (list[PropertyAccessNode]): A list of PropertyAccessNode instances representing the fields. Args: fields (list[PropertyAccessNode]): A list of PropertyAccessNode instances to initialize the node with. """ fields: list[PropertyAccessNode] data_type: Optional[DataType] class PropertiesMappingException(Exception): """ Exception raised for errors in the properties mapping process. Attributes: message (str): Explanation of the error. """ pass class PropertiesMapper: """ A class to map properties in an abstract syntax tree (AST) based on provided metadata. Attributes: properties_metadata (PropertiesMetadata): Metadata containing property mappings. Methods: __init__(properties_metadata: PropertiesMetadata): Initializes the PropertiesMapper with the given properties metadata. map_props_in_ast(abstract_node: Node) -> tuple[Node, list[PropertyMetadataInfo]]: Maps properties in the given AST node based on the properties metadata. __visit_nodes(abstract_node: Node, involved_fields: list[PropertyMetadataInfo]) -> Node: Recursively visits and processes nodes in the AST, mapping properties as needed. __visit_comparison_node(comparison_node: ComparisonNode, involved_fields: list[PropertyMetadataInfo]) -> Node: Visits and processes a comparison node, mapping properties as needed. _visit_member_access_node(member_access_node: MemberAccessNode, involved_fields: list[PropertyMetadataInfo]) -> Node: Visits and processes a member access node, mapping properties as needed. _modify_comparison_node_based_on_mapping(comparison_node: ComparisonNode, mapping: PropertyMetadataInfo) -> Node: Modifies a comparison node based on the provided property metadata mapping. _create_property_access_node(mapping, method_access_node: MethodAccessNode) -> Node: Creates a property access node based on the given mapping and method access node. _map_property(property_access_node: PropertyAccessNode) -> tuple[MultipleFieldsNode, PropertyMetadataInfo]: Maps a property access node to its corresponding database fields based on the metadata. """ def __init__(self, properties_metadata: PropertiesMetadata): self.properties_metadata = properties_metadata def map_props_in_ast( self, abstract_node: Node ) -> tuple[Node, list[PropertyMetadataInfo]]: involved_fields = list[PropertyMetadataInfo]() mapped_ast = self.__visit_nodes(abstract_node, involved_fields) distinct_involved_fields = { field.field_name: field for field in involved_fields } involved_fields = [value for _, value in distinct_involved_fields.items()] return mapped_ast, involved_fields def __visit_nodes( self, abstract_node: Node, involved_fields: list[PropertyMetadataInfo] ) -> Node: if isinstance(abstract_node, ParenthesisNode): return self.__visit_nodes(abstract_node.expression, involved_fields) if isinstance(abstract_node, LogicalNode): left = self.__visit_nodes(abstract_node.left, involved_fields) right = self.__visit_nodes(abstract_node.right, involved_fields) if left is None: return right if right is None: return left return LogicalNode( left=left, operator=abstract_node.operator, right=right, ) if isinstance(abstract_node, ComparisonNode): return self.__visit_comparison_node(abstract_node, involved_fields) if isinstance(abstract_node, MemberAccessNode): return self._visit_member_access_node(abstract_node, involved_fields) if isinstance(abstract_node, UnaryNode): return self.__visit_unary_node(abstract_node, involved_fields) if isinstance(abstract_node, ConstantNode): return abstract_node raise NotImplementedError( f"{type(abstract_node).__name__} node type is not supported yet" ) def __visit_unary_node( self, abstract_node: UnaryNode, involved_fields: list[PropertyMetadataInfo] ): if abstract_node.operator == UnaryNodeOperator.HAS and isinstance( abstract_node.operand, PropertyAccessNode ): mapped_property, property_metadata = self._map_property( property_access_node=abstract_node.operand, throw_mapping_error=False ) involved_fields.append(property_metadata) return UnaryNode(operator=UnaryNodeOperator.HAS, operand=mapped_property) operand = self.__visit_nodes(abstract_node.operand, involved_fields) if operand is None: return UnaryNode( operator=abstract_node.operator, operand=ConstantNode(value=True) ) return UnaryNode( operator=abstract_node.operator, operand=self.__visit_nodes(abstract_node.operand, involved_fields), ) def __visit_comparison_node( self, comparison_node: ComparisonNode, involved_fields: list[PropertyMetadataInfo], ) -> Node: if not isinstance(comparison_node.first_operand, PropertyAccessNode): return comparison_node first_operand, property_metadata = self._map_property( comparison_node.first_operand ) involved_fields.append(property_metadata) comparison_node = ComparisonNode( first_operand=first_operand, operator=comparison_node.operator, second_operand=comparison_node.second_operand, ) return self._modify_comparison_node_based_on_mapping( comparison_node, property_metadata ) def _visit_member_access_node( self, member_access_node: MemberAccessNode, involved_fields: list[PropertyMetadataInfo], ) -> Node: # in case expression is just property access node # it will behave like !!property in JS # converting queried property to boolean and evaluate as boolean mapped_prop, property_metadata = self._map_property(member_access_node) involved_fields.append(property_metadata) return LogicalNode( left=ComparisonNode( first_operand=mapped_prop, operator=ComparisonNodeOperator.NE, second_operand=ConstantNode(value=None), ), operator=LogicalNodeOperator.AND, right=LogicalNode( left=ComparisonNode( first_operand=mapped_prop, operator=ComparisonNodeOperator.NE, second_operand=ConstantNode(value="0"), ), operator=LogicalNodeOperator.AND, right=LogicalNode( left=ComparisonNode( first_operand=mapped_prop, operator=ComparisonNodeOperator.NE, second_operand=ConstantNode(value=False), ), operator=LogicalNodeOperator.AND, right=ComparisonNode( first_operand=mapped_prop, operator=ComparisonNodeOperator.NE, second_operand=ConstantNode(value=""), ), ), ), ) return member_access_node def _modify_comparison_node_based_on_mapping( self, comparison_node: ComparisonNode, mapping: PropertyMetadataInfo ): """ Modifies a comparison node based on the provided property metadata mapping. This method adjusts the comparison node if the property being compared has enumerated values. Specifically, it handles cases where the comparison operator is one of the following: GE (greater than or equal to), GT (greater than), LE (less than or equal to), or LT (less than). If the second operand of the comparison node is not in the enumerated values, it modifies the comparison to use the IN operator with the enumerated values. Additionally, it handles ranges based on the comparison operator and the index of the second operand in the enumerated values. Args: comparison_node (ComparisonNode): The comparison node to be modified. mapping (PropertyMetadataInfo): The property metadata information that includes enumerated values. Returns: ComparisonNode: The modified comparison node, or the original comparison node if no modifications are necessary. """ if not isinstance(comparison_node.second_operand, ConstantNode): return comparison_node if mapping.enum_values: if comparison_node.operator in [ ComparisonNodeOperator.GE, ComparisonNodeOperator.GT, ComparisonNodeOperator.LE, ComparisonNodeOperator.LT, ]: if comparison_node.second_operand.value not in mapping.enum_values: if comparison_node.operator in [ ComparisonNodeOperator.LT, ComparisonNodeOperator.LE, ]: return UnaryNode( operator=UnaryNodeOperator.NOT, operand=ComparisonNode( first_operand=comparison_node.first_operand, operator=ComparisonNodeOperator.IN, second_operand=[ ConstantNode(value=item) for item in mapping.enum_values ], ), ) else: return ComparisonNode( first_operand=comparison_node.first_operand, operator=ComparisonNodeOperator.IN, second_operand=[ ConstantNode(value=item) for item in mapping.enum_values ], ) index = mapping.enum_values.index(comparison_node.second_operand.value) ranges = { ComparisonNodeOperator.GT: [index + 1, None], ComparisonNodeOperator.GE: [index, None], ComparisonNodeOperator.LT: [index, None], ComparisonNodeOperator.LE: [index + 1, None], } start_index, end_index = ranges[comparison_node.operator] if ( comparison_node.operator == ComparisonNodeOperator.LE and start_index >= len(mapping.enum_values) ): # it handles the case when queried value is the last in enum # and hence any value is applicable # and there is no need to even do filtering return None if ( comparison_node.operator == ComparisonNodeOperator.GT and start_index >= len(mapping.enum_values) ): # nothig could be greater than the last value in enum # so it will always return False return ConstantNode(value=False) result = ComparisonNode( first_operand=comparison_node.first_operand, operator=ComparisonNodeOperator.IN, second_operand=[ ConstantNode(value=item) for item in mapping.enum_values[start_index:end_index] ], ) if comparison_node.operator in [ ComparisonNodeOperator.LT, ComparisonNodeOperator.LE, ]: result = UnaryNode(operator=UnaryNodeOperator.NOT, operand=result) return result return comparison_node def _create_property_access_node( self, mapping, data_type: type, method_access_node: MethodAccessNode ) -> Node: if isinstance(mapping, JsonFieldMapping): return JsonPropertyAccessNode( json_property_name=mapping.json_prop, property_to_extract=mapping.prop_in_json, data_type=data_type, ) if isinstance(mapping, SimpleFieldMapping): return PropertyAccessNode( path=[mapping.map_to], data_type=data_type, ) raise NotImplementedError(f"Mapping type {type(mapping).__name__} is not supported yet") def _map_property( self, property_access_node: PropertyAccessNode, throw_mapping_error=True ) -> tuple[MultipleFieldsNode, PropertyMetadataInfo]: property_metadata = self.properties_metadata.get_property_metadata( property_access_node.path ) if not property_metadata: joined_path = ".".join(property_access_node.path) if not throw_mapping_error: return property_access_node, PropertyMetadataInfo( field_name=joined_path, field_mappings=[SimpleFieldMapping(joined_path)], enum_values=None, ) raise PropertiesMappingException( f'Missing mapping configuration for property "{joined_path}"' ) result = [] for mapping in property_metadata.field_mappings: property_access_node = self._create_property_access_node( mapping, property_metadata.data_type, None ) result.append(property_access_node) return ( MultipleFieldsNode(fields=result, data_type=property_metadata.data_type) if len(result) > 1 else result[0] ), property_metadata ================================================ FILE: keep/api/core/cel_to_sql/properties_metadata.py ================================================ import fnmatch import re from keep.api.core.cel_to_sql.ast_nodes import DataType class SimpleFieldMapping: def __init__(self, map_to: str): self.map_to = map_to class JsonFieldMapping: def __init__(self, json_prop: str, prop_in_json: list[str]): self.json_prop = json_prop self.prop_in_json = prop_in_json class PropertyMetadataInfo: def __init__( self, field_name: str, field_mappings: list[SimpleFieldMapping | JsonFieldMapping], enum_values: list[str], data_type: DataType = None, ): self.field_name = field_name self.field_mappings = field_mappings self.enum_values = enum_values self.data_type = data_type class FieldMappingConfiguration: def __init__( self, map_from_pattern: str, map_to: list[str] | str, data_type: DataType = None, enum_values: list[str] = None, ): self.map_from_pattern = map_from_pattern self.enum_values = enum_values self.data_type = data_type self.map_to: list[str] = map_to if isinstance(map_to, list) else [map_to] def remap_fields_configurations( mapping_rules: dict[str, str], field_configurations: list[FieldMappingConfiguration] ) -> list[FieldMappingConfiguration]: """ Remaps the 'map_to' fields in the given field configurations based on the provided mapping rules. Args: mapping_rules (dict[str, str]): A dictionary where keys are the patterns to be replaced and values are the new patterns. field_configurations (list[FieldMappingConfiguration]): A list of FieldMappingConfiguration objects to be remapped. Returns: list[FieldMappingConfiguration]: A new list of FieldMappingConfiguration objects with updated 'map_to' fields. """ result: list[FieldMappingConfiguration] = [ FieldMappingConfiguration( map_from_pattern=item.map_from_pattern, map_to=item.map_to, enum_values=item.enum_values, data_type=item.data_type, ) for item in field_configurations ] for map_from, map_to in mapping_rules.items(): for field_config in result: field_config.map_to = [ item.replace(map_from, map_to) for item in field_config.map_to ] return result class PropertiesMetadata: """ A class to handle metadata properties and mappings for given property paths. Attributes: known_fields_mapping (dict): A dictionary containing known field mappings. known_fields_wildcards (dict): A dictionary containing wildcard patterns from known field mappings. Methods: __init__(known_fields_mapping: dict): Initializes the PropertiesMetadata with known field mappings. get_property_metadata(prop_path: str): Retrieves the metadata for a given property path. If the property path matches a known field or a wildcard pattern, it returns the corresponding mappings. Supports JSON type mappings and simple field mappings. """ def __init__(self, fields_mapping_configurations: list[FieldMappingConfiguration]): self.wildcard_configurations: dict[FieldMappingConfiguration] = {} self.known_configurations: dict[FieldMappingConfiguration] = {} for field_mapping in fields_mapping_configurations: new_field_mapping_config = FieldMappingConfiguration( map_from_pattern=self.__get_property_path_str( self.__extract_fields(field_mapping.map_from_pattern) ), map_to=field_mapping.map_to, data_type=field_mapping.data_type, enum_values=field_mapping.enum_values, ) if '*' in field_mapping.map_from_pattern: self.wildcard_configurations[ new_field_mapping_config.map_from_pattern ] = new_field_mapping_config continue self.known_configurations[new_field_mapping_config.map_from_pattern] = ( new_field_mapping_config ) def get_property_metadata_for_str(self, prop_path_str: str) -> PropertyMetadataInfo: return self.get_property_metadata(self.__extract_fields(prop_path_str)) def get_property_metadata(self, prop_path: list[str]) -> PropertyMetadataInfo: prop_path_str = self.__get_property_path_str(prop_path) field_mapping_config, mapping_key = self.__find_mapping_configuration( prop_path_str ) if not field_mapping_config: return None field_mappings = [] map_to: list[str] = ( field_mapping_config.map_to if isinstance(field_mapping_config.map_to, list) else [field_mapping_config.map_to] ) template_prop = None if "*" in mapping_key: # if mapping_key is a wildcard pattern (alert.*), extract the template prop (alert) regex_pattern = re.escape(mapping_key).replace(r"\*", r"(.*)") regex = re.compile(f"^{regex_pattern}$") match = regex.match(prop_path_str) template_prop = match.group(1) else: # otherwise, the template prop is the prop_path itself template_prop = prop_path_str for item in map_to: match = re.match(r"JSON\(([^)]+)\)", item) # If first element is a JSON mapping (JSON(event).tagsContainer.*) # we extract JSON column (event) and replace * with prop_in_json if match: json_prop = match.group(1) splitted = item.replace(f"JSON({json_prop})", "").split(".") prop_in_json_list = [spl for spl in splitted] if "*" in splitted: prop_in_json_list[splitted.index("*")] = template_prop else: prop_in_json_list.append(template_prop) field_mappings.append( JsonFieldMapping( json_prop=json_prop, prop_in_json=self.__extract_fields( ".".join(prop_in_json_list[1:]) ), # skip JSON column and take the rest ) ) continue splitted = item.split(".") field_mappings.append(SimpleFieldMapping(item)) return PropertyMetadataInfo( field_name=prop_path_str, field_mappings=field_mappings, enum_values=field_mapping_config.enum_values, data_type=field_mapping_config.data_type, ) def __extract_fields(self, property_path_str): """ Extracts fields from a property path string. This method takes a property path string and extracts individual fields from it. The property path string can contain fields separated by dots or enclosed in square brackets. Args: property_path_str (str): The property path string to extract fields from. Returns: list: A list of extracted fields as strings. """ pattern = re.compile(r"\[([^\[\]]+)\]|([^.]+)") matches = pattern.findall(property_path_str) return [m[0] or m[1] for m in matches] def __get_property_path_str(self, prop_path: list[str]) -> str: """ Converts a list of property path components into a single string, ensuring that components with special characters are enclosed in square brackets. Args: prop_path (list[str]): A list of strings representing the property path components. Returns: str: A single string representing the property path, with special characters handled appropriately. """ result = [] for item in prop_path: if re.search(r"[^a-zA-Z0-9*]", item): result.append(f"[{item}]") else: result.append(item) return ".".join(result) def __find_mapping_configuration(self, prop_path_str: str): """ Find the mapping configuration for a given property path. This method searches for a direct mapping configuration in the known configurations. If no direct mapping is found, it checks for wildcard patterns in the wildcard configurations. Args: prop_path (str): The property path to find the mapping configuration for. Returns: tuple: A tuple containing the FieldMappingConfiguration and the mapping key. If no configuration is found, both elements of the tuple will be None. """ field_mapping_config: FieldMappingConfiguration = None mapping_key = None if prop_path_str in self.known_configurations: field_mapping_config = self.known_configurations[prop_path_str] mapping_key = prop_path_str # If no direct mapping is found, check for wildcard patterns in known fields if not field_mapping_config: for pattern, field_mapping_config_from_dict in self.wildcard_configurations.items(): if fnmatch.fnmatch(prop_path_str, pattern): field_mapping_config = field_mapping_config_from_dict mapping_key = pattern break return field_mapping_config, mapping_key ================================================ FILE: keep/api/core/cel_to_sql/sql_providers/base.py ================================================ from typing import Any, List from sqlalchemy import Dialect, String from keep.api.core.cel_to_sql.ast_nodes import ( ComparisonNodeOperator, ConstantNode, DataType, LogicalNodeOperator, MemberAccessNode, Node, LogicalNode, ComparisonNode, UnaryNode, PropertyAccessNode, ParenthesisNode, UnaryNodeOperator, from_type_to_data_type, ) from keep.api.core.cel_to_sql.cel_ast_converter import CelToAstConverter from keep.api.core.cel_to_sql.properties_mapper import JsonPropertyAccessNode, MultipleFieldsNode, PropertiesMapper, PropertiesMappingException from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, PropertiesMetadata, PropertyMetadataInfo, SimpleFieldMapping, ) from celpy import CELParseError class CelToSqlException(Exception): pass class CelToSqlResult: def __init__(self, sql: str, involved_fields: List[PropertyMetadataInfo]): self.sql = sql self.involved_fields = involved_fields class BaseCelToSqlProvider: """ Base class for converting CEL (Common Expression Language) expressions to SQL strings. Methods: convert_to_sql_str(cel: str) -> BuiltQueryMetadata: Converts a CEL expression to an SQL string. json_extract(column: str, path: str) -> str: Abstract method to extract JSON data from a column. Must be implemented in the child class. coalesce(args: List[str]) -> str: Abstract method to perform COALESCE operation. Must be implemented in the child class. _visit_parentheses(node: str) -> str: Wraps a given SQL string in parentheses. _visit_logical_node(logical_node: LogicalNode) -> str: Visits a logical node and converts it to an SQL string. _visit_logical_and(left: str, right: str) -> str: Converts a logical AND operation to an SQL string. _visit_logical_or(left: str, right: str) -> str: Converts a logical OR operation to an SQL string. _visit_comparison_node(comparison_node: ComparisonNode) -> str: Visits a comparison node and converts it to an SQL string. _visit_equal(first_operand: str, second_operand: str) -> str: Converts an equality comparison to an SQL string. _visit_not_equal(first_operand: str, second_operand: str) -> str: Converts a not-equal comparison to an SQL string. _visit_greater_than(first_operand: str, second_operand: str) -> str: Converts a greater-than comparison to an SQL string. _visit_greater_than_or_equal(first_operand: str, second_operand: str) -> str: Converts a greater-than-or-equal comparison to an SQL string. _visit_less_than(first_operand: str, second_operand: str) -> str: Converts a less-than comparison to an SQL string. _visit_less_than_or_equal(first_operand: str, second_operand: str) -> str: Converts a less-than-or-equal comparison to an SQL string. _visit_in(first_operand: Node, array: list[ConstantNode]) -> str: Converts an IN operation to an SQL string. _visit_constant_node(value: str) -> str: Converts a constant value to an SQL string. _visit_multiple_fields_node(multiple_fields_node: MultipleFieldsNode) -> str: Visits a multiple fields node and converts it to an SQL string. _visit_member_access_node(member_access_node: MemberAccessNode) -> str: Visits a member access node and converts it to an SQL string. _visit_property_access_node(property_access_node: PropertyAccessNode) -> str: Visits a property access node and converts it to an SQL string. _visit_index_property(property_path: str) -> str: Abstract method to handle index properties. Must be implemented in the child class. _visit_contains_method_calling(property_path: str, method_args: List[str]) -> str: Abstract method to handle 'contains' method calls. Must be implemented in the child class. _visit_startwith_method_calling(property_path: str, method_args: List[str]) -> str: Abstract method to handle 'startsWith' method calls. Must be implemented in the child class. _visit_endswith_method_calling(property_path: str, method_args: List[str]) -> str: Abstract method to handle 'endsWith' method calls. Must be implemented in the child class. _visit_unary_node(unary_node: UnaryNode) -> str: Visits a unary node and converts it to an SQL string. _visit_unary_not(operand: str) -> str: Converts a NOT operation to an SQL string. """ def __init__(self, dialect: Dialect, properties_metadata: PropertiesMetadata): super().__init__() self.__literal_proc = String("").literal_processor(dialect=dialect) self.properties_metadata = properties_metadata self.properties_mapper = PropertiesMapper(properties_metadata) def convert_to_sql_str(self, cel: str) -> str: return self.convert_to_sql_str_v2(cel).sql def convert_to_sql_str_v2(self, cel: str) -> CelToSqlResult: """ Converts a CEL (Common Expression Language) expression to an SQL string. Args: cel (str): The CEL expression to convert. Returns: str: The resulting SQL string. Returns an empty string if the input CEL expression is empty. Raises: CelToSqlException: If there is an error parsing the CEL expression, mapping properties, or building the SQL filter. """ if not cel: return CelToSqlResult(sql="", involved_fields=[]) try: original_query = CelToAstConverter.convert_to_ast(cel) except CELParseError as e: raise CelToSqlException(f"Error parsing CEL expression: {str(e)}") from e try: with_mapped_props, involved_fields = ( self.properties_mapper.map_props_in_ast(original_query) ) except PropertiesMappingException as e: raise CelToSqlException(f"Error while mapping columns: {str(e)}") from e if not with_mapped_props: return CelToSqlResult(sql="", involved_fields=[]) try: sql_filter = self._build_sql_filter(with_mapped_props, []) return CelToSqlResult(sql=sql_filter, involved_fields=involved_fields) except NotImplementedError as e: raise CelToSqlException(f"Error while converting CEL expression tree to SQL: {str(e)}") from e def get_order_by_expression(self, sort_options: list[tuple[str, str]]) -> str: sort_expressions: list[str] = [] for sort_option in sort_options: sort_by, sort_dir = sort_option sort_dir = sort_dir.lower() order_by_exp = self.get_field_expression(sort_by) sort_expressions.append( f"{order_by_exp} {sort_dir == 'asc' and 'ASC' or 'DESC'}" ) return ", ".join(sort_expressions) def get_field_expression(self, cel_field: str) -> str: metadata = self.properties_metadata.get_property_metadata_for_str(cel_field) field_expressions = [] for field_mapping in metadata.field_mappings: if isinstance(field_mapping, JsonFieldMapping): field_expressions.append( self.json_extract_as_text( field_mapping.json_prop, field_mapping.prop_in_json ) ) continue elif isinstance(field_mapping, SimpleFieldMapping): field_expressions.append(field_mapping.map_to) continue raise ValueError(f"Unsupported field mapping type: {type(field_mapping)}") if len(field_expressions) > 1: return self.coalesce(field_expressions) else: return field_expressions[0] def literal_proc(self, value: Any) -> str: if isinstance(value, str): return self.__literal_proc(value) return f"'{str(value)}'" def _get_order_by_field(self, cel_sort_by: str) -> str: return self.get_field_expression(cel_sort_by) def _build_sql_filter(self, abstract_node: Node, stack: list[Node]) -> str: stack.append(abstract_node) result = None if isinstance(abstract_node, ParenthesisNode): result = self._visit_parentheses( self._build_sql_filter(abstract_node.expression, stack) ) if isinstance(abstract_node, LogicalNode): result = self._visit_logical_node(abstract_node, stack) if isinstance(abstract_node, ComparisonNode): result = self._visit_comparison_node(abstract_node, stack) if isinstance(abstract_node, MemberAccessNode): result = self._visit_member_access_node(abstract_node, stack) if isinstance(abstract_node, UnaryNode): result = self._visit_unary_node(abstract_node, stack) if isinstance(abstract_node, ConstantNode): result = self._visit_constant_node(abstract_node.value) if isinstance(abstract_node, MultipleFieldsNode): result = self._visit_multiple_fields_node(abstract_node, None, stack) if result: stack.pop() return result raise NotImplementedError( f"{type(abstract_node).__name__} node type is not supported yet" ) def json_extract_as_text(self, column: str, path: list[str]) -> str: raise NotImplementedError("Extracting JSON is not implemented. Must be implemented in the child class.") def _json_contains_path(self, column: str, path: list[str]) -> str: raise NotImplementedError( "Extracting JSON is not implemented. Must be implemented in the child class." ) def coalesce(self, args): if len(args) == 1: return args[0] return f"COALESCE({', '.join(args)})" def cast(self, expression_to_cast: str, to_type: DataType, force=False) -> str: raise NotImplementedError("CAST is not implemented. Must be implemented in the child class.") def _visit_parentheses(self, node: str) -> str: return f"({node})" # region Logical Visitors def _visit_logical_node(self, logical_node: LogicalNode, stack: list[Node]) -> str: left = self._build_sql_filter(logical_node.left, stack) right = self._build_sql_filter(logical_node.right, stack) if logical_node.operator == LogicalNodeOperator.AND: return self._visit_logical_and(left, right) elif logical_node.operator == LogicalNodeOperator.OR: return self._visit_logical_or(left, right) raise NotImplementedError( f"{logical_node.operator} logical operator is not supported yet" ) def _visit_logical_and(self, left: str, right: str) -> str: return f"({left} AND {right})" def _visit_logical_or(self, left: str, right: str) -> str: return f"({left} OR {right})" # endregion # region Comparison Visitors def _visit_comparison_node(self, comparison_node: ComparisonNode, stack: list[Node]) -> str: first_operand = None second_operand = None should_cast = comparison_node.operator not in [ ComparisonNodeOperator.CONTAINS, ComparisonNodeOperator.STARTS_WITH, ComparisonNodeOperator.ENDS_WITH, ] first_operand_data_type = None second_operand_data_type = None force_cast = False if comparison_node.operator == ComparisonNodeOperator.IN: if ( isinstance(comparison_node.first_operand, PropertyAccessNode) and comparison_node.first_operand.data_type == DataType.ARRAY ): return self._visit_in_for_array_datatype( comparison_node.first_operand, ( comparison_node.second_operand if isinstance(comparison_node.second_operand, list) else [comparison_node.second_operand] ), stack, ) return self._visit_in( comparison_node.first_operand, ( comparison_node.second_operand if isinstance(comparison_node.second_operand, list) else [comparison_node.second_operand] ), stack, ) if ( comparison_node.operator == ComparisonNodeOperator.EQ and isinstance(comparison_node.first_operand, PropertyAccessNode) and comparison_node.first_operand.data_type == DataType.ARRAY ): return self._visit_equal_for_array_datatype( comparison_node.first_operand, comparison_node.second_operand, ) if should_cast: if isinstance(comparison_node.first_operand, PropertyAccessNode): first_operand_data_type = comparison_node.first_operand.data_type if isinstance(comparison_node.first_operand, JsonPropertyAccessNode): first_operand_data_type = comparison_node.first_operand.data_type force_cast = True if isinstance(comparison_node.first_operand, MultipleFieldsNode): first_operand_data_type = comparison_node.first_operand.data_type force_cast = isinstance( comparison_node.first_operand.fields[0], JsonPropertyAccessNode ) if isinstance(comparison_node.second_operand, ConstantNode): second_operand_data_type = from_type_to_data_type( type(comparison_node.second_operand.value) ) second_operand = self._visit_constant_node( comparison_node.second_operand.value, first_operand_data_type, ) if first_operand is None: first_operand = self._build_sql_filter(comparison_node.first_operand, stack) if second_operand is None: second_operand = self._build_sql_filter( comparison_node.second_operand, stack ) if force_cast or (not first_operand_data_type and second_operand_data_type): first_operand = self.cast( first_operand, second_operand_data_type, ) if comparison_node.operator == ComparisonNodeOperator.EQ: result = self._visit_equal(first_operand, second_operand) elif comparison_node.operator == ComparisonNodeOperator.NE: result = self._visit_not_equal(first_operand, second_operand) elif comparison_node.operator == ComparisonNodeOperator.GT: result = self._visit_greater_than(first_operand, second_operand) elif comparison_node.operator == ComparisonNodeOperator.GE: result = self._visit_greater_than_or_equal(first_operand, second_operand) elif comparison_node.operator == ComparisonNodeOperator.LT: result = self._visit_less_than(first_operand, second_operand) elif comparison_node.operator == ComparisonNodeOperator.LE: result = self._visit_less_than_or_equal(first_operand, second_operand) elif comparison_node.operator == ComparisonNodeOperator.CONTAINS: result = self._visit_contains_method_calling( first_operand, [comparison_node.second_operand] ) elif comparison_node.operator == ComparisonNodeOperator.STARTS_WITH: result = self._visit_starts_with_method_calling( first_operand, [comparison_node.second_operand] ) elif comparison_node.operator == ComparisonNodeOperator.ENDS_WITH: result = self._visit_ends_with_method_calling( first_operand, [comparison_node.second_operand] ) else: raise NotImplementedError( f"{comparison_node.operator} comparison operator is not supported yet" ) return result def _visit_equal(self, first_operand: str, second_operand: str) -> str: if second_operand == "NULL": return f"{first_operand} IS NULL" return f"{first_operand} = {second_operand}" def _visit_equal_for_array_datatype( self, first_operand: Node, second_operand: Node ) -> str: raise NotImplementedError( "Array datatype comparison is not implemented. Must be implemented in the child class." ) def _visit_not_equal(self, first_operand: str, second_operand: str) -> str: if second_operand == "NULL": return f"{first_operand} IS NOT NULL" return f"{first_operand} != {second_operand}" def _visit_greater_than(self, first_operand: str, second_operand: str) -> str: return f"{first_operand} > {second_operand}" def _visit_greater_than_or_equal(self, first_operand: str, second_operand: str) -> str: return f"{first_operand} >= {second_operand}" def _visit_less_than(self, first_operand: str, second_operand: str) -> str: return f"{first_operand} < {second_operand}" def _visit_less_than_or_equal(self, first_operand: str, second_operand: str) -> str: return f"{first_operand} <= {second_operand}" def _visit_in(self, first_operand: Node, array: list[ConstantNode], stack: list[Node]) -> str: constant_value_type = type(array[0].value) cast_to = None if not all(isinstance(item.value, constant_value_type) for item in array): cast_to = DataType.STRING if isinstance(first_operand, JsonPropertyAccessNode): first_operand_str = self._visit_property_access_node(first_operand, stack) if first_operand.data_type: first_operand_str = self.cast( first_operand_str, first_operand.data_type ) elif isinstance(first_operand, PropertyAccessNode): first_operand_str = self._visit_property_access_node(first_operand, stack) if cast_to: first_operand_str = self.cast(first_operand_str, cast_to) elif isinstance(first_operand, MultipleFieldsNode): first_operand_str = self._visit_multiple_fields_node( first_operand, None, stack ) if next( ( item for item in iter(first_operand.fields) if isinstance(item, JsonPropertyAccessNode) ), False, ): if first_operand.data_type: first_operand_str = self.cast( first_operand_str, first_operand.data_type ) first_operand_str = first_operand_str else: first_operand_str = self._build_sql_filter(first_operand, stack) constant_nodes_without_none = [] is_none_found = False for item in array: if isinstance(item, ConstantNode): if item.value is None: is_none_found = True continue constant_nodes_without_none.append(item) or_queries = [] if len(constant_nodes_without_none) > 0: or_queries.append( f"{first_operand_str} in ({ ', '.join([self._visit_constant_node(c.value, self._get_data_type_to_convert(first_operand)) for c in constant_nodes_without_none])})" ) if is_none_found: or_queries.append(self._visit_equal(first_operand_str, "NULL")) if len(or_queries) == 0: return self._visit_constant_node(False) final_query = or_queries[0] for query in or_queries[1:]: final_query = self._visit_logical_or(final_query, query) return final_query def _visit_in_for_array_datatype( self, first_operand: Node, array: list[ConstantNode], stack: list[Node] ) -> str: raise NotImplementedError( "Array datatype IN operator is not implemented. Must be implemented in the child class." ) def _visit_contains_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: raise NotImplementedError( "'contains' method must be implemented in the child class" ) def _visit_starts_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: raise NotImplementedError( "'startsWith' method call must be implemented in the child class" ) def _visit_ends_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: raise NotImplementedError( "'endsWith' method call must be implemented in the child class" ) # endregion def _visit_constant_node( self, value: Any, expected_data_type: DataType = None ) -> str: if value is None: return "NULL" if isinstance(value, str): return self.literal_proc(value) if isinstance(value, bool): return str(value).lower() if isinstance(value, float) or isinstance(value, int): return str(value) raise NotImplementedError(f"{type(value).__name__} constant type is not supported yet. Consider implementing this support in child class.") def _get_data_type_to_convert(self, node: Node) -> DataType: """ Extracts data type from node. The data type will be used to convert the value of constant node into the expected type (SQL type). """ if isinstance(node, PropertyAccessNode): return node.data_type if isinstance(node, MultipleFieldsNode): return node.data_type if isinstance(node, ComparisonNode): return self._get_data_type_to_convert(node.first_operand) raise NotImplementedError( f"Cannot find data type to convert for {type(node).__name__} node" ) # region Member Access Visitors def _visit_multiple_fields_node( self, multiple_fields_node: MultipleFieldsNode, cast_to: DataType, stack ) -> str: coalesce_args = [] for item in multiple_fields_node.fields: arg = self._visit_property_access_node(item, stack) if isinstance(item, JsonPropertyAccessNode) and cast_to: arg = self.cast(arg, cast_to) coalesce_args.append(arg) if len(coalesce_args) == 1: return coalesce_args[0] return self.coalesce(coalesce_args) def _visit_member_access_node(self, member_access_node: MemberAccessNode, stack) -> str: if isinstance(member_access_node, PropertyAccessNode): return self._visit_property_access_node(member_access_node, stack) raise NotImplementedError( f"{type(member_access_node).__name__} member access node is not supported yet" ) def _visit_property_access_node(self, property_access_node: PropertyAccessNode, stack: list[Node]) -> str: if (isinstance(property_access_node, JsonPropertyAccessNode)): return self.json_extract_as_text(property_access_node.json_property_name, property_access_node.property_to_extract) return ".".join([f"{item}" for item in property_access_node.path]) def _visit_index_property(self, property_path: str) -> str: raise NotImplementedError("Index property is not supported yet") # endregion # region Unary Visitors def _visit_unary_node(self, unary_node: UnaryNode, stack: list[Node]) -> str: if unary_node.operator == UnaryNodeOperator.NOT: return self._visit_unary_not(unary_node.operand, stack) if unary_node.operator == UnaryNodeOperator.HAS: return self._visit_unary_has(unary_node.operand, stack) raise NotImplementedError( f"{unary_node.operator} unary operator is not supported yet" ) def _visit_unary_not(self, operand: Node, stack) -> str: return f"NOT ({self._build_sql_filter(operand, stack)})" def _visit_unary_has(self, operand: Node, stack) -> str: if isinstance(operand, JsonPropertyAccessNode): return self._json_contains_path( operand.json_property_name, operand.property_to_extract ) if isinstance(operand, PropertyAccessNode): # In case when it's simple property access and property metadata exists for path, we match all rows (return TRUE) # otherwise, we filter out all rows (return FALSE) return ( "TRUE" if self.properties_metadata.get_property_metadata(operand.path) else "FALSE" ) if isinstance(operand, MultipleFieldsNode): return self._build_sql_filter( self.__convert_to_or( [ UnaryNode(operator=UnaryNodeOperator.HAS, operand=field) for field in operand.fields ] ), stack, ) return "FALSE" def __convert_to_or(self, expressions: Node) -> LogicalNode: """ Converts a list of expressions to an OR expression. Args: expressions (Node): The list of expressions to convert. Returns: str: The resulting OR expression. """ node = None for expression in expressions: if node is None: node = expression continue node = LogicalNode( left=node, operator=LogicalNodeOperator.OR, right=expression, ) return node # endregion ================================================ FILE: keep/api/core/cel_to_sql/sql_providers/get_cel_to_sql_provider_for_dialect.py ================================================ from keep.api.core.cel_to_sql.properties_metadata import PropertiesMetadata from keep.api.core.cel_to_sql.sql_providers.base import BaseCelToSqlProvider from keep.api.core.cel_to_sql.sql_providers.postgresql import CelToPostgreSqlProvider from keep.api.core.cel_to_sql.sql_providers.sqlite import CelToSqliteProvider from keep.api.core.cel_to_sql.sql_providers.mysql import CelToMySqlProvider from keep.api.core.db import engine def get_cel_to_sql_provider( properties_metadata: PropertiesMetadata, ) -> BaseCelToSqlProvider: return get_cel_to_sql_provider_for_dialect(engine.dialect.name, properties_metadata) def get_cel_to_sql_provider_for_dialect( dialect_name: str, properties_metadata: PropertiesMetadata, ) -> BaseCelToSqlProvider: if dialect_name == "sqlite": return CelToSqliteProvider(engine.dialect, properties_metadata) elif dialect_name == "mysql": return CelToMySqlProvider(engine.dialect, properties_metadata) elif dialect_name == "postgresql": return CelToPostgreSqlProvider(engine.dialect, properties_metadata) else: raise ValueError(f"Unsupported dialect: {engine.dialect.name}") ================================================ FILE: keep/api/core/cel_to_sql/sql_providers/mysql.py ================================================ from datetime import datetime from typing import List from uuid import UUID from keep.api.core.cel_to_sql.ast_nodes import ( ComparisonNode, ComparisonNodeOperator, ConstantNode, DataType, LogicalNode, LogicalNodeOperator, Node, PropertyAccessNode, ) from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, SimpleFieldMapping, ) from keep.api.core.cel_to_sql.sql_providers.base import BaseCelToSqlProvider class CelToMySqlProvider(BaseCelToSqlProvider): def json_extract_as_text(self, column: str, path: list[str]) -> str: return f"JSON_UNQUOTE({self._json_extract(column, path)})" def _json_contains_path(self, column: str, path: list[str]) -> str: property_path_str = ".".join([f'"{item}"' for item in path]) return f"JSON_CONTAINS_PATH({column}, 'one', '$.{property_path_str}')" def cast(self, expression_to_cast: str, to_type, force=False): if to_type == DataType.BOOLEAN: cast_conditions = { # f"{expression_to_cast} is NULL": "FALSE", f"LOWER({expression_to_cast}) = 'true'": "TRUE", f"LOWER({expression_to_cast}) = 'false'": "FALSE", f"CAST({expression_to_cast} AS SIGNED) >= 1": "TRUE", f"CAST({expression_to_cast} AS SIGNED) <= 1": "FALSE", f"{expression_to_cast} != ''": "TRUE", } result = " ".join( [f"WHEN {key} THEN {value}" for key, value in cast_conditions.items()] ) result = f"CASE {result} ELSE FALSE END" return result if not force: # MySQL does not need explicit cast for other than boolean because it does it implicitly # so if not forced, we return the expression as is return expression_to_cast if to_type == DataType.INTEGER: return f"CAST({expression_to_cast} AS SIGNED)" elif to_type == DataType.FLOAT: return f"CAST({expression_to_cast} AS DOUBLE)" else: return expression_to_cast def _json_extract(self, column: str, path: list[str]) -> str: property_path_str = ".".join([f'"{item}"' for item in path]) return f"JSON_EXTRACT({column}, '$.{property_path_str}')" def get_order_by_expression(self, sort_options: list[tuple[str, str]]) -> str: sort_expressions: list[str] = [] for sort_option in sort_options: sort_by, sort_dir = sort_option sort_dir = sort_dir.lower() order_by_exp = self._get_order_by_field(sort_by) sort_expressions.append( f"{order_by_exp} {sort_dir == 'asc' and 'ASC' or 'DESC'}" ) return ", ".join(sort_expressions) def _get_order_by_field(self, cel_sort_by: str): """Overriden, because for MySql we need to just use JSON_EXTRACT wihout JSON_UNQOUTE to sorting work like expected""" metadata = self.properties_metadata.get_property_metadata_for_str(cel_sort_by) field_expressions = [] for field_mapping in metadata.field_mappings: if isinstance(field_mapping, JsonFieldMapping): field_expressions.append( self._json_extract( field_mapping.json_prop, field_mapping.prop_in_json ) ) continue elif isinstance(field_mapping, SimpleFieldMapping): field_expressions.append(field_mapping.map_to) continue raise ValueError(f"Unsupported field mapping type: {type(field_mapping)}") if len(field_expressions) > 1: return self.coalesce(field_expressions) else: return field_expressions[0] def _visit_constant_node( self, value: str, expected_data_type: DataType = None ) -> str: if expected_data_type is DataType.UUID: str_value = str(value) try: # Because MySQL works with UUID without dashes, we need to convert it to a hex string # Example: 123e4567-e89b-12d3-a456-426614174000 -> 123e4567e89b12d3a456426614174000 # Example2: 123e4567e89b12d3a456426614174000 -> 123e4567e89b12d3a456426614174000 (hex in CEL is also supported) value = UUID(str_value).hex except ValueError: pass if isinstance(value, datetime): date_str = self.literal_proc(value.strftime("%Y-%m-%d %H:%M:%S")) date_exp = f"CAST({date_str} as DATETIME)" return date_exp elif isinstance(value, bool): return "TRUE" if value else "FALSE" return super()._visit_constant_node(value, expected_data_type) def _visit_contains_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.contains accepts 1 argument but got {len(method_args)}') value = ( method_args[0].value.lower() if isinstance(method_args[0].value, str) else method_args[0].value ) processed_literal = self.literal_proc(value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND LOWER({property_path}) LIKE '%{unquoted_literal}%'" def _visit_starts_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.startsWith accepts 1 argument but got {len(method_args)}') value = ( method_args[0].value.lower() if isinstance(method_args[0].value, str) else method_args[0].value ) processed_literal = self.literal_proc(value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND LOWER({property_path}) LIKE '{unquoted_literal}%'" def _visit_ends_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.endsWith accepts 1 argument but got {len(method_args)}') value = ( method_args[0].value.lower() if isinstance(method_args[0].value, str) else method_args[0].value ) processed_literal = self.literal_proc(value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND LOWER({property_path}) LIKE '%{unquoted_literal}'" def _visit_equal_for_array_datatype( self, first_operand: Node, second_operand: Node ) -> str: if not isinstance(first_operand, PropertyAccessNode): raise NotImplementedError( f"Array datatype comparison is not supported for {type(first_operand).__name__} node" ) if not isinstance(second_operand, ConstantNode): raise NotImplementedError( f"Array datatype comparison is not supported for {type(second_operand).__name__} node" ) prop = self._visit_property_access_node(first_operand, []) constant_node_value = self._visit_constant_node(second_operand.value) if constant_node_value == "NULL": return f"(JSON_CONTAINS({prop}, '[null]') OR {prop} IS NULL OR JSON_LENGTH({prop}) = 0)" elif constant_node_value.startswith("'") and constant_node_value.endswith("'"): constant_node_value = constant_node_value[1:-1] return f"JSON_CONTAINS({prop}, '[\"{constant_node_value}\"]')" def _visit_in_for_array_datatype( self, first_operand: Node, array: list[ConstantNode], stack: list[Node] ) -> str: node = None for item in array: current_node = ComparisonNode( first_operand=first_operand, operator=ComparisonNodeOperator.EQ, second_operand=item, ) if not node: node = current_node continue node = LogicalNode( left=node, operator=LogicalNodeOperator.OR, right=current_node, ) return self._build_sql_filter(node, stack) ================================================ FILE: keep/api/core/cel_to_sql/sql_providers/postgresql.py ================================================ from datetime import datetime from typing import List from uuid import UUID from keep.api.core.cel_to_sql.ast_nodes import ( ComparisonNode, ComparisonNodeOperator, ConstantNode, LogicalNode, LogicalNodeOperator, Node, PropertyAccessNode, ) from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, SimpleFieldMapping, ) from keep.api.core.cel_to_sql.sql_providers.base import BaseCelToSqlProvider from keep.api.core.cel_to_sql.ast_nodes import DataType class CelToPostgreSqlProvider(BaseCelToSqlProvider): def json_extract_as_text(self, column: str, path: list[str]) -> str: all_columns = [column] + [f"'{item}'" for item in path] json_property_path = " -> ".join(all_columns[:-1]) return f"({json_property_path}) ->> {all_columns[-1]}" # (json_column -> 'labels' -> tags) ->> 'service' def _json_contains_path(self, column: str, path: list[str]) -> str: property_path_str = ".".join([f'"{item}"' for item in path]) return f"JSONB_PATH_EXISTS({column}::JSONB, '$.{property_path_str}')" def cast(self, expression_to_cast: str, to_type: DataType, force=False): if to_type == DataType.STRING: to_type_str = "TEXT" elif to_type == DataType.INTEGER or to_type == DataType.FLOAT: to_type_str = "FLOAT" elif to_type == DataType.NULL: return expression_to_cast elif to_type == DataType.DATETIME: to_type_str = "TIMESTAMP" elif to_type == DataType.BOOLEAN: # to_type_str = "BOOLEAN" cast_conditions = { f"LOWER({expression_to_cast}) = 'true'": "true", f"LOWER({expression_to_cast}) = 'false'": "false", # regex match ensures safe casting to float f"{expression_to_cast} ~ '^[-+]?[0-9]*\\.?[0-9]+$'": f"CAST({expression_to_cast} AS FLOAT) >= 1", f"LOWER({expression_to_cast}) != ''": "true", } result = " ".join( [ f"WHEN {condition} THEN {value}" for condition, value in cast_conditions.items() ] ) result = f"CASE {result} ELSE false END" return result else: raise ValueError(f"Unsupported type: {to_type}") return f"({expression_to_cast})::{to_type_str}" def get_field_expression(self, cel_field): """ Overriden, because for PostgreSql we need to cast columns to known data types (because every JSON operation returns just text). This is used in ordering to correctly order rows in accordance to their types and not lexicographically. """ metadata = self.properties_metadata.get_property_metadata_for_str(cel_field) field_expressions = [] for field_mapping in metadata.field_mappings: if isinstance(field_mapping, JsonFieldMapping): json_exp = self.json_extract_as_text( field_mapping.json_prop, field_mapping.prop_in_json ) if ( metadata.data_type != DataType.STRING and metadata.data_type is not None ): json_exp = self.cast(json_exp, metadata.data_type) field_expressions.append(json_exp) continue elif isinstance(field_mapping, SimpleFieldMapping): field_expressions.append(field_mapping.map_to) continue raise ValueError(f"Unsupported field mapping type: {type(field_mapping)}") if len(field_expressions) > 1: return self.coalesce(field_expressions) else: return field_expressions[0] def _visit_constant_node( self, value: str, expected_data_type: DataType = None ) -> str: if expected_data_type == DataType.UUID: str_value = str(value) try: # Because PostgreSQL works with UUID with dashes, we need to convert it to a UUID with dashes string # Example: 123e4567e89b12d3a456426614174000 -> 123e4567-e89b-12d3-a456-426614174000 # Example2: 123e4567-e89b-12d3-a456-426614174000 -> 123e4567-e89b-12d3-a456-426614174000 (dashed UUID in CEL is also supported) value = str(UUID(str_value)) except ValueError: pass if isinstance(value, datetime): date_str = self.literal_proc(value.strftime("%Y-%m-%d %H:%M:%S")) date_exp = f"CAST({date_str} as TIMESTAMP)" return date_exp return super()._visit_constant_node(value) def _visit_contains_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.contains accepts 1 argument but got {len(method_args)}') processed_literal = self.literal_proc(method_args[0].value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND {property_path} ILIKE '%{unquoted_literal}%'" def _visit_starts_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.startsWith accepts 1 argument but got {len(method_args)}') processed_literal = self.literal_proc(method_args[0].value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND {property_path} ILIKE '{unquoted_literal}%'" def _visit_ends_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.endsWith accepts 1 argument but got {len(method_args)}') processed_literal = self.literal_proc(method_args[0].value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND {property_path} ILIKE '%{unquoted_literal}'" def _visit_equal_for_array_datatype( self, first_operand: Node, second_operand: Node ) -> str: if not isinstance(first_operand, PropertyAccessNode): raise NotImplementedError( f"Array datatype comparison is not supported for {type(first_operand).__name__} node" ) if not isinstance(second_operand, ConstantNode): raise NotImplementedError( f"Array datatype comparison is not supported for {type(second_operand).__name__} node" ) prop = self._visit_property_access_node(first_operand, []) constant_node_value = self._visit_constant_node(second_operand.value) if constant_node_value == "NULL": return f"({prop}::jsonb @> '[null]' OR {prop} IS NULL OR jsonb_array_length({prop}::jsonb) = 0)" elif constant_node_value.startswith("'") and constant_node_value.endswith("'"): constant_node_value = constant_node_value[1:-1] return f"{prop}::jsonb @> '[\"{constant_node_value}\"]'" def _visit_in_for_array_datatype( self, first_operand: Node, array: list[ConstantNode], stack: list[Node] ) -> str: node = None for item in array: current_node = ComparisonNode( first_operand=first_operand, operator=ComparisonNodeOperator.EQ, second_operand=item, ) if not node: node = current_node continue node = LogicalNode( left=node, operator=LogicalNodeOperator.OR, right=current_node, ) return self._build_sql_filter(node, stack) ================================================ FILE: keep/api/core/cel_to_sql/sql_providers/sqlite.py ================================================ from datetime import datetime from typing import List from uuid import UUID from keep.api.core.cel_to_sql.ast_nodes import ( ConstantNode, DataType, Node, PropertyAccessNode, ) from keep.api.core.cel_to_sql.sql_providers.base import BaseCelToSqlProvider class CelToSqliteProvider(BaseCelToSqlProvider): def json_extract_as_text(self, column: str, path: list[str]) -> str: property_path_str = ".".join([f'"{item}"' for item in path]) return f"json_extract({column}, '$.{property_path_str}')" def _json_contains_path(self, column: str, path: list[str]) -> str: """ Generates a SQL expression to check if a JSON column contains a specific path. This method constructs a SQL query using SQLite's JSON functions to determine whether a JSON object in a specified column contains a given path. The path is represented as a list of keys, and the method supports both single-level and nested paths. Args: column (str): The name of the JSON column in the database table. path (list[str]): A list of keys representing the JSON path to check. Returns: str: A SQL expression that evaluates to true if the specified path exists in the JSON column. Example: For a JSON column `json_column` and a path `['a', 'b', 'c']`, the method generates a SQL query similar to: ``` EXISTS ( SELECT 1 FROM json_each(json_extract(json_column, '$.a.b')) WHERE json_each.key = 'c' ) ``` """ json_each_exp = None key_name = None if len(path) == 1: json_each_exp = f"json_each({column})" key_name = path[0] else: last_key = path[-1] other_keys = path[:-1] json_each_exp = ( f"json_each({self.json_extract_as_text(column, other_keys)})" ) key_name = last_key return ( f"EXISTS (SELECT 1 FROM {json_each_exp} WHERE json_each.key = '{key_name}')" ) def cast(self, expression_to_cast: str, to_type: DataType, force=False): if to_type == DataType.STRING: to_type_str = "TEXT" elif to_type == DataType.NULL: return expression_to_cast elif to_type == DataType.INTEGER or to_type == DataType.FLOAT: to_type_str = "REAL" elif to_type == DataType.DATETIME: return expression_to_cast elif to_type == DataType.BOOLEAN: cast_conditions = { # f"{expression_to_cast} is NULL": "FALSE", f"LOWER({expression_to_cast}) = 'true'": "TRUE", f"LOWER({expression_to_cast}) = 'false'": "FALSE", f"CAST({expression_to_cast} AS SIGNED) >= 1": "TRUE", f"CAST({expression_to_cast} AS SIGNED) <= 1": "FALSE", f"{expression_to_cast} != ''": "TRUE", } result = " ".join( [f"WHEN {key} THEN {value}" for key, value in cast_conditions.items()] ) result = f"CASE {result} ELSE FALSE END" return result else: raise ValueError(f"Unsupported type: {type}") return f"CAST({expression_to_cast} as {to_type_str})" def _visit_constant_node( self, value: str, expected_data_type: DataType = None ) -> str: if expected_data_type == DataType.UUID: str_value = str(value) try: # Because SQLite works with UUID without dashes, we need to convert it to a hex string # Example: 123e4567-e89b-12d3-a456-426614174000 -> 123e4567e89b12d3a456426614174000 # Example2: 123e4567e89b12d3a456426614174000 -> 123e4567e89b12d3a456426614174000 (hex in CEL is also supported) value = UUID(str_value).hex except ValueError: pass if isinstance(value, datetime): date_str = self.literal_proc(value.strftime("%Y-%m-%d %H:%M:%S")) date_exp = f"datetime({date_str})" return date_exp return super()._visit_constant_node(value, expected_data_type) def _visit_property_path(self, property_path: str) -> str: pass def _visit_contains_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.contains accepts 1 argument but got {len(method_args)}') processed_literal = self.literal_proc(method_args[0].value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND {property_path} LIKE '%{unquoted_literal}%'" def _visit_starts_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.startsWith accepts 1 argument but got {len(method_args)}') processed_literal = self.literal_proc(method_args[0].value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND {property_path} LIKE '{unquoted_literal}%'" def _visit_ends_with_method_calling( self, property_path: str, method_args: List[ConstantNode] ) -> str: if len(method_args) != 1: raise ValueError(f'{property_path}.endsWith accepts 1 argument but got {len(method_args)}') processed_literal = self.literal_proc(method_args[0].value) unquoted_literal = processed_literal[1:-1] return f"{property_path} IS NOT NULL AND {property_path} LIKE '%{unquoted_literal}'" def _visit_equal_for_array_datatype( self, first_operand: Node, second_operand: Node ) -> str: if not isinstance(first_operand, PropertyAccessNode): raise NotImplementedError( f"Array datatype comparison is not supported for {type(first_operand).__name__} node" ) if not isinstance(second_operand, ConstantNode): raise NotImplementedError( f"Array datatype comparison is not supported for {type(second_operand).__name__} node" ) prop = self._visit_property_access_node(first_operand, []) if second_operand.value is None: return f"({prop} IS NULL OR {prop} = '[]')" value = self._visit_constant_node(second_operand.value)[1:-1] return f"(SELECT 1 FROM json_each({prop}) as json_array WHERE json_array.value = '{value}')" def _visit_in_for_array_datatype( self, first_operand: Node, array: list[ConstantNode], stack: list[Node] ) -> str: in_opratation = self._visit_in( PropertyAccessNode(path=["json_array", "value"]), array, stack ) column = self._visit_property_access_node(first_operand, []) array_filter = ( f"(SELECT 1 FROM json_each({column}) as json_array WHERE {in_opratation})" ) is_none_in_list = next((True for item in array if item.value is None), False) if is_none_in_list: return f"({column} = '[]' OR {column} IS NULL OR {array_filter})" return array_filter ================================================ FILE: keep/api/core/config.py ================================================ import pathlib from starlette.config import Config ROOT = pathlib.Path(__file__).resolve().parent.parent # app/ BASE_DIR = ROOT.parent # ./ try: config = Config(BASE_DIR / ".env") except FileNotFoundError: config = Config() ================================================ FILE: keep/api/core/db.py ================================================ """ Keep main database module. This module contains the CRUD database functions for Keep. """ import hashlib import json import logging import random import uuid from collections import defaultdict from contextlib import contextmanager from datetime import datetime, timedelta, timezone from functools import wraps from typing import Any, Callable, Dict, Iterator, List, Tuple, Type, Union, Optional from uuid import UUID, uuid4 from dateutil.parser import parse from dateutil.tz import tz from dotenv import find_dotenv, load_dotenv from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor from psycopg2.errors import NoActiveSqlTransaction from retry import retry from sqlalchemy import ( String, and_, case, cast, desc, func, literal, null, select, union, update, ) from sqlalchemy.dialects.mysql import insert as mysql_insert from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.dialects.sqlite import insert as sqlite_insert from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.orm import foreign, joinedload, subqueryload from sqlalchemy.orm.exc import StaleDataError from sqlalchemy.sql import exists, expression from sqlalchemy.sql.functions import count from sqlmodel import Session, SQLModel, col, or_, select, text from sqlalchemy.orm.attributes import flag_modified from keep.api.consts import STATIC_PRESETS from keep.api.core.config import config from keep.api.core.db_utils import ( create_db_engine, custom_serialize, get_json_extract_field, get_or_create, ) from keep.api.core.dependencies import SINGLE_TENANT_UUID # This import is required to create the tables from keep.api.models.action_type import ActionType from keep.api.models.ai_external import ( ExternalAIConfigAndMetadata, ExternalAIConfigAndMetadataDto, ) from keep.api.models.alert import AlertStatus from keep.api.models.db.action import Action from keep.api.models.db.ai_external import * # pylint: disable=unused-wildcard-import from keep.api.models.db.alert import * # pylint: disable=unused-wildcard-import from keep.api.models.db.dashboard import * # pylint: disable=unused-wildcard-import from keep.api.models.db.enrichment_event import * # pylint: disable=unused-wildcard-import from keep.api.models.db.extraction import * # pylint: disable=unused-wildcard-import from keep.api.models.db.incident import * # pylint: disable=unused-wildcard-import from keep.api.models.db.maintenance_window import * # pylint: disable=unused-wildcard-import from keep.api.models.db.mapping import * # pylint: disable=unused-wildcard-import from keep.api.models.db.preset import * # pylint: disable=unused-wildcard-import from keep.api.models.db.provider import * # pylint: disable=unused-wildcard-import from keep.api.models.db.provider_image import * # pylint: disable=unused-wildcard-import from keep.api.models.db.rule import * # pylint: disable=unused-wildcard-import from keep.api.models.db.system import * # pylint: disable=unused-wildcard-import from keep.api.models.db.tenant import * # pylint: disable=unused-wildcard-import from keep.api.models.db.topology import * # pylint: disable=unused-wildcard-import from keep.api.models.db.workflow import * # pylint: disable=unused-wildcard-import from keep.api.models.incident import IncidentDto, IncidentDtoIn, IncidentSorting from keep.api.models.time_stamp import TimeStampFilter logger = logging.getLogger(__name__) # this is a workaround for gunicorn to load the env vars # because somehow in gunicorn it doesn't load the .env file load_dotenv(find_dotenv()) engine = create_db_engine() SQLAlchemyInstrumentor().instrument(enable_commenter=True, engine=engine) ALLOWED_INCIDENT_FILTERS = [ "status", "severity", "sources", "affected_services", "assignee", ] KEEP_AUDIT_EVENTS_ENABLED = config("KEEP_AUDIT_EVENTS_ENABLED", cast=bool, default=True) INTERVAL_WORKFLOWS_RELAUNCH_TIMEOUT = timedelta(minutes=60) WORKFLOWS_TIMEOUT = timedelta(minutes=120) def dispose_session(): logger.info("Disposing engine pool") if engine.dialect.name != "sqlite": engine.dispose(close=False) logger.info("Engine pool disposed") else: logger.info("Engine pool is sqlite, not disposing") @contextmanager def existed_or_new_session(session: Optional[Session] = None) -> Iterator[Session]: try: if session: yield session else: with Session(engine) as session: yield session except Exception as e: e.session = session raise e def get_session() -> Session: """ Creates a database session. Yields: Session: A database session """ from opentelemetry import trace # pylint: disable=import-outside-toplevel tracer = trace.get_tracer(__name__) with tracer.start_as_current_span("get_session"): with Session(engine) as session: yield session def get_session_sync() -> Session: """ Creates a database session. Returns: Session: A database session """ return Session(engine) def __convert_to_uuid(value: str, should_raise: bool = False) -> UUID | None: try: return UUID(value) except ValueError: if should_raise: raise ValueError(f"Invalid UUID: {value}") return None def retry_on_db_error(f): @retry( exceptions=(OperationalError, IntegrityError, StaleDataError), tries=3, delay=0.1, backoff=2, jitter=(0, 0.1), logger=logger, ) @wraps(f) def wrapper(*args, **kwargs): try: return f(*args, **kwargs) except (OperationalError, IntegrityError, StaleDataError) as e: if hasattr(e, "session") and not e.session.is_active: e.session.rollback() if "Deadlock found" in str(e): logger.warning( "Deadlock detected, retrying transaction", extra={"error": str(e)} ) raise # retry will catch this else: logger.exception( f"Error while executing transaction during {f.__name__}", ) raise # if it's not a deadlock, let it propagate return wrapper def create_workflow_execution( workflow_id: str, workflow_revision: int, tenant_id: str, triggered_by: str, execution_number: int = 1, event_id: str = None, fingerprint: str = None, execution_id: str = None, event_type: str = "alert", test_run: bool = False, ) -> str: with Session(engine) as session: try: workflow_execution_id = execution_id or ( str(uuid4()) if not test_run else "test_" + str(uuid4()) ) if len(triggered_by) > 255: triggered_by = triggered_by[:255] workflow_execution = WorkflowExecution( id=workflow_execution_id, workflow_id=workflow_id, workflow_revision=workflow_revision, tenant_id=tenant_id, started=datetime.now(tz=timezone.utc), triggered_by=triggered_by, execution_number=execution_number, status="in_progress", error=None, execution_time=None, results={}, is_test_run=test_run, ) session.add(workflow_execution) # Ensure the object has an id session.flush() execution_id = workflow_execution.id if KEEP_AUDIT_EVENTS_ENABLED: if fingerprint and event_type == "alert": workflow_to_alert_execution = WorkflowToAlertExecution( workflow_execution_id=execution_id, alert_fingerprint=fingerprint, event_id=event_id, ) session.add(workflow_to_alert_execution) elif event_type == "incident": workflow_to_incident_execution = WorkflowToIncidentExecution( workflow_execution_id=execution_id, alert_fingerprint=fingerprint, incident_id=event_id, ) session.add(workflow_to_incident_execution) session.commit() return execution_id except IntegrityError: session.rollback() logger.debug( f"Failed to create a new execution for workflow {workflow_id}. Constraint is met." ) raise def get_mapping_rule_by_id( tenant_id: str, rule_id: str, session: Optional[Session] = None ) -> MappingRule | None: with existed_or_new_session(session) as session: query = select(MappingRule).where( MappingRule.tenant_id == tenant_id, MappingRule.id == rule_id ) return session.exec(query).first() def get_extraction_rule_by_id( tenant_id: str, rule_id: str, session: Optional[Session] = None ) -> ExtractionRule | None: with existed_or_new_session(session) as session: query = select(ExtractionRule).where( ExtractionRule.tenant_id == tenant_id, ExtractionRule.id == rule_id ) return session.exec(query).first() def get_last_completed_execution( session: Session, workflow_id: str ) -> WorkflowExecution: return session.exec( select(WorkflowExecution) .where(WorkflowExecution.workflow_id == workflow_id) .where(WorkflowExecution.is_test_run == False) .where( (WorkflowExecution.status == "success") | (WorkflowExecution.status == "error") | (WorkflowExecution.status == "providers_not_configured") ) .order_by(WorkflowExecution.execution_number.desc()) .limit(1) ).first() def get_timeouted_workflow_exections(): with Session(engine) as session: logger.debug("Checking for timeouted workflows") timeouted_workflows = [] try: result = session.exec( select(WorkflowExecution) .filter(WorkflowExecution.status == "in_progress") .filter( WorkflowExecution.started <= datetime.utcnow() - WORKFLOWS_TIMEOUT ) ) timeouted_workflows = result.all() except Exception as e: logger.exception("Failed to get timeouted workflows: ", e) logger.debug(f"Found {len(timeouted_workflows)} timeouted workflows") return timeouted_workflows def get_workflows_that_should_run(): with Session(engine) as session: logger.debug("Checking for workflows that should run") workflows_with_interval = [] try: result = session.exec( select(Workflow) .filter(Workflow.is_deleted == False) .filter(Workflow.is_disabled == False) .filter(Workflow.interval != None) .filter(Workflow.interval > 0) ) workflows_with_interval = result.all() if result else [] except Exception: logger.exception("Failed to get workflows with interval") logger.debug(f"Found {len(workflows_with_interval)} workflows with interval") workflows_to_run = [] # for each workflow: for workflow in workflows_with_interval: current_time = datetime.utcnow() last_execution = get_last_completed_execution(session, workflow.id) # if there no last execution, that's the first time we run the workflow if not last_execution: try: # try to get the lock workflow_execution_id = create_workflow_execution( workflow.id, workflow.revision, workflow.tenant_id, "scheduler" ) # we succeed to get the lock on this execution number :) # let's run it workflows_to_run.append( { "tenant_id": workflow.tenant_id, "workflow_id": workflow.id, "workflow_execution_id": workflow_execution_id, } ) # some other thread/instance has already started to work on it except IntegrityError: continue # else, if the last execution was more than interval seconds ago, we need to run it elif ( last_execution.started + timedelta(seconds=workflow.interval) <= current_time ): try: # try to get the lock with execution_number + 1 workflow_execution_id = create_workflow_execution( workflow.id, workflow.revision, workflow.tenant_id, "scheduler", last_execution.execution_number + 1, ) # we succeed to get the lock on this execution number :) # let's run it workflows_to_run.append( { "tenant_id": workflow.tenant_id, "workflow_id": workflow.id, "workflow_execution_id": workflow_execution_id, } ) # continue to the next one continue # some other thread/instance has already started to work on it except IntegrityError: # we need to verify the locking is still valid and not timeouted session.rollback() pass # get the ongoing execution ongoing_execution = session.exec( select(WorkflowExecution) .where(WorkflowExecution.workflow_id == workflow.id) .where( WorkflowExecution.execution_number == last_execution.execution_number + 1 ) .limit(1) ).first() # this is a WTF exception since if this (workflow_id, execution_number) does not exist, # we would be able to acquire the lock if not ongoing_execution: logger.error( f"WTF: ongoing execution not found {workflow.id} {last_execution.execution_number + 1}" ) continue # if this completed, error, than that's ok - the service who locked the execution is done elif ongoing_execution.status != "in_progress": continue # if the ongoing execution runs more than timeout minutes, relaunch it elif ( ongoing_execution.started + INTERVAL_WORKFLOWS_RELAUNCH_TIMEOUT <= current_time ): ongoing_execution.status = "timeout" session.commit() # re-create the execution and try to get the lock try: workflow_execution_id = create_workflow_execution( workflow.id, workflow.revision, workflow.tenant_id, "scheduler", ongoing_execution.execution_number + 1, ) # some other thread/instance has already started to work on it and that's ok except IntegrityError: logger.debug( f"Failed to create a new execution for workflow {workflow.id} [timeout]. Constraint is met." ) continue # managed to acquire the (workflow_id, execution_number) lock workflows_to_run.append( { "tenant_id": workflow.tenant_id, "workflow_id": workflow.id, "workflow_execution_id": workflow_execution_id, } ) else: logger.debug( f"Workflow {workflow.id} is already running by someone else" ) return workflows_to_run def update_workflow_by_id( id: str, name: str, tenant_id: str, description: str | None, interval: int, workflow_raw: str, is_disabled: bool, updated_by: str, provisioned: bool = False, provisioned_file: str | None = None, ): with Session(engine, expire_on_commit=False) as session: if provisioned: # if workflow is provisioned, we lookup by name to not duplicate workflows on each backend restart existing_workflow = get_workflow_by_name(tenant_id, name) else: # otherwise, we want certainty, so just lookup by id existing_workflow = get_workflow_by_id(tenant_id, id) if not existing_workflow: raise ValueError("Workflow not found") return update_workflow_with_values( existing_workflow, name=name, description=description, interval=interval, workflow_raw=workflow_raw, is_disabled=is_disabled, provisioned=provisioned, provisioned_file=provisioned_file, updated_by=updated_by, session=session, ) def update_workflow_with_values( existing_workflow: Workflow, name: str, description: str | None, interval: int | None, workflow_raw: str, is_disabled: bool, updated_by: str, provisioned: bool = False, provisioned_file: str | None = None, session: Session | None = None, ): # In case the workflow name changed to empty string, keep the old name name = name or existing_workflow.name with existed_or_new_session(session) as session: # Get the latest revision number for this workflow latest_version = session.exec( select(WorkflowVersion) .where(col(WorkflowVersion.workflow_id) == existing_workflow.id) .order_by(col(WorkflowVersion.revision).desc()) .limit(1) ).first() next_revision = (latest_version.revision if latest_version else 0) + 1 # Update all existing versions to not be current session.exec( update(WorkflowVersion) .where(col(WorkflowVersion.workflow_id) == existing_workflow.id) .values(is_current=False) # type: ignore[attr-defined] ) # creating a new version version = WorkflowVersion( workflow_id=existing_workflow.id, revision=next_revision, workflow_raw=workflow_raw, updated_by=updated_by, comment=f"Updated by {updated_by}", # TODO: check if valid is_valid=True, is_current=True, updated_at=datetime.now(), ) session.add(version) existing_workflow.name = name existing_workflow.description = description existing_workflow.updated_by = updated_by existing_workflow.interval = interval existing_workflow.workflow_raw = workflow_raw existing_workflow.revision = next_revision existing_workflow.last_updated = datetime.now() existing_workflow.is_deleted = False existing_workflow.is_disabled = is_disabled existing_workflow.provisioned = provisioned existing_workflow.provisioned_file = provisioned_file session.add(existing_workflow) session.commit() return existing_workflow def is_equal_workflow_dicts(a: dict, b: dict): return ( a.get("workflow_raw") == b.get("workflow_raw") and a.get("tenant_id") == b.get("tenant_id") and a.get("is_test") == b.get("is_test") and a.get("is_deleted") == b.get("is_deleted") and a.get("is_disabled") == b.get("is_disabled") and a.get("name") == b.get("name") and a.get("description") == b.get("description") and a.get("interval") == b.get("interval") and a.get("provisioned") == b.get("provisioned") and a.get("provisioned_file") == b.get("provisioned_file") ) def add_or_update_workflow( id: str, name: str, tenant_id: str, description: str | None, created_by: str, interval: int | None, workflow_raw: str, is_disabled: bool, updated_by: str, provisioned: bool = False, provisioned_file: str | None = None, force_update: bool = False, is_test: bool = False, lookup_by_name: bool = False, ) -> Workflow: with Session(engine, expire_on_commit=False) as session: if provisioned or lookup_by_name: # if workflow is provisioned, we lookup by name to not duplicate workflows on each backend restart existing_workflow = get_workflow_by_name(tenant_id, name) else: # otherwise, we want certainty, so just lookup by id existing_workflow = get_workflow_by_id(tenant_id, id) if existing_workflow: existing_workflow_dict = existing_workflow.model_dump() workflow_dict = dict( tenant_id=tenant_id, name=name, description=description, interval=interval, workflow_raw=workflow_raw, is_disabled=is_disabled, is_test=is_test, is_deleted=False, provisioned=provisioned, provisioned_file=provisioned_file, ) if ( is_equal_workflow_dicts(existing_workflow_dict, workflow_dict) and not force_update ): logger.info( f"Workflow {id} already exists with the same workflow properties, skipping update" ) return existing_workflow return update_workflow_with_values( existing_workflow, name=name, description=description, interval=interval, workflow_raw=workflow_raw, is_disabled=is_disabled, provisioned=provisioned, provisioned_file=provisioned_file, updated_by=updated_by, session=session, ) else: now = datetime.now(tz=timezone.utc) # Create a new workflow workflow = Workflow( id=id, revision=1, name=name, tenant_id=tenant_id, description=description, created_by=created_by, updated_by=updated_by, last_updated=now, interval=interval, is_disabled=is_disabled, workflow_raw=workflow_raw, provisioned=provisioned, provisioned_file=provisioned_file, is_test=is_test, ) version = WorkflowVersion( workflow_id=workflow.id, revision=1, workflow_raw=workflow_raw, updated_by=updated_by, comment=f"Created by {created_by}", is_valid=True, is_current=True, updated_at=now, ) session.add(workflow) session.add(version) session.commit() return workflow def get_or_create_dummy_workflow(tenant_id: str, session: Session | None = None): with existed_or_new_session(session) as session: workflow, created = get_or_create( session, Workflow, tenant_id=tenant_id, id=get_dummy_workflow_id(tenant_id), name="Dummy Workflow for test runs", description="Auto-generated dummy workflow for test runs", created_by="system", workflow_raw="{}", is_disabled=False, is_test=True, ) if created: # For new instances, make sure they're committed and refreshed from the database session.commit() session.refresh(workflow) elif workflow: # For existing instances, refresh to get the current state session.refresh(workflow) return workflow def get_workflow_to_alert_execution_by_workflow_execution_id( workflow_execution_id: str, ) -> WorkflowToAlertExecution: """ Get the WorkflowToAlertExecution entry for a given workflow execution ID. Args: workflow_execution_id (str): The workflow execution ID to filter the workflow execution by. Returns: WorkflowToAlertExecution: The WorkflowToAlertExecution object. """ with Session(engine) as session: return ( session.query(WorkflowToAlertExecution) .filter_by(workflow_execution_id=workflow_execution_id) .first() ) def get_last_workflow_workflow_to_alert_executions( session: Session, tenant_id: str ) -> list[WorkflowToAlertExecution]: """ Get the latest workflow executions for each alert fingerprint. Args: session (Session): The database session. tenant_id (str): The tenant_id to filter the workflow executions by. Returns: list[WorkflowToAlertExecution]: A list of WorkflowToAlertExecution objects. """ # Subquery to find the max started timestamp for each alert_fingerprint max_started_subquery = ( session.query( WorkflowToAlertExecution.alert_fingerprint, func.max(WorkflowExecution.started).label("max_started"), ) .join( WorkflowExecution, WorkflowToAlertExecution.workflow_execution_id == WorkflowExecution.id, ) .filter(WorkflowExecution.tenant_id == tenant_id) .filter(WorkflowExecution.started >= datetime.now() - timedelta(days=7)) .group_by(WorkflowToAlertExecution.alert_fingerprint) ).subquery("max_started_subquery") # Query to find WorkflowToAlertExecution entries that match the max started timestamp latest_workflow_to_alert_executions: list[WorkflowToAlertExecution] = ( session.query(WorkflowToAlertExecution) .join( WorkflowExecution, WorkflowToAlertExecution.workflow_execution_id == WorkflowExecution.id, ) .join( max_started_subquery, and_( WorkflowToAlertExecution.alert_fingerprint == max_started_subquery.c.alert_fingerprint, WorkflowExecution.started == max_started_subquery.c.max_started, ), ) .filter(WorkflowExecution.tenant_id == tenant_id) .limit(1000) .all() ) return latest_workflow_to_alert_executions def get_last_workflow_execution_by_workflow_id( tenant_id: str, workflow_id: str, status: str | None = None, exclude_ids: list[str] | None = None, ) -> Optional[WorkflowExecution]: with Session(engine) as session: query = ( select(WorkflowExecution) .where(WorkflowExecution.workflow_id == workflow_id) .where(WorkflowExecution.tenant_id == tenant_id) .where(WorkflowExecution.started >= datetime.now() - timedelta(days=1)) .order_by(col(WorkflowExecution.started).desc()) ) if status: query = query.where(WorkflowExecution.status == status) if exclude_ids: query = query.where(col(WorkflowExecution.id).notin_(exclude_ids)) workflow_execution = session.exec(query).first() return workflow_execution def get_workflows_with_last_execution(tenant_id: str) -> List[dict]: with Session(engine) as session: latest_execution_cte = ( select( WorkflowExecution.workflow_id, func.max(WorkflowExecution.started).label("last_execution_time"), ) .where(WorkflowExecution.tenant_id == tenant_id) .where( WorkflowExecution.started >= datetime.now(tz=timezone.utc) - timedelta(days=7) ) .group_by(WorkflowExecution.workflow_id) .limit(1000) .cte("latest_execution_cte") ) workflows_with_last_execution_query = ( select( Workflow, latest_execution_cte.c.last_execution_time, WorkflowExecution.status, ) .outerjoin( latest_execution_cte, Workflow.id == latest_execution_cte.c.workflow_id, ) .outerjoin( WorkflowExecution, and_( Workflow.id == WorkflowExecution.workflow_id, WorkflowExecution.started == latest_execution_cte.c.last_execution_time, ), ) .where(Workflow.tenant_id == tenant_id) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ).distinct() result = session.execute(workflows_with_last_execution_query).all() return result def get_all_workflows(tenant_id: str, exclude_disabled: bool = False) -> List[Workflow]: with Session(engine) as session: query = ( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ) if exclude_disabled: query = query.where(Workflow.is_disabled == False) workflows = session.exec(query).all() return workflows def get_all_provisioned_workflows(tenant_id: str): with Session(engine) as session: workflows = session.exec( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.provisioned == True) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ).all() return list(workflows) def get_all_provisioned_providers(tenant_id: str) -> List[Provider]: with Session(engine) as session: providers = session.exec( select(Provider) .where(Provider.tenant_id == tenant_id) .where(Provider.provisioned == True) ).all() return list(providers) def get_all_workflows_yamls(tenant_id: str): with Session(engine) as session: workflows = session.exec( select(Workflow.workflow_raw) .where(Workflow.tenant_id == tenant_id) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ).all() return list(workflows) def get_workflow_by_name(tenant_id: str, workflow_name: str): with Session(engine) as session: workflow = session.exec( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.name == workflow_name) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ).first() return workflow def get_workflow_by_id(tenant_id: str, workflow_id: str): with Session(engine) as session: workflow = session.exec( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.id == workflow_id) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ).first() return workflow def get_workflow_versions(tenant_id: str, workflow_id: str): with Session(engine) as session: versions = session.exec( select(WorkflowVersion) # starting from the 'workflow' table since it's smaller .select_from(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.id == workflow_id) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) .join(WorkflowVersion, WorkflowVersion.workflow_id == Workflow.id) .order_by(WorkflowVersion.revision.desc()) ).all() return versions def get_workflow_version(tenant_id: str, workflow_id: str, revision: int): with Session(engine) as session: version = session.exec( select(WorkflowVersion) # starting from the 'workflow' table since it's smaller .select_from(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.id == workflow_id) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) .join(WorkflowVersion, WorkflowVersion.workflow_id == Workflow.id) .where(WorkflowVersion.revision == revision) ).first() return version def update_provider_last_pull_time(tenant_id: str, provider_id: str): extra = {"tenant_id": tenant_id, "provider_id": provider_id} logger.info("Updating provider last pull time", extra=extra) with Session(engine) as session: provider = session.exec( select(Provider).where( Provider.tenant_id == tenant_id, Provider.id == provider_id ) ).first() if not provider: logger.warning( "Could not update provider last pull time since provider does not exist", extra=extra, ) try: provider.last_pull_time = datetime.now(tz=timezone.utc) session.commit() except Exception: logger.exception("Failed to update provider last pull time", extra=extra) raise logger.info("Successfully updated provider last pull time", extra=extra) def get_installed_providers(tenant_id: str) -> List[Provider]: with Session(engine) as session: providers = session.exec( select(Provider).where(Provider.tenant_id == tenant_id) ).all() return providers def get_consumer_providers() -> List[Provider]: # get all the providers that installed as consumers with Session(engine) as session: providers = session.exec( select(Provider).where(Provider.consumer == True) ).all() return providers def finish_workflow_execution(tenant_id, workflow_id, execution_id, status, error): with Session(engine) as session: workflow_execution = session.exec( select(WorkflowExecution).where(WorkflowExecution.id == execution_id) ).first() # some random number to avoid collisions if not workflow_execution: logger.warning( f"Failed to finish workflow execution {execution_id} for workflow {workflow_id}. Execution not found.", extra={ "tenant_id": tenant_id, "workflow_id": workflow_id, "workflow_execution_id": execution_id, }, ) raise ValueError("Execution not found") workflow_execution.is_running = random.randint(1, 2147483647 - 1) # max int workflow_execution.status = status # TODO: we had a bug with the error field, it was too short so some customers may fail over it. # we need to fix it in the future, create a migration that increases the size of the error field # and then we can remove the [:511] from here workflow_execution.error = error[:511] if error else None execution_time = ( datetime.utcnow() - workflow_execution.started ).total_seconds() workflow_execution.execution_time = int(execution_time) # TODO: logs session.commit() logger.info( f"Finished workflow execution {execution_id} for workflow {workflow_id} with status {status}", extra={ "tenant_id": tenant_id, "workflow_id": workflow_id, "workflow_execution_id": execution_id, "execution_time": execution_time, }, ) def get_workflow_executions( tenant_id, workflow_id, limit=50, offset=0, tab=2, status: Optional[Union[str, List[str]]] = None, trigger: Optional[Union[str, List[str]]] = None, execution_id: Optional[str] = None, is_test_run: bool = False, ): with Session(engine) as session: query = session.query( WorkflowExecution, ).filter( WorkflowExecution.tenant_id == tenant_id, WorkflowExecution.workflow_id == workflow_id, WorkflowExecution.is_test_run == False, ) now = datetime.now(tz=timezone.utc) timeframe = None if tab == 1: timeframe = now - timedelta(days=30) elif tab == 2: timeframe = now - timedelta(days=7) elif tab == 3: start_of_day = now.replace(hour=0, minute=0, second=0, microsecond=0) query = query.filter( WorkflowExecution.started >= start_of_day, WorkflowExecution.started <= now, ) if timeframe: query = query.filter(WorkflowExecution.started >= timeframe) if isinstance(status, str): status = [status] elif status is None: status = [] # Normalize trigger to a list if isinstance(trigger, str): trigger = [trigger] if execution_id: query = query.filter(WorkflowExecution.id == execution_id) if status and len(status) > 0: query = query.filter(WorkflowExecution.status.in_(status)) if trigger and len(trigger) > 0: conditions = [ WorkflowExecution.triggered_by.like(f"{trig}%") for trig in trigger ] query = query.filter(or_(*conditions)) total_count = query.count() status_count_query = query.with_entities( WorkflowExecution.status, func.count().label("count") ).group_by(WorkflowExecution.status) status_counts = status_count_query.all() statusGroupbyMap = {status: count for status, count in status_counts} pass_count = statusGroupbyMap.get("success", 0) fail_count = statusGroupbyMap.get("error", 0) + statusGroupbyMap.get( "timeout", 0 ) avgDuration = query.with_entities( func.avg(WorkflowExecution.execution_time) ).scalar() avgDuration = avgDuration if avgDuration else 0.0 query = ( query.order_by(desc(WorkflowExecution.started)).limit(limit).offset(offset) ) # Execute the query workflow_executions = query.all() return total_count, workflow_executions, pass_count, fail_count, avgDuration def delete_workflow(tenant_id, workflow_id): with Session(engine) as session: workflow = session.exec( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.id == workflow_id) ).first() if workflow: workflow.is_deleted = True session.commit() def delete_workflow_by_provisioned_file(tenant_id, provisioned_file): with Session(engine) as session: workflow = session.exec( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.provisioned_file == provisioned_file) ).first() if workflow: workflow.is_deleted = True session.commit() def get_workflow_id(tenant_id, workflow_name): with Session(engine) as session: workflow = session.exec( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.name == workflow_name) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ).first() if workflow: return workflow.id def push_logs_to_db(log_entries): # avoid circular import from keep.api.logging import LOG_FORMAT, LOG_FORMAT_OPEN_TELEMETRY db_log_entries = [] if LOG_FORMAT == LOG_FORMAT_OPEN_TELEMETRY: for log_entry in log_entries: try: try: # after formatting message = log_entry["message"][0:255] except Exception: # before formatting, fallback message = log_entry["msg"][0:255] try: timestamp = datetime.strptime( log_entry["asctime"], "%Y-%m-%d %H:%M:%S,%f" ) except Exception: timestamp = log_entry["created"] log_entry = WorkflowExecutionLog( workflow_execution_id=log_entry["workflow_execution_id"], timestamp=timestamp, message=message, context=json.loads( json.dumps(log_entry.get("context", {}), default=str) ), # workaround to serialize any object ) db_log_entries.append(log_entry) except Exception: print("Failed to parse log entry - ", log_entry) else: for log_entry in log_entries: try: try: # after formatting message = log_entry["message"][0:255] except Exception: # before formatting, fallback message = log_entry["msg"][0:255] log_entry = WorkflowExecutionLog( workflow_execution_id=log_entry["workflow_execution_id"], timestamp=log_entry["created"], message=message, # limit the message to 255 chars context=json.loads( json.dumps(log_entry.get("context", {}), default=str) ), # workaround to serialize any object ) db_log_entries.append(log_entry) except Exception: print("Failed to parse log entry - ", log_entry) # Add the LogEntry instances to the database session with Session(engine) as session: session.add_all(db_log_entries) session.commit() def get_workflow_execution( tenant_id: str, workflow_execution_id: str, is_test_run: bool | None = None, ): with Session(engine) as session: base_query = select(WorkflowExecution) if is_test_run is not None: base_query = base_query.where( WorkflowExecution.is_test_run == is_test_run, ) base_query = base_query.where( WorkflowExecution.id == workflow_execution_id, WorkflowExecution.tenant_id == tenant_id, ) execution_with_relations = base_query.options( joinedload(WorkflowExecution.workflow_to_alert_execution), joinedload(WorkflowExecution.workflow_to_incident_execution), ) return session.exec(execution_with_relations).one() def get_workflow_execution_with_logs( tenant_id: str, workflow_execution_id: str, is_test_run: bool | None = None, ): with Session(engine) as session: execution = get_workflow_execution( tenant_id, workflow_execution_id, is_test_run ) logs = session.exec( select(WorkflowExecutionLog) .where(WorkflowExecutionLog.workflow_execution_id == workflow_execution_id) .order_by(WorkflowExecutionLog.timestamp.asc()) ).all() return execution, logs def get_last_workflow_executions(tenant_id: str, limit=20): with Session(engine) as session: execution_with_logs = ( session.query(WorkflowExecution) .filter( WorkflowExecution.tenant_id == tenant_id, ) .order_by(desc(WorkflowExecution.started)) .limit(limit) .options(joinedload(WorkflowExecution.logs)) .all() ) return execution_with_logs def get_workflow_executions_count(tenant_id: str): with Session(engine) as session: query = session.query(WorkflowExecution).filter( WorkflowExecution.tenant_id == tenant_id, ) return { "success": query.filter(WorkflowExecution.status == "success").count(), "other": query.filter(WorkflowExecution.status != "success").count(), } def add_audit( tenant_id: str, fingerprint: str, user_id: str, action: ActionType, description: str, session: Session = None, commit: bool = True, ) -> AlertAudit: with existed_or_new_session(session) as session: audit = AlertAudit( tenant_id=tenant_id, fingerprint=fingerprint, user_id=user_id, action=action.value, description=description, ) session.add(audit) if commit: session.commit() session.refresh(audit) return audit def _enrich_entity( session, tenant_id, fingerprint, enrichments, action_type: ActionType, action_callee: str, action_description: str, force=False, audit_enabled=True, ): """ Enrich an alert with the provided enrichments. Args: session (Session): The database session. tenant_id (str): The tenant ID to filter the alert enrichments by. fingerprint (str): The alert fingerprint to filter the alert enrichments by. enrichments (dict): The enrichments to add to the alert. force (bool): Whether to force the enrichment to be updated. This is used to dispose enrichments if necessary. """ enrichment = get_enrichment_with_session(session, tenant_id, fingerprint) if enrichment: # if force - override exisitng enrichments. being used to dispose enrichments if necessary if force: new_enrichment_data = enrichments else: new_enrichment_data = {**enrichment.enrichments, **enrichments} # SQLAlchemy doesn't support updating JSON fields, so we need to do it manually # https://github.com/sqlalchemy/sqlalchemy/discussions/8396#discussion-4308891 stmt = ( update(AlertEnrichment) .where(AlertEnrichment.id == enrichment.id) .values(enrichments=new_enrichment_data) ) session.execute(stmt) if audit_enabled: # add audit event audit = AlertAudit( tenant_id=tenant_id, fingerprint=fingerprint, user_id=action_callee, action=action_type.value, description=action_description, ) session.add(audit) session.commit() # Refresh the instance to get updated data from the database session.refresh(enrichment) return enrichment else: try: alert_enrichment = AlertEnrichment( tenant_id=tenant_id, alert_fingerprint=fingerprint, enrichments=enrichments, ) session.add(alert_enrichment) # add audit event if audit_enabled: audit = AlertAudit( tenant_id=tenant_id, fingerprint=fingerprint, user_id=action_callee, action=action_type.value, description=action_description, ) session.add(audit) session.commit() return alert_enrichment except IntegrityError: # If we hit a duplicate entry error, rollback and get the existing enrichment logger.warning( "Duplicate entry error", extra={ "tenant_id": tenant_id, "fingerprint": fingerprint, "enrichments": enrichments, }, ) session.rollback() return get_enrichment_with_session(session, tenant_id, fingerprint) def batch_enrich( tenant_id, fingerprints, enrichments, action_type: ActionType, action_callee: str, action_description: str, session=None, audit_enabled=True, ): """ Batch enrich multiple alerts with the same enrichments in a single transaction. Args: tenant_id (str): The tenant ID to filter the alert enrichments by. fingerprints (List[str]): List of alert fingerprints to enrich. enrichments (dict): The enrichments to add to all alerts. action_type (ActionType): The type of action being performed. action_callee (str): The ID of the user performing the action. action_description (str): Description of the action. session (Session, optional): Database session to use. force (bool, optional): Whether to override existing enrichments. Defaults to False. audit_enabled (bool, optional): Whether to create audit entries. Defaults to True. Returns: List[AlertEnrichment]: List of enriched alert objects. """ with existed_or_new_session(session) as session: # Get all existing enrichments in one query existing_enrichments = { e.alert_fingerprint: e for e in session.exec( select(AlertEnrichment) .where(AlertEnrichment.tenant_id == tenant_id) .where(AlertEnrichment.alert_fingerprint.in_(fingerprints)) ).all() } # Prepare bulk operations to_create = [] audit_entries = [] for fingerprint in fingerprints: existing = existing_enrichments.get(fingerprint) if not existing: # For new entries to_create.append( AlertEnrichment( tenant_id=tenant_id, alert_fingerprint=fingerprint, enrichments=enrichments, ) ) if audit_enabled: audit_entries.append( AlertAudit( tenant_id=tenant_id, fingerprint=fingerprint, user_id=action_callee, action=action_type.value, description=action_description, ) ) # Merge per fingerprint, matching _enrich_entity pattern if existing_enrichments: for fingerprint, existing in existing_enrichments.items(): merged = {**existing.enrichments, **enrichments} stmt = ( update(AlertEnrichment) .where(AlertEnrichment.id == existing.id) .values(enrichments=merged) ) session.execute(stmt) # Bulk insert new enrichments if to_create: session.add_all(to_create) # Bulk insert audit entries if audit_entries: session.add_all(audit_entries) session.commit() # Get all updated/created enrichments result = session.exec( select(AlertEnrichment) .where(AlertEnrichment.tenant_id == tenant_id) .where(AlertEnrichment.alert_fingerprint.in_(fingerprints)) ).all() return result def enrich_entity( tenant_id, fingerprint, enrichments, action_type: ActionType, action_callee: str, action_description: str, session=None, force=False, audit_enabled=True, ): with existed_or_new_session(session) as session: return _enrich_entity( session, tenant_id, fingerprint, enrichments, action_type, action_callee, action_description, force=force, audit_enabled=audit_enabled, ) def count_alerts( provider_type: str, provider_id: str, ever: bool, start_time: Optional[datetime], end_time: Optional[datetime], tenant_id: str, ): with Session(engine) as session: if ever: return ( session.query(Alert) .filter( Alert.tenant_id == tenant_id, Alert.provider_id == provider_id, Alert.provider_type == provider_type, ) .count() ) else: return ( session.query(Alert) .filter( Alert.tenant_id == tenant_id, Alert.provider_id == provider_id, Alert.provider_type == provider_type, Alert.timestamp >= start_time, Alert.timestamp <= end_time, ) .count() ) def get_enrichment(tenant_id, fingerprint, refresh=False): with Session(engine) as session: return get_enrichment_with_session(session, tenant_id, fingerprint, refresh) @retry(exceptions=(Exception,), tries=3, delay=0.1, backoff=2) def get_enrichment_with_session(session, tenant_id, fingerprint, refresh=False): try: alert_enrichment = session.exec( select(AlertEnrichment) .where(AlertEnrichment.tenant_id == tenant_id) .where(AlertEnrichment.alert_fingerprint == fingerprint) ).first() if refresh and alert_enrichment: try: session.refresh(alert_enrichment) except Exception: logger.exception( "Failed to refresh enrichment", extra={"tenant_id": tenant_id, "fingerprint": fingerprint}, ) session.rollback() raise # This will trigger a retry return alert_enrichment except Exception as e: if "PendingRollbackError" in str(e): logger.warning( "Session has pending rollback, attempting recovery", extra={"tenant_id": tenant_id, "fingerprint": fingerprint}, ) session.rollback() raise # This will trigger a retry else: logger.exception( "Unexpected error getting enrichment", extra={"tenant_id": tenant_id, "fingerprint": fingerprint}, ) raise # This will trigger a retry def get_enrichments( tenant_id: int, fingerprints: List[str] ) -> List[Optional[AlertEnrichment]]: """ Get a list of alert enrichments for a list of fingerprints using a single DB query. :param tenant_id: The tenant ID to filter the alert enrichments by. :param fingerprints: A list of fingerprints to get the alert enrichments for. :return: A list of AlertEnrichment objects or None for each fingerprint. """ with Session(engine) as session: result = session.exec( select(AlertEnrichment) .where(AlertEnrichment.tenant_id == tenant_id) .where(AlertEnrichment.alert_fingerprint.in_(fingerprints)) ).all() return result def get_alerts_with_filters( tenant_id, provider_id=None, filters=None, time_delta=1, with_incidents=False, ) -> list[Alert]: with Session(engine) as session: # Create the query query = ( session.query(Alert) .select_from(LastAlert) .join(Alert, LastAlert.alert_id == Alert.id) ) # Apply subqueryload to force-load the alert_enrichment relationship query = query.options(subqueryload(Alert.alert_enrichment)) # Filter by tenant_id query = query.filter(Alert.tenant_id == tenant_id) # Filter by time_delta query = query.filter( Alert.timestamp >= datetime.now(tz=timezone.utc) - timedelta(days=time_delta) ) # Ensure Alert and AlertEnrichment are joined for subsequent filters query = query.outerjoin(Alert.alert_enrichment) # Apply filters if provided if filters: for f in filters: filter_key, filter_value = f.get("key"), f.get("value") if isinstance(filter_value, bool) and filter_value is True: # If the filter value is True, we want to filter by the existence of the enrichment # e.g.: all the alerts that have ticket_id if session.bind.dialect.name in ["mysql", "postgresql"]: query = query.filter( func.json_extract( AlertEnrichment.enrichments, f"$.{filter_key}" ) != null() ) elif session.bind.dialect.name == "sqlite": query = query.filter( func.json_type( AlertEnrichment.enrichments, f"$.{filter_key}" ) != null() ) elif isinstance(filter_value, (str, int)): if session.bind.dialect.name in ["mysql", "postgresql"]: query = query.filter( func.json_unquote( func.json_extract( AlertEnrichment.enrichments, f"$.{filter_key}" ) ) == filter_value ) elif session.bind.dialect.name == "sqlite": query = query.filter( func.json_extract( AlertEnrichment.enrichments, f"$.{filter_key}" ) == filter_value ) else: logger.warning( "Unsupported dialect", extra={"dialect": session.bind.dialect.name}, ) else: logger.warning("Unsupported filter type", extra={"filter": f}) if provider_id: query = query.filter(Alert.provider_id == provider_id) query = query.order_by(Alert.timestamp.desc()) query = query.limit(10000) # Execute the query alerts = query.all() if with_incidents: alerts = enrich_alerts_with_incidents(tenant_id, alerts, session) return alerts def query_alerts( tenant_id, provider_id=None, limit=1000, timeframe=None, upper_timestamp=None, lower_timestamp=None, skip_alerts_with_null_timestamp=True, sort_ascending=False, ) -> list[Alert]: """ Get all alerts for a given tenant_id. Args: tenant_id (_type_): The tenant_id to filter the alerts by. provider_id (_type_, optional): The provider id to filter by. Defaults to None. limit (_type_, optional): The maximum number of alerts to return. Defaults to 1000. timeframe (_type_, optional): The number of days to look back for alerts. Defaults to None. upper_timestamp (_type_, optional): The upper timestamp to filter by. Defaults to None. lower_timestamp (_type_, optional): The lower timestamp to filter by. Defaults to None. Returns: List[Alert]: A list of Alert objects.""" with Session(engine) as session: # Create the query query = session.query(Alert) # Apply subqueryload to force-load the alert_enrichment relationship query = query.options(subqueryload(Alert.alert_enrichment)) # Filter by tenant_id query = query.filter(Alert.tenant_id == tenant_id) # if timeframe is provided, filter the alerts by the timeframe if timeframe: query = query.filter( Alert.timestamp >= datetime.now(tz=timezone.utc) - timedelta(days=timeframe) ) filter_conditions = [] if upper_timestamp is not None: filter_conditions.append(Alert.timestamp < upper_timestamp) if lower_timestamp is not None: filter_conditions.append(Alert.timestamp >= lower_timestamp) # Apply the filter conditions if filter_conditions: query = query.filter(*filter_conditions) # Unpack and apply all conditions if provider_id: query = query.filter(Alert.provider_id == provider_id) if skip_alerts_with_null_timestamp: query = query.filter(Alert.timestamp.isnot(None)) if sort_ascending: query = query.order_by(Alert.timestamp.asc()) else: query = query.order_by(Alert.timestamp.desc()) if limit: query = query.limit(limit) # Execute the query alerts = query.all() return alerts def get_started_at_for_alerts( tenant_id, fingerprints: list[str], session: Optional[Session] = None, ) -> dict[str, datetime]: with existed_or_new_session(session) as session: statement = select(LastAlert.fingerprint, LastAlert.first_timestamp).where( LastAlert.tenant_id == tenant_id, LastAlert.fingerprint.in_(fingerprints), ) result = session.exec(statement).all() return {row[0]: row[1] for row in result} def get_last_alerts( tenant_id, provider_id=None, limit=1000, timeframe=None, upper_timestamp=None, lower_timestamp=None, with_incidents=False, fingerprints=None, ) -> list[Alert]: with Session(engine) as session: dialect_name = session.bind.dialect.name # Build the base query using select() stmt = ( select(Alert, LastAlert.first_timestamp.label("startedAt")) .select_from(LastAlert) .join(Alert, LastAlert.alert_id == Alert.id) .where(LastAlert.tenant_id == tenant_id) .where(Alert.tenant_id == tenant_id) ) if timeframe: stmt = stmt.where( LastAlert.timestamp >= datetime.now(tz=timezone.utc) - timedelta(days=timeframe) ) # Apply additional filters filter_conditions = [] if upper_timestamp is not None: filter_conditions.append(LastAlert.timestamp < upper_timestamp) if lower_timestamp is not None: filter_conditions.append(LastAlert.timestamp >= lower_timestamp) if fingerprints: filter_conditions.append(LastAlert.fingerprint.in_(tuple(fingerprints))) logger.info(f"filter_conditions: {filter_conditions}") if filter_conditions: stmt = stmt.where(*filter_conditions) # Main query for alerts stmt = stmt.options(subqueryload(Alert.alert_enrichment)) if with_incidents: if dialect_name == "sqlite": # SQLite version - using JSON incidents_subquery = ( select( LastAlertToIncident.fingerprint, func.json_group_array( cast(LastAlertToIncident.incident_id, String) ).label("incidents"), ) .where( LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, ) .group_by(LastAlertToIncident.fingerprint) .subquery() ) elif dialect_name == "mysql": # MySQL version - using GROUP_CONCAT incidents_subquery = ( select( LastAlertToIncident.fingerprint, func.group_concat( cast(LastAlertToIncident.incident_id, String) ).label("incidents"), ) .where( LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, ) .group_by(LastAlertToIncident.fingerprint) .subquery() ) elif dialect_name == "postgresql": # PostgreSQL version - using string_agg incidents_subquery = ( select( LastAlertToIncident.fingerprint, func.string_agg( cast(LastAlertToIncident.incident_id, String), ",", ).label("incidents"), ) .where( LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, ) .group_by(LastAlertToIncident.fingerprint) .subquery() ) else: raise ValueError(f"Unsupported dialect: {dialect_name}") stmt = stmt.add_columns(incidents_subquery.c.incidents) stmt = stmt.outerjoin( incidents_subquery, Alert.fingerprint == incidents_subquery.c.fingerprint, ) if provider_id: stmt = stmt.where(Alert.provider_id == provider_id) # Order by timestamp in descending order and limit the results stmt = stmt.order_by(desc(Alert.timestamp)).limit(limit) # Execute the query alerts_with_start = session.execute(stmt).all() # Process results based on dialect alerts = [] for alert_data in alerts_with_start: alert = alert_data[0] startedAt = alert_data[1] if not alert.event.get("startedAt"): alert.event["startedAt"] = str(startedAt) else: alert.event["firstTimestamp"] = str(startedAt) alert.event["event_id"] = str(alert.id) if with_incidents: incident_id = alert_data[2] if dialect_name == "sqlite": # Parse JSON array for SQLite incident_id = json.loads(incident_id)[0] if incident_id else None elif dialect_name in ("mysql", "postgresql"): # Split comma-separated string for MySQL and PostgreSQL incident_id = incident_id.split(",")[0] if incident_id else None alert.event["incident"] = str(incident_id) if incident_id else None alerts.append(alert) return alerts def get_alerts_by_fingerprint( tenant_id: str, fingerprint: str, limit=1, status=None, with_alert_instance_enrichment=False, ) -> List[Alert]: """ Get all alerts for a given fingerprint. Args: tenant_id (str): The tenant_id to filter the alerts by. fingerprint (str): The fingerprint to filter the alerts by. Returns: List[Alert]: A list of Alert objects. """ with Session(engine) as session: # Create the query query = session.query(Alert) # Apply subqueryload to force-load the alert_enrichment relationship query = query.options(subqueryload(Alert.alert_enrichment)) if with_alert_instance_enrichment: query = query.options(subqueryload(Alert.alert_instance_enrichment)) # Filter by tenant_id query = query.filter(Alert.tenant_id == tenant_id) query = query.filter(Alert.fingerprint == fingerprint) query = query.order_by(Alert.timestamp.desc()) if status: query = query.filter(get_json_extract_field(session, Alert.event, "status") == status) if limit: query = query.limit(limit) # Execute the query alerts = query.all() return alerts def get_all_alerts_by_fingerprints( tenant_id: str, fingerprints: List[str], session: Optional[Session] = None ) -> List[Alert]: with existed_or_new_session(session) as session: query = ( select(Alert) .filter(Alert.tenant_id == tenant_id) .filter(Alert.fingerprint.in_(fingerprints)) .order_by(Alert.timestamp.desc()) ) return session.exec(query).all() def get_alert_by_fingerprint_and_event_id( tenant_id: str, fingerprint: str, event_id: str ) -> Alert: with Session(engine) as session: alert = ( session.query(Alert) .filter(Alert.tenant_id == tenant_id) .filter(Alert.fingerprint == fingerprint) .filter(Alert.id == uuid.UUID(event_id)) .first() ) return alert def get_alert_by_event_id( tenant_id: str, event_id: str, session: Optional[Session] = None ) -> Alert: with existed_or_new_session(session) as session: query = ( select(Alert) .filter(Alert.tenant_id == tenant_id) .filter(Alert.id == uuid.UUID(event_id)) ) query = query.options(subqueryload(Alert.alert_enrichment)) alert = session.exec(query).first() return alert def get_alerts_by_ids( tenant_id: str, alert_ids: list[str | UUID], session: Optional[Session] = None ) -> List[Alert]: with existed_or_new_session(session) as session: query = ( select(Alert) .filter(Alert.tenant_id == tenant_id) .filter(Alert.id.in_(alert_ids)) ) query = query.options(subqueryload(Alert.alert_enrichment)) return session.exec(query).all() def get_previous_alert_by_fingerprint(tenant_id: str, fingerprint: str) -> Alert: # get the previous alert for a given fingerprint with Session(engine) as session: alert = ( session.query(Alert) .filter(Alert.tenant_id == tenant_id) .filter(Alert.fingerprint == fingerprint) .order_by(Alert.timestamp.desc()) .limit(2) .all() ) if len(alert) > 1: return alert[1] else: # no previous alert return None def get_alerts_by_status( status: AlertStatus, session: Optional[Session] = None ) -> List[Alert]: with existed_or_new_session(session) as session: status_field = get_json_extract_field(session, Alert.event, "status") query = ( select(Alert). where(status_field == status.value) ) return session.exec(query).all() def get_api_key(api_key: str, include_deleted: bool = False) -> TenantApiKey: with Session(engine) as session: api_key_hashed = hashlib.sha256(api_key.encode()).hexdigest() statement = select(TenantApiKey).where(TenantApiKey.key_hash == api_key_hashed) if not include_deleted: statement = statement.where(TenantApiKey.is_deleted != True) tenant_api_key = session.exec(statement).first() return tenant_api_key def get_user_by_api_key(api_key: str): api_key = get_api_key(api_key) return api_key.created_by # this is only for single tenant def get_user(username, password, update_sign_in=True): from keep.api.models.db.user import User password_hash = hashlib.sha256(password.encode()).hexdigest() with Session(engine, expire_on_commit=False) as session: user = session.exec( select(User) .where(User.tenant_id == SINGLE_TENANT_UUID) .where(User.username == username) .where(User.password_hash == password_hash) ).first() if user and update_sign_in: user.last_sign_in = datetime.utcnow() session.add(user) session.commit() return user def get_users(tenant_id=None): from keep.api.models.db.user import User tenant_id = tenant_id or SINGLE_TENANT_UUID with Session(engine) as session: users = session.exec(select(User).where(User.tenant_id == tenant_id)).all() return users def delete_user(username): from keep.api.models.db.user import User with Session(engine) as session: user = session.exec( select(User) .where(User.tenant_id == SINGLE_TENANT_UUID) .where(User.username == username) ).first() if user: session.delete(user) session.commit() def user_exists(tenant_id, username): from keep.api.models.db.user import User with Session(engine) as session: user = session.exec( select(User) .where(User.tenant_id == tenant_id) .where(User.username == username) ).first() return user is not None def create_user(tenant_id, username, password, role): from keep.api.models.db.user import User password_hash = hashlib.sha256(password.encode()).hexdigest() with Session(engine) as session: user = User( tenant_id=tenant_id, username=username, password_hash=password_hash, role=role, ) session.add(user) session.commit() session.refresh(user) return user def update_user_last_sign_in(tenant_id, username): from keep.api.models.db.user import User with Session(engine) as session: user = session.exec( select(User) .where(User.tenant_id == tenant_id) .where(User.username == username) ).first() if user: user.last_sign_in = datetime.utcnow() session.add(user) session.commit() return user def update_user_role(tenant_id, username, role): from keep.api.models.db.user import User with Session(engine) as session: user = session.exec( select(User) .where(User.tenant_id == tenant_id) .where(User.username == username) ).first() if user and user.role != role: user.role = role session.add(user) session.commit() return user def save_workflow_results(tenant_id, workflow_execution_id, workflow_results): with Session(engine) as session: workflow_execution = session.exec( select(WorkflowExecution) .where(WorkflowExecution.tenant_id == tenant_id) .where(WorkflowExecution.id == workflow_execution_id) ).one() try: # backward comptability - try to serialize the workflow results json.dumps(workflow_results) # if that's ok, use the original way workflow_execution.results = workflow_results except Exception: # if that's not ok, use the Keep way (e.g. alerdto is not json serializable) logger.warning( "Failed to serialize workflow results, using fastapi encoder", ) # use some other way to serialize the workflow results workflow_execution.results = custom_serialize(workflow_results) # commit the changes session.commit() def get_workflow_by_name(tenant_id, workflow_name): with Session(engine) as session: workflow = session.exec( select(Workflow) .where(Workflow.tenant_id == tenant_id) .where(Workflow.name == workflow_name) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ).first() return workflow def get_previous_execution_id(tenant_id, workflow_id, workflow_execution_id): with Session(engine) as session: previous_execution = session.exec( select(WorkflowExecution) .where(WorkflowExecution.tenant_id == tenant_id) .where(WorkflowExecution.workflow_id == workflow_id) .where(WorkflowExecution.id != workflow_execution_id) .where(WorkflowExecution.is_test_run == False) .where( WorkflowExecution.started >= datetime.now() - timedelta(days=1) ) # no need to check more than 1 day ago .order_by(WorkflowExecution.started.desc()) .limit(1) ).first() if previous_execution: return previous_execution else: return None def create_rule( tenant_id, name, timeframe, timeunit, definition, definition_cel, created_by, grouping_criteria=None, group_description=None, require_approve=False, resolve_on=ResolveOn.NEVER.value, create_on=CreateIncidentOn.ANY.value, incident_name_template=None, incident_prefix=None, multi_level=False, multi_level_property_name=None, threshold=1, assignee=None, ): grouping_criteria = grouping_criteria or [] with Session(engine) as session: rule = Rule( tenant_id=tenant_id, name=name, timeframe=timeframe, timeunit=timeunit, definition=definition, definition_cel=definition_cel, created_by=created_by, creation_time=datetime.utcnow(), grouping_criteria=grouping_criteria, group_description=group_description, require_approve=require_approve, resolve_on=resolve_on, create_on=create_on, incident_name_template=incident_name_template, incident_prefix=incident_prefix, multi_level=multi_level, multi_level_property_name=multi_level_property_name, threshold=threshold, assignee=assignee, ) session.add(rule) session.commit() session.refresh(rule) return rule def update_rule( tenant_id, rule_id, name, timeframe, timeunit, definition, definition_cel, updated_by, grouping_criteria, require_approve, resolve_on, create_on, incident_name_template, incident_prefix, multi_level, multi_level_property_name, threshold, assignee=None, ): rule_uuid = __convert_to_uuid(rule_id) if not rule_uuid: return False with Session(engine) as session: rule = session.exec( select(Rule).where(Rule.tenant_id == tenant_id).where(Rule.id == rule_uuid) ).first() if rule: rule.name = name rule.timeframe = timeframe rule.timeunit = timeunit rule.definition = definition rule.definition_cel = definition_cel rule.grouping_criteria = grouping_criteria rule.require_approve = require_approve rule.updated_by = updated_by rule.update_time = datetime.utcnow() rule.resolve_on = resolve_on rule.create_on = create_on rule.incident_name_template = incident_name_template rule.incident_prefix = incident_prefix rule.multi_level = multi_level rule.multi_level_property_name = multi_level_property_name rule.threshold = threshold rule.assignee = assignee session.commit() session.refresh(rule) return rule else: return None def get_rules(tenant_id, ids=None) -> list[Rule]: with Session(engine) as session: # Start building the query query = ( select(Rule) .where(Rule.tenant_id == tenant_id) .where(Rule.is_deleted.is_(False)) ) # Apply additional filters if ids are provided if ids is not None: query = query.where(Rule.id.in_(ids)) # Execute the query rules = session.exec(query).all() return rules def create_alert(tenant_id, provider_type, provider_id, event, fingerprint): with Session(engine) as session: alert = Alert( tenant_id=tenant_id, provider_type=provider_type, provider_id=provider_id, event=event, fingerprint=fingerprint, ) session.add(alert) session.commit() session.refresh(alert) return alert def delete_rule(tenant_id, rule_id): with Session(engine) as session: rule_uuid = __convert_to_uuid(rule_id) if not rule_uuid: return False rule = session.exec( select(Rule).where(Rule.tenant_id == tenant_id).where(Rule.id == rule_uuid) ).first() if rule and not rule.is_deleted: rule.is_deleted = True session.commit() return True return False def get_incident_for_grouping_rule( tenant_id, rule, rule_fingerprint, session: Optional[Session] = None ) -> (Optional[Incident], bool): # checks if incident with the incident criteria exists, if not it creates it # and then assign the alert to the incident with existed_or_new_session(session) as session: incident = session.exec( select(Incident) .where(Incident.tenant_id == tenant_id) .where(Incident.rule_id == rule.id) .where(Incident.rule_fingerprint == rule_fingerprint) .order_by(Incident.creation_time.desc()) ).first() # if the last alert in the incident is older than the timeframe, create a new incident is_incident_expired = False if incident and incident.status in [ IncidentStatus.RESOLVED.value, IncidentStatus.MERGED.value, IncidentStatus.DELETED.value, ]: is_incident_expired = True elif incident and incident.alerts_count > 0: enrich_incidents_with_alerts(tenant_id, [incident], session) is_incident_expired = max( alert.timestamp for alert in incident.alerts ) < datetime.utcnow() - timedelta(seconds=rule.timeframe) # if there is no incident with the rule_fingerprint, create it or existed is already expired if not incident: return None, None return incident, is_incident_expired @retry_on_db_error def create_incident_for_grouping_rule( tenant_id, rule: Rule, rule_fingerprint, incident_name: str = None, past_incident: Optional[Incident] = None, assignee: str | None = None, session: Optional[Session] = None, ): with existed_or_new_session(session) as session: # Create and add a new incident if it doesn't exist incident = Incident( tenant_id=tenant_id, user_generated_name=incident_name or f"{rule.name}", rule_id=rule.id, rule_fingerprint=rule_fingerprint, is_predicted=True, is_candidate=rule.require_approve, is_visible=False, # rule.create_on == CreateIncidentOn.ANY.value, incident_type=IncidentType.RULE.value, same_incident_in_the_past_id=past_incident.id if past_incident else None, resolve_on=rule.resolve_on, assignee=assignee, ) session.add(incident) session.flush() if rule.incident_prefix: incident.user_generated_name = f"{rule.incident_prefix}-{incident.running_number} - {incident.user_generated_name}" session.commit() session.refresh(incident) return incident @retry_on_db_error def create_incident_for_topology( tenant_id: str, alert_group: list[Alert], session: Session ) -> Incident: """Create a new incident from topology-connected alerts""" # Get highest severity from alerts severity = max(alert.severity for alert in alert_group) # Get all services services = set() service_names = set() for alert in alert_group: services.update(alert.service_ids) service_names.update(alert.service_names) incident = Incident( tenant_id=tenant_id, user_generated_name=f"Topology incident: Multiple alerts across {', '.join(service_names)}", severity=severity.value, status=IncidentStatus.FIRING.value, is_visible=True, incident_type=IncidentType.TOPOLOGY.value, # Set incident type for topology data={"services": list(services), "alert_count": len(alert_group)}, ) return incident def get_rule(tenant_id, rule_id): with Session(engine) as session: rule = session.exec( select(Rule).where(Rule.tenant_id == tenant_id).where(Rule.id == rule_id) ).first() return rule def get_rule_incidents_count_db(tenant_id): with Session(engine) as session: query = ( session.query(Incident.rule_id, func.count(Incident.id)) .select_from(Incident) .filter(Incident.tenant_id == tenant_id, col(Incident.rule_id).isnot(None)) .group_by(Incident.rule_id) ) return dict(query.all()) def get_rule_distribution(tenant_id, minute=False): """Returns hits per hour for each rule, optionally breaking down by groups if the rule has 'group by', limited to the last 7 days.""" with Session(engine) as session: # Get the timestamp for 7 days ago seven_days_ago = datetime.utcnow() - timedelta(days=1) # Check the dialect if session.bind.dialect.name == "mysql": time_format = "%Y-%m-%d %H:%i" if minute else "%Y-%m-%d %H" timestamp_format = func.date_format( LastAlertToIncident.timestamp, time_format ) elif session.bind.dialect.name == "postgresql": time_format = "YYYY-MM-DD HH:MI" if minute else "YYYY-MM-DD HH" timestamp_format = func.to_char(LastAlertToIncident.timestamp, time_format) elif session.bind.dialect.name == "sqlite": time_format = "%Y-%m-%d %H:%M" if minute else "%Y-%m-%d %H" timestamp_format = func.strftime(time_format, LastAlertToIncident.timestamp) else: raise ValueError("Unsupported database dialect") # Construct the query query = ( session.query( Rule.id.label("rule_id"), Rule.name.label("rule_name"), Incident.id.label("incident_id"), Incident.rule_fingerprint.label("rule_fingerprint"), timestamp_format.label("time"), func.count(LastAlertToIncident.fingerprint).label("hits"), ) .join(Incident, Rule.id == Incident.rule_id) .join(LastAlertToIncident, Incident.id == LastAlertToIncident.incident_id) .filter( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.timestamp >= seven_days_ago, ) .filter(Rule.tenant_id == tenant_id) # Filter by tenant_id .group_by( Rule.id, "rule_name", Incident.id, "rule_fingerprint", "time" ) # Adjusted here .order_by("time") ) results = query.all() # Convert the results into a dictionary rule_distribution = {} for result in results: rule_id = result.rule_id rule_fingerprint = result.rule_fingerprint timestamp = result.time hits = result.hits if rule_id not in rule_distribution: rule_distribution[rule_id] = {} if rule_fingerprint not in rule_distribution[rule_id]: rule_distribution[rule_id][rule_fingerprint] = {} rule_distribution[rule_id][rule_fingerprint][timestamp] = hits return rule_distribution def get_all_deduplication_rules(tenant_id): with Session(engine) as session: rules = session.exec( select(AlertDeduplicationRule).where( AlertDeduplicationRule.tenant_id == tenant_id ) ).all() return rules def get_deduplication_rule_by_id(tenant_id, rule_id: str): rule_uuid = __convert_to_uuid(rule_id) if not rule_uuid: return None with Session(engine) as session: rules = session.exec( select(AlertDeduplicationRule) .where(AlertDeduplicationRule.tenant_id == tenant_id) .where(AlertDeduplicationRule.id == rule_uuid) ).first() return rules def get_custom_deduplication_rule(tenant_id, provider_id, provider_type): with Session(engine) as session: rule = session.exec( select(AlertDeduplicationRule) .where(AlertDeduplicationRule.tenant_id == tenant_id) .where(AlertDeduplicationRule.provider_id == provider_id) .where(AlertDeduplicationRule.provider_type == provider_type) ).first() return rule def create_deduplication_rule( tenant_id: str, name: str, description: str, provider_id: str | None, provider_type: str, created_by: str, enabled: bool = True, fingerprint_fields: list[str] = [], full_deduplication: bool = False, ignore_fields: list[str] = [], priority: int = 0, is_provisioned: bool = False, ): with Session(engine) as session: new_rule = AlertDeduplicationRule( tenant_id=tenant_id, name=name, description=description, provider_id=provider_id, provider_type=provider_type, last_updated_by=created_by, # on creation, last_updated_by is the same as created_by created_by=created_by, enabled=enabled, fingerprint_fields=fingerprint_fields, full_deduplication=full_deduplication, ignore_fields=ignore_fields, priority=priority, is_provisioned=is_provisioned, ) session.add(new_rule) session.commit() session.refresh(new_rule) return new_rule def update_deduplication_rule( rule_id: str, tenant_id: str, name: str, description: str, provider_id: str | None, provider_type: str, last_updated_by: str, enabled: bool = True, fingerprint_fields: list[str] = [], full_deduplication: bool = False, ignore_fields: list[str] = [], priority: int = 0, ): rule_uuid = __convert_to_uuid(rule_id) if not rule_uuid: return False with Session(engine) as session: rule = session.exec( select(AlertDeduplicationRule) .where(AlertDeduplicationRule.id == rule_uuid) .where(AlertDeduplicationRule.tenant_id == tenant_id) ).first() if not rule: raise ValueError(f"No deduplication rule found with id {rule_id}") rule.name = name rule.description = description rule.provider_id = provider_id rule.provider_type = provider_type rule.last_updated_by = last_updated_by rule.enabled = enabled rule.fingerprint_fields = fingerprint_fields rule.full_deduplication = full_deduplication rule.ignore_fields = ignore_fields rule.priority = priority session.add(rule) session.commit() session.refresh(rule) return rule def delete_deduplication_rule(rule_id: str, tenant_id: str) -> bool: rule_uuid = __convert_to_uuid(rule_id) if not rule_uuid: return False with Session(engine) as session: rule = session.exec( select(AlertDeduplicationRule) .where(AlertDeduplicationRule.id == rule_uuid) .where(AlertDeduplicationRule.tenant_id == tenant_id) ).first() if not rule: return False session.delete(rule) session.commit() return True def create_deduplication_event( tenant_id, deduplication_rule_id, deduplication_type, provider_id, provider_type ): logger.debug( "Adding deduplication event", extra={ "deduplication_rule_id": deduplication_rule_id, "deduplication_type": deduplication_type, "provider_id": provider_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) if isinstance(deduplication_rule_id, str): deduplication_rule_id = __convert_to_uuid(deduplication_rule_id) if not deduplication_rule_id: logger.debug( "Deduplication rule id is not a valid uuid", extra={ "deduplication_rule_id": deduplication_rule_id, "tenant_id": tenant_id, }, ) return False with Session(engine) as session: deduplication_event = AlertDeduplicationEvent( tenant_id=tenant_id, deduplication_rule_id=deduplication_rule_id, deduplication_type=deduplication_type, provider_id=provider_id, provider_type=provider_type, timestamp=datetime.now(tz=timezone.utc), date_hour=datetime.now(tz=timezone.utc).replace( minute=0, second=0, microsecond=0 ), ) session.add(deduplication_event) session.commit() logger.debug( "Deduplication event added", extra={ "deduplication_event_id": deduplication_event.id, "tenant_id": tenant_id, }, ) def get_all_deduplication_stats(tenant_id): with Session(engine) as session: # Query to get all-time deduplication stats all_time_query = ( select( AlertDeduplicationEvent.deduplication_rule_id, AlertDeduplicationEvent.provider_id, AlertDeduplicationEvent.provider_type, AlertDeduplicationEvent.deduplication_type, func.count(AlertDeduplicationEvent.id).label("dedup_count"), ) .where(AlertDeduplicationEvent.tenant_id == tenant_id) .group_by( AlertDeduplicationEvent.deduplication_rule_id, AlertDeduplicationEvent.provider_id, AlertDeduplicationEvent.provider_type, AlertDeduplicationEvent.deduplication_type, ) ) all_time_results = session.exec(all_time_query).all() # Query to get alerts distribution in the last 24 hours twenty_four_hours_ago = datetime.utcnow() - timedelta(hours=24) alerts_last_24_hours_query = ( select( AlertDeduplicationEvent.deduplication_rule_id, AlertDeduplicationEvent.provider_id, AlertDeduplicationEvent.provider_type, AlertDeduplicationEvent.date_hour, func.count(AlertDeduplicationEvent.id).label("hourly_count"), ) .where(AlertDeduplicationEvent.tenant_id == tenant_id) .where(AlertDeduplicationEvent.date_hour >= twenty_four_hours_ago) .group_by( AlertDeduplicationEvent.deduplication_rule_id, AlertDeduplicationEvent.provider_id, AlertDeduplicationEvent.provider_type, AlertDeduplicationEvent.date_hour, ) ) alerts_last_24_hours_results = session.exec(alerts_last_24_hours_query).all() # Create a dictionary with deduplication stats for each rule stats = {} current_hour = datetime.utcnow().replace(minute=0, second=0, microsecond=0) for result in all_time_results: provider_id = result.provider_id provider_type = result.provider_type dedup_count = result.dedup_count dedup_type = result.deduplication_type # alerts without provider_id and provider_type are considered as "keep" if not provider_type: provider_type = "keep" key = str(result.deduplication_rule_id) if key not in stats: # initialize the stats for the deduplication rule stats[key] = { "full_dedup_count": 0, "partial_dedup_count": 0, "none_dedup_count": 0, "alerts_last_24_hours": [ {"hour": (current_hour - timedelta(hours=i)).hour, "number": 0} for i in range(0, 24) ], "provider_id": provider_id, "provider_type": provider_type, } if dedup_type == "full": stats[key]["full_dedup_count"] += dedup_count elif dedup_type == "partial": stats[key]["partial_dedup_count"] += dedup_count elif dedup_type == "none": stats[key]["none_dedup_count"] += dedup_count # Add alerts distribution from the last 24 hours for result in alerts_last_24_hours_results: provider_id = result.provider_id provider_type = result.provider_type date_hour = result.date_hour hourly_count = result.hourly_count key = str(result.deduplication_rule_id) if not provider_type: provider_type = "keep" if key in stats: hours_ago = int((current_hour - date_hour).total_seconds() / 3600) if 0 <= hours_ago < 24: stats[key]["alerts_last_24_hours"][23 - hours_ago][ "number" ] = hourly_count return stats def get_last_alert_hashes_by_fingerprints( tenant_id, fingerprints: list[str] ) -> dict[str, str | None]: # get the last alert hashes for a list of fingerprints # to check deduplication with Session(engine) as session: query = ( select(LastAlert.fingerprint, LastAlert.alert_hash) .where(LastAlert.tenant_id == tenant_id) .where(LastAlert.fingerprint.in_(fingerprints)) ) results = session.execute(query).all() # Create a dictionary from the results alert_hash_dict = { fingerprint: alert_hash for fingerprint, alert_hash in results if alert_hash is not None } return alert_hash_dict def update_key_last_used( tenant_id: str, reference_id: str, max_retries=3, ) -> str: """ Updates API key last used. Args: session (Session): _description_ tenant_id (str): _description_ reference_id (str): _description_ Returns: str: _description_ """ with Session(engine) as session: # Get API Key from database statement = ( select(TenantApiKey) .where(TenantApiKey.reference_id == reference_id) .where(TenantApiKey.tenant_id == tenant_id) ) tenant_api_key_entry = session.exec(statement).first() # Update last used if not tenant_api_key_entry: # shouldn't happen but somehow happened to specific tenant so logging it logger.error( "API key not found", extra={"tenant_id": tenant_id, "unique_api_key_id": reference_id}, ) return tenant_api_key_entry.last_used = datetime.utcnow() for attempt in range(max_retries): try: session.add(tenant_api_key_entry) session.commit() break except StaleDataError as ex: if "expected to update" in ex.args[0]: logger.info( f"Phantom read detected while updating API key `{reference_id}`, retry #{attempt}" ) session.rollback() continue else: raise def get_linked_providers(tenant_id: str) -> List[Tuple[str, str, datetime]]: # Alert table may be too huge, so cutting the query without mercy LIMIT_BY_ALERTS = 10000 with Session(engine) as session: alerts_subquery = ( select(Alert) .filter(Alert.tenant_id == tenant_id, Alert.provider_type != "group") .limit(LIMIT_BY_ALERTS) .subquery() ) providers = session.exec( select( alerts_subquery.c.provider_type, alerts_subquery.c.provider_id, func.max(alerts_subquery.c.timestamp).label("last_alert_timestamp"), ) .select_from(alerts_subquery) .filter(~exists().where(Provider.id == alerts_subquery.c.provider_id)) .group_by(alerts_subquery.c.provider_type, alerts_subquery.c.provider_id) ).all() return providers def is_linked_provider(tenant_id: str, provider_id: str) -> bool: with Session(engine) as session: query = session.query(Alert.provider_id) # Add FORCE INDEX hint only for MySQL if engine.dialect.name == "mysql": query = query.with_hint(Alert, "FORCE INDEX (idx_alert_tenant_provider)") linked_provider = ( query.outerjoin(Provider, Alert.provider_id == Provider.id) .filter( Alert.tenant_id == tenant_id, Alert.provider_id == provider_id, Provider.id == None, ) .first() ) return linked_provider is not None def get_provider_distribution( tenant_id: str, aggregate_all: bool = False, timestamp_filter: TimeStampFilter = None, ) -> ( list[dict[str, int | Any]] | dict[str, dict[str, datetime | list[dict[str, int]] | Any]] ): """ Calculate the distribution of incidents created over time for a specific tenant. Args: tenant_id (str): ID of the tenant whose incidents are being queried. timestamp_filter (TimeStampFilter, optional): Filter to specify the time range. - lower_timestamp (datetime): Start of the time range. - upper_timestamp (datetime): End of the time range. Returns: List[dict]: A list of dictionaries representing the hourly distribution of incidents. Each dictionary contains: - 'timestamp' (str): Timestamp of the hour in "YYYY-MM-DD HH:00" format. - 'number' (int): Number of incidents created in that hour. Notes: - If no timestamp_filter is provided, defaults to the last 24 hours. - Supports MySQL, PostgreSQL, and SQLite for timestamp formatting. """ with Session(engine) as session: twenty_four_hours_ago = datetime.utcnow() - timedelta(hours=24) time_format = "%Y-%m-%d %H" filters = [Alert.tenant_id == tenant_id] if timestamp_filter: if timestamp_filter.lower_timestamp: filters.append(Alert.timestamp >= timestamp_filter.lower_timestamp) if timestamp_filter.upper_timestamp: filters.append(Alert.timestamp <= timestamp_filter.upper_timestamp) else: filters.append(Alert.timestamp >= twenty_four_hours_ago) if session.bind.dialect.name == "mysql": timestamp_format = func.date_format(Alert.timestamp, time_format) elif session.bind.dialect.name == "postgresql": # PostgreSQL requires a different syntax for the timestamp format # cf: https://www.postgresql.org/docs/current/functions-formatting.html#FUNCTIONS-FORMATTING timestamp_format = func.to_char(Alert.timestamp, "YYYY-MM-DD HH") elif session.bind.dialect.name == "sqlite": timestamp_format = func.strftime(time_format, Alert.timestamp) if aggregate_all: # Query for combined alert distribution across all providers query = ( session.query( timestamp_format.label("time"), func.count().label("hits") ) .filter(*filters) .group_by("time") .order_by("time") ) results = query.all() results = {str(time): hits for time, hits in results} # Create a complete list of timestamps within the specified range distribution = [] current_time = timestamp_filter.lower_timestamp.replace( minute=0, second=0, microsecond=0 ) while current_time <= timestamp_filter.upper_timestamp: timestamp_str = current_time.strftime(time_format) distribution.append( { "timestamp": timestamp_str + ":00", "number": results.get(timestamp_str, 0), } ) current_time += timedelta(hours=1) return distribution else: # Query for alert distribution grouped by provider query = ( session.query( Alert.provider_id, Alert.provider_type, timestamp_format.label("time"), func.count().label("hits"), func.max(Alert.timestamp).label("last_alert_timestamp"), ) .filter(*filters) .group_by(Alert.provider_id, Alert.provider_type, "time") .order_by(Alert.provider_id, Alert.provider_type, "time") ) results = query.all() provider_distribution = {} for provider_id, provider_type, time, hits, last_alert_timestamp in results: provider_key = f"{provider_id}_{provider_type}" last_alert_timestamp = ( datetime.fromisoformat(last_alert_timestamp) if isinstance(last_alert_timestamp, str) else last_alert_timestamp ) if provider_key not in provider_distribution: provider_distribution[provider_key] = { "provider_id": provider_id, "provider_type": provider_type, "alert_last_24_hours": [ {"hour": i, "number": 0} for i in range(24) ], "last_alert_received": last_alert_timestamp, } else: provider_distribution[provider_key]["last_alert_received"] = max( provider_distribution[provider_key]["last_alert_received"], last_alert_timestamp, ) time = datetime.strptime(time, time_format) index = int((time - twenty_four_hours_ago).total_seconds() // 3600) if 0 <= index < 24: provider_distribution[provider_key]["alert_last_24_hours"][index][ "number" ] += hits return provider_distribution def get_combined_workflow_execution_distribution( tenant_id: str, timestamp_filter: TimeStampFilter = None ): """ Calculate the distribution of WorkflowExecutions started over time, combined across all workflows for a specific tenant. Args: tenant_id (str): ID of the tenant whose workflow executions are being analyzed. timestamp_filter (TimeStampFilter, optional): Filter to specify the time range. - lower_timestamp (datetime): Start of the time range. - upper_timestamp (datetime): End of the time range. Returns: List[dict]: A list of dictionaries representing the hourly distribution of workflow executions. Each dictionary contains: - 'timestamp' (str): Timestamp of the hour in "YYYY-MM-DD HH:00" format. - 'number' (int): Number of workflow executions started in that hour. Notes: - If no timestamp_filter is provided, defaults to the last 24 hours. - Supports MySQL, PostgreSQL, and SQLite for timestamp formatting. """ with Session(engine) as session: twenty_four_hours_ago = datetime.utcnow() - timedelta(hours=24) time_format = "%Y-%m-%d %H" filters = [WorkflowExecution.tenant_id == tenant_id] if timestamp_filter: if timestamp_filter.lower_timestamp: filters.append( WorkflowExecution.started >= timestamp_filter.lower_timestamp ) if timestamp_filter.upper_timestamp: filters.append( WorkflowExecution.started <= timestamp_filter.upper_timestamp ) else: filters.append(WorkflowExecution.started >= twenty_four_hours_ago) # Database-specific timestamp formatting if session.bind.dialect.name == "mysql": timestamp_format = func.date_format(WorkflowExecution.started, time_format) elif session.bind.dialect.name == "postgresql": timestamp_format = func.to_char(WorkflowExecution.started, "YYYY-MM-DD HH") elif session.bind.dialect.name == "sqlite": timestamp_format = func.strftime(time_format, WorkflowExecution.started) # Query for combined execution count across all workflows query = ( session.query( timestamp_format.label("time"), func.count().label("executions"), ) .filter(*filters) .group_by("time") .order_by("time") ) results = {str(time): executions for time, executions in query.all()} distribution = [] current_time = timestamp_filter.lower_timestamp.replace( minute=0, second=0, microsecond=0 ) while current_time <= timestamp_filter.upper_timestamp: timestamp_str = current_time.strftime(time_format) distribution.append( { "timestamp": timestamp_str + ":00", "number": results.get(timestamp_str, 0), } ) current_time += timedelta(hours=1) return distribution def get_incidents_created_distribution( tenant_id: str, timestamp_filter: TimeStampFilter = None ): """ Calculate the distribution of incidents created over time for a specific tenant. Args: tenant_id (str): ID of the tenant whose incidents are being queried. timestamp_filter (TimeStampFilter, optional): Filter to specify the time range. - lower_timestamp (datetime): Start of the time range. - upper_timestamp (datetime): End of the time range. Returns: List[dict]: A list of dictionaries representing the hourly distribution of incidents. Each dictionary contains: - 'timestamp' (str): Timestamp of the hour in "YYYY-MM-DD HH:00" format. - 'number' (int): Number of incidents created in that hour. Notes: - If no timestamp_filter is provided, defaults to the last 24 hours. - Supports MySQL, PostgreSQL, and SQLite for timestamp formatting. """ with Session(engine) as session: twenty_four_hours_ago = datetime.utcnow() - timedelta(hours=24) time_format = "%Y-%m-%d %H" filters = [Incident.tenant_id == tenant_id] if timestamp_filter: if timestamp_filter.lower_timestamp: filters.append( Incident.creation_time >= timestamp_filter.lower_timestamp ) if timestamp_filter.upper_timestamp: filters.append( Incident.creation_time <= timestamp_filter.upper_timestamp ) else: filters.append(Incident.creation_time >= twenty_four_hours_ago) # Database-specific timestamp formatting if session.bind.dialect.name == "mysql": timestamp_format = func.date_format(Incident.creation_time, time_format) elif session.bind.dialect.name == "postgresql": timestamp_format = func.to_char(Incident.creation_time, "YYYY-MM-DD HH") elif session.bind.dialect.name == "sqlite": timestamp_format = func.strftime(time_format, Incident.creation_time) query = ( session.query( timestamp_format.label("time"), func.count().label("incidents") ) .filter(*filters) .group_by("time") .order_by("time") ) results = {str(time): incidents for time, incidents in query.all()} distribution = [] current_time = timestamp_filter.lower_timestamp.replace( minute=0, second=0, microsecond=0 ) while current_time <= timestamp_filter.upper_timestamp: timestamp_str = current_time.strftime(time_format) distribution.append( { "timestamp": timestamp_str + ":00", "number": results.get(timestamp_str, 0), } ) current_time += timedelta(hours=1) return distribution def calc_incidents_mttr(tenant_id: str, timestamp_filter: TimeStampFilter = None): """ Calculate the Mean Time to Resolve (MTTR) for incidents over time for a specific tenant. Args: tenant_id (str): ID of the tenant whose incidents are being analyzed. timestamp_filter (TimeStampFilter, optional): Filter to specify the time range. - lower_timestamp (datetime): Start of the time range. - upper_timestamp (datetime): End of the time range. Returns: List[dict]: A list of dictionaries representing the hourly MTTR of incidents. Each dictionary contains: - 'timestamp' (str): Timestamp of the hour in "YYYY-MM-DD HH:00" format. - 'mttr' (float): Mean Time to Resolve incidents in that hour (in hours). Notes: - If no timestamp_filter is provided, defaults to the last 24 hours. - Only includes resolved incidents. - Supports MySQL, PostgreSQL, and SQLite for timestamp formatting. """ with Session(engine) as session: twenty_four_hours_ago = datetime.utcnow() - timedelta(hours=24) time_format = "%Y-%m-%d %H" filters = [ Incident.tenant_id == tenant_id, Incident.status == IncidentStatus.RESOLVED.value, ] if timestamp_filter: if timestamp_filter.lower_timestamp: filters.append( Incident.creation_time >= timestamp_filter.lower_timestamp ) if timestamp_filter.upper_timestamp: filters.append( Incident.creation_time <= timestamp_filter.upper_timestamp ) else: filters.append(Incident.creation_time >= twenty_four_hours_ago) # Database-specific timestamp formatting if session.bind.dialect.name == "mysql": timestamp_format = func.date_format(Incident.creation_time, time_format) elif session.bind.dialect.name == "postgresql": timestamp_format = func.to_char(Incident.creation_time, "YYYY-MM-DD HH") elif session.bind.dialect.name == "sqlite": timestamp_format = func.strftime(time_format, Incident.creation_time) query = ( session.query( timestamp_format.label("time"), Incident.start_time, Incident.end_time, func.count().label("incidents"), ) .filter(*filters) .group_by("time", Incident.start_time, Incident.end_time) .order_by("time") ) results = {} for time, start_time, end_time, incidents in query.all(): if start_time and end_time: resolution_time = ( end_time - start_time ).total_seconds() / 3600 # in hours time_str = str(time) if time_str not in results: results[time_str] = {"number": 0, "mttr": 0} results[time_str]["number"] += incidents results[time_str]["mttr"] += resolution_time * incidents distribution = [] current_time = timestamp_filter.lower_timestamp.replace( minute=0, second=0, microsecond=0 ) while current_time <= timestamp_filter.upper_timestamp: timestamp_str = current_time.strftime(time_format) if timestamp_str in results and results[timestamp_str]["number"] > 0: avg_mttr = ( results[timestamp_str]["mttr"] / results[timestamp_str]["number"] ) else: avg_mttr = 0 distribution.append( { "timestamp": timestamp_str + ":00", "mttr": avg_mttr, } ) current_time += timedelta(hours=1) return distribution def get_presets( tenant_id: str, email, preset_ids: list[str] = None ) -> List[Dict[str, Any]]: with Session(engine) as session: # v2 with RBAC and roles if preset_ids: statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id.in_(preset_ids)) ) # v1, no RBAC and roles else: statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where( or_( Preset.is_private == False, Preset.created_by == email, ) ) ) result = session.exec(statement) presets = result.unique().all() return presets def get_db_preset_by_name(tenant_id: str, preset_name: str) -> Preset | None: with Session(engine) as session: preset = session.exec( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.name == preset_name) ).first() return preset def get_db_presets(tenant_id: str) -> List[Preset]: with Session(engine) as session: presets = ( session.exec(select(Preset).where(Preset.tenant_id == tenant_id)) .unique() .all() ) return presets def get_all_presets_dtos(tenant_id: str) -> List[PresetDto]: presets = get_db_presets(tenant_id) static_presets_dtos = list(STATIC_PRESETS.values()) return [PresetDto(**preset.to_dict()) for preset in presets] + static_presets_dtos def get_dashboards(tenant_id: str, email=None) -> List[Dict[str, Any]]: with Session(engine) as session: statement = ( select(Dashboard) .where(Dashboard.tenant_id == tenant_id) .where( or_( Dashboard.is_private == False, Dashboard.created_by == email, ) ) ) dashboards = session.exec(statement).all() # for postgres, the jsonb column is returned as a string # so we need to parse it for dashboard in dashboards: if isinstance(dashboard.dashboard_config, str): dashboard.dashboard_config = json.loads(dashboard.dashboard_config) return dashboards def create_dashboard( tenant_id, dashboard_name, created_by, dashboard_config, is_private=False ): with Session(engine) as session: dashboard = Dashboard( tenant_id=tenant_id, dashboard_name=dashboard_name, dashboard_config=dashboard_config, created_by=created_by, is_private=is_private, ) session.add(dashboard) session.commit() session.refresh(dashboard) return dashboard def update_dashboard( tenant_id, dashboard_id, dashboard_name, dashboard_config, updated_by ): with Session(engine) as session: dashboard = session.exec( select(Dashboard) .where(Dashboard.tenant_id == tenant_id) .where(Dashboard.id == dashboard_id) ).first() if not dashboard: return None if dashboard_name: dashboard.dashboard_name = dashboard_name if dashboard_config: dashboard.dashboard_config = dashboard_config dashboard.updated_by = updated_by dashboard.updated_at = datetime.utcnow() session.commit() session.refresh(dashboard) return dashboard def delete_dashboard(tenant_id, dashboard_id): with Session(engine) as session: dashboard = session.exec( select(Dashboard) .where(Dashboard.tenant_id == tenant_id) .where(Dashboard.id == dashboard_id) ).first() if dashboard: session.delete(dashboard) session.commit() return True return False def get_all_actions(tenant_id: str) -> List[Action]: with Session(engine) as session: actions = session.exec( select(Action).where(Action.tenant_id == tenant_id) ).all() return actions def get_action(tenant_id: str, action_id: str) -> Action: with Session(engine) as session: action = session.exec( select(Action) .where(Action.tenant_id == tenant_id) .where(Action.id == action_id) ).first() return action def create_action(action: Action): with Session(engine) as session: session.add(action) session.commit() session.refresh(action) def create_actions(actions: List[Action]): with Session(engine) as session: for action in actions: session.add(action) session.commit() def delete_action(tenant_id: str, action_id: str) -> bool: with Session(engine) as session: found_action = session.exec( select(Action) .where(Action.id == action_id) .where(Action.tenant_id == tenant_id) ).first() if found_action: session.delete(found_action) session.commit() return bool(found_action) return False def update_action( tenant_id: str, action_id: str, update_payload: Action ) -> Union[Action, None]: with Session(engine) as session: found_action = session.exec( select(Action) .where(Action.id == action_id) .where(Action.tenant_id == tenant_id) ).first() if found_action: for key, value in update_payload.dict(exclude_unset=True).items(): if hasattr(found_action, key): setattr(found_action, key, value) session.commit() session.refresh(found_action) return found_action def get_tenants(): with Session(engine) as session: tenants = session.exec(select(Tenant)).all() return tenants def get_tenants_configurations(only_with_config=False) -> dict: with Session(engine) as session: try: tenants = session.exec(select(Tenant)).all() # except column configuration does not exist (new column added) except OperationalError as e: if "Unknown column" in str(e): logger.warning("Column configuration does not exist in the database") return {} else: logger.exception("Failed to get tenants configurations") return {} tenants_configurations = {} for tenant in tenants: if only_with_config and not tenant.configuration: continue tenants_configurations[tenant.id] = tenant.configuration or {} return tenants_configurations def update_preset_options(tenant_id: str, preset_id: str, options: dict) -> Preset: if isinstance(preset_id, str): preset_id = __convert_to_uuid(preset_id) with Session(engine) as session: preset = session.exec( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id == preset_id) ).first() stmt = ( update(Preset) .where(Preset.id == preset_id) .where(Preset.tenant_id == tenant_id) .values(options=options) ) session.execute(stmt) session.commit() session.refresh(preset) return preset def assign_alert_to_incident( fingerprint: str, incident: Incident, tenant_id: str, session: Optional[Session] = None, ): return add_alerts_to_incident(tenant_id, incident, [fingerprint], session=session) def is_alert_assigned_to_incident( fingerprint: str, incident_id: UUID, tenant_id: str ) -> bool: with Session(engine) as session: assigned = session.exec( select(LastAlertToIncident) .join(Incident, LastAlertToIncident.incident_id == Incident.id) .where(LastAlertToIncident.fingerprint == fingerprint) .where(LastAlertToIncident.incident_id == incident_id) .where(LastAlertToIncident.tenant_id == tenant_id) .where(LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT) .where(Incident.status != IncidentStatus.DELETED.value) ).first() return assigned is not None def get_alert_audit( tenant_id: str, fingerprint: str | list[str], limit: int = 50 ) -> List[AlertAudit]: """ Get the alert audit for the given fingerprint(s). Args: tenant_id (str): the tenant_id to filter the alert audit by fingerprint (str | list[str]): the fingerprint(s) to filter the alert audit by limit (int, optional): the maximum number of alert audits to return. Defaults to 50. Returns: List[AlertAudit]: the alert audit for the given fingerprint(s) """ with Session(engine) as session: if isinstance(fingerprint, list): query = ( select(AlertAudit) .where(AlertAudit.tenant_id == tenant_id) .where(AlertAudit.fingerprint.in_(fingerprint)) .order_by(desc(AlertAudit.timestamp), AlertAudit.fingerprint) ) if limit: query = query.limit(limit) else: query = ( select(AlertAudit) .where(AlertAudit.tenant_id == tenant_id) .where(AlertAudit.fingerprint == fingerprint) .order_by(desc(AlertAudit.timestamp)) .limit(limit) ) # Execute the query and fetch all results result = session.execute(query).scalars().all() return result def get_incidents_meta_for_tenant(tenant_id: str) -> dict: with Session(engine) as session: if session.bind.dialect.name == "sqlite": sources_join = func.json_each(Incident.sources).table_valued("value") affected_services_join = func.json_each( Incident.affected_services ).table_valued("value") query = ( select( func.json_group_array(col(Incident.assignee).distinct()).label( "assignees" ), func.json_group_array(sources_join.c.value.distinct()).label( "sources" ), func.json_group_array( affected_services_join.c.value.distinct() ).label("affected_services"), ) .select_from(Incident) .outerjoin(sources_join, sources_join.c.value.isnot(None)) .outerjoin( affected_services_join, affected_services_join.c.value.isnot(None) ) .filter(Incident.tenant_id == tenant_id, Incident.is_visible == True) ) results = session.exec(query).one_or_none() if not results: return {} return { "assignees": list(filter(bool, json.loads(results.assignees))), "sources": list(filter(bool, json.loads(results.sources))), "services": list(filter(bool, json.loads(results.affected_services))), } elif session.bind.dialect.name == "mysql": sources_join = func.json_table( Incident.sources, Column("value", String(127)) ).table_valued("value") affected_services_join = func.json_table( Incident.affected_services, Column("value", String(127)) ).table_valued("value") query = ( select( func.group_concat(col(Incident.assignee).distinct()).label( "assignees" ), func.group_concat(sources_join.c.value.distinct()).label("sources"), func.group_concat(affected_services_join.c.value.distinct()).label( "affected_services" ), ) .select_from(Incident) .outerjoin(sources_join, sources_join.c.value.isnot(None)) .outerjoin( affected_services_join, affected_services_join.c.value.isnot(None) ) .filter(Incident.tenant_id == tenant_id, Incident.is_visible == True) ) results = session.exec(query).one_or_none() if not results: return {} return { "assignees": results.assignees.split(",") if results.assignees else [], "sources": results.sources.split(",") if results.sources else [], "services": ( results.affected_services.split(",") if results.affected_services else [] ), } elif session.bind.dialect.name == "postgresql": sources_join = func.json_array_elements_text(Incident.sources).table_valued( "value" ) affected_services_join = func.json_array_elements_text( Incident.affected_services ).table_valued("value") query = ( select( func.json_agg(col(Incident.assignee).distinct()).label("assignees"), func.json_agg(sources_join.c.value.distinct()).label("sources"), func.json_agg(affected_services_join.c.value.distinct()).label( "affected_services" ), ) .select_from(Incident) .outerjoin(sources_join, sources_join.c.value.isnot(None)) .outerjoin( affected_services_join, affected_services_join.c.value.isnot(None) ) .filter(Incident.tenant_id == tenant_id, Incident.is_visible == True) ) results = session.exec(query).one_or_none() if not results: return {} assignees, sources, affected_services = results return { "assignees": list(filter(bool, assignees)) if assignees else [], "sources": list(filter(bool, sources)) if sources else [], "services": ( list(filter(bool, affected_services)) if affected_services else [] ), } return {} def apply_incident_filters(session: Session, filters: dict, query): for field_name, value in filters.items(): if field_name in ALLOWED_INCIDENT_FILTERS: if field_name in ["affected_services", "sources"]: field = getattr(Incident, field_name) # Rare case with empty values if isinstance(value, list) and not any(value): continue query = filter_query(session, query, field, value) else: field = getattr(Incident, field_name) if isinstance(value, list): query = query.filter(col(field).in_(value)) else: query = query.filter(col(field) == value) return query def filter_query(session: Session, query, field, value): if session.bind.dialect.name in ["mysql", "postgresql"]: if isinstance(value, list): if session.bind.dialect.name == "mysql": query = query.filter(func.json_overlaps(field, func.json_array(value))) else: query = query.filter(col(field).op("?|")(func.array(value))) else: query = query.filter(func.json_contains(field, value)) elif session.bind.dialect.name == "sqlite": json_each_alias = func.json_each(field).table_valued("value") subquery = select(1).select_from(json_each_alias) if isinstance(value, list): subquery = subquery.where(json_each_alias.c.value.in_(value)) else: subquery = subquery.where(json_each_alias.c.value == value) query = query.filter(subquery.exists()) return query def enrich_incidents_with_alerts( tenant_id: str, incidents: List[Incident], session: Optional[Session] = None ): with existed_or_new_session(session) as session: incident_alerts = session.exec( select(LastAlertToIncident.incident_id, Alert) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == LastAlert.tenant_id, LastAlertToIncident.fingerprint == LastAlert.fingerprint, LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, ), ) .join(Alert, LastAlert.alert_id == Alert.id) .where( LastAlert.tenant_id == tenant_id, LastAlertToIncident.incident_id.in_( [incident.id for incident in incidents] ), ) ).all() alerts_per_incident = defaultdict(list) for incident_id, alert in incident_alerts: alerts_per_incident[incident_id].append(alert) for incident in incidents: incident._alerts = alerts_per_incident[incident.id] return incidents def enrich_alerts_with_incidents( tenant_id: str, alerts: List[Alert], session: Optional[Session] = None ): with existed_or_new_session(session) as session: alert_incidents = session.exec( select(LastAlertToIncident.fingerprint, Incident) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == LastAlert.tenant_id, LastAlertToIncident.fingerprint == LastAlert.fingerprint, LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, ), ) .join(Incident, LastAlertToIncident.incident_id == Incident.id) .where( LastAlert.tenant_id == tenant_id, LastAlertToIncident.fingerprint.in_( [alert.fingerprint for alert in alerts] ), ) ).all() incidents_per_alert = defaultdict(list) for fingerprint, incident in alert_incidents: incidents_per_alert[fingerprint].append(incident) for alert in alerts: alert._incidents = incidents_per_alert[alert.fingerprint] return alerts def get_incidents_by_alert_fingerprint( tenant_id: str, fingerprint: str, session: Optional[Session] = None ) -> List[Incident]: with existed_or_new_session(session) as session: alert_incidents = session.exec( select(Incident) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == LastAlert.tenant_id, LastAlertToIncident.fingerprint == LastAlert.fingerprint, LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, ), ) .join(Incident, LastAlertToIncident.incident_id == Incident.id) .where( LastAlert.tenant_id == tenant_id, LastAlertToIncident.fingerprint == fingerprint, ) ).all() return alert_incidents def get_last_incidents( tenant_id: str, limit: int = 25, offset: int = 0, timeframe: int = None, upper_timestamp: datetime = None, lower_timestamp: datetime = None, is_candidate: bool = False, sorting: Optional[IncidentSorting] = IncidentSorting.creation_time, with_alerts: bool = False, is_predicted: bool = None, filters: Optional[dict] = None, allowed_incident_ids: Optional[List[str]] = None, ) -> Tuple[list[Incident], int]: """ Get the last incidents and total amount of incidents. Args: tenant_id (str): The tenant_id to filter the incidents by. limit (int): Amount of objects to return offset (int): Current offset for timeframe (int|null): Return incidents only for the last days upper_timestamp: datetime = None, lower_timestamp: datetime = None, is_candidate (bool): filter incident candidates or real incidents sorting: Optional[IncidentSorting]: how to sort the data with_alerts (bool): Pre-load alerts or not is_predicted (bool): filter only incidents predicted by KeepAI filters (dict): dict of filters Returns: List[Incident]: A list of Incident objects. """ with Session(engine) as session: query = session.query( Incident, ).filter( Incident.tenant_id == tenant_id, Incident.is_candidate == is_candidate, Incident.is_visible == True, ) if allowed_incident_ids: query = query.filter(Incident.id.in_(allowed_incident_ids)) if is_predicted is not None: query = query.filter(Incident.is_predicted == is_predicted) if timeframe: query = query.filter( Incident.start_time >= datetime.now(tz=timezone.utc) - timedelta(days=timeframe) ) if upper_timestamp and lower_timestamp: query = query.filter( col(Incident.last_seen_time).between(lower_timestamp, upper_timestamp) ) elif upper_timestamp: query = query.filter(Incident.last_seen_time <= upper_timestamp) elif lower_timestamp: query = query.filter(Incident.last_seen_time >= lower_timestamp) if filters: query = apply_incident_filters(session, filters, query) if sorting: query = query.order_by(sorting.get_order_by(Incident)) total_count = query.count() # Order by start_time in descending order and limit the results query = query.limit(limit).offset(offset) # Execute the query incidents = query.all() if with_alerts: enrich_incidents_with_alerts(tenant_id, incidents, session) enrich_incidents_with_enrichments(tenant_id, incidents, session) return incidents, total_count def get_incident_by_id( tenant_id: str, incident_id: str | UUID, with_alerts: bool = False, session: Optional[Session] = None, ) -> Optional[Incident]: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id, should_raise=True) with existed_or_new_session(session) as session: query = ( session.query( Incident, AlertEnrichment, ) .outerjoin( AlertEnrichment, and_( Incident.tenant_id == AlertEnrichment.tenant_id, cast(col(Incident.id), String) == foreign(AlertEnrichment.alert_fingerprint), ), ) .filter( Incident.tenant_id == tenant_id, Incident.id == incident_id, ) ) incident_with_enrichments = query.first() if incident_with_enrichments: incident, enrichments = incident_with_enrichments if with_alerts: enrich_incidents_with_alerts( tenant_id, [incident], session, ) if enrichments: incident.set_enrichments(enrichments.enrichments) else: incident = None return incident def create_incident_from_dto( tenant_id: str, incident_dto: IncidentDtoIn | IncidentDto, generated_from_ai: bool = False, session: Optional[Session] = None, ) -> Optional[Incident]: """ Creates an incident for a specified tenant based on the provided incident data transfer object (DTO). Args: tenant_id (str): The unique identifier of the tenant for whom the incident is being created. incident_dto (IncidentDtoIn | IncidentDto): The data transfer object containing incident details. Can be an instance of `IncidentDtoIn` or `IncidentDto`. generated_from_ai (bool, optional): Specifies whether the incident was generated by Keep's AI. Defaults to False. Returns: Optional[Incident]: The newly created `Incident` object if successful, otherwise `None`. """ if issubclass(type(incident_dto), IncidentDto) and generated_from_ai: # NOTE: we do not use dto's alerts, alert count, start time etc # because we want to re-use the BL of creating incidents # where all of these are calculated inside add_alerts_to_incident incident_dict = { "user_summary": incident_dto.user_summary, "generated_summary": incident_dto.description, "user_generated_name": incident_dto.user_generated_name, "ai_generated_name": incident_dto.dict().get("name"), "assignee": incident_dto.assignee, "is_predicted": False, # its not a prediction, but an AI generation "is_candidate": False, # confirmed by the user :) "is_visible": True, # confirmed by the user :) "incident_type": IncidentType.AI.value, } elif issubclass(type(incident_dto), IncidentDto): # we will reach this block when incident is pulled from a provider incident_dict = incident_dto.to_db_incident().dict() if "incident_type" not in incident_dict: incident_dict["incident_type"] = IncidentType.MANUAL.value else: # We'll reach this block when a user creates an incident incident_dict = incident_dto.dict() # Keep existing incident_type if present, default to MANUAL if not if "incident_type" not in incident_dict: incident_dict["incident_type"] = IncidentType.MANUAL.value if incident_dto.severity is not None: incident_dict["severity"] = incident_dto.severity.order return create_incident_from_dict(tenant_id, incident_dict, session) @retry_on_db_error def create_incident_from_dict( tenant_id: str, incident_data: dict, session: Optional[Session] = None ) -> Optional[Incident]: is_predicted = incident_data.get("is_predicted", False) if "is_candidate" not in incident_data: incident_data["is_candidate"] = is_predicted with existed_or_new_session(session) as session: new_incident = Incident(**incident_data, tenant_id=tenant_id) session.add(new_incident) session.commit() session.refresh(new_incident) return new_incident @retry_on_db_error def update_incident_from_dto_by_id( tenant_id: str, incident_id: str | UUID, updated_incident_dto: IncidentDtoIn | IncidentDto, generated_by_ai: bool = False, ) -> Optional[Incident]: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with Session(engine) as session: incident = session.exec( select(Incident).where( Incident.tenant_id == tenant_id, Incident.id == incident_id, ) ).first() if not incident: return None if issubclass(type(updated_incident_dto), IncidentDto): # We execute this when we update an incident received from the provider updated_data = updated_incident_dto.to_db_incident().model_dump() else: # When a user updates an Incident updated_data = updated_incident_dto.dict() for key, value in updated_data.items(): # Update only if the new value is different from the current one if hasattr(incident, key) and getattr(incident, key) != value: if isinstance(value, Enum): setattr(incident, key, value.value) else: if value is not None: setattr(incident, key, value) if "same_incident_in_the_past_id" in updated_data: incident.same_incident_in_the_past_id = updated_data[ "same_incident_in_the_past_id" ] if generated_by_ai: incident.generated_summary = updated_incident_dto.user_summary else: incident.user_summary = updated_incident_dto.user_summary session.commit() session.refresh(incident) return incident def get_incident_by_fingerprint( tenant_id: str, fingerprint: str, session: Optional[Session] = None ) -> Optional[Incident]: with existed_or_new_session(session) as session: return session.exec( select(Incident).where( Incident.tenant_id == tenant_id, Incident.fingerprint == fingerprint ) ).one_or_none() def delete_incident_by_id( tenant_id: str, incident_id: UUID, session: Optional[Session] = None ) -> bool: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with existed_or_new_session(session) as session: incident = session.exec( select(Incident).filter( Incident.tenant_id == tenant_id, Incident.id == incident_id, ) ).first() session.execute( update(Incident) .where( Incident.tenant_id == tenant_id, Incident.id == incident.id, ) .values({"status": IncidentStatus.DELETED.value}) ) session.commit() return True def get_incidents_count( tenant_id: str, ) -> int: with Session(engine) as session: return ( session.query(Incident) .filter( Incident.tenant_id == tenant_id, ) .count() ) def get_incident_alerts_and_links_by_incident_id( tenant_id: str, incident_id: UUID | str, limit: Optional[int] = None, offset: Optional[int] = 0, session: Optional[Session] = None, include_unlinked: bool = False, ) -> tuple[List[tuple[Alert, LastAlertToIncident]], int]: with existed_or_new_session(session) as session: query = ( session.query( Alert, LastAlertToIncident, ) .select_from(LastAlertToIncident) .join( LastAlert, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .join(Alert, LastAlert.alert_id == Alert.id) .filter( LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident_id, ) .order_by(col(LastAlert.timestamp).desc()) .options(joinedload(Alert.alert_enrichment)) ) if not include_unlinked: query = query.filter( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, ) total_count = query.count() if limit is not None and offset is not None: query = query.limit(limit).offset(offset) return query.all(), total_count def get_incident_alerts_by_incident_id(*args, **kwargs) -> tuple[List[Alert], int]: """ Unpacking (List[(Alert, LastAlertToIncident)], int) to (List[Alert], int). """ alerts_and_links, total_alerts = get_incident_alerts_and_links_by_incident_id( *args, **kwargs ) alerts = [alert_and_link[0] for alert_and_link in alerts_and_links] return alerts, total_alerts def get_future_incidents_by_incident_id( incident_id: str, limit: Optional[int] = None, offset: Optional[int] = None, ) -> tuple[List[Incident], int]: with Session(engine) as session: query = session.query( Incident, ).filter(Incident.same_incident_in_the_past_id == incident_id) if limit: query = query.limit(limit) if offset: query = query.offset(offset) total_count = query.count() return query.all(), total_count def get_int_severity(input_severity: int | str) -> int: if isinstance(input_severity, int): return input_severity else: return IncidentSeverity(input_severity).order def get_alerts_data_for_incident( tenant_id: str, fingerprints: Optional[List[str]] = None, session: Optional[Session] = None, ): """ Function to prepare aggregated data for incidents from the given list of alert_ids Logic is wrapped to the inner function for better usability with an optional database session Args: tenant_id (str): The tenant ID to filter alerts alert_ids (list[str | UUID]): list of alert ids for aggregation session (Optional[Session]): The database session or None Returns: dict {sources: list[str], services: list[str], count: int} """ with existed_or_new_session(session) as session: fields = ( get_json_extract_field(session, Alert.event, "service"), Alert.provider_type, Alert.fingerprint, get_json_extract_field(session, Alert.event, "severity"), ) alerts_data = session.exec( select(*fields) .select_from(LastAlert) .join( Alert, and_( LastAlert.tenant_id == Alert.tenant_id, LastAlert.alert_id == Alert.id, ), ) .where( LastAlert.tenant_id == tenant_id, col(LastAlert.fingerprint).in_(fingerprints), ) ).all() sources = [] services = [] severities = [] for service, source, fingerprint, severity in alerts_data: if source: sources.append(source) if service: services.append(service) if severity: if isinstance(severity, int): severities.append(IncidentSeverity.from_number(severity)) else: severities.append(IncidentSeverity(severity)) return { "sources": set(sources), "services": set(services), "max_severity": max(severities) if severities else IncidentSeverity.LOW, } @retry_on_db_error def add_alerts_to_incident( tenant_id: str, incident: Incident, fingerprints: List[str], is_created_by_ai: bool = False, session: Optional[Session] = None, override_count: bool = False, exclude_unlinked_alerts: bool = False, # if True, do not add alerts to incident if they are manually unlinked max_retries=3, ) -> Optional[Incident]: logger.info( f"Adding alerts to incident {incident.id} in database, total {len(fingerprints)} alerts", extra={"tags": {"tenant_id": tenant_id, "incident_id": incident.id}}, ) with existed_or_new_session(session) as session: with session.no_autoflush: # Use a set for faster membership checks existing_fingerprints = set( session.exec( select(LastAlert.fingerprint) .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == LastAlert.tenant_id, LastAlertToIncident.fingerprint == LastAlert.fingerprint, ), ) .where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident.id, ) ).all() ) new_fingerprints = { fingerprint for fingerprint in fingerprints if fingerprint not in existing_fingerprints } # filter out unlinked alerts if exclude_unlinked_alerts: unlinked_alerts = set( session.exec( select(LastAlert.fingerprint) .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == LastAlert.tenant_id, LastAlertToIncident.fingerprint == LastAlert.fingerprint, ), ) .where( LastAlertToIncident.deleted_at != NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident.id, ) ).all() ) new_fingerprints = new_fingerprints - unlinked_alerts if not new_fingerprints: return incident alert_to_incident_entries = [ LastAlertToIncident( fingerprint=str(fingerprint), # it may sometime be UUID... incident_id=incident.id, tenant_id=tenant_id, is_created_by_ai=is_created_by_ai, ) for fingerprint in new_fingerprints ] for idx, entry in enumerate(alert_to_incident_entries): session.add(entry) if (idx + 1) % 100 == 0: logger.info( f"Added {idx + 1}/{len(alert_to_incident_entries)} alerts to incident {incident.id} in database", extra={ "tags": {"tenant_id": tenant_id, "incident_id": incident.id} }, ) session.flush() session.commit() alerts_data_for_incident = get_alerts_data_for_incident( tenant_id, new_fingerprints, session ) new_sources = list( set(incident.sources if incident.sources else []) | set(alerts_data_for_incident["sources"]) ) new_affected_services = list( set(incident.affected_services if incident.affected_services else []) | set(alerts_data_for_incident["services"]) ) if not incident.forced_severity: # If incident has alerts already, use the max severity between existing and new alerts, # otherwise use the new alerts max severity new_severity = ( max( incident.severity, alerts_data_for_incident["max_severity"].order, ) if incident.alerts_count else alerts_data_for_incident["max_severity"].order ) else: new_severity = incident.severity if not override_count: alerts_count = ( select(count(LastAlertToIncident.fingerprint)).where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident.id, ) ).scalar_subquery() else: alerts_count = alerts_data_for_incident["count"] last_received_field = get_json_extract_field( session, Alert.event, "lastReceived" ) started_at, last_seen_at = session.exec( select(func.min(last_received_field), func.max(last_received_field)) .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == Alert.tenant_id, LastAlertToIncident.fingerprint == Alert.fingerprint, ), ) .where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident.id, ) ).one() if isinstance(started_at, str): started_at = parse(started_at) if isinstance(last_seen_at, str): last_seen_at = parse(last_seen_at) incident_id = incident.id for attempt in range(max_retries): try: session.exec( update(Incident) .where( Incident.id == incident_id, Incident.tenant_id == tenant_id, ) .values( alerts_count=alerts_count, last_seen_time=last_seen_at, start_time=started_at, affected_services=new_affected_services, severity=new_severity, sources=new_sources, ) ) session.commit() break except StaleDataError as ex: if "expected to update" in ex.args[0]: logger.info( f"Phantom read detected while updating incident `{incident_id}`, retry #{attempt}" ) session.rollback() continue else: raise session.add(incident) session.refresh(incident) return incident def get_incident_unique_fingerprint_count( tenant_id: str, incident_id: str | UUID ) -> int: with Session(engine) as session: return session.execute( select(func.count(1)) .select_from(LastAlertToIncident) .where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident_id, ) ).scalar() def get_last_alerts_for_incidents( incident_ids: List[str | UUID], ) -> Dict[str, List[Alert]]: with Session(engine) as session: query = ( session.query( Alert, LastAlertToIncident.incident_id, ) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .join(Alert, LastAlert.alert_id == Alert.id) .filter( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.incident_id.in_(incident_ids), ) .order_by(Alert.timestamp.desc()) ) alerts = query.all() incidents_alerts = defaultdict(list) for alert, incident_id in alerts: incidents_alerts[str(incident_id)].append(alert) return incidents_alerts def remove_alerts_to_incident_by_incident_id( tenant_id: str, incident_id: str | UUID, fingerprints: List[str] ) -> Optional[int]: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with Session(engine) as session: incident = session.exec( select(Incident).where( Incident.tenant_id == tenant_id, Incident.id == incident_id, ) ).first() if not incident: return None # Removing alerts-to-incident relation for provided alerts_ids deleted = ( session.query(LastAlertToIncident) .where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident.id, col(LastAlertToIncident.fingerprint).in_(fingerprints), ) .update( { "deleted_at": datetime.now(datetime.now().astimezone().tzinfo), } ) ) session.commit() # Getting aggregated data for incidents for alerts which just was removed alerts_data_for_incident = get_alerts_data_for_incident( tenant_id, fingerprints, session=session ) service_field = get_json_extract_field(session, Alert.event, "service") # checking if services of removed alerts are still presented in alerts # which still assigned with the incident existed_services_query = ( select(func.distinct(service_field)) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .join(Alert, LastAlert.alert_id == Alert.id) .filter( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.incident_id == incident_id, service_field.in_(alerts_data_for_incident["services"]), ) ) services_existed = session.exec(existed_services_query) # checking if sources (providers) of removed alerts are still presented in alerts # which still assigned with the incident existed_sources_query = ( select(col(Alert.provider_type).distinct()) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .join(Alert, LastAlert.alert_id == Alert.id) .filter( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.incident_id == incident_id, col(Alert.provider_type).in_(alerts_data_for_incident["sources"]), ) ) sources_existed = session.exec(existed_sources_query) severity_field = get_json_extract_field(session, Alert.event, "severity") # checking if severities of removed alerts are still presented in alerts # which still assigned with the incident updated_severities_query = ( select(severity_field) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .join(Alert, LastAlert.alert_id == Alert.id) .filter( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.incident_id == incident_id, ) ) updated_severities_result = session.exec(updated_severities_query) updated_severities = [ get_int_severity(severity) for severity in updated_severities_result ] # Making lists of services and sources to remove from the incident services_to_remove = [ service for service in alerts_data_for_incident["services"] if service not in services_existed ] sources_to_remove = [ source for source in alerts_data_for_incident["sources"] if source not in sources_existed ] last_received_field = get_json_extract_field( session, Alert.event, "lastReceived" ) started_at, last_seen_at = session.exec( select(func.min(last_received_field), func.max(last_received_field)) .select_from(LastAlert) .join( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .join(Alert, LastAlert.alert_id == Alert.id) .where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident.id, ) ).one() # filtering removed entities from affected services and sources in the incident new_affected_services = [ service for service in incident.affected_services if service not in services_to_remove ] new_sources = [ source for source in incident.sources if source not in sources_to_remove ] if not incident.forced_severity: new_severity = ( max(updated_severities) if updated_severities else IncidentSeverity.LOW.order ) else: new_severity = incident.severity if isinstance(started_at, str): started_at = parse(started_at) if isinstance(last_seen_at, str): last_seen_at = parse(last_seen_at) alerts_count = ( select(count(LastAlertToIncident.fingerprint)).where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.tenant_id == tenant_id, LastAlertToIncident.incident_id == incident.id, ) ).subquery() session.exec( update(Incident) .where( Incident.id == incident_id, Incident.tenant_id == tenant_id, ) .values( alerts_count=alerts_count, last_seen_time=last_seen_at, start_time=started_at, affected_services=new_affected_services, severity=new_severity, sources=new_sources, ) ) session.commit() session.add(incident) session.refresh(incident) return deleted class DestinationIncidentNotFound(Exception): pass def merge_incidents_to_id( tenant_id: str, source_incident_ids: List[UUID], # Maybe to add optional destionation_incident_dto to merge to destination_incident_id: UUID, merged_by: str | None = None, ) -> Tuple[List[UUID], List[UUID], List[UUID]]: with Session(engine) as session: destination_incident = session.exec( select(Incident).where( Incident.tenant_id == tenant_id, Incident.id == destination_incident_id ) ).first() if not destination_incident: raise DestinationIncidentNotFound( f"Destination incident with id {destination_incident_id} not found" ) source_incidents = session.exec( select(Incident).filter( Incident.tenant_id == tenant_id, Incident.id.in_(source_incident_ids), ) ).all() enrich_incidents_with_alerts(tenant_id, source_incidents, session=session) merged_incident_ids = [] failed_incident_ids = [] for source_incident in source_incidents: source_incident_alerts_fingerprints = [ alert.fingerprint for alert in source_incident.alerts ] source_incident.merged_into_incident_id = destination_incident.id source_incident.merged_at = datetime.now(tz=timezone.utc) source_incident.status = IncidentStatus.MERGED.value source_incident.merged_by = merged_by try: remove_alerts_to_incident_by_incident_id( tenant_id, source_incident.id, [alert.fingerprint for alert in source_incident.alerts], ) except OperationalError as e: logger.error( f"Error removing alerts from incident {source_incident.id}: {e}" ) try: add_alerts_to_incident( tenant_id, destination_incident, source_incident_alerts_fingerprints, session=session, ) merged_incident_ids.append(source_incident.id) except OperationalError as e: logger.error( f"Error adding alerts to incident {destination_incident.id} from {source_incident.id}: {e}" ) failed_incident_ids.append(source_incident.id) session.commit() session.refresh(destination_incident) return merged_incident_ids, failed_incident_ids def get_alerts_count( tenant_id: str, ) -> int: with Session(engine) as session: return ( session.query(Alert) .filter( Alert.tenant_id == tenant_id, ) .count() ) def get_first_alert_datetime( tenant_id: str, ) -> datetime | None: with Session(engine) as session: first_alert = ( session.query(Alert) .filter( Alert.tenant_id == tenant_id, ) .first() ) if first_alert: return first_alert.timestamp def confirm_predicted_incident_by_id( tenant_id: str, incident_id: UUID | str, ): if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with Session(engine) as session: incident = session.exec( select(Incident) .where( Incident.tenant_id == tenant_id, Incident.id == incident_id, Incident.is_candidate == expression.true(), ) .options(joinedload(Incident.alerts)) ).first() if not incident: return None session.query(Incident).filter( Incident.tenant_id == tenant_id, Incident.id == incident_id, Incident.is_candidate == expression.true(), ).update( { "is_visible": True, } ) session.commit() session.refresh(incident) return incident def get_tenant_config(tenant_id: str) -> dict: with Session(engine) as session: tenant_data = session.exec(select(Tenant).where(Tenant.id == tenant_id)).first() return tenant_data.configuration if tenant_data else {} def write_tenant_config(tenant_id: str, config: dict) -> None: with Session(engine) as session: tenant_data = session.exec(select(Tenant).where(Tenant.id == tenant_id)).first() tenant_data.configuration = config session.commit() session.refresh(tenant_data) return tenant_data def update_incident_summary( tenant_id: str, incident_id: UUID, summary: str ) -> Incident: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with Session(engine) as session: incident = session.exec( select(Incident) .where(Incident.tenant_id == tenant_id) .where(Incident.id == incident_id) ).first() if not incident: logger.error( f"Incident not found for tenant {tenant_id} and incident {incident_id}", extra={"tenant_id": tenant_id}, ) return incident.generated_summary = summary session.commit() session.refresh(incident) return def update_incident_name(tenant_id: str, incident_id: UUID, name: str) -> Incident: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with Session(engine) as session: incident = session.exec( select(Incident) .where(Incident.tenant_id == tenant_id) .where(Incident.id == incident_id) ).first() if not incident: logger.error( f"Incident not found for tenant {tenant_id} and incident {incident_id}", extra={"tenant_id": tenant_id}, ) return incident.ai_generated_name = name session.commit() session.refresh(incident) return incident def update_incident_severity( tenant_id: str, incident_id: UUID, severity: IncidentSeverity ) -> Optional[Incident]: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with Session(engine) as session: incident = session.exec( select(Incident) .where(Incident.tenant_id == tenant_id) .where(Incident.id == incident_id) ).first() if not incident: logger.error( f"Incident not found for tenant {tenant_id} and incident {incident_id}", extra={"tenant_id": tenant_id}, ) return incident.severity = severity.order incident.forced_severity = True session.add(incident) session.commit() session.refresh(incident) return incident def get_topology_data_by_dynamic_matcher( tenant_id: str, matchers_value: dict[str, str] ) -> TopologyService | None: with Session(engine) as session: query = select(TopologyService).where(TopologyService.tenant_id == tenant_id) for matcher in matchers_value: query = query.where( getattr(TopologyService, matcher) == matchers_value[matcher] ) # Add joinedload for applications to avoid detached instance error query = query.options(joinedload(TopologyService.applications)) service = session.exec(query).first() return service def get_tags(tenant_id): with Session(engine) as session: tags = session.exec(select(Tag).where(Tag.tenant_id == tenant_id)).all() return tags def create_tag(tag: Tag): with Session(engine) as session: session.add(tag) session.commit() session.refresh(tag) return tag def assign_tag_to_preset(tenant_id: str, tag_id: str, preset_id: str): if isinstance(preset_id, str): preset_id = __convert_to_uuid(preset_id) with Session(engine) as session: tag_preset = PresetTagLink( tenant_id=tenant_id, tag_id=tag_id, preset_id=preset_id, ) session.add(tag_preset) session.commit() session.refresh(tag_preset) return tag_preset def get_provider_by_name(tenant_id: str, provider_name: str) -> Provider: with Session(engine) as session: provider = session.exec( select(Provider) .where(Provider.tenant_id == tenant_id) .where(Provider.name == provider_name) ).first() return provider def get_provider_by_type_and_id( tenant_id: str, provider_type: str, provider_id: Optional[str] ) -> Provider: with Session(engine) as session: query = select(Provider).where( Provider.tenant_id == tenant_id, Provider.type == provider_type, Provider.id == provider_id, ) provider = session.exec(query).first() return provider def bulk_upsert_alert_fields( tenant_id: str, fields: List[str], provider_id: str, provider_type: str, session: Optional[Session] = None, max_retries=3, ): with existed_or_new_session(session) as session: for attempt in range(max_retries): try: # Prepare the data for bulk insert data = [ { "tenant_id": tenant_id, "field_name": field, "provider_id": provider_id, "provider_type": provider_type, } for field in fields ] if engine.dialect.name == "postgresql": stmt = pg_insert(AlertField).values(data) stmt = stmt.on_conflict_do_update( index_elements=[ "tenant_id", "field_name", ], # Unique constraint columns set_={ "provider_id": stmt.excluded.provider_id, "provider_type": stmt.excluded.provider_type, }, ) elif engine.dialect.name == "mysql": stmt = mysql_insert(AlertField).values(data) stmt = stmt.on_duplicate_key_update( provider_id=stmt.inserted.provider_id, provider_type=stmt.inserted.provider_type, ) elif engine.dialect.name == "sqlite": stmt = sqlite_insert(AlertField).values(data) stmt = stmt.on_conflict_do_update( index_elements=[ "tenant_id", "field_name", ], # Unique constraint columns set_={ "provider_id": stmt.excluded.provider_id, "provider_type": stmt.excluded.provider_type, }, ) elif engine.dialect.name == "mssql": # SQL Server requires a raw query with a MERGE statement values = ", ".join( f"('{tenant_id}', '{field}', '{provider_id}', '{provider_type}')" for field in fields ) merge_query = text( f""" MERGE INTO AlertField AS target USING (VALUES {values}) AS source (tenant_id, field_name, provider_id, provider_type) ON target.tenant_id = source.tenant_id AND target.field_name = source.field_name WHEN MATCHED THEN UPDATE SET provider_id = source.provider_id, provider_type = source.provider_type WHEN NOT MATCHED THEN INSERT (tenant_id, field_name, provider_id, provider_type) VALUES (source.tenant_id, source.field_name, source.provider_id, source.provider_type) """ ) session.execute(merge_query) else: raise NotImplementedError( f"Upsert not supported for {engine.dialect.name}" ) # Execute the statement if engine.dialect.name != "mssql": # Already executed for SQL Server session.execute(stmt) session.commit() break except OperationalError as e: # Handle any potential race conditions session.rollback() if "Deadlock found" in str(e): logger.info( f"Deadlock found during bulk_upsert_alert_fields `{e}`, retry #{attempt}" ) if attempt >= max_retries: raise e continue else: raise e def get_alerts_fields(tenant_id: str) -> List[AlertField]: with Session(engine) as session: fields = session.exec( select(AlertField).where(AlertField.tenant_id == tenant_id) ).all() return fields def change_incident_status_by_id( tenant_id: str, incident_id: UUID | str, status: IncidentStatus, end_time: datetime | None = None, ) -> bool: if isinstance(incident_id, str): incident_id = __convert_to_uuid(incident_id) with Session(engine) as session: stmt = ( update(Incident) .where( Incident.tenant_id == tenant_id, Incident.id == incident_id, ) .values( status=status.value, end_time=end_time, ) ) session.exec(stmt) session.commit() def get_workflow_executions_for_incident_or_alert( tenant_id: str, incident_id: str, limit: int = 25, offset: int = 0 ): with Session(engine) as session: # Base query for both incident and alert related executions base_query = ( select( WorkflowExecution.id, WorkflowExecution.started, WorkflowExecution.status, WorkflowExecution.execution_number, WorkflowExecution.triggered_by, WorkflowExecution.workflow_id, WorkflowExecution.execution_time, Workflow.name.label("workflow_name"), literal(incident_id).label("incident_id"), case( ( WorkflowToAlertExecution.alert_fingerprint != None, WorkflowToAlertExecution.alert_fingerprint, ), else_=literal(None), ).label("alert_fingerprint"), ) .join(Workflow, WorkflowExecution.workflow_id == Workflow.id) .outerjoin( WorkflowToAlertExecution, WorkflowExecution.id == WorkflowToAlertExecution.workflow_execution_id, ) .where(WorkflowExecution.tenant_id == tenant_id) ) # Query for workflow executions directly associated with the incident incident_query = base_query.join( WorkflowToIncidentExecution, WorkflowExecution.id == WorkflowToIncidentExecution.workflow_execution_id, ).where(WorkflowToIncidentExecution.incident_id == incident_id) # Query for workflow executions associated with alerts tied to the incident alert_query = ( base_query.join( LastAlert, WorkflowToAlertExecution.alert_fingerprint == LastAlert.fingerprint, ) .join(Alert, LastAlert.alert_id == Alert.id) .join( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.incident_id == incident_id, LastAlert.tenant_id == tenant_id, ) ) # Combine both queries combined_query = union(incident_query, alert_query).subquery() # Count total results count_query = select(func.count()).select_from(combined_query) total_count = session.execute(count_query).scalar() # Final query with ordering, offset, and limit final_query = ( select(combined_query) .order_by(desc(combined_query.c.started)) .offset(offset) .limit(limit) ) # Execute the query and fetch results results = session.execute(final_query).all() return results, total_count def is_all_alerts_resolved( fingerprints: Optional[List[str]] = None, incident: Optional[Incident] = None, session: Optional[Session] = None, ): return is_all_alerts_in_status( fingerprints, incident, AlertStatus.RESOLVED, session ) def is_all_alerts_in_status( fingerprints: Optional[List[str]] = None, incident: Optional[Incident] = None, status: AlertStatus = AlertStatus.RESOLVED, session: Optional[Session] = None, ): if incident and incident.alerts_count == 0: return False with existed_or_new_session(session) as session: enriched_status_field = get_json_extract_field( session, AlertEnrichment.enrichments, "status" ) status_field = get_json_extract_field(session, Alert.event, "status") subquery = ( select( enriched_status_field.label("enriched_status"), status_field.label("status"), ) .select_from(LastAlert) .join(Alert, LastAlert.alert_id == Alert.id) .outerjoin( AlertEnrichment, and_( Alert.tenant_id == AlertEnrichment.tenant_id, Alert.fingerprint == AlertEnrichment.alert_fingerprint, ), ) ) if fingerprints: subquery = subquery.where(LastAlert.fingerprint.in_(fingerprints)) if incident: subquery = subquery.join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == LastAlert.tenant_id, LastAlertToIncident.fingerprint == LastAlert.fingerprint, ), ).where( LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, LastAlertToIncident.incident_id == incident.id, ) subquery = subquery.subquery() not_in_status_exists = session.query( exists( select( subquery.c.enriched_status, subquery.c.status, ) .select_from(subquery) .where( or_( subquery.c.enriched_status != status.value, and_( subquery.c.enriched_status.is_(None), subquery.c.status != status.value, ), ) ) ) ).scalar() return not not_in_status_exists def is_last_incident_alert_resolved( incident: Incident, session: Optional[Session] = None ) -> bool: return is_edge_incident_alert_resolved(incident, func.max, session) def is_first_incident_alert_resolved( incident: Incident, session: Optional[Session] = None ) -> bool: return is_edge_incident_alert_resolved(incident, func.min, session) def is_edge_incident_alert_resolved( incident: Incident, direction: Callable, session: Optional[Session] = None ) -> bool: if incident.alerts_count == 0: return False with existed_or_new_session(session) as session: enriched_status_field = get_json_extract_field( session, AlertEnrichment.enrichments, "status" ) status_field = get_json_extract_field(session, Alert.event, "status") finerprint, enriched_status, status = session.exec( select(Alert.fingerprint, enriched_status_field, status_field) .select_from(Alert) .outerjoin( AlertEnrichment, and_( Alert.tenant_id == AlertEnrichment.tenant_id, Alert.fingerprint == AlertEnrichment.alert_fingerprint, ), ) .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == Alert.tenant_id, LastAlertToIncident.fingerprint == Alert.fingerprint, ), ) .where(LastAlertToIncident.incident_id == incident.id) .group_by(Alert.fingerprint) .having(func.max(Alert.timestamp)) .order_by(direction(Alert.timestamp)) ).first() return enriched_status == AlertStatus.RESOLVED.value or ( enriched_status is None and status == AlertStatus.RESOLVED.value ) def get_alerts_metrics_by_provider( tenant_id: str, start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, fields: Optional[List[str]] = [], ) -> Dict[str, Dict[str, Any]]: dynamic_field_sums = [ func.sum( case( ( (func.json_extract(Alert.event, f"$.{field}").isnot(None)) & (func.json_extract(Alert.event, f"$.{field}") != False), 1, ), else_=0, ) ).label(f"{field}_count") for field in fields ] with Session(engine) as session: query = ( session.query( Alert.provider_type, Alert.provider_id, func.count(Alert.id).label("total_alerts"), func.sum( case((LastAlertToIncident.fingerprint.isnot(None), 1), else_=0) ).label("correlated_alerts"), *dynamic_field_sums, ) .join(LastAlert, Alert.id == LastAlert.alert_id) .outerjoin( LastAlertToIncident, and_( LastAlert.tenant_id == LastAlertToIncident.tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .filter( Alert.tenant_id == tenant_id, ) ) # Add timestamp filter only if both start_date and end_date are provided if start_date and end_date: query = query.filter( Alert.timestamp >= start_date, Alert.timestamp <= end_date ) results = query.group_by(Alert.provider_id, Alert.provider_type).all() metrics = {} for row in results: key = f"{row.provider_id}_{row.provider_type}" metrics[key] = { "total_alerts": row.total_alerts, "correlated_alerts": row.correlated_alerts, "provider_type": row.provider_type, } for field in fields: metrics[key][f"{field}_count"] = getattr(row, f"{field}_count", 0) return metrics def get_or_create_external_ai_settings( tenant_id: str, ) -> List[ExternalAIConfigAndMetadataDto]: with Session(engine) as session: algorithm_configs = session.exec( select(ExternalAIConfigAndMetadata).where( ExternalAIConfigAndMetadata.tenant_id == tenant_id ) ).all() if len(algorithm_configs) == 0: if os.environ.get("KEEP_EXTERNAL_AI_TRANSFORMERS_URL") is not None: algorithm_config = ExternalAIConfigAndMetadata.from_external_ai( tenant_id=tenant_id, algorithm=external_ai_transformers ) session.add(algorithm_config) session.commit() algorithm_configs = [algorithm_config] return [ ExternalAIConfigAndMetadataDto.from_orm(algorithm_config) for algorithm_config in algorithm_configs ] def update_extrnal_ai_settings( tenant_id: str, ai_settings: ExternalAIConfigAndMetadata ) -> ExternalAIConfigAndMetadataDto: with Session(engine) as session: setting = ( session.query(ExternalAIConfigAndMetadata) .filter( ExternalAIConfigAndMetadata.tenant_id == tenant_id, ExternalAIConfigAndMetadata.id == ai_settings.id, ) .first() ) setting.settings = json.dumps(ai_settings.settings) setting.feedback_logs = ai_settings.feedback_logs if ai_settings.settings_proposed_by_algorithm is not None: setting.settings_proposed_by_algorithm = json.dumps( ai_settings.settings_proposed_by_algorithm ) else: setting.settings_proposed_by_algorithm = None session.add(setting) session.commit() return setting def get_table_class(table_name: str) -> Type[SQLModel]: """ Get the SQLModel table class dynamically based on table name. Assumes table classes follow PascalCase naming convention. Args: table_name (str): Name of the table in snake_case (e.g. "alerts", "rules") Returns: Type[SQLModel]: The corresponding SQLModel table class """ # Convert snake_case to PascalCase and remove trailing 's' if exists class_name = "".join( word.capitalize() for word in table_name.rstrip("s").split("_") ) # Get all SQLModel subclasses from the imported modules model_classes = { cls.__name__: cls for cls in SQLModel.__subclasses__() if hasattr(cls, "__tablename__") } if class_name not in model_classes: raise ValueError(f"No table class found for table name: {table_name}") return model_classes[class_name] def get_resource_ids_by_resource_type( tenant_id: str, table_name: str, uid: str, session: Optional[Session] = None ) -> List[str]: """ Get all unique IDs from a table grouped by a specified UID column. Args: tenant_id (str): The tenant ID to filter by table_name (str): Name of the table (e.g. "alerts", "rules") uid (str): Name of the column to group by session (Optional[Session]): SQLModel session Returns: List[str]: List of unique IDs Example: >>> get_resource_ids_by_resource_type("tenant123", "alerts", "alert_id") ['id1', 'id2', 'id3'] """ with existed_or_new_session(session) as session: # Get the table class dynamically table_class = get_table_class(table_name) # Create the query using SQLModel's select query = ( select(getattr(table_class, uid)) .distinct() .where(getattr(table_class, "tenant_id") == tenant_id) ) # Execute the query and return results result = session.exec(query) return result.all() def get_or_creat_posthog_instance_id(session: Optional[Session] = None): POSTHOG_INSTANCE_ID_KEY = "posthog_instance_id" with Session(engine) as session: system = session.exec( select(System).where(System.name == POSTHOG_INSTANCE_ID_KEY) ).first() if system: return system.value system = System( id=str(uuid4()), name=POSTHOG_INSTANCE_ID_KEY, value=str(uuid4()), ) session.add(system) session.commit() session.refresh(system) return system.value def get_activity_report(session: Optional[Session] = None): from keep.api.models.db.user import User last_24_hours = datetime.utcnow() - timedelta(hours=24) activity_report = {} with Session(engine) as session: activity_report["tenants_count"] = session.query(Tenant).count() activity_report["providers_count"] = session.query(Provider).count() activity_report["users_count"] = session.query(User).count() activity_report["rules_count"] = session.query(Rule).count() activity_report["last_24_hours_incidents_count"] = ( session.query(Incident) .filter(Incident.creation_time >= last_24_hours) .count() ) activity_report["last_24_hours_alerts_count"] = ( session.query(Alert).filter(Alert.timestamp >= last_24_hours).count() ) activity_report["last_24_hours_rules_created"] = ( session.query(Rule).filter(Rule.creation_time >= last_24_hours).count() ) activity_report["last_24_hours_workflows_created"] = ( session.query(Workflow) .filter(Workflow.creation_time >= last_24_hours) .count() ) activity_report["last_24_hours_workflows_executed"] = ( session.query(WorkflowExecution) .filter(WorkflowExecution.started >= last_24_hours) .count() ) return activity_report def get_last_alerts_by_fingerprints( tenant_id: str, fingerprint: List[str], session: Optional[Session] = None, ) -> List[LastAlert]: with existed_or_new_session(session) as session: query = select(LastAlert).where( and_( LastAlert.tenant_id == tenant_id, LastAlert.fingerprint.in_(fingerprint), ) ) return session.exec(query).all() def get_last_alert_by_fingerprint( tenant_id: str, fingerprint: str, session: Optional[Session] = None, for_update: bool = False, ) -> Optional[LastAlert]: with existed_or_new_session(session) as session: query = select(LastAlert).where( and_( LastAlert.tenant_id == tenant_id, LastAlert.fingerprint == fingerprint, ) ) if for_update: query = query.with_for_update() return session.exec(query).first() def set_last_alert( tenant_id: str, alert: Alert, session: Optional[Session] = None, max_retries=3 ) -> None: fingerprint = alert.fingerprint logger.info(f"Setting last alert for `{fingerprint}`") with existed_or_new_session(session) as session: for attempt in range(max_retries): logger.info( f"Attempt {attempt} to set last alert for `{fingerprint}`", extra={ "alert_id": alert.id, "tenant_id": tenant_id, "fingerprint": fingerprint, }, ) try: last_alert = get_last_alert_by_fingerprint( tenant_id, fingerprint, session, for_update=True ) # To prevent rare, but possible race condition # For example if older alert failed to process # and retried after new one if last_alert and last_alert.timestamp.replace( tzinfo=tz.UTC ) < alert.timestamp.replace(tzinfo=tz.UTC): logger.info( f"Update last alert for `{fingerprint}`: {last_alert.alert_id} -> {alert.id}", extra={ "alert_id": alert.id, "tenant_id": tenant_id, "fingerprint": fingerprint, }, ) last_alert.timestamp = alert.timestamp last_alert.alert_id = alert.id last_alert.alert_hash = alert.alert_hash session.add(last_alert) elif not last_alert: logger.info(f"No last alert for `{fingerprint}`, creating new") last_alert = LastAlert( tenant_id=tenant_id, fingerprint=alert.fingerprint, timestamp=alert.timestamp, first_timestamp=alert.timestamp, alert_id=alert.id, alert_hash=alert.alert_hash, ) session.add(last_alert) session.commit() break except OperationalError as ex: if "no such savepoint" in ex.args[0]: logger.info( f"No such savepoint while updating lastalert for `{fingerprint}`, retry #{attempt}" ) session.rollback() if attempt >= max_retries: raise ex continue if "Deadlock found" in ex.args[0]: logger.info( f"Deadlock found while updating lastalert for `{fingerprint}`, retry #{attempt}" ) session.rollback() if attempt >= max_retries: raise ex continue except NoActiveSqlTransaction: logger.exception( f"No active sql transaction while updating lastalert for `{fingerprint}`, retry #{attempt}", extra={ "alert_id": alert.id, "tenant_id": tenant_id, "fingerprint": fingerprint, }, ) continue logger.debug( f"Successfully updated lastalert for `{fingerprint}`", extra={ "alert_id": alert.id, "tenant_id": tenant_id, "fingerprint": fingerprint, }, ) # break the retry loop break def set_maintenance_windows_trace(alert: Alert, maintenance_w: MaintenanceWindowRule, session: Optional[Session] = None): mw_id = str(maintenance_w.id) if mw_id in alert.event.get("maintenance_windows_trace", []): return with existed_or_new_session(session) as session: if "maintenance_windows_trace" in alert.event: if mw_id not in alert.event['maintenance_windows_trace']: alert.event['maintenance_windows_trace'].append(mw_id) else: alert.event['maintenance_windows_trace'] = [mw_id] flag_modified(alert, "event") session.add(alert) session.commit() def get_provider_logs( tenant_id: str, provider_id: str, limit: int = 100 ) -> List[ProviderExecutionLog]: with Session(engine) as session: logs = ( session.query(ProviderExecutionLog) .filter( ProviderExecutionLog.tenant_id == tenant_id, ProviderExecutionLog.provider_id == provider_id, ) .order_by(desc(ProviderExecutionLog.timestamp)) .limit(limit) .all() ) return logs def enrich_incidents_with_enrichments( tenant_id: str, incidents: List[Incident], session: Optional[Session] = None, ) -> List[Incident]: """Enrich incidents with their enrichment data.""" if not incidents: return incidents with existed_or_new_session(session) as session: # Get all enrichments for these incidents in one query enrichments = session.exec( select(AlertEnrichment).where( AlertEnrichment.tenant_id == tenant_id, AlertEnrichment.alert_fingerprint.in_( [str(incident.id) for incident in incidents] ), ) ).all() # Create a mapping of incident_id to enrichment enrichments_map = { enrichment.alert_fingerprint: enrichment.enrichments for enrichment in enrichments } # Add enrichments to each incident for incident in incidents: incident._enrichments = enrichments_map.get(str(incident.id), {}) return incidents def get_error_alerts(tenant_id: str, limit: int = 100) -> List[AlertRaw]: with Session(engine) as session: return ( session.query(AlertRaw) .filter( AlertRaw.tenant_id == tenant_id, AlertRaw.error == True, AlertRaw.dismissed == False, ) .limit(limit) .all() ) def dismiss_error_alerts(tenant_id: str, alert_id=None, dismissed_by=None) -> None: with Session(engine) as session: stmt = ( update(AlertRaw) .where( AlertRaw.tenant_id == tenant_id, ) .values( dismissed=True, dismissed_by=dismissed_by, dismissed_at=datetime.now(tz=timezone.utc), ) ) if alert_id: if isinstance(alert_id, str): alert_id_uuid = uuid.UUID(alert_id) stmt = stmt.where(AlertRaw.id == alert_id_uuid) else: stmt = stmt.where(AlertRaw.id == alert_id) session.exec(stmt) session.commit() def create_tenant(tenant_name: str) -> str: with Session(engine) as session: try: # check if the tenant exist: logger.info("Checking if tenant exists") tenant = session.exec( select(Tenant).where(Tenant.name == tenant_name) ).first() if not tenant: # Do everything related with single tenant creation in here tenant_id = str(uuid4()) logger.info( "Creating tenant", extra={"tenant_id": tenant_id, "tenant_name": tenant_name}, ) session.add(Tenant(id=tenant_id, name=tenant_name)) else: logger.warning("Tenant already exists") # commit the changes session.commit() logger.info( "Tenant created", extra={"tenant_id": tenant_id, "tenant_name": tenant_name}, ) return tenant_id except IntegrityError: # Tenant already exists logger.exception("Failed to create tenant") raise except Exception: logger.exception("Failed to create tenant") pass def create_single_tenant_for_e2e(tenant_id: str) -> None: """ Creates the single tenant and the default user if they don't exist. """ with Session(engine) as session: try: # check if the tenant exist: logger.info("Checking if single tenant exists") tenant = session.exec(select(Tenant).where(Tenant.id == tenant_id)).first() if not tenant: # Do everything related with single tenant creation in here logger.info("Creating single tenant", extra={"tenant_id": tenant_id}) session.add(Tenant(id=tenant_id, name="Single Tenant")) else: logger.info("Single tenant already exists") # commit the changes session.commit() logger.info("Single tenant created", extra={"tenant_id": tenant_id}) except IntegrityError: # Tenant already exists logger.exception("Failed to provision single tenant") raise except Exception: logger.exception("Failed to create single tenant") pass def get_maintenance_windows_started(session: Optional[Session] = None) -> List[MaintenanceWindowRule]: """ It will return all windows started, i.e start_time < currentTime """ with existed_or_new_session(session) as session: query = ( select(MaintenanceWindowRule) .where(MaintenanceWindowRule.start_time <= datetime.now(tz=timezone.utc)) ) return session.exec(query).all() def recover_prev_alert_status(alert: Alert, session: Optional[Session] = None): """ It'll restore the previous status of the alert. """ with existed_or_new_session(session) as session: try: status = alert.event.get("status") prev_status = alert.event.get("previous_status") alert.event["status"] = prev_status alert.event["previous_status"] = status except KeyError: logger.warning(f"Alert {alert.id} does not have previous status.") query = ( update(Alert) .where(Alert.id == alert.id) .values( event = alert.event ) ) session.exec(query) session.commit() ================================================ FILE: keep/api/core/db_on_start.py ================================================ """ This module is responsible for creating the database and tables when the application starts. The reason to split this code from db.py is that the functions here are invoked from the master process when the application starts, while the functions in db.py are invoked from the worker processes. This is important because if the master process init the engine, it will be forked to the worker processes, and the engine will be shared among all the processes, causing issues with the connections. ** This happens because the engine is not fork-safe, and the connections are not thread-safe. ** The mitigation is to create different engines for each process, and the master process should only be responsible for creating the database and tables, while the worker processes should only be responsible for creating the sessions. """ import hashlib import logging import os import alembic.command import alembic.config from sqlalchemy.exc import IntegrityError from sqlmodel import Session, select from keep.api.core.config import config from keep.api.core.db_utils import create_db_engine from keep.api.models.db.alert import * # pylint: disable=unused-wildcard-import from keep.api.models.db.dashboard import * # pylint: disable=unused-wildcard-import from keep.api.models.db.extraction import * # pylint: disable=unused-wildcard-import from keep.api.models.db.mapping import * # pylint: disable=unused-wildcard-import from keep.api.models.db.preset import * # pylint: disable=unused-wildcard-import from keep.api.models.db.provider import * # pylint: disable=unused-wildcard-import from keep.api.models.db.rule import * # pylint: disable=unused-wildcard-import from keep.api.models.db.statistics import * # pylint: disable=unused-wildcard-import from keep.api.models.db.tenant import * # pylint: disable=unused-wildcard-import from keep.api.models.db.workflow import * # pylint: disable=unused-wildcard-import # This import is required to create the tables from keep.identitymanager.rbac import Admin as AdminRole logger = logging.getLogger(__name__) engine = create_db_engine() KEEP_FORCE_RESET_DEFAULT_PASSWORD = config( "KEEP_FORCE_RESET_DEFAULT_PASSWORD", default="false", cast=bool ) DEFAULT_USERNAME = config("KEEP_DEFAULT_USERNAME", default="keep") DEFAULT_PASSWORD = config("KEEP_DEFAULT_PASSWORD", default="keep") def try_create_single_tenant(tenant_id: str, create_default_user=True) -> None: """ Creates the single tenant and the default user if they don't exist. """ # if Keep is not multitenant, let's import the User table too: from keep.api.models.db.user import User # pylint: disable=import-outside-toplevel with Session(engine) as session: try: # check if the tenant exist: tenant = session.exec(select(Tenant).where(Tenant.id == tenant_id)).first() if not tenant: # Do everything related with single tenant creation in here logger.info("Creating single tenant") session.add(Tenant(id=tenant_id, name="Single Tenant")) else: logger.info("Single tenant already exists") # now let's create the default user # check if at least one user exists: user: User | None = session.exec(select(User)).first() # if no users exist, let's create the default user if not user and create_default_user: logger.info("Creating default user") default_password = hashlib.sha256(DEFAULT_PASSWORD.encode()).hexdigest() default_user = User( username=DEFAULT_USERNAME, password_hash=default_password, role=AdminRole.get_name(), ) session.add(default_user) logger.info("Default user created") # else, if the user want to force the refresh of the default user password elif KEEP_FORCE_RESET_DEFAULT_PASSWORD and user: # update the password of the default user logger.info("Forcing reset of default user password") default_password = hashlib.sha256(DEFAULT_PASSWORD.encode()).hexdigest() user.password_hash = default_password if user.username != DEFAULT_USERNAME: logger.info( "Default user username updated", extra={ "username": user.username, "new_username": DEFAULT_USERNAME, }, ) user.username = DEFAULT_USERNAME logger.info("Default user password updated") # provision default api keys if os.environ.get("KEEP_DEFAULT_API_KEYS", ""): logger.info("Provisioning default api keys") from keep.contextmanager.contextmanager import ContextManager from keep.secretmanager.secretmanagerfactory import SecretManagerFactory default_api_keys = os.environ.get("KEEP_DEFAULT_API_KEYS").split(",") for default_api_key in default_api_keys: try: api_key_name, api_key_role, api_key_secret = ( default_api_key.strip().split(":") ) except ValueError: logger.error( "Invalid format for default api key. Expected format: name:role:secret" ) # Create the default api key for the default user api_key = session.exec( select(TenantApiKey).where( TenantApiKey.reference_id == api_key_name ) ).first() if api_key: logger.info(f"Api key {api_key_name} already exists") continue logger.info(f"Provisioning api key {api_key_name}") hashed_api_key = hashlib.sha256( api_key_secret.encode("utf-8") ).hexdigest() new_installation_api_key = TenantApiKey( tenant_id=tenant_id, reference_id=api_key_name, key_hash=hashed_api_key, is_system=True, created_by="system", role=api_key_role, ) session.add(new_installation_api_key) # write to the secret manager context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager( context_manager ) try: secret_manager.write_secret( secret_name=f"{tenant_id}-{api_key_name}", secret_value=api_key_secret, ) # probably 409 if the secret already exists, but we don't want to fail on that except Exception: logger.exception( f"Failed to write secret for api key {api_key_name}" ) pass logger.info(f"Api key {api_key_name} provisioned") logger.info("Api keys provisioned") # commit the changes session.commit() logger.info("Single tenant created") except IntegrityError: # Tenant already exists logger.exception("Failed to provision single tenant") raise except Exception: logger.exception("Failed to create single tenant") pass def migrate_db(): """ Run migrations to make sure the DB is up-to-date. """ if os.environ.get("SKIP_DB_CREATION", "false") == "true": logger.info("Skipping running migrations...") return None logger.info("Running migrations...") config_path = os.path.dirname(os.path.abspath(__file__)) + "/../../" + "alembic.ini" config = alembic.config.Config(file_=config_path) # Re-defined because alembic.ini uses relative paths which doesn't work # when running the app as a pyhton pakage (could happen form any path) config.set_main_option( "script_location", os.path.dirname(os.path.abspath(__file__)) + "/../models/db/migrations", ) alembic.command.upgrade(config, "head") logger.info("Finished migrations") ================================================ FILE: keep/api/core/db_utils.py ================================================ """ This module contains the database utilities. Mainly, it creates the database engine based on the environment variables. """ import json import logging import os from enum import Enum from typing import Any, Dict, Optional, Tuple, Type, TypeVar import pymysql from dotenv import find_dotenv, load_dotenv from fastapi.encoders import jsonable_encoder from google.cloud.sql.connector import Connector from pydantic import BaseModel from sqlalchemy import func from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.compiler import compiles from sqlalchemy.sql.ddl import CreateColumn from sqlalchemy.sql.functions import GenericFunction from sqlmodel import Session, SQLModel, create_engine, select # This import is required to create the tables from keep.api.consts import RUNNING_IN_CLOUD_RUN from keep.api.core.config import config logger = logging.getLogger(__name__) def __get_conn() -> pymysql.connections.Connection: """ Creates a connection to the database when running in Cloud Run. Returns: pymysql.connections.Connection: The DB connection. """ with Connector() as connector: conn = connector.connect( os.environ.get("DB_CONNECTION_NAME", "keephq-sandbox:us-central1:keep"), "pymysql", ip_type=os.environ.get("DB_IP_TYPE", "public"), user=os.environ.get("DB_SERVICE_ACCOUNT", "keep-api"), db=os.environ.get("DB_NAME", "keepdb"), enable_iam_auth=True, ) return conn def __get_conn_impersonate() -> pymysql.connections.Connection: """ Creates a connection to the remote database when running locally. Returns: pymysql.connections.Connection: The DB connection. """ from google.auth import ( # pylint: disable=import-outside-toplevel default, impersonated_credentials, ) from google.auth.transport.requests import ( # pylint: disable=import-outside-toplevel Request, ) # Get application default credentials creds, _ = default() # Create impersonated credentials target_scopes = ["https://www.googleapis.com/auth/cloud-platform"] service_account = os.environ.get("DB_SERVICE_ACCOUNT") creds = impersonated_credentials.Credentials( source_credentials=creds, target_principal=service_account, target_scopes=target_scopes, ) # Refresh the credentials to obtain an impersonated access token creds.refresh(Request()) # Get the access token access_token = creds.token # Create a new MySQL connection with the obtained access token with Connector() as connector: conn = connector.connect( os.environ.get("DB_CONNECTION_NAME", "keephq-sandbox:us-central1:keep"), "pymysql", user="keep-api", password=access_token, host="127.0.0.1", port=3306, database=os.environ.get("DB_NAME", "keepdb"), ) return conn # this is a workaround for gunicorn to load the env vars # becuase somehow in gunicorn it doesn't load the .env file load_dotenv(find_dotenv()) DB_CONNECTION_STRING = config( "DATABASE_CONNECTION_STRING", default=None ) # pylint: disable=invalid-name DB_POOL_SIZE = config( "DATABASE_POOL_SIZE", default=5, cast=int ) # pylint: disable=invalid-name DB_MAX_OVERFLOW = config( "DATABASE_MAX_OVERFLOW", default=10, cast=int ) # pylint: disable=invalid-name DB_ECHO = config( "DATABASE_ECHO", default=False, cast=bool ) # pylint: disable=invalid-name KEEP_FORCE_CONNECTION_STRING = config( "KEEP_FORCE_CONNECTION_STRING", default=False, cast=bool ) # pylint: disable=invalid-name KEEP_DB_PRE_PING_ENABLED = config( "KEEP_DB_PRE_PING_ENABLED", default=False, cast=bool ) # pylint: disable=invalid-name def dumps(_json) -> str: """ Overcome the issue of serializing datetime objects to JSON with the default json.dumps. Usually seen with PostgreSQL JSONB fields. https://stackoverflow.com/questions/36438052/using-a-custom-json-encoder-for-sqlalchemys-postgresql-jsonb-implementation Args: _json (object): The json object to serialize. Returns: str: The serialized JSON object. """ return json.dumps(_json, default=str) def create_db_engine(): """ Creates a database engine based on the environment variables. """ if RUNNING_IN_CLOUD_RUN and not KEEP_FORCE_CONNECTION_STRING: engine = create_engine( "mysql+pymysql://", creator=__get_conn, echo=DB_ECHO, json_serializer=dumps, pool_size=DB_POOL_SIZE, max_overflow=DB_MAX_OVERFLOW, ) elif DB_CONNECTION_STRING == "impersonate": engine = create_engine( "mysql+pymysql://", creator=__get_conn_impersonate, echo=DB_ECHO, json_serializer=dumps, ) elif DB_CONNECTION_STRING: try: logger.info(f"Creating a connection pool with size {DB_POOL_SIZE}") engine = create_engine( DB_CONNECTION_STRING, pool_size=DB_POOL_SIZE, max_overflow=DB_MAX_OVERFLOW, json_serializer=dumps, echo=DB_ECHO, pool_pre_ping=True if KEEP_DB_PRE_PING_ENABLED else False, ) # SQLite does not support pool_size except TypeError: engine = create_engine( DB_CONNECTION_STRING, json_serializer=dumps, echo=DB_ECHO ) else: engine = create_engine( "sqlite:///./keep.db", connect_args={"check_same_thread": False}, echo=DB_ECHO, json_serializer=dumps, ) return engine def get_json_extract_field(session, base_field, key): if session.bind.dialect.name == "postgresql": return func.json_extract_path_text(base_field, key) elif session.bind.dialect.name == "mysql": return func.json_unquote(func.json_extract(base_field, "$.{}".format(key))) else: return func.json_extract(base_field, "$.{}".format(key)) def get_aggreated_field(session: Session, column_name: str, alias: str): if session.bind is None: raise ValueError("Session is not bound to a database") if session.bind.dialect.name == "postgresql": return func.array_agg(column_name).label(alias) elif session.bind.dialect.name == "mysql": return func.json_arrayagg(column_name).label(alias) elif session.bind.dialect.name == "sqlite": return func.group_concat(column_name).label(alias) else: return func.array_agg(column_name).label(alias) class json_table(GenericFunction): inherit_cache = True @compiles(json_table, "mysql") def _compile_json_table(element, compiler, **kw): ddl_compiler = compiler.dialect.ddl_compiler(compiler.dialect, None) return "JSON_TABLE({}, '$[*]' COLUMNS({} PATH '$'))".format( compiler.process(element.clauses.clauses[0], **kw), ",".join( ddl_compiler.process(CreateColumn(clause), **kw) for clause in element.clauses.clauses[1:] ), ) T = TypeVar("T", bound=SQLModel) def get_or_create( session: Session, model: Type[T], defaults: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> Tuple[T, bool]: """ Get an instance by filter kwargs, or create one with those filters plus any defaults. Args: session: SQLModel session model: Model class defaults: Dict of default values for creation (not used for lookup) **kwargs: Filter parameters used both for lookup and creation Returns: tuple: (instance, created) where created is a boolean indicating if a new instance was created """ # Build query with all filter conditions query = select(model) for key, value in kwargs.items(): query = query.where(getattr(model, key) == value) # Execute the query instance = session.exec(query).first() if instance: return instance, False # Prepare creation attributes create_attrs = kwargs.copy() if defaults: create_attrs.update(defaults) instance = model(**create_attrs) session.add(instance) try: # Try to flush without committing to detect any integrity errors session.flush() return instance, True except IntegrityError: # If there's a conflict, roll back and try to fetch again (another process might have created it) session.rollback() # Try to fetch again with the same query instance = session.exec(query).first() if instance: return instance, False # If we still can't find it, something else is wrong, re-raise raise def custom_serialize(obj: Any) -> Any: """ Custom serializer that handles Pydantic models (like AlertDto) and other complex types. """ if isinstance(obj, dict): return {k: custom_serialize(v) for k, v in obj.items()} elif isinstance(obj, list): return [custom_serialize(item) for item in obj] elif isinstance(obj, tuple): return tuple(custom_serialize(item) for item in obj) elif isinstance(obj, BaseModel): # For Pydantic models like AlertDto return obj.dict() elif isinstance(obj, Enum): # For enum values return obj.value else: # For other objects, try jsonable_encoder, which handles many edge cases try: return jsonable_encoder(obj) except Exception: # If even jsonable_encoder fails, convert to string as a last resort return str(obj) ================================================ FILE: keep/api/core/demo_mode.py ================================================ import asyncio import logging import os import random import threading import time from uuid import uuid4 import aiohttp import requests from requests.models import PreparedRequest from keep.api.core.db import get_session_sync from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.logging import CONFIG from keep.api.models.db.topology import TopologyServiceInDto from keep.api.tasks.process_topology_task import process_topology from keep.api.utils.tenant_utils import get_or_create_api_key from keep.providers.providers_factory import ProvidersFactory logging.config.dictConfig(CONFIG) logger = logging.getLogger(__name__) KEEP_LIVE_DEMO_MODE = os.environ.get("KEEP_LIVE_DEMO_MODE", "false").lower() == "true" GENERATE_DEDUPLICATIONS = False REQUESTS_QUEUE = asyncio.Queue() correlation_rules_to_create = [ { "sqlQuery": {"sql": "((name like :name_1))", "params": {"name_1": "%MQ%"}}, "groupDescription": "This rule groups all alerts related to MQ.", "ruleName": "Message queue is getting filled up", "celQuery": '(name.contains("MQ"))', "timeframeInSeconds": 86400, "timeUnit": "hours", "groupingCriteria": [], "requireApprove": False, "resolveOn": "never", }, { "sqlQuery": { "sql": "((name like :name_1) or (name = :name_2) or (name like :name_3)) or (name = :name_4)", "params": { "name_1": "%NetworkLatencyHigh%", "name_2": "HighCPUUsage", "name_3": "%NetworkLatencyIsHigh%", "name_4": "Failed to load product catalog", }, }, "groupDescription": "This rule groups alerts from multiple sources.", "ruleName": "Application issue caused by DB load", "celQuery": '(name.contains("NetworkLatencyHigh")) || (name == "HighCPUUsage") || (name.contains("NetworkLatencyIsHigh")) || (name == "Failed to load product catalog")', "timeframeInSeconds": 86400, "timeUnit": "hours", "groupingCriteria": [], "requireApprove": False, "resolveOn": "never", }, ] services_to_create = [ TopologyServiceInDto( source_provider_id="Prod-Datadog", repository="keephq/keep", tags=[], service="api", display_name="API Service", environment="prod", description="The main API service", team="keep", email="support@keephq.dev", slack="https://slack.keephq.dev", ip_address="10.0.0.1", category="Python", manufacturer="", dependencies={ "db": "SQL", "queue": "AMQP", }, application_ids=[], updated_at="2024-11-18T09:23:46", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", repository="keephq/keep", tags=[], service="ui", display_name="Platform", environment="prod", description="The user interface (aka Platform)", team="keep", email="support@keephq.dev", slack="https://slack.keephq.dev", ip_address="10.0.0.2", category="nextjs", manufacturer="", dependencies={ "api": "HTTP/S", }, application_ids=[], updated_at="2024-11-18T09:29:25", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", repository="keephq/keep", tags=[], service="db", display_name="DB", environment="prod", description="Production Database", team="keep", email="support@keephq.dev", slack="https://slack.keephq.dev", ip_address="10.0.0.3", category="postgres", manufacturer="", dependencies={}, application_ids=[], updated_at="2024-11-18T09:30:44", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", repository="keephq/keep", tags=[], service="queue", display_name="Kafka", environment="prod", description="Production Queue", team="keep", email="support@keephq.dev", slack="https://slack.keephq.dev", ip_address="10.0.0.4", category="Kafka", dependencies={ "processor": "AMQP", }, application_ids=[], updated_at="2024-11-18T09:31:31", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", repository="keephq/keep", tags=[], service="processor", display_name="Processor", environment="prod", description="Processing Service", team="keep", email="support@keephq.dev", slack="https://slack.keephq.dev", ip_address="10.0.0.5", category="go", dependencies={ "storage": "HTTP/S", }, application_ids=[], updated_at="2024-11-18T10:02:20", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", repository="keephq/keep", tags=[], service="backoffice", display_name="Backoffice", environment="prod", description="Backoffice UI to control configuration", team="keep", email="support@keephq.dev", slack="https://slack.keephq.dev", ip_address="172.1.1.0", category="nextjs", dependencies={ "api": "HTTP/S", }, application_ids=[], updated_at="2024-11-18T10:11:31", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", repository="keephq/keep", tags=[], service="storage", display_name="Storage", environment="prod", description="Storage Service", team="keep", email="support@keephq.dev", slack="https://slack.keephq.dev", ip_address="10.0.0.8", category="python", dependencies={}, application_ids=[], updated_at="2024-11-18T10:13:56", ), ] application_to_create = { "name": "Main App", "description": "It is the most critical business process ever imaginable.", "services": [ {"name": "API Service", "service": "api"}, {"name": "DB", "service": "db"}, {"name": "Kafka", "service": "queue"}, {"name": "Processor", "service": "processor"}, {"name": "Storage", "service": "storage"}, ], } def get_or_create_topology(keep_api_key, keep_api_url): services_existing = requests.get( f"{keep_api_url}/topology", headers={"x-api-key": keep_api_key}, ) services_existing.raise_for_status() services_existing = services_existing.json() # Creating services if len(services_existing) == 0: process_topology( SINGLE_TENANT_UUID, services_to_create, "Prod-Datadog", "datadog" ) # Create application applications_existing = requests.get( f"{keep_api_url}/topology/applications", headers={"x-api-key": keep_api_key}, ) applications_existing.raise_for_status() applications_existing = applications_existing.json() if len(applications_existing) == 0: # Pull services again to get their ids services_existing = requests.get( f"{keep_api_url}/topology", headers={"x-api-key": keep_api_key}, ) services_existing.raise_for_status() services_existing = services_existing.json() # Update application_to_create with existing services ids for service in application_to_create["services"]: for existing_service in services_existing: if service["name"] == existing_service["display_name"]: service["id"] = existing_service["id"] # Check if any service does not have an id for service in application_to_create["services"]: if "id" not in service: logger.error( f"Service {service['name']} does not have an id. Application creation failed." ) return True response = requests.post( f"{keep_api_url}/topology/applications", headers={"x-api-key": keep_api_key}, json=application_to_create, ) response.raise_for_status() def get_or_create_correlation_rules(keep_api_key, keep_api_url): correlation_rules_existing = requests.get( f"{keep_api_url}/rules", headers={"x-api-key": keep_api_key}, ) correlation_rules_existing.raise_for_status() correlation_rules_existing = correlation_rules_existing.json() if len(correlation_rules_existing) == 0: for correlation_rule in correlation_rules_to_create: response = requests.post( f"{keep_api_url}/rules", headers={"x-api-key": keep_api_key}, json=correlation_rule, ) response.raise_for_status() def get_installed_providers(keep_api_key, keep_api_url): response = requests.get( f"{keep_api_url}/providers", headers={"x-api-key": keep_api_key}, ) response.raise_for_status() return response.json()["installed_providers"] def perform_demo_ai(keep_api_key, keep_api_url): # Get or create manual Incident incidents_existing = requests.get( f"{keep_api_url}/incidents", headers={"x-api-key": keep_api_key}, ) incidents_existing.raise_for_status() incidents_existing = incidents_existing.json()["items"] MANUAL_INCIDENT_NAME = "GPU Cluster issue" incident_exists = None # Create incident if it doesn't exist for incident in incidents_existing: if incident["user_generated_name"] == MANUAL_INCIDENT_NAME: incident_exists = incident if incident_exists is None: response = requests.post( f"{keep_api_url}/incidents", headers={"x-api-key": keep_api_key}, json={ "user_generated_name": MANUAL_INCIDENT_NAME, "user_summary": "While two other incidents are created because of correlation rules, this incident is created manually and only a few alerts are added to it. AI will correlated it with the rest of alerts automatically.", "severity": "critical", "status": "open", "environment": "prod", "service": "api", "application": "Main App", "description": "This is a manual incident.", }, ) response.raise_for_status() random_number = random.randint(1, 100) if random_number > 90: return # Publish alert FAKE_ALERT_NAMES = [ "HighGPUConsumption", "NotMuchGPUMemoryLeft", "GPUServiceError", ] name = random.choice(FAKE_ALERT_NAMES) DESCRIPTIONS = { "HighGPUConsumption": "GPU usage is high", "NotMuchGPUMemoryLeft": "GPU memory latency is high", "GPUServiceError": "GPU service is probably unreachable", } response = requests.post( f"{keep_api_url}/alerts/event", headers={ "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": keep_api_key, }, json={ "name": name, "source": ["prometheus"], "description": DESCRIPTIONS[name], "fingerprint": str(uuid4()), }, ) response.raise_for_status() # If incident has not many alerts, correlate alerts_in_incident = requests.get( f"{keep_api_url}/incidents/{incident_exists['id']}/alerts", headers={"x-api-key": keep_api_key}, ) alerts_in_incident.raise_for_status() alerts_in_incident = alerts_in_incident.json() if len(alerts_in_incident["items"]) < 20: alerts_existing = requests.get( f"{keep_api_url}/alerts", headers={"x-api-key": keep_api_key}, ) alerts_existing.raise_for_status() alerts_existing = alerts_existing.json() fingerprints_to_add = [] for alert in alerts_existing: if alert["name"] in FAKE_ALERT_NAMES: fingerprints_to_add.append(alert["fingerprint"]) if len(fingerprints_to_add) > 0: fingerprints_to_add = fingerprints_to_add[:10] response = requests.post( f"{keep_api_url}/incidents/{incident_exists['id']}/alerts", headers={"x-api-key": keep_api_key}, json=fingerprints_to_add, ) response.raise_for_status() number_of_errors_before_restart = 10 async def safe_run_async_worker(worker, *args, **kwargs): number_of_errors = 0 while True: logger.info( f"Starting worker {worker.__name__}", extra={ "args_": args, "kwargs_": kwargs, } ) try: await worker(*args, **kwargs) except asyncio.CancelledError: # pragma: no cover # happens on shutdown, fine pass except Exception: number_of_errors += 1 # we want to raise an exception if we have too many errors if ( number_of_errors_before_restart and number_of_errors >= number_of_errors_before_restart ): logger.error( f"Worker encountered {number_of_errors} errors, restarting...", exc_info=True, ) raise # o.w: log the error and continue logger.exception("Demo worker error") await asyncio.sleep(3) continue break def simulate_alerts(*args, **kwargs): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.create_task(safe_run_async_worker(simulate_alerts_worker, worker_id=0, keep_api_key=kwargs.get("keep_api_key"), rps=0)) loop.create_task(safe_run_async_worker(simulate_alerts_async, *args, **kwargs)) loop.run_forever() async def simulate_alerts_async( keep_api_url=None, keep_api_key=None, sleep_interval=5, demo_correlation_rules=False, demo_topology=False, clean_old_incidents=False, demo_ai=False, count=None, target_rps=0, ): logger.info("Simulating alerts...") providers_config = [ {"type": "prometheus", "weight": 3}, {"type": "grafana", "weight": 1}, {"type": "cloudwatch", "weight": 1}, {"type": "datadog", "weight": 1}, {"type": "sentry", "weight": 2}, # {"type": "signalfx", "weight": 1}, {"type": "gcpmonitoring", "weight": 1}, ] # Normalize weights total_weight = sum(p["weight"] for p in providers_config) normalized_weights = [p["weight"] / total_weight for p in providers_config] providers = [p["type"] for p in providers_config] providers_to_randomize_fingerprint_for = [ # "cloudwatch", # "datadog", ] provider_classes = { provider: ProvidersFactory.get_provider_class(provider) for provider in providers } existing_installed_providers = get_installed_providers(keep_api_key, keep_api_url) logger.info(f"Existing installed providers: {existing_installed_providers}") existing_providers_to_their_ids = {} for existing_provider in existing_installed_providers: if existing_provider["type"] in providers: existing_providers_to_their_ids[existing_provider["type"]] = ( existing_provider["id"] ) logger.info( f"Existing installed existing_providers_to_their_ids: {existing_providers_to_their_ids}" ) if demo_correlation_rules: logger.info("Creating correlation rules...") get_or_create_correlation_rules(keep_api_key, keep_api_url) logger.info("Correlation rules created.") if demo_topology: logger.info("Creating topology...") get_or_create_topology(keep_api_key, keep_api_url) logger.info("Topology created.") shoot = 1 while True: if count is not None: count -= 1 if count < 0: break try: logger.info("Looping to send alerts...") if demo_ai: perform_demo_ai(keep_api_key, keep_api_url) # If we want to make stress-testing, we want to prepare more data for faster requesting in workers if target_rps: shoot = target_rps * 100 for _ in range(shoot): send_alert_url_params = {} # choose provider based on weights provider_type = random.choices( providers, weights=normalized_weights, k=1 )[0] send_alert_url = "{}/alerts/event/{}".format( keep_api_url, provider_type ) if provider_type in existing_providers_to_their_ids: send_alert_url_params["provider_id"] = ( existing_providers_to_their_ids[provider_type] ) logger.info( f"Provider type: {provider_type}, send_alert_url_params now are: {send_alert_url_params}" ) provider = provider_classes[provider_type] alert = provider.simulate_alert() if provider_type in providers_to_randomize_fingerprint_for: send_alert_url_params["fingerprint"] = str(uuid4()) # Determine number of times to send the same alert num_iterations = 1 if GENERATE_DEDUPLICATIONS: num_iterations = random.randint(1, 3) env = random.choice(["production", "staging", "development"]) if "provider_id" not in send_alert_url_params: send_alert_url_params["provider_id"] = f"{provider_type}-{env}" else: alert["environment"] = random.choice( ["prod-01", "prod-02", "prod-03"] ) for _ in range(num_iterations): prepared_request = PreparedRequest() prepared_request.prepare_url(send_alert_url, send_alert_url_params) await REQUESTS_QUEUE.put((prepared_request.url, alert)) if not target_rps: await asyncio.sleep(sleep_interval) # Wait until almost prepopulated data was consumed while not REQUESTS_QUEUE.empty(): await asyncio.sleep(sleep_interval) except Exception as e: logger.exception( "Error in simulate_alerts", extra={"exception_str": str(e)} ) logger.info( "Sleeping for {} seconds before next iteration".format(sleep_interval) ) def launch_demo_mode_thread( keep_api_url=None, keep_api_key=None ) -> threading.Thread | None: if not KEEP_LIVE_DEMO_MODE: logger.info("Not launching the demo mode.") return logger.info("Launching demo mode.") if keep_api_key is None: with get_session_sync() as session: keep_api_key = get_or_create_api_key( session=session, tenant_id=SINGLE_TENANT_UUID, created_by="system", unique_api_key_id="simulate_alerts", system_description="Simulate Alerts API key", ) sleep_interval = 5 thread = threading.Thread( target=simulate_alerts, kwargs={ "keep_api_key": keep_api_key, "keep_api_url": keep_api_url, "sleep_interval": sleep_interval, "demo_correlation_rules": True, "demo_topology": True, "clean_old_incidents": True, "demo_ai": True, }, ) thread.daemon = True thread.start() logger.info("Demo mode launched.") return thread async def simulate_alerts_worker(worker_id, keep_api_key, rps=1): headers = {"x-api-key": keep_api_key, "Content-type": "application/json"} async with aiohttp.ClientSession() as session: total_start = time.time() total_requests = 0 while True: start = time.time() url, alert = await REQUESTS_QUEUE.get() async with session.post(url, json=alert, headers=headers) as response: response_time = time.time() - start total_requests += 1 if not response.ok: logger.error("Failed to send alert: {}".format(response.text)) else: logger.info( f"Alert sent successfully in {response_time:.3f} seconds" ) if rps: delay = 1 / rps - (time.time() - start) if delay > 0: logger.debug("worker %d sleeps, %f", worker_id, delay) await asyncio.sleep(delay) logger.info( "Worker %d RPS: %.2f", worker_id, total_requests / (time.time() - total_start), ) logger.info("Total requests: %d", total_requests) if __name__ == "__main__": keep_api_url = os.environ.get("KEEP_API_URL") or "http://localhost:8080" keep_api_key = os.environ.get("KEEP_READ_ONLY_BYPASS_KEY") get_or_create_correlation_rules(keep_api_key, keep_api_url) simulate_alerts( keep_api_url=keep_api_url, keep_api_key=keep_api_key, sleep_interval=1, demo_correlation_rules=True, ) ================================================ FILE: keep/api/core/dependencies.py ================================================ import logging import os from fastapi import Request from fastapi.datastructures import FormData from pusher import Pusher from keep.api.core.config import config logger = logging.getLogger(__name__) # Just a fake random tenant id SINGLE_TENANT_UUID = "keep" SINGLE_TENANT_EMAIL = "admin@keephq" PUSHER_ROOT_CA = config("PUSHER_ROOT_CA", default=None) if PUSHER_ROOT_CA: logger.warning("Patching PUSHER root certificate") from pusher import requests as pusher_requests pusher_requests.CERT_PATH = PUSHER_ROOT_CA async def extract_generic_body(request: Request) -> dict | bytes | FormData: """ Extracts the body of the request based on the content type. Args: request (Request): The request object. Returns: dict | bytes | FormData: The body of the request. """ content_type = request.headers.get("Content-Type") if content_type == "application/x-www-form-urlencoded": return await request.form() elif isinstance(content_type, str) and content_type.startswith("multipart/form-data"): return await request.form() else: try: logger.debug("Parsing body as json") body = await request.json() logger.debug("Parsed body as json") return body except Exception: logger.debug("Failed to parse body as json, returning raw body") return await request.body() def get_pusher_client() -> Pusher | None: logger.debug("Getting pusher client") pusher_disabled = os.environ.get("PUSHER_DISABLED", "false") == "true" pusher_host = os.environ.get("PUSHER_HOST") pusher_app_id = os.environ.get("PUSHER_APP_ID") pusher_app_key = os.environ.get("PUSHER_APP_KEY") pusher_app_secret = os.environ.get("PUSHER_APP_SECRET") if ( pusher_disabled or pusher_app_id is None or pusher_app_key is None or pusher_app_secret is None ): logger.debug("Pusher is disabled or missing environment variables") return None # TODO: defaults on open source no docker try: pusher = Pusher( host=pusher_host, port=( int(os.environ.get("PUSHER_PORT")) if os.environ.get("PUSHER_PORT") else None ), app_id=pusher_app_id, key=pusher_app_key, secret=pusher_app_secret, ssl=False if os.environ.get("PUSHER_USE_SSL", False) is False else True, cluster=os.environ.get("PUSHER_CLUSTER"), ) except ValueError: logger.warning( "Pusher client could not be initialized due to invalid configuration " "(PUSHER_APP_ID must be a numeric string). " "Real-time push notifications are disabled.", extra={"pusher_app_id": pusher_app_id}, ) return None logging.debug("Pusher client initialized") return pusher ================================================ FILE: keep/api/core/elastic.py ================================================ import logging import os from elasticsearch import ApiError, BadRequestError, Elasticsearch from elasticsearch.helpers import BulkIndexError, bulk from keep.api.core.db import get_enrichments from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.core.tenant_configuration import TenantConfiguration from keep.api.models.alert import AlertDto, AlertSeverity from keep.api.utils.cel_utils import preprocess_cel_expression from keep.api.utils.enrichment_helpers import parse_and_enrich_deleted_and_assignees class ElasticClient: def __init__( self, tenant_id, api_key=None, hosts: list[str] = None, basic_auth=None, **kwargs, ): self.tenant_id = tenant_id self.tenant_configuration = TenantConfiguration() self.logger = logging.getLogger(__name__) enabled = os.environ.get("ELASTIC_ENABLED", "false").lower() == "true" # if its a single tenant deployment or elastic is disabled, return if tenant_id == SINGLE_TENANT_UUID: self.enabled = enabled # if its a multi tenant deployment and elastic is on, check if its enabled for the tenant elif not enabled: self.enabled = False # else, pre tenant configuration else: # if elastic is disabled for the tenant, return if not self.tenant_configuration.get_configuration( tenant_id, "search_mode" ): self.enabled = False self.logger.debug(f"Elastic is disabled for tenant {tenant_id}") return else: self.enabled = True # if elastic is disabled, return if not self.enabled: return self.refresh_strategy = os.environ.get("ELASTIC_REFRESH_STRATEGY", "true") self.api_key = api_key or os.environ.get("ELASTIC_API_KEY") self.hosts = hosts or os.environ.get("ELASTIC_HOSTS").split(",") self.verify_certs = ( os.environ.get("ELASTIC_VERIFY_CERTS", "true").lower() == "true" ) basic_auth = basic_auth or ( os.environ.get("ELASTIC_USER"), os.environ.get("ELASTIC_PASSWORD"), ) if not (self.api_key or basic_auth) or not self.hosts: raise ValueError( "No Elastic configuration found although Elastic is enabled" ) # single tenant id should have an index suffix if tenant_id == SINGLE_TENANT_UUID and not os.environ.get( "ELASTIC_INDEX_SUFFIX" ): raise ValueError( "No Elastic index suffix found although Elastic is enabled for single tenant" ) if any(basic_auth): self.logger.debug("Using basic auth for Elastic") self._client = Elasticsearch( basic_auth=basic_auth, hosts=self.hosts, verify_certs=self.verify_certs, **kwargs, ) else: self.logger.debug("Using API key for Elastic") self._client = Elasticsearch( api_key=self.api_key, hosts=self.hosts, verify_certs=self.verify_certs, **kwargs, ) @property def alerts_index(self): if self.tenant_id == SINGLE_TENANT_UUID: suffix = os.environ.get("ELASTIC_INDEX_SUFFIX") return f"keep-alerts-{suffix}" else: return f"keep-alerts-{self.tenant_id}" def _construct_alert_dto_from_results(self, results): if not results: return [] alert_dtos = [] fingerprints = [ result["_source"]["fingerprint"] for result in results["hits"]["hits"] ] enrichments = get_enrichments(self.tenant_id, fingerprints) enrichments_by_fingerprint = { enrichment.alert_fingerprint: enrichment.enrichments for enrichment in enrichments } for result in results["hits"]["hits"]: alert = result["_source"] alert_dto = AlertDto(**alert) if alert_dto.fingerprint in enrichments_by_fingerprint: parse_and_enrich_deleted_and_assignees( alert_dto, enrichments_by_fingerprint[alert_dto.fingerprint] ) alert_dtos.append(alert_dto) return alert_dtos def run_query(self, query: str, limit: int = 1000): if not self.enabled: return # preprocess severity query = preprocess_cel_expression(query) try: # TODO - handle source (array) # TODO - https://www.elastic.co/guide/en/elasticsearch/reference/current/sql-limitations.html#_array_type_of_fields results = self._client.sql.query( body={ "query": query, "field_multi_value_leniency": True, "fetch_size": limit, } ) return results except BadRequestError as e: # means no index. if no alert was indexed, the index is not exist if "Unknown index" in str(e): self.logger.warning("Index does not exist yet.") return [] else: self.logger.exception( f"Failed to run query in Elastic: {e}", extra={ "tenant_id": self.tenant_id, }, ) raise Exception(f"Failed to run query in Elastic: {e}") except Exception as e: self.logger.exception( f"Failed to run query in Elastic: {e}", extra={ "tenant_id": self.tenant_id, }, ) raise Exception(f"Failed to run query in Elastic: {e}") def search_alerts(self, query: str, limit: int) -> list[AlertDto]: if not self.enabled: return [] try: # Shahar: due to limitation in Elasticsearch array fields, we translate the SQL to DSL # this is not 100% efficient since there are two requests (translate + query) instead of one but this could be improved with # either: # 1. get the ES query from the client (react query builder support it) # 2. use the translate when keeping the preset in the db since its not change (only for presets, not general queryes) # 3. wait for ES to support array fields in SQL # TODO - https://www.elastic.co/guide/en/elasticsearch/reference/current/sql-limitations.html#_array_type_of_fields # preprocess severity query = preprocess_cel_expression(query) dsl_query = self._client.sql.translate( body={"query": query, "fetch_size": limit} ) # get all fields dsl_query = dict(dsl_query) dsl_query["_source"] = True dsl_query["fields"] = ["*"] raw_alerts = self._client.search(index=self.alerts_index, body=dsl_query) alerts_dtos = self._construct_alert_dto_from_results(raw_alerts) return alerts_dtos except BadRequestError as e: # means no index. if no alert was indexed, the index is not exist if "Unknown index" in str(e): self.logger.warning("Index does not exist yet.") return [] else: self.logger.error(f"Failed to run query in Elastic: {e}") raise Exception(f"Failed to run query in Elastic: {e}") except Exception as e: self.logger.error(f"Failed to search alerts in Elastic: {e}") raise Exception(f"Failed to search alerts in Elastic: {e}") def index_alert(self, alert: AlertDto): if not self.enabled: return try: # query alert_dict = alert.dict() alert_dict["dismissed"] = bool(alert_dict["dismissed"]) # change severity to number so we can sort by it alert_dict["severity"] = AlertSeverity(alert.severity.lower()).order self._client.index( index=self.alerts_index, body=alert_dict, id=alert.fingerprint, # we want to update the alert if it already exists so that elastic will have the latest version refresh=self.refresh_strategy, ) # TODO: retry/pubsub except ApiError as e: self.logger.error(f"Failed to index alert to Elastic: {e} {e.errors}") raise Exception(f"Failed to index alert to Elastic: {e} {e.errors}") except Exception as e: self.logger.error(f"Failed to index alert to Elastic: {e}") raise Exception(f"Failed to index alert to Elastic: {e}") def index_alerts(self, alerts: list[AlertDto]): if not self.enabled: return actions = [] for alert in alerts: if hasattr(alert, "incident_dto"): alert.incident_dto = [incident.json() for incident in alert.incident_dto] action = { "_index": self.alerts_index, "_id": alert.fingerprint, # use fingerprint as the document ID "_source": alert.dict(), } # change severity to number so we can sort by it action["_source"]["severity"] = AlertSeverity( action["_source"]["severity"].lower() ).order actions.append(action) try: success, failed = bulk(self._client, actions, refresh=self.refresh_strategy) self.logger.info( f"Successfully indexed {success} alerts. Failed to index {failed} alerts." ) except BulkIndexError as e: self.logger.error(f"Failed to index alerts to Elastic: {e} {e.errors}") raise Exception(f"Failed to index alerts to Elastic: {e} {e.errors}") except ApiError as e: self.logger.error(f"Failed to index alerts to Elastic: {e} {e.errors}") raise Exception(f"Failed to index alerts to Elastic: {e} {e.errors}") except Exception as e: self.logger.exception(f"Failed to index alerts to Elastic: {e}") raise Exception(f"Failed to index alerts to Elastic: {e}") def enrich_alert(self, alert_fingerprint: str, alert_enrichments: dict): if not self.enabled: return self.logger.debug(f"Enriching alert {alert_fingerprint}") # get the alert, enrich it and index it alert = self._client.get(index=self.alerts_index, id=alert_fingerprint) if not alert: self.logger.error(f"Alert with fingerprint {alert_fingerprint} not found") return # enrich the alert alert["_source"].update(alert_enrichments) enriched_alert = AlertDto(**alert["_source"]) # index the enriched alert self.index_alert(enriched_alert) self.logger.debug(f"Alert {alert_fingerprint} enriched and indexed") def drop_index(self): if not self.enabled: return self._client.indices.delete(index=self.alerts_index) ================================================ FILE: keep/api/core/facets.py ================================================ import json import logging from typing import Any from sqlalchemy import select from sqlalchemy.exc import OperationalError from keep.api.core.cel_to_sql.ast_nodes import DataType from keep.api.core.cel_to_sql.properties_metadata import PropertiesMetadata from keep.api.core.facets_query_builder.get_facets_query_builder import ( get_facets_query_builder, ) from keep.api.core.facets_query_builder.utils import get_facet_key from keep.api.models.facet import CreateFacetDto, FacetDto, FacetOptionDto, FacetOptionsQueryDto from uuid import UUID, uuid4 # from pydantic import BaseModel from sqlmodel import Session from keep.api.core.db import engine from keep.api.models.db.facet import Facet, FacetType logger = logging.getLogger(__name__) OPTIONS_PER_FACET = 50 def build_facet_selects( properties_metadata: PropertiesMetadata, facets: list[FacetDto] ): return None def map_facet_option_value(value, data_type: DataType): """ Maps the value to the appropriate data type. Args: value: The value to be mapped. data_type: The data type to map the value to. Returns: The mapped value. """ if data_type == DataType.INTEGER: try: return int(value) except ValueError: return value elif data_type == DataType.FLOAT: try: return float(value) except ValueError: return value elif data_type == DataType.BOOLEAN: return value in ["true", "1"] else: return value def get_facet_options( base_query_factory: lambda facet_property_path, select_statement: Any, entity_id_column: any, facets: list[FacetDto], facet_options_query: FacetOptionsQueryDto, properties_metadata: PropertiesMetadata, ) -> dict[str, list[FacetOptionDto]]: """ Generates facet options based on the provided query and metadata. Args: base_query: The base SQL query to be used for fetching data. cel (str): The CEL (Common Expression Language) string for filtering. facets (list[FacetDto]): A list of facet definitions. properties_metadata (PropertiesMetadata): Metadata about the properties. Returns: dict[str, list[FacetOptionDto]]: A dictionary where keys are facet IDs and values are lists of FacetOptionDto objects. """ invalid_facets = [] valid_facets = [] for facet in facets: if properties_metadata.get_property_metadata_for_str(facet.property_path): valid_facets.append(facet) continue invalid_facets.append(facet) result_dict: dict[str, list[FacetOptionDto]] = {} if valid_facets: with Session(engine) as session: try: db_query = get_facets_query_builder( properties_metadata ).build_facets_data_query( base_query_factory=base_query_factory, entity_id_column=entity_id_column, facets=valid_facets, facet_options_query=facet_options_query, ) data = session.exec(db_query).all() except OperationalError as e: logger.warning( f"""Failed to execute query for facet options. Facet options: {json.dumps(facet_options_query.dict())} Error: {e} """ ) return {facet.id: [] for facet in facets} grouped_by_id_dict = {} for facet_data in data: if facet_data.facet_id not in grouped_by_id_dict: grouped_by_id_dict[facet_data.facet_id] = [] # This is to limit the number of options per facet # It's done mostly for sqlite, because in sqlite we can't use limit in the subquery if ( engine.dialect.name == "sqlite" and len(grouped_by_id_dict[facet_data.facet_id]) >= OPTIONS_PER_FACET ): continue grouped_by_id_dict[facet_data.facet_id].append(facet_data) for facet in facets: facet_key = get_facet_key( facet.property_path, facet_options_query.cel, facet_options_query.facet_queries[facet.id], ) property_mapping = properties_metadata.get_property_metadata_for_str( facet.property_path ) result_dict.setdefault(facet.id, []) if facet_key in grouped_by_id_dict: result_dict[facet.id] = [ FacetOptionDto( display_name=str(facet_value), value=map_facet_option_value( facet_value, property_mapping.data_type ), matches_count=0 if matches_count is None else matches_count, ) for facet_id, facet_value, matches_count in grouped_by_id_dict[ facet_key ] ] if property_mapping is None: result_dict[facet.id] = [] continue if property_mapping.enum_values: if facet.id in result_dict: values_with_zero_matches = [ enum_value for enum_value in property_mapping.enum_values if enum_value not in [ facet_option.value for facet_option in result_dict[facet.id] ] ] else: result_dict.setdefault(facet.id, []) values_with_zero_matches = property_mapping.enum_values for enum_value in values_with_zero_matches: result_dict[facet.id].append( FacetOptionDto( display_name=enum_value, value=enum_value, matches_count=0, ) ) result_dict[facet.id] = sorted( result_dict[facet.id], key=lambda facet_option: ( property_mapping.enum_values.index(facet_option.value) if facet_option.value in property_mapping.enum_values else -100 # put unknown values at the end ), reverse=True, ) for invalid_facet in invalid_facets: result_dict[invalid_facet.id] = [] return result_dict def create_facet(tenant_id: str, entity_type, facet: CreateFacetDto) -> FacetDto: """ Creates a new facet for a given tenant and returns the created facet's details. Args: tenant_id (str): The ID of the tenant for whom the facet is being created. facet (CreateFacetDto): The data transfer object containing the details of the facet to be created. Returns: FacetDto: The data transfer object containing the details of the created facet. """ with Session(engine) as session: facet_db = Facet( id=str(uuid4()), tenant_id=tenant_id, name=facet.name, description=facet.description, entity_type=entity_type, property_path=facet.property_path, type=FacetType.str.value, user_id="system", ) session.add(facet_db) session.commit() return FacetDto( id=str(facet_db.id), property_path=facet_db.property_path, name=facet_db.name, description=facet_db.description, is_static=False, is_lazy=True, type=facet_db.type, ) return None def delete_facet(tenant_id: str, entity_type: str, facet_id: str) -> bool: """ Deletes a facet from the database for a given tenant. Args: tenant_id (str): The ID of the tenant. facet_id (str): The ID of the facet to be deleted. Returns: bool: True if the facet was successfully deleted, False otherwise. """ with Session(engine) as session: facet = session.exec( select(Facet) .where(Facet.tenant_id == tenant_id) .where(Facet.id == UUID(facet_id)) .where(Facet.entity_type == entity_type) ).first()[0] # result returned as tuple if facet: session.delete(facet) session.commit() return True return False def get_facets( tenant_id: str, entity_type: str, facet_ids_to_load: list[str] = None ) -> list[FacetDto]: """ Retrieve a list of facet DTOs for a given tenant and entity type. Args: tenant_id (str): The ID of the tenant. entity_type (str): The type of the entity. facet_ids_to_load (list[str], optional): A list of facet IDs to load. Defaults to None. Returns: list[FacetDto]: A list of FacetDto objects representing the facets. """ with Session(engine) as session: query = select(Facet).where( Facet.tenant_id == tenant_id, Facet.entity_type == entity_type ) if facet_ids_to_load: query = query.filter(Facet.id.in_([UUID(id) for id in facet_ids_to_load])) facets_from_db = session.exec(query).all() facet_dtos = [] for facet in facets_from_db: facet = facet[0] # because each row is returned as a tuple facet_dtos.append( FacetDto( id=str(facet.id), property_path=facet.property_path, name=facet.name, is_static=False, is_lazy=True, type=FacetType.str, ) ) return facet_dtos ================================================ FILE: keep/api/core/facets_query_builder/base_facets_query_builder.py ================================================ from typing import Any from sqlalchemy import CTE, func, literal, literal_column, select, text from keep.api.core.cel_to_sql.ast_nodes import DataType from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, PropertiesMetadata, PropertyMetadataInfo, SimpleFieldMapping, ) from keep.api.core.cel_to_sql.sql_providers.base import BaseCelToSqlProvider from keep.api.core.facets_query_builder.utils import get_facet_key from keep.api.models.facet import FacetDto, FacetOptionsQueryDto class BaseFacetsQueryBuilder: """ Base class for facets handlers. """ def __init__( self, properties_metadata: PropertiesMetadata, cel_to_sql: BaseCelToSqlProvider ): self.properties_metadata = properties_metadata self.cel_to_sql = cel_to_sql def build_facets_data_query( self, base_query_factory: lambda facet_property_path, involved_fields, select_statement: Any, entity_id_column: any, facets: list[FacetDto], facet_options_query: FacetOptionsQueryDto, ): """ Builds a SQL query to extract and count facet data based on the provided parameters. Args: dialect (str): The SQL dialect to use (e.g., 'postgresql', 'mysql'). base_query: The base SQLAlchemy query object to build upon. facets (list[FacetDto]): A list of facet data transfer objects specifying the facets to be queried. properties_metadata (PropertiesMetadata): Metadata about the properties to be used in the query. cel (str): A CEL (Common Expression Language) string to filter the base query. Returns: sqlalchemy.sql.Selectable: A SQLAlchemy selectable object representing the constructed query. """ # Main Query: JSON Extraction and Counting union_queries = [] # prevents duplicate queries for the same facet property path and its cel combination visited_facets = set() for facet in facets: facet_cel = facet_options_query.facet_queries.get(facet.id, "") facet_key = get_facet_key( facet_property_path=facet.property_path, filter_cel=facet_options_query.cel, facet_cel=facet_cel, ) if facet_key in visited_facets: continue cel_queries = [ facet_options_query.cel, facet_options_query.facet_queries.get(facet.id, None), ] final_cel = " && ".join(filter(lambda cel: cel, cel_queries)) facet_sub_query = self.build_facet_subquery( facet_key=facet_key, entity_id_column=entity_id_column, base_query_factory=base_query_factory, facet_property_path=facet.property_path, facet_cel=final_cel, ) union_queries.append(facet_sub_query) visited_facets.add(facet_key) query = None if len(union_queries) > 1: query = union_queries[0].union_all(*union_queries[1:]) else: query = union_queries[0] return query def build_facet_select(self, entity_id_column, facet_key: str, facet_property_path): property_metadata = self.properties_metadata.get_property_metadata_for_str( facet_property_path ) return [ literal(facet_key).label("facet_id"), self._get_select_for_column(property_metadata).label("facet_value"), func.count(func.distinct(entity_id_column)).label("matches_count"), ] def build_facet_subquery( self, facet_key: str, entity_id_column, base_query_factory: lambda facet_property_path, involved_fields, select_statement: Any, facet_property_path: str, facet_cel: str, ): metadata = self.properties_metadata.get_property_metadata_for_str( facet_property_path ) involved_fields = [] sql_filter = None if facet_cel: cel_to_sql_result = self.cel_to_sql.convert_to_sql_str_v2(facet_cel) involved_fields = cel_to_sql_result.involved_fields sql_filter = cel_to_sql_result.sql base_query = base_query_factory( facet_property_path, involved_fields, self.build_facet_select( entity_id_column=entity_id_column, facet_property_path=facet_property_path, facet_key=facet_key, ), ) if sql_filter: base_query = base_query.filter(text(sql_filter)) if metadata.data_type == DataType.ARRAY: facet_source_subquery = self._build_facet_subquery_for_json_array( base_query, metadata, ) else: facet_source_subquery = base_query if isinstance(facet_source_subquery, CTE): return select( literal_column("facet_id"), literal_column("facet_value"), literal_column("matches_count"), ).select_from(facet_source_subquery) return facet_source_subquery.group_by( literal_column("facet_id"), literal_column("facet_value") ) def _get_select_for_column(self, property_metadata: PropertyMetadataInfo): coalecense_args = [] should_cast = False for field_mapping in property_metadata.field_mappings: if isinstance(field_mapping, JsonFieldMapping): should_cast = True coalecense_args.append(self._handle_json_mapping(field_mapping)) elif isinstance(field_mapping, SimpleFieldMapping): coalecense_args.append(self._handle_simple_mapping(field_mapping)) select_expression = self._coalesce(coalecense_args) if should_cast: return self._cast_column(select_expression, property_metadata.data_type) return select_expression def _cast_column( self, column, data_type: DataType, ): return column def _build_facet_subquery_for_json_array( self, base_query, metadata: PropertyMetadataInfo, ): raise NotImplementedError("This method should be implemented in subclasses.") def _handle_simple_mapping(self, field_mapping: SimpleFieldMapping): return literal_column(field_mapping.map_to) def _coalesce(self, args: list): if len(args) == 1: return args[0] return func.coalesce(*args) def _handle_json_mapping(self, field_mapping: JsonFieldMapping): raise NotImplementedError("This method should be implemented in subclasses.") ================================================ FILE: keep/api/core/facets_query_builder/get_facets_query_builder.py ================================================ from keep.api.core.cel_to_sql.properties_metadata import PropertiesMetadata from keep.api.core.cel_to_sql.sql_providers.get_cel_to_sql_provider_for_dialect import ( get_cel_to_sql_provider, ) from keep.api.core.db import engine from keep.api.core.facets_query_builder.base_facets_query_builder import ( BaseFacetsQueryBuilder, ) from keep.api.core.facets_query_builder.mysql import MySqlFacetsQueryBuilder from keep.api.core.facets_query_builder.postgresql import PostgreSqlFacetsQueryBuilder from keep.api.core.facets_query_builder.sqlite import SqliteFacetsHandler def get_facets_query_builder( properties_metadata: PropertiesMetadata, ) -> BaseFacetsQueryBuilder: return get_facets_query_builder_for_dialect( engine.dialect.name, properties_metadata ) def get_facets_query_builder_for_dialect( dialect_name: str, properties_metadata: PropertiesMetadata, ) -> BaseFacetsQueryBuilder: if dialect_name == "sqlite": return SqliteFacetsHandler( properties_metadata, get_cel_to_sql_provider(properties_metadata) ) elif dialect_name == "mysql": return MySqlFacetsQueryBuilder( properties_metadata, get_cel_to_sql_provider(properties_metadata) ) elif dialect_name == "postgresql": return PostgreSqlFacetsQueryBuilder( properties_metadata, get_cel_to_sql_provider(properties_metadata) ) else: raise ValueError(f"Unsupported dialect: {engine.dialect.name}") ================================================ FILE: keep/api/core/facets_query_builder/mysql.py ================================================ from typing import Any from sqlalchemy import ( Column, Integer, String, case, cast, func, literal, literal_column, ) from sqlmodel import true from keep.api.core.cel_to_sql.ast_nodes import DataType from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, PropertyMetadataInfo, ) from keep.api.core.facets_query_builder.base_facets_query_builder import ( BaseFacetsQueryBuilder, ) class MySqlFacetsQueryBuilder(BaseFacetsQueryBuilder): def build_facet_subquery( self, facet_key: str, entity_id_column, base_query_factory: lambda facet_property_path, involved_fields, select_statement: Any, facet_property_path: str, facet_cel: str, ): return ( super() .build_facet_subquery( facet_key=facet_key, entity_id_column=entity_id_column, base_query_factory=base_query_factory, facet_property_path=facet_property_path, facet_cel=facet_cel, ) .limit(50) # Limit number of returned options per facet by 50 ) def _cast_column(self, column, data_type: DataType): if data_type == DataType.BOOLEAN: return case( (func.lower(column) == "true", literal("true")), (func.lower(column) == "false", literal("false")), (cast(column, Integer) >= 1, literal("true")), (column != "", literal("true")), else_=literal("false"), ) return super()._cast_column(column, data_type) def _get_select_for_column(self, property_metadata: PropertyMetadataInfo): if property_metadata.data_type == DataType.ARRAY: return literal_column(property_metadata.field_name + "_array").collate( "utf8mb4_0900_ai_ci" ) return super()._get_select_for_column(property_metadata) def _build_facet_subquery_for_json_array( self, base_query, metadata: PropertyMetadataInfo ): column_name = metadata.field_mappings[0].map_to json_table_join = func.json_table( literal_column(column_name), Column(metadata.field_name + "_array", String(127)), ).table_valued("value") base_query = base_query.outerjoin(json_table_join, true()) return base_query.group_by( literal_column("facet_id"), literal_column("facet_value") ).cte(f"{column_name}_facet_subquery") def _handle_json_mapping(self, field_mapping: JsonFieldMapping): built_json_path = "$." + ".".join( [f'"{item}"' for item in field_mapping.prop_in_json] ) return func.json_unquote( func.json_extract(literal_column(field_mapping.json_prop), built_json_path) ) ================================================ FILE: keep/api/core/facets_query_builder/postgresql.py ================================================ from typing import Any from sqlalchemy import Integer, String, case, cast, func, lateral, literal, select from sqlalchemy.sql import literal_column from sqlalchemy.dialects.postgresql import JSONB from sqlmodel import true from keep.api.core.cel_to_sql.ast_nodes import DataType from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, PropertyMetadataInfo, ) from keep.api.core.facets_query_builder.base_facets_query_builder import ( BaseFacetsQueryBuilder, ) class PostgreSqlFacetsQueryBuilder(BaseFacetsQueryBuilder): def _get_select_for_column(self, property_metadata: PropertyMetadataInfo): if property_metadata.data_type == DataType.ARRAY: return literal_column( f'"{property_metadata.field_name.replace("_", "")}_array".value' ) if property_metadata.data_type == DataType.UUID: return cast(super()._get_select_for_column(property_metadata), String) if next( ( True for item in property_metadata.field_mappings if not isinstance(item, JsonFieldMapping) ), False, ): return cast(super()._get_select_for_column(property_metadata), String) return super()._get_select_for_column(property_metadata) def build_facet_subquery( self, facet_key: str, entity_id_column, base_query_factory: lambda facet_property_path, involved_fields, select_statement: Any, facet_property_path: str, facet_cel: str, ): return ( super() .build_facet_subquery( facet_key=facet_key, entity_id_column=entity_id_column, base_query_factory=base_query_factory, facet_property_path=facet_property_path, facet_cel=facet_cel, ) .limit(50) # Limit number of returned options per facet by 50 ) def _cast_column(self, column, data_type: DataType): if data_type == DataType.BOOLEAN: return case( (func.lower(column) == "true", literal("true")), (func.lower(column) == "false", literal("false")), ( column.op("~")("^[0-9]+$"), case( (cast(column, Integer) >= 1, literal("true")), else_=literal("false"), ), ), (column != "", literal("true")), else_=literal("false"), ) return super()._cast_column(column, data_type) def _build_facet_subquery_for_json_array( self, base_query, metadata: PropertyMetadataInfo ): column_name = metadata.field_mappings[0].map_to alias = metadata.field_name.replace("_", "") + "_array" json_table_join = lateral( ( select( func.jsonb_array_elements_text( cast(literal_column(column_name), JSONB) ).label("value") ) ) ) return base_query.outerjoin(json_table_join.alias(alias), true()) def _handle_json_mapping(self, field_mapping: JsonFieldMapping): all_columns = [field_mapping.json_prop] + [ f"'{item}'" for item in field_mapping.prop_in_json ] json_property_path = " -> ".join(all_columns[:-1]) return literal_column(f"({json_property_path}) ->> {all_columns[-1]}") ================================================ FILE: keep/api/core/facets_query_builder/sqlite.py ================================================ from sqlalchemy import Integer, case, cast, func, literal, literal_column from sqlmodel import true from keep.api.core.cel_to_sql.ast_nodes import DataType from keep.api.core.cel_to_sql.properties_metadata import ( JsonFieldMapping, PropertyMetadataInfo, ) from keep.api.core.facets_query_builder.base_facets_query_builder import ( BaseFacetsQueryBuilder, ) class SqliteFacetsHandler(BaseFacetsQueryBuilder): def _get_select_for_column(self, property_metadata: PropertyMetadataInfo): if property_metadata.data_type == DataType.ARRAY: return literal_column( property_metadata.field_name.replace("_", "") + "_array" + ".value" ) return super()._get_select_for_column(property_metadata) def _cast_column(self, column, data_type: DataType): if data_type == DataType.BOOLEAN: return case( (func.lower(column) == "true", literal("true")), (func.lower(column) == "false", literal("false")), (cast(column, Integer) >= 1, literal("true")), (column != "", literal("true")), else_=literal("false"), ) return super()._cast_column(column, data_type) def _build_facet_subquery_for_json_array( self, base_query, metadata: PropertyMetadataInfo ): column_name = metadata.field_mappings[0].map_to alias = metadata.field_name.replace("_", "") + "_array" json_table_join = func.json_each(literal_column(column_name)).table_valued( "value" ) return base_query.outerjoin(json_table_join.alias(alias), true()) def _handle_json_mapping(self, field_mapping: JsonFieldMapping): built_json_path = "$." + ".".join( [f'"{item}"' for item in field_mapping.prop_in_json] ) return func.json_extract( literal_column(field_mapping.json_prop), built_json_path ) ================================================ FILE: keep/api/core/facets_query_builder/utils.py ================================================ import hashlib def get_facet_key(facet_property_path: str, filter_cel, facet_cel: str) -> str: """ Generates a unique key for the facet based on its property path and CEL expression. Args: facet_property_path (str): The property path of the facet. facet_cel (str): The CEL expression associated with the facet. Returns: str: A unique key for the facet. """ filter_cel = filter_cel or "" facet_cel = facet_cel or "" return ( facet_property_path + hashlib.sha1((filter_cel + facet_cel).encode("utf-8")).hexdigest() ) ================================================ FILE: keep/api/core/incidents.py ================================================ import logging from datetime import datetime, timedelta, timezone from typing import List, Optional, Tuple from sqlalchemy import String, and_, case, cast, func, select from sqlmodel import Session, col, text from sqlalchemy.orm import foreign, aliased from keep.api.core.alerts import get_alert_potential_facet_fields from keep.api.core.cel_to_sql.properties_mapper import ( PropertiesMappingException, ) from keep.api.core.cel_to_sql.properties_metadata import ( FieldMappingConfiguration, PropertiesMetadata, PropertyMetadataInfo, ) from keep.api.core.cel_to_sql.sql_providers.base import CelToSqlException from keep.api.core.cel_to_sql.sql_providers.get_cel_to_sql_provider_for_dialect import ( get_cel_to_sql_provider, ) from keep.api.core.db import engine, enrich_incidents_with_alerts from keep.api.core.facets import get_facet_options, get_facets from keep.api.models.db.alert import ( Alert, AlertEnrichment, Incident, LastAlert, LastAlertToIncident, ) from keep.api.models.db.facet import FacetType from keep.api.models.facet import FacetDto, FacetOptionDto, FacetOptionsQueryDto from keep.api.models.incident import IncidentSorting from keep.api.models.query import SortOptionsDto from keep.api.core.cel_to_sql.ast_nodes import DataType logger = logging.getLogger(__name__) incident_field_configurations = [ FieldMappingConfiguration( map_from_pattern="id", map_to=["incident.id"], data_type=DataType.UUID ), FieldMappingConfiguration( map_from_pattern="name", map_to=["incident.user_generated_name", "incident.ai_generated_name"], data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="summary", map_to=["incident.user_summary", "incident.generated_summary"], data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="assignee", map_to="incident.assignee", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="severity", map_to="incident.severity", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="status", map_to=["JSON(incidentenrichment.enrichments).*", "incident.status"], data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="creation_time", map_to="incident.creation_time", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="start_time", map_to="incident.start_time", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="end_time", map_to="incident.end_time", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="last_seen_time", map_to="incident.last_seen_time", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="is_predicted", map_to="incident.is_predicted", data_type=DataType.BOOLEAN, ), FieldMappingConfiguration( map_from_pattern="is_candidate", map_to="incident.is_candidate", data_type=DataType.BOOLEAN, ), FieldMappingConfiguration( map_from_pattern="is_visible", map_to="incident.is_visible", data_type=DataType.BOOLEAN, ), FieldMappingConfiguration( map_from_pattern="alerts_count", map_to="incident.alerts_count", data_type=DataType.INTEGER, ), FieldMappingConfiguration( map_from_pattern="merged_at", map_to="incident.merged_at", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="merged_by", map_to="incident.merged_by", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="hasLinkedIncident", map_to="addional_incident_fields.incident_has_linked_incident", data_type=DataType.BOOLEAN, ), FieldMappingConfiguration( map_from_pattern="alert.providerType", map_to="alert.provider_type", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="sources", map_to="incident.sources", data_type=DataType.ARRAY ), FieldMappingConfiguration( map_from_pattern="affectedServices", map_to="incident.affected_services", data_type=DataType.ARRAY, ), FieldMappingConfiguration( map_from_pattern="alert.*", map_to=["JSON(alertenrichment.enrichments).*", "JSON(alert.event).*"], ), ] properties_metadata = PropertiesMetadata(incident_field_configurations) incident_enrichment = aliased(AlertEnrichment, name="incidentenrichment") static_facets = [ FacetDto( id="1e7b1d6e-1c2b-4f8e-9f8e-1c2b4f8e9f8e", property_path="status", name="Status", is_static=True, type=FacetType.str, ), FacetDto( id="2e7b1d6e-2c2b-4f8e-9f8e-2c2b4f8e9f8e", property_path="severity", name="Severity", is_static=True, type=FacetType.str, ), FacetDto( id="3e7b1d6e-3c2b-4f8e-9f8e-3c2b4f8e9f8e", property_path="assignee", name="Assignee", is_static=True, type=FacetType.str, ), FacetDto( id="5e7b1d6e-5c2b-4f8e-9f8e-5c2b4f8e9f8e", property_path="sources", name="Source", is_static=True, type=FacetType.str, ), FacetDto( id="4e7b1d6e-4c2b-4f8e-9f8e-4c2b4f8e9f8e", property_path="affectedServices", name="Service", is_static=True, type=FacetType.str, ), FacetDto( id="5e247d67-ad9a-4f32-b8d1-8bdf4191d93f", property_path="hasLinkedIncident", name="Linked incident", is_static=True, type=FacetType.str, ), ] static_facets_dict = {facet.id: facet for facet in static_facets} def __build_base_incident_query( tenant_id: str, select_args: list, cel=None, force_fetch_alerts=False, force_fetch_has_linked_incident=False, ): fetch_alerts = False fetch_has_linked_incident = False cel_to_sql_instance = get_cel_to_sql_provider(properties_metadata) sql_filter = None involved_fields = [] is_visible_filter_present = False if cel: cel_to_sql_result = cel_to_sql_instance.convert_to_sql_str_v2(cel) sql_filter = cel_to_sql_result.sql involved_fields = cel_to_sql_result.involved_fields fetch_alerts = next( ( True for field in involved_fields if field.field_name.startswith("alert.") ), False, ) fetch_has_linked_incident = next( ( True for field in involved_fields if field.field_name == "hasLinkedIncident" ), False, ) is_visible_filter_present = next( ( True for field in involved_fields if field.field_name == "is_visible" ), False, ) sql_query = select(*select_args).select_from(Incident) if fetch_alerts or force_fetch_alerts: sql_query = ( sql_query.outerjoin( LastAlertToIncident, and_( LastAlertToIncident.incident_id == Incident.id, LastAlertToIncident.tenant_id == tenant_id, ), ) .outerjoin( LastAlert, and_( LastAlert.tenant_id == tenant_id, LastAlert.fingerprint == LastAlertToIncident.fingerprint, ), ) .outerjoin( Alert, and_(LastAlert.alert_id == Alert.id, LastAlert.tenant_id == tenant_id), ) .outerjoin( AlertEnrichment, and_( AlertEnrichment.alert_fingerprint == Alert.fingerprint, AlertEnrichment.tenant_id == tenant_id, ), ) ) sql_query = sql_query.outerjoin( incident_enrichment, and_( Incident.tenant_id == incident_enrichment.tenant_id, cast(col(Incident.id), String) == foreign(incident_enrichment.alert_fingerprint), ), ) if fetch_has_linked_incident or force_fetch_has_linked_incident: additional_incident_fields = ( select( Incident.id, case( ( Incident.same_incident_in_the_past_id.isnot(None), True, ), else_=False, ).label("incident_has_linked_incident"), ) .select_from(Incident) .subquery("addional_incident_fields") ) sql_query = sql_query.join( additional_incident_fields, Incident.id == additional_incident_fields.c.id ) sql_query = sql_query.filter(Incident.tenant_id == tenant_id) if not is_visible_filter_present: sql_query = sql_query.filter( Incident.is_visible == True ) if sql_filter: sql_query = sql_query.where(text(sql_filter)) return { "query": sql_query, "involved_fields": involved_fields, "fetch_alerts": fetch_alerts, } def __build_last_incidents_total_count_query( tenant_id: str, timeframe: int = None, upper_timestamp: datetime = None, lower_timestamp: datetime = None, is_candidate: bool = False, is_predicted: bool = None, cel: str = None, allowed_incident_ids: Optional[List[str]] = None, ): """ Builds a SQL query to retrieve the last incidents based on various filters and sorting options. Args: dialect (str): The SQL dialect to use. tenant_id (str): The tenant ID to filter incidents. limit (int, optional): The maximum number of incidents to return. Defaults to 25. offset (int, optional): The number of incidents to skip before starting to return results. Defaults to 0. timeframe (int, optional): The number of days to look back from the current date for incidents. Defaults to None. upper_timestamp (datetime, optional): The upper bound timestamp for filtering incidents. Defaults to None. lower_timestamp (datetime, optional): The lower bound timestamp for filtering incidents. Defaults to None. is_candidate (bool, optional): Filter for confirmed incidents. Defaults to False. sorting (Optional[IncidentSorting], optional): The sorting criteria for the incidents. Defaults to IncidentSorting.creation_time. is_predicted (bool, optional): Filter for predicted incidents. Defaults to None. cel (str, optional): The CEL (Common Expression Language) string to convert to SQL. Defaults to None. allowed_incident_ids (Optional[List[str]], optional): List of allowed incident IDs to filter. Defaults to None. Returns: sqlalchemy.sql.selectable.Select: The constructed SQL query. """ fetch_alerts = cel and "alert." in cel count_funct = ( func.count(func.distinct(Incident.id)) if fetch_alerts else func.count(1) ) query = __build_base_incident_query( tenant_id=tenant_id, cel=cel, select_args=[count_funct], )["query"] query = query.filter(Incident.is_candidate == is_candidate) if allowed_incident_ids: query = query.filter(Incident.id.in_(allowed_incident_ids)) if is_predicted is not None: query = query.filter(Incident.is_predicted == is_predicted) if timeframe: query = query.filter( Incident.start_time >= datetime.now(tz=timezone.utc) - timedelta(days=timeframe) ) if upper_timestamp and lower_timestamp: query = query.filter( col(Incident.last_seen_time).between(lower_timestamp, upper_timestamp) ) elif upper_timestamp: query = query.filter(Incident.last_seen_time <= upper_timestamp) elif lower_timestamp: query = query.filter(Incident.last_seen_time >= lower_timestamp) return query def __build_last_incidents_query( tenant_id: str, limit: int = 25, offset: int = 0, timeframe: int = None, upper_timestamp: datetime = None, lower_timestamp: datetime = None, is_candidate: bool = False, sorting: Optional[IncidentSorting] = IncidentSorting.creation_time, is_predicted: bool = None, cel: str = None, allowed_incident_ids: Optional[List[str]] = None, ): """ Builds a SQL query to retrieve the last incidents based on various filters and sorting options. Args: dialect (str): The SQL dialect to use. tenant_id (str): The tenant ID to filter incidents. limit (int, optional): The maximum number of incidents to return. Defaults to 25. offset (int, optional): The number of incidents to skip before starting to return results. Defaults to 0. timeframe (int, optional): The number of days to look back from the current date for incidents. Defaults to None. upper_timestamp (datetime, optional): The upper bound timestamp for filtering incidents. Defaults to None. lower_timestamp (datetime, optional): The lower bound timestamp for filtering incidents. Defaults to None. is_candidate (bool, optional): Filter for confirmed incidents. Defaults to False. sorting (Optional[IncidentSorting], optional): The sorting criteria for the incidents. Defaults to IncidentSorting.creation_time. is_predicted (bool, optional): Filter for predicted incidents. Defaults to None. cel (str, optional): The CEL (Common Expression Language) string to convert to SQL. Defaults to None. allowed_incident_ids (Optional[List[str]], optional): List of allowed incident IDs to filter. Defaults to None. Returns: sqlalchemy.sql.selectable.Select: The constructed SQL query. """ sort_dir = "DESC" if "-" in sorting.value else "ASC" sort_by = sorting.value.replace("-", "") sort_options: list[SortOptionsDto] = [ SortOptionsDto(sort_by=sort_by, sort_dir=sort_dir) ] cel_to_sql_instance = get_cel_to_sql_provider(properties_metadata) sort_by_exp = cel_to_sql_instance.get_order_by_expression( [(sort_option.sort_by, sort_option.sort_dir) for sort_option in sort_options] ) distinct_columns = [ text(cel_to_sql_instance.get_field_expression(sort_option.sort_by)) for sort_option in sort_options ] built_query_result = __build_base_incident_query( tenant_id=tenant_id, cel=cel, select_args=[Incident, incident_enrichment], ) sql_query = built_query_result["query"] fetch_alerts = built_query_result["fetch_alerts"] sql_query = sql_query.order_by(text(sort_by_exp)) sql_query = sql_query.filter(Incident.is_candidate == is_candidate) if allowed_incident_ids: sql_query = sql_query.filter(Incident.id.in_(allowed_incident_ids)) if is_predicted is not None: sql_query = sql_query.filter(Incident.is_predicted == is_predicted) if timeframe: sql_query = sql_query.filter( Incident.start_time >= datetime.now(tz=timezone.utc) - timedelta(days=timeframe) ) if upper_timestamp and lower_timestamp: sql_query = sql_query.filter( col(Incident.last_seen_time).between(lower_timestamp, upper_timestamp) ) elif upper_timestamp: sql_query = sql_query.filter(Incident.last_seen_time <= upper_timestamp) elif lower_timestamp: sql_query = sql_query.filter(Incident.last_seen_time >= lower_timestamp) if fetch_alerts: sql_query = sql_query.distinct(*(distinct_columns + [Incident.id])) # Order by start_time in descending order and limit the results sql_query = sql_query.limit(limit).offset(offset) return sql_query def get_last_incidents_by_cel( tenant_id: str, limit: int = 25, offset: int = 0, timeframe: int = None, upper_timestamp: datetime = None, lower_timestamp: datetime = None, is_candidate: bool = False, sorting: Optional[IncidentSorting] = IncidentSorting.creation_time, with_alerts: bool = False, is_predicted: bool = None, cel: str = None, allowed_incident_ids: Optional[List[str]] = None, ) -> Tuple[list[Incident], int]: """ Retrieve the last incidents for a given tenant based on various filters and criteria. Args: tenant_id (str): The ID of the tenant. limit (int, optional): The maximum number of incidents to return. Defaults to 25. offset (int, optional): The number of incidents to skip before starting to collect the result set. Defaults to 0. timeframe (int, optional): The timeframe in which to look for incidents. Defaults to None. upper_timestamp (datetime, optional): The upper bound timestamp for filtering incidents. Defaults to None. lower_timestamp (datetime, optional): The lower bound timestamp for filtering incidents. Defaults to None. is_candidate (bool, optional): Filter for confirmed incidents. Defaults to False. sorting (Optional[IncidentSorting], optional): The sorting criteria for the incidents. Defaults to IncidentSorting.creation_time. with_alerts (bool, optional): Whether to include alerts in the incidents. Defaults to False. is_predicted (bool, optional): Filter for predicted incidents. Defaults to None. cel (str, optional): The CEL (Common Event Language) filter. Defaults to None. allowed_incident_ids (Optional[List[str]], optional): A list of allowed incident IDs to filter by. Defaults to None. Returns: Tuple[list[Incident], int]: A tuple containing a list of incidents and the total count of incidents. """ with Session(engine) as session: try: total_count_query = __build_last_incidents_total_count_query( tenant_id=tenant_id, timeframe=timeframe, upper_timestamp=upper_timestamp, lower_timestamp=lower_timestamp, is_candidate=is_candidate, is_predicted=is_predicted, cel=cel, allowed_incident_ids=allowed_incident_ids, ) sql_query = __build_last_incidents_query( tenant_id=tenant_id, limit=limit, offset=offset, timeframe=timeframe, upper_timestamp=upper_timestamp, lower_timestamp=lower_timestamp, is_candidate=is_candidate, sorting=sorting, is_predicted=is_predicted, cel=cel, allowed_incident_ids=allowed_incident_ids, ) except CelToSqlException as e: if isinstance(e.__cause__, PropertiesMappingException): # if there is an error in mapping properties, return empty list logger.error(f"Error mapping properties: {str(e)}") return [], 0 raise e total_count = session.exec(total_count_query).one()[0] all_records = session.exec(sql_query).all() incidents = [] for row in all_records: dict_row = row._asdict() incident = dict_row.get("Incident") enrichment = dict_row.get("incidentenrichment") if enrichment: incident.set_enrichments(enrichment.enrichments) incidents.append(incident) if with_alerts: enrich_incidents_with_alerts(tenant_id, incidents, session) return incidents, total_count def get_incident_facets_data( tenant_id: str, allowed_incident_ids: list[str], facet_options_query: FacetOptionsQueryDto, ) -> dict[str, list[FacetOptionDto]]: """ Retrieves incident facets data for a given tenant. Args: tenant_id (str): The ID of the tenant. facets_to_load (list[str]): A list of facets to load. allowed_incident_ids (list[str]): A list of allowed incident IDs. cel (str, optional): A CEL expression to filter the incidents. Defaults to None. Returns: dict[str, list[FacetOptionDto]]: A dictionary where the keys are facet ids and the values are lists of FacetOptionDto objects. """ if facet_options_query and facet_options_query.facet_queries: facets = get_incident_facets( tenant_id, facet_options_query.facet_queries.keys() ) else: facets = static_facets def base_query_factory( facet_property_path: str, involved_fields: PropertyMetadataInfo, select_statement, ): force_fetch_alerts = "alert" in facet_property_path or next( (True for item in involved_fields if "alert" in item.field_name), False ) force_fetch_has_linked_incident = ( "hasLinkedIncident" in facet_property_path or next( ( True for item in involved_fields if "hasLinkedIncident" in item.field_name ), False, ) ) base_query = __build_base_incident_query( tenant_id, select_statement, force_fetch_alerts=force_fetch_alerts, force_fetch_has_linked_incident=force_fetch_has_linked_incident, )["query"] if allowed_incident_ids: base_query = base_query.filter(Incident.id.in_(allowed_incident_ids)) return base_query return get_facet_options( base_query_factory=base_query_factory, entity_id_column=Incident.id, facets=facets, facet_options_query=facet_options_query, properties_metadata=properties_metadata, ) def get_incident_facets( tenant_id: str, facet_ids_to_load: list[str] = None ) -> list[FacetDto]: """ Retrieve incident facets for a given tenant. This function returns a list of facets associated with incidents for a specified tenant. If no specific facet IDs are provided, it returns a combination of static facets and dynamically loaded facets for the tenant. If specific facet IDs are provided, it returns the corresponding facets, loading them dynamically if they are not static. Args: tenant_id (str): The ID of the tenant for which to retrieve incident facets. facet_ids_to_load (list[str], optional): A list of facet IDs to load. If not provided, all static facets and dynamically loaded facets for the tenant will be returned. Returns: list[FacetDto]: A list of FacetDto objects representing the incident facets for the tenant. """ not_static_facet_ids = [] facets = [] if not facet_ids_to_load: return static_facets + get_facets(tenant_id, "incident") if facet_ids_to_load: for facet_id in facet_ids_to_load: if facet_id not in static_facets_dict: not_static_facet_ids.append(facet_id) continue facets.append(static_facets_dict[facet_id]) if not_static_facet_ids: facets += get_facets(tenant_id, "incident", not_static_facet_ids) return facets def get_incident_potential_facet_fields(tenant_id: str) -> list[str]: alert_fields = [ f"alert.{item}" for item in get_alert_potential_facet_fields(tenant_id) ] incident_fields = [ item.map_from_pattern for item in incident_field_configurations if not item.map_from_pattern.startswith("alert.*") ] seen = set() result = [] for item in incident_fields + alert_fields: if item not in seen: seen.add(item) result.append(item) return result ================================================ FILE: keep/api/core/limiter.py ================================================ # https://slowapi.readthedocs.io/en/latest/#fastapi import logging from slowapi import Limiter from slowapi.util import get_remote_address from keep.api.core.config import config logger = logging.getLogger(__name__) limiter_enabled = config("KEEP_USE_LIMITER", default="false", cast=bool) default_limit = config("KEEP_LIMIT_CONCURRENCY", default="100/minute", cast=str) logger.warning(f"Rate limiter is {'enabled' if limiter_enabled else 'disabled'}") limiter = Limiter( key_func=get_remote_address, enabled=limiter_enabled, default_limits=[default_limit] ) ================================================ FILE: keep/api/core/metrics.py ================================================ import os from prometheus_client import Counter, Gauge, Histogram, Summary PROMETHEUS_MULTIPROC_DIR = os.environ.get("PROMETHEUS_MULTIPROC_DIR", "/tmp/prometheus") os.makedirs(PROMETHEUS_MULTIPROC_DIR, exist_ok=True) METRIC_PREFIX = "keep_" # Process event metrics events_in_counter = Counter( f"{METRIC_PREFIX}events_in_total", "Total number of events received", ) events_out_counter = Counter( f"{METRIC_PREFIX}events_processed_total", "Total number of events processed", ) events_error_counter = Counter( f"{METRIC_PREFIX}events_error_total", "Total number of events with error", ) processing_time_summary = Summary( f"{METRIC_PREFIX}processing_time_seconds", "Average time spent processing events", ) running_tasks_gauge = Gauge( f"{METRIC_PREFIX}running_tasks_current", "Current number of running tasks", multiprocess_mode="livesum", ) running_tasks_by_process_gauge = Gauge( f"{METRIC_PREFIX}running_tasks_by_process", "Current number of running tasks per process", labelnames=["pid"], multiprocess_mode="livesum", ) ### WORKFLOWS METRIC_PREFIX = "keep_workflows_" # Workflow execution metrics workflow_executions_total = Counter( f"{METRIC_PREFIX}executions_total", "Total number of workflow executions", labelnames=["tenant_id", "workflow_id", "trigger_type"], ) workflow_execution_errors_total = Counter( f"{METRIC_PREFIX}execution_errors_total", "Total number of workflow execution errors", labelnames=["tenant_id", "workflow_id", "error_type"], ) workflow_execution_status = Counter( f"{METRIC_PREFIX}execution_status_total", "Total number of workflow executions by status", labelnames=["tenant_id", "workflow_id", "status"], ) # Workflow performance metrics workflow_execution_duration = Histogram( f"{METRIC_PREFIX}execution_duration_seconds", "Time spent executing workflows", labelnames=["tenant_id", "workflow_id"], buckets=(1, 5, 10, 30, 60, 120, 300, 600), # 1s, 5s, 10s, 30s, 1m, 2m, 5m, 10m ) workflow_execution_step_duration = Histogram( f"{METRIC_PREFIX}execution_step_duration_seconds", "Time spent executing individual workflow steps", labelnames=["tenant_id", "workflow_id", "step_name"], buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60), ) # Workflow state metrics workflows_running = Gauge( f"{METRIC_PREFIX}running", "Number of currently running workflows", labelnames=["tenant_id"], multiprocess_mode="livesum", ) workflow_queue_size = Gauge( f"{METRIC_PREFIX}queue_size", "Number of workflows waiting to be executed", labelnames=["tenant_id"], multiprocess_mode="livesum", ) ================================================ FILE: keep/api/core/report_uptime.py ================================================ import os import time import asyncio import logging import threading from datetime import datetime from keep.api.core.db import get_activity_report, get_or_creat_posthog_instance_id from keep.api.core.posthog import ( posthog_client, is_posthog_reachable, KEEP_VERSION, POSTHOG_DISABLED, ) logger = logging.getLogger(__name__) UPTIME_REPORTING_CADENCE = 60 * 60 # 1 hour LAUNCH_TIME = datetime.now() async def report_uptime_to_posthog(): """ Reports uptime and current version to PostHog every hour. Should be lunched in a separate thread. """ while True: start_time = time.time() properties = { "status": "up", "keep_version": KEEP_VERSION, **get_activity_report(), } end_time = time.time() properties["db_request_duration_ms"] = int((end_time - start_time) * 1000) properties["uptime_hours"] = round( ((datetime.now() - LAUNCH_TIME).total_seconds()) / 3600 ) ee_enabled = os.environ.get("EE_ENABLED", "false").lower() == "true" if ee_enabled: properties["api_url"] = os.environ.get("KEEP_API_URL") posthog_client.capture( get_or_creat_posthog_instance_id(), "backend_status", properties=properties, ) posthog_client.flush() logger.info("Uptime reported to PostHog.", extra=properties) await asyncio.sleep(UPTIME_REPORTING_CADENCE) def launch_uptime_reporting_thread() -> threading.Thread | None: """ Running async uptime reporting as a sub-thread. """ if not POSTHOG_DISABLED: if is_posthog_reachable(): thread = threading.Thread( target=asyncio.run, args=(report_uptime_to_posthog(),) ) thread.start() logger.info("Uptime Reporting to Posthog launched.") return thread else: logger.info("Reporting to Posthog not launched because it's not reachable.") else: logger.info("Posthog reporting is disabled so no uptime reporting.") ================================================ FILE: keep/api/core/tenant_configuration.py ================================================ import logging from datetime import datetime, timedelta from fastapi import HTTPException from keep.api.core.config import config from keep.api.core.db import get_tenants_configurations class TenantConfiguration: _instance = None class _TenantConfiguration: def __init__(self): self.logger = logging.getLogger(__name__) self.configurations = self._load_tenant_configurations() self.last_loaded = datetime.now() self.reload_time = config( "TENANT_CONFIGURATION_RELOAD_TIME", default=5, cast=int ) def _load_tenant_configurations(self): self.logger.debug("Loading tenants configurations") tenants_configuration = get_tenants_configurations() self.logger.debug( "Tenants configurations loaded", extra={ "number_of_tenants": len(tenants_configuration), }, ) self.last_loaded = datetime.now() return tenants_configuration def _reload_if_needed(self): if datetime.now() - self.last_loaded > timedelta(minutes=self.reload_time): self.logger.info("Reloading tenants configurations") updated_configurations = self._load_tenant_configurations() if updated_configurations: self.configurations = updated_configurations self.logger.info("Tenants configurations reloaded") else: self.logger.warning("No tenants configurations found in db, maybe error") def get_configuration(self, tenant_id, config_name=None): self._reload_if_needed() # tenant_config = self.configurations.get(tenant_id, {}) tenant_config = self.configurations.get(tenant_id) if not tenant_config: self.logger.debug(f"Tenant {tenant_id} not found in memory, loading it") self.configurations = self._load_tenant_configurations() tenant_config = self.configurations.get(tenant_id, {}) if tenant_id not in self.configurations: self.logger.exception( f"Tenant not found [id: {tenant_id}]", extra={ "tenant_id": tenant_id, }, ) raise HTTPException( status_code=401, detail=f"Tenant not found [id: {tenant_id}]" ) if config_name is None: return tenant_config return tenant_config.get(config_name, None) def __new__(cls): if not cls._instance: cls._instance = cls._TenantConfiguration() return cls._instance ================================================ FILE: keep/api/core/tracer.py ================================================ from typing import Optional, Sequence from opentelemetry.context import Context from opentelemetry.sdk.trace import sampling from opentelemetry.sdk.trace.sampling import Decision, SamplingResult from opentelemetry.trace import Link, SpanKind from opentelemetry.trace.span import TraceState from opentelemetry.util.types import Attributes class KeepSampler(sampling.Sampler): def __init__(self, parent_sampler=None): self.parent_sampler = parent_sampler or sampling.ParentBased(sampling.ALWAYS_ON) # Operations we want to exclude from tracing self.excluded_operations = { "connect", "select 1", "ping", "SELECT 1", "ROLLBACK", "BEGIN", "SELECT keepdb", "COMMIT", } def should_sample( self, context: Optional["Context"], trace_id: int, name: str, kind: Optional[SpanKind] = None, attributes: Attributes = None, links: Optional[Sequence["Link"]] = None, trace_state: Optional["TraceState"] = None, ): # For SQL operations if kind == SpanKind.CLIENT and name in self.excluded_operations: return SamplingResult(Decision.DROP, {}, []) # For all other operations, use the parent sampler return self.parent_sampler.should_sample( context, trace_id, name, kind, attributes, links, trace_state ) def get_description(self): return "KeepSampler" ================================================ FILE: keep/api/core/workflows.py ================================================ """ Keep main database module. This module contains the CRUD database functions for Keep. """ from datetime import datetime, timedelta, timezone from typing import TypedDict, Tuple from sqlalchemy import and_, case, desc, func, literal_column, select, text from sqlmodel import Session from keep.api.core.cel_to_sql.properties_metadata import ( FieldMappingConfiguration, PropertiesMetadata, PropertyMetadataInfo, ) from keep.api.core.cel_to_sql.sql_providers.get_cel_to_sql_provider_for_dialect import ( get_cel_to_sql_provider, ) from keep.api.core.db import existed_or_new_session from keep.api.core.facets import get_facet_options, get_facets from keep.api.models.db.facet import FacetType from keep.api.models.db.workflow import Workflow, WorkflowExecution from keep.api.models.facet import FacetDto, FacetOptionDto, FacetOptionsQueryDto from keep.api.core.cel_to_sql.ast_nodes import DataType workflow_field_configurations = [ FieldMappingConfiguration( map_from_pattern="name", map_to="workflow.name", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="description", map_to="workflow.description", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="started", map_to="started", data_type=DataType.DATETIME ), FieldMappingConfiguration( map_from_pattern="last_execution_status", map_to="status", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="last_execution_time", map_to="execution_time", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="disabled", map_to="workflow.is_disabled", data_type=DataType.BOOLEAN, ), FieldMappingConfiguration( map_from_pattern="last_updated", map_to="workflow.last_updated", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="created_at", map_to="workflow.creation_time", data_type=DataType.DATETIME, ), FieldMappingConfiguration( map_from_pattern="created_by", map_to="workflow.created_by", data_type=DataType.STRING, ), FieldMappingConfiguration( map_from_pattern="updated_by", map_to="workflow.updated_by", data_type=DataType.STRING, ), ] properties_metadata = PropertiesMetadata(workflow_field_configurations) static_facets = [ FacetDto( id="558a5844-55a1-45ad-b190-8848a389007d", property_path="last_execution_status", name="Last execution status", is_static=True, type=FacetType.str, ), FacetDto( id="6672d434-36d6-4e48-b5ec-3123a7b38cf8", property_path="disabled", name="Enabling status", is_static=True, type=FacetType.str, ), FacetDto( id="77325333-7710-4904-bf06-6c3d58aa5787", property_path="created_by", name="Created by", is_static=True, type=FacetType.str, ), ] static_facets_dict = {facet.id: facet for facet in static_facets} def __build_workflow_executions_query(tenant_id: str): query = ( select( WorkflowExecution.workflow_id, WorkflowExecution.id.label("execution_id"), WorkflowExecution.started, WorkflowExecution.execution_time, WorkflowExecution.status, func.row_number() .over( partition_by=WorkflowExecution.workflow_id, order_by=desc(WorkflowExecution.started), ) .label("row_num"), ) .where(WorkflowExecution.tenant_id == tenant_id) .where(WorkflowExecution.is_test_run == False) .where( WorkflowExecution.started >= datetime.now(tz=timezone.utc) - timedelta(days=30) ) ) return query def build_workflow_executions_query( tenant_id: str, workflow_ids: list[str], limit_per_workflow: int ): query = __build_workflow_executions_query(tenant_id).cte( "workflow_executions_query" ) filtered_query = ( select( query.c.workflow_id, query.c.execution_id, query.c.started, query.c.execution_time, query.c.status, ) .select_from(query) .where(query.c.workflow_id.in_(workflow_ids)) .where(query.c.row_num <= limit_per_workflow) ) return filtered_query def __build_base_query( tenant_id: str, fetch_last_executions: int = 1, select_statements=None, latest_executions_subquery_cte=None, ): if latest_executions_subquery_cte is None: latest_executions_subquery_cte = __build_workflow_executions_query( tenant_id ).cte("latest_executions_subquery") if select_statements is None: select_statements = [ Workflow, Workflow.id.label("entity_id"), # here it creates aliases for table columns that will be used in filtering and faceting case( ( literal_column("status").isnot(None), literal_column("status"), ), else_="", ).label("filter_last_execution_status"), ] workflows_with_last_executions_query = ( select(*select_statements) .select_from(Workflow) .outerjoin( latest_executions_subquery_cte, and_( Workflow.id == latest_executions_subquery_cte.c.workflow_id, latest_executions_subquery_cte.c.row_num <= fetch_last_executions, ), ) .where(Workflow.tenant_id == tenant_id) .where(Workflow.is_deleted == False) .where(Workflow.is_test == False) ) return workflows_with_last_executions_query def build_workflows_total_count_query(tenant_id: str, cel: str): query = __build_base_query( tenant_id=tenant_id, select_statements=[func.count(func.distinct(Workflow.id))] ) if cel: cel_to_sql_instance = get_cel_to_sql_provider(properties_metadata) sql_filter_str = cel_to_sql_instance.convert_to_sql_str(cel) query = query.filter(text(sql_filter_str)) query = query.distinct() return query def build_workflows_query( tenant_id: str, cel: str, limit: int, offset: int, sort_by: str, sort_dir: str, fetch_last_executions: int = 15, ): limit = limit if limit is not None else 20 offset = offset if offset is not None else 0 cel_to_sql_instance = get_cel_to_sql_provider(properties_metadata) query = __build_base_query( tenant_id=tenant_id, fetch_last_executions=fetch_last_executions, select_statements=[ Workflow, literal_column("started").label("started"), literal_column("execution_time").label("execution_time"), literal_column("status").label("status"), literal_column("execution_id").label("execution_id"), ], ) if not sort_by: sort_by = "started" sort_dir = "desc" order_by_exp = cel_to_sql_instance.get_order_by_expression([(sort_by, sort_dir)]) query = query.order_by(text(order_by_exp)).limit(limit).offset(offset) if cel: sql_filter_str = cel_to_sql_instance.convert_to_sql_str(cel) query = query.filter(text(sql_filter_str)) return query class WorkflowWithLastExecutions(TypedDict): workflow: Workflow workflow_last_run_started: datetime workflow_last_run_time: datetime workflow_last_run_status: str workflow_last_executions: list[WorkflowExecution] def get_workflows_with_last_executions_v2( tenant_id: str, cel: str, limit: int, offset: int, sort_by: str, sort_dir: str, fetch_last_executions: int = 15, session: Session = None, ) -> Tuple[list[WorkflowWithLastExecutions], int]: with existed_or_new_session(session) as session: total_count_query = build_workflows_total_count_query( tenant_id=tenant_id, cel=cel ) count = session.exec(total_count_query).one()[0] if count == 0: return [], count workflows_query = build_workflows_query( tenant_id=tenant_id, cel=cel, limit=limit, offset=offset, sort_by=sort_by, sort_dir=sort_dir, fetch_last_executions=1, ) query_result = session.exec(workflows_query).all() workflow_ids = [workflow.id for workflow, *_ in query_result] workflow_executions_query = build_workflow_executions_query( tenant_id=tenant_id, workflow_ids=workflow_ids, limit_per_workflow=fetch_last_executions, ) workflow_executions_query_result = session.exec(workflow_executions_query).all() execution_dict = {} for ( workflow_id, execution_id, started, execution_time, status, ) in workflow_executions_query_result: if workflow_id not in execution_dict: execution_dict[workflow_id] = [] execution_dict[workflow_id].append( { "id": execution_id, "started": started, "execution_time": execution_time, "status": status, } ) result = [] for workflow, started, execution_time, status, execution_id in query_result: # workaround for filter. In query status is empty string if it is NULL in DB status = None if status == "" else status result.append( { "workflow": workflow, "workflow_last_run_started": started, "workflow_last_run_time": execution_time, "workflow_last_run_status": status, "workflow_last_executions": execution_dict.get(workflow.id, []), } ) return result, count def get_workflow_facets( tenant_id: str, facet_ids_to_load: list[str] = None ) -> list[FacetDto]: not_static_facet_ids = [] facets = [] if not facet_ids_to_load: return static_facets + get_facets(tenant_id, "workflow") if facet_ids_to_load: for facet_id in facet_ids_to_load: if facet_id not in static_facets_dict: not_static_facet_ids.append(facet_id) continue facets.append(static_facets_dict[facet_id]) if not_static_facet_ids: facets += get_facets(tenant_id, "workflow", not_static_facet_ids) return facets def get_workflow_facets_data( tenant_id: str, facet_options_query: FacetOptionsQueryDto, ) -> dict[str, list[FacetOptionDto]]: if facet_options_query and facet_options_query.facet_queries: facets = get_workflow_facets( tenant_id, facet_options_query.facet_queries.keys() ) else: facets = static_facets latest_executions_subquery_cte = __build_workflow_executions_query(tenant_id).cte( "latest_executions_subquery" ) def base_query_factory( facet_property_path: str, involved_fields: PropertyMetadataInfo, select_statement, ): return __build_base_query( tenant_id=tenant_id, select_statements=select_statement, latest_executions_subquery_cte=latest_executions_subquery_cte, ) return get_facet_options( base_query_factory=base_query_factory, entity_id_column=Workflow.id, facets=facets, facet_options_query=facet_options_query, properties_metadata=properties_metadata, ) def get_workflow_potential_facet_fields(tenant_id: str) -> list[str]: return [ field_configuration.map_from_pattern for field_configuration in workflow_field_configurations if "*" not in field_configuration.map_from_pattern ] ================================================ FILE: keep/api/custom_worker.py ================================================ from uvicorn.workers import UvicornWorker class CustomUvicornWorker(UvicornWorker): CONFIG_KWARGS = {"lifespan": "on"} ================================================ FILE: keep/api/logging.py ================================================ import http.client import inspect import logging import logging.config import logging.handlers import os import sys import threading import uuid from datetime import datetime from threading import Timer # tb: small hack to avoid the InsecureRequestWarning logs import urllib3 from pythonjsonlogger import jsonlogger from sqlmodel import Session from keep.api.consts import RUNNING_IN_CLOUD_RUN from keep.api.core.db import get_session, push_logs_to_db from keep.api.models.db.provider import ProviderExecutionLog urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) KEEP_STORE_WORKFLOW_LOGS = ( os.environ.get("KEEP_STORE_WORKFLOW_LOGS", "true").lower() == "true" ) logger = logging.getLogger(__name__) def get_gunicorn_log_level(): """ Check for --log-level flag in gunicorn command line arguments Returns the log level or None if not found """ log_level = None try: for i, arg in enumerate(sys.argv): if arg == "--log-level" and i + 1 < len(sys.argv): log_level = sys.argv[i + 1].upper() break elif arg.startswith("--log-level="): log_level = arg.split("=", 1)[1].upper() break except Exception: pass # Validate the log level valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] if log_level in valid_levels: return log_level # o/w, use Keep's log level return LOG_LEVEL class WorkflowContextFilter(logging.Filter): """ This is part of the root logger configuration. It filters out log records that don't have a workflow_id in the thread context. """ def filter(self, record): # Get workflow_id and debug flag from thread context thread = threading.current_thread() workflow_id = getattr(thread, "workflow_id", None) # Early return if no workflow_id if not workflow_id: return False # Skip DEBUG logs unless debug mode is enabled if not getattr(thread, "workflow_debug", False) and record.levelname == "DEBUG": return False # Initialize record.extra if needed if not hasattr(record, "extra"): record.extra = {} # Get thread context attributes thread_attrs = { "workflow_id": workflow_id, "workflow_execution_id": getattr(thread, "workflow_execution_id", None), "tenant_id": getattr(thread, "tenant_id", None), "provider_type": getattr(thread, "provider_type", None), } # Set record attributes from thread context for attr, value in thread_attrs.items(): if value is not None: setattr(record, attr, value) # Handle step_id step_id = getattr(thread, "step_id", None) if step_id is not None: record.context = {"step_id": step_id} # Handle event if present if "event" in record.__dict__: if hasattr(record, "context"): record.context["event"] = record.event else: record.context = {"event": record.event} return True class WorkflowDBHandler(logging.Handler): def __init__(self, flush_interval: int = 2): super().__init__() logging.getLogger(__name__).info("Initializing WorkflowDBHandler") self.records = [] self.flush_interval = flush_interval self._stop_event = threading.Event() # Start repeating timer in a separate thread self._timer_thread = threading.Thread(target=self._timer_run) self._timer_thread.daemon = ( True # Make it a daemon so it stops when program exits ) logging.getLogger(__name__).info("Starting WorkflowDBHandler timer thread") self._timer_thread.start() logging.getLogger(__name__).info("Started WorkflowDBHandler timer thread") def _timer_run(self): while not self._stop_event.is_set(): # logging.getLogger(__name__).info("Timer running") self.flush() # logging.getLogger(__name__).info("Timer sleeping") self._stop_event.wait(self.flush_interval) # Wait but can be interrupted def close(self): self._stop_event.set() # Signal the timer to stop self._timer_thread.join() # Wait for timer thread to finish super().close() def emit(self, record): # we want to push only workflow logs to the DB if not KEEP_STORE_WORKFLOW_LOGS: return if hasattr(record, "workflow_execution_id") and record.workflow_execution_id: self.format(record) self.records.append(record) def push_logs_to_db(self): # Convert log records to a list of dictionaries and clean the self.records buffer log_entries, self.records = [record.__dict__ for record in self.records], [] # Push log entries to the database push_logs_to_db(log_entries) def flush(self): if not self.records: return try: logging.getLogger(__name__).info("Flushing workflow logs to DB") self.push_logs_to_db() logging.getLogger(__name__).info("Flushed workflow logs to DB") except Exception as e: # Use the parent logger to avoid infinite recursion logging.getLogger(__name__).error( f"Failed to flush workflow logs: {str(e)}" ) finally: # Clear the timer reference self._flush_timer = None class ProviderDBHandler(logging.Handler): def __init__(self, flush_interval: int = 2): super().__init__() self.records = [] self.flush_interval = flush_interval self._flush_timer = None def emit(self, record): # Only store provider logs if hasattr(record, "provider_id") and record.provider_id: self.records.append(record) # Cancel existing timer if any if self._flush_timer: self._flush_timer.cancel() # Start new timer self._flush_timer = Timer(self.flush_interval, self.flush) self._flush_timer.start() def flush(self): if not self.records: return # Copy records and clear original list to avoid race conditions _records = self.records.copy() self.records = [] try: session = Session(next(get_session()).bind) log_entries = [] for record in _records: # if record have execution_id use it, but mostly for future use if hasattr(record, "execution_id"): execution_id = record.execution_id else: execution_id = None entry = ProviderExecutionLog( id=str(uuid.uuid4()), tenant_id=record.tenant_id, provider_id=record.provider_id, timestamp=datetime.fromtimestamp(record.created), log_message=record.getMessage(), log_level=record.levelname, context=getattr(record, "extra", {}), execution_id=execution_id, ) log_entries.append(entry) session.add_all(log_entries) session.commit() session.close() except Exception as e: # Use the parent logger to avoid infinite recursion logging.getLogger(__name__).error( f"Failed to flush provider logs: {str(e)}" ) finally: # Clear the timer reference self._flush_timer = None def close(self): """Cancel timer and flush remaining logs when handler is closed""" if self._flush_timer: self._flush_timer.cancel() self._flush_timer = None self.flush() super().close() class ProviderLoggerAdapter(logging.LoggerAdapter): def __init__(self, logger, provider_instance, tenant_id, provider_id, step_id=None): # Create a new logger specifically for this adapter self.provider_logger = logging.getLogger(f"provider.{provider_id}") # Add the ProviderDBHandler only to this specific logger handler = ProviderDBHandler() self.provider_logger.addHandler(handler) # Initialize the adapter with the new logger super().__init__(self.provider_logger, {}) self.provider_instance = provider_instance self.tenant_id = tenant_id self.provider_id = provider_id self.execution_id = str(uuid.uuid4()) self.step_id = step_id def process(self, msg, kwargs): kwargs = kwargs.copy() if kwargs else {} if "extra" not in kwargs: kwargs["extra"] = {} kwargs["extra"].update( { "tenant_id": self.tenant_id, "provider_id": self.provider_id, "execution_id": self.execution_id, } ) return msg, kwargs LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") KEEP_LOG_FILE = os.environ.get("KEEP_LOG_FILE") LOG_FORMAT_OPEN_TELEMETRY = "open_telemetry" LOG_FORMAT_DEVELOPMENT_TERMINAL = "dev_terminal" LOG_FORMAT = os.environ.get("LOG_FORMAT", LOG_FORMAT_OPEN_TELEMETRY) class DevTerminalFormatter(logging.Formatter): def format(self, record): if not hasattr(record, "otelTraceID"): record.otelTraceID = "-" # or any default value you prefer message = super().format(record) extra_info = "" # Use inspect to go up the stack until we find the _log function frame = inspect.currentframe() while frame: if frame.f_code.co_name == "_log": # Extract extra from the _log function's local variables extra = frame.f_locals.get("extra", {}) if extra: extra_info = " ".join( [f"[{k}: {v}]" for k, v in extra.items() if k != "raw_event"] ) else: extra_info = "" break frame = frame.f_back return f"{message} {extra_info}" def get_worker_type(): """Determine if this is a uvicorn or arq worker""" import sys # Check command line arguments or process name to identify worker type if any("arq" in arg.lower() for arg in sys.argv): return "arqworker" elif any("uvicorn" in arg.lower() for arg in sys.argv): return "uvicorn" else: return None # Set this as a global variable during initialization WORKER_TYPE = get_worker_type() class CustomJsonFormatter(jsonlogger.JsonFormatter): def __init__(self, *args, rename_fields=None, **kwargs): super().__init__(*args, **kwargs) self.rename_fields = rename_fields if RUNNING_IN_CLOUD_RUN else {} def add_fields(self, log_record, record, message_dict): super().add_fields(log_record, record, message_dict) # Add worker type to all logs if WORKER_TYPE: log_record["worker_type"] = getattr(record, "worker_type", WORKER_TYPE) CONFIG = { "version": 1, "disable_existing_loggers": False, "formatters": { "json": { "()": CustomJsonFormatter, "fmt": "%(worker_type) %(asctime)s %(message)s %(levelname)s %(name)s %(filename)s %(otelTraceID)s %(otelSpanID)s %(otelTraceSampled)s %(otelServiceName)s %(threadName)s %(process)s %(module)s", "rename_fields": { "levelname": "severity", "asctime": "timestamp", "otelTraceID": "logging.googleapis.com/trace", "otelSpanID": "logging.googleapis.com/spanId", "otelTraceSampled": "logging.googleapis.com/trace_sampled", }, }, "dev_terminal": { "()": DevTerminalFormatter, "format": "%(asctime)s - %(thread)s %(otelTraceID)s %(threadName)s %(levelname)s - %(message)s", }, "uvicorn_access": { # Add new formatter for uvicorn.access "format": "%(asctime)s - %(otelTraceID)s - %(threadName)s - %(message)s" }, }, "handlers": { "default": { "level": LOG_LEVEL, "formatter": ( "json" if LOG_FORMAT == LOG_FORMAT_OPEN_TELEMETRY else "dev_terminal" ), "class": "logging.StreamHandler", "stream": "ext://sys.stdout", }, "workflowhandler": { "level": "DEBUG", "formatter": ( "json" if LOG_FORMAT == LOG_FORMAT_OPEN_TELEMETRY else "dev_terminal" ), "class": "keep.api.logging.WorkflowDBHandler", "filters": ["thread_context"], # Add filter here }, "uvicorn_access": { # Add new handler for uvicorn.access "class": "logging.StreamHandler", "formatter": "uvicorn_access", }, }, "filters": { # Add filters section "thread_context": {"()": "keep.api.logging.WorkflowContextFilter"} }, "loggers": { "": { "handlers": ["workflowhandler", "default"], "level": "DEBUG", "propagate": False, }, "slowapi": { "handlers": ["default"], "level": LOG_LEVEL, "propagate": False, }, "uvicorn.access": { # Add uvicorn.access logger configuration "handlers": ["uvicorn_access"], "level": get_gunicorn_log_level(), "propagate": False, }, "uvicorn.error": { # Add uvicorn.error logger configuration "()": "CustomizedUvicornLogger", # Use custom logger class "handlers": ["default"], "level": get_gunicorn_log_level(), "propagate": False, }, "opentelemetry.context": { "handlers": [], "level": "CRITICAL", "propagate": False, }, "Evaluator": { "handlers": [], "level": "CRITICAL", "propagate": False, }, "NameContainer": { "handlers": [], "level": "CRITICAL", "propagate": False, }, "evaluation": { "handlers": [], "level": "CRITICAL", "propagate": False, }, "Environment": { "handlers": [], "level": "CRITICAL", "propagate": False, }, "httpx": { "handlers": [], "level": "ERROR", "propagate": False, }, }, } class CustomizedUvicornLogger(logging.Logger): """This class overrides the default Uvicorn logger to add trace_id to the log record Args: logging (_type_): _description_ """ def makeRecord( self, name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None, ): if extra: trace_id = extra.pop("otelTraceID", None) else: trace_id = None rv = super().makeRecord( name, level, fn, lno, msg, args, exc_info, func, extra, sinfo ) if trace_id: rv.__dict__["otelTraceID"] = trace_id return rv def _log( self, level, msg, args, exc_info=None, extra=None, stack_info=False, stacklevel=1, ): # Find trace_id from call stack frame = ( inspect.currentframe().f_back ) # Go one level up to get the caller's frame while frame: found_frame = False if frame.f_code.co_name == "run_asgi": trace_id = ( frame.f_locals.get("self").scope.get("state", {}).get("trace_id", 0) ) tenant_id = ( frame.f_locals.get("self") .scope.get("state", {}) .get("tenant_id", 0) ) if trace_id: if extra is None: extra = {} extra.update({"otelTraceID": trace_id}) found_frame = True if tenant_id: if extra is None: extra = {} extra.update({"tenant_id": tenant_id}) found_frame = True # if we found the frame, we can stop searching if found_frame: break frame = frame.f_back # Call the original _log function to handle the logging with trace_id logging.Logger._log( self, level, msg, args, exc_info, extra, stack_info, stacklevel ) def setup_logging(): # Add file handler if KEEP_LOG_FILE is set if KEEP_LOG_FILE: CONFIG["handlers"]["file"] = { "level": "DEBUG", "formatter": ("json"), "class": "logging.handlers.RotatingFileHandler", "filename": KEEP_LOG_FILE, "mode": "a", "maxBytes": 1024 * 1024 * 1024, # 1GB "backupCount": 5, } # Add file handler to root logger CONFIG["loggers"][""]["handlers"].append("file") logging.config.dictConfig(CONFIG) # MONKEY PATCHING http.client # See: https://stackoverflow.com/questions/58738195/python-http-request-and-debug-level-logging-to-the-log-file http_client_logger = logging.getLogger("http.client") http_client_logger.setLevel(logging.DEBUG) http.client.HTTPConnection.debuglevel = 1 def print_to_log(*args): http_client_logger.debug(" ".join(args)) # monkey-patch a `print` global into the http.client module; all calls to # print() in that module will then use our print_to_log implementation http.client.print = print_to_log ================================================ FILE: keep/api/middlewares.py ================================================ import logging import os import time from importlib import metadata import jwt from fastapi import Request from starlette.middleware.base import BaseHTTPMiddleware from keep.api.core.config import config from keep.api.core.db import get_api_key logger = logging.getLogger(__name__) try: KEEP_VERSION = metadata.version("keep") except Exception: KEEP_VERSION = os.environ.get("KEEP_VERSION", "unknown") KEEP_EXTRACT_IDENTITY = config("KEEP_EXTRACT_IDENTITY", default="true", cast=bool) def _extract_identity(request: Request, attribute="email") -> str: try: token = request.headers.get("Authorization").split(" ")[1] decoded_token = jwt.decode(token, options={"verify_signature": False}) return decoded_token.get(attribute) # case api key except AttributeError: # try api key api_key = request.headers.get("x-api-key") if not api_key: return "anonymous" # allow disabling the extraction of the identity from the api key # for high performance scenarios if KEEP_EXTRACT_IDENTITY: api_key = get_api_key(api_key) if api_key: return api_key.tenant_id return "anonymous" except Exception: return "anonymous" class LoggingMiddleware(BaseHTTPMiddleware): async def dispatch(self, request: Request, call_next): identity = _extract_identity(request, attribute="keep_tenant_id") logger.info( f"Request started: {request.method} {request.url.path}", extra={"tenant_id": identity}, ) # for debugging purposes, log the payload if os.environ.get("LOG_AUTH_PAYLOAD", "false") == "true": logger.info(f"Request headers: {request.headers}") start_time = time.time() request.state.tenant_id = identity response = await call_next(request) end_time = time.time() logger.info( f"Request finished: {request.method} {request.url.path} {response.status_code} in {end_time - start_time:.2f}s", extra={ "tenant_id": identity, "status_code": response.status_code, }, ) return response ================================================ FILE: keep/api/models/__init__.py ================================================ ================================================ FILE: keep/api/models/action.py ================================================ from typing import Optional, Union, Any from pydantic import BaseModel class ActionDTO(BaseModel): id: Optional[str] use: str name: str details: Union[dict[str, Any], None] = None class PartialActionDTO(BaseModel): use: Optional[str] = None name: Optional[str] = None details: Union[dict, None] = None ================================================ FILE: keep/api/models/action_type.py ================================================ import enum class ActionType(enum.Enum): # the alert was triggered TIGGERED = "alert was triggered" # someone acknowledged the alert ACKNOWLEDGE = "alert acknowledged" # the alert was resolved AUTOMATIC_RESOLVE = "alert automatically resolved" API_AUTOMATIC_RESOLVE = "alert automatically resolved by API" # the alert was resolved manually MANUAL_RESOLVE = "alert manually resolved" MANUAL_STATUS_CHANGE = "alert status manually changed" API_STATUS_CHANGE = "alert status changed by API" STATUS_UNENRICH = "alert status undone" # the alert was escalated WORKFLOW_ENRICH = "alert enriched by workflow" MAPPING_RULE_ENRICH = "alert enriched by mapping rule" EXTRACTION_RULE_ENRICH = "alert enriched by extraction rule" # the alert was deduplicated DEDUPLICATED = "alert was deduplicated" # a ticket was created TICKET_ASSIGNED = "alert was assigned with ticket" TICKET_UNASSIGNED = "alert was unassigned from ticket" # a ticket was updated TICKET_UPDATED = "alert ticket was updated" # disposing enriched alert DISPOSE_ENRICHED_ALERT = "alert enrichments disposed" # delete alert DELETE_ALERT = "alert deleted" # generic enrichment GENERIC_ENRICH = "alert enriched" GENERIC_UNENRICH = "alert un-enriched" # commented COMMENT = "a comment was added to the alert" UNCOMMENT = "a comment was removed from the alert" MAINTENANCE = "Alert is in maintenance window" MAINTENANCE_EXPIRED = "Alert has been removed from maintenance window" DISMISSAL_EXPIRED = "Alert dismissal expired" INCIDENT_COMMENT = "A comment was added to the incident" INCIDENT_ENRICH = "Incident enriched" INCIDENT_STATUS_CHANGE = "Incident status changed" INCIDENT_ASSIGN = "Incident assigned" INCIDENT_UNENRICH = "Incident enriched" ================================================ FILE: keep/api/models/ai_external.py ================================================ import os import logging import requests from typing import Any from dataclasses import Field from datetime import datetime from pydantic import BaseModel, Json, Field from keep.api.models.db.ai_external import ExternalAI, ExternalAIConfigAndMetadata logger = logging.getLogger(__name__) class ExternalAIDto(BaseModel): name: str description: str last_time_reminded: datetime | None = None api_url: str | None = Field(exclude=True) api_key: str | None = Field(exclude=True) def __init__(self, **data): super().__init__(**data) self.last_time_reminded = None @classmethod def from_orm(cls, _object: ExternalAI) -> "ExternalAIDto": return cls( name=_object.name, description=_object.description, api_url=_object.api_url, api_key=_object.api_key, ) def remind_about_the_client(self, tenant_id: str): """ AI services are stateless by design, so we need to remind about the client each time we want them to be executed. """ from keep.api.utils.tenant_utils import get_or_create_api_key from keep.api.core.db import get_session if self.last_time_reminded and (datetime.now() - self._last_time_reminded).total_seconds() < 30: logger.info(f"Skipping reminder about the client for {self.name} as it was reminded recently.") return else: self.last_time_reminded = datetime.now() if self.api_url is None or self.api_key is None: logger.error(f"API URL or API Key is missing for {self.name}. Skipping reminder.") return self.last_time_reminded = datetime.now() back_api_key = get_or_create_api_key( session=next(get_session()), tenant_id=tenant_id, created_by="system", unique_api_key_id=self.name.lower().replace(" ", "_") ) try: response = requests.post( self.api_url + "/remind_about_the_client", json={ "api_key": self.api_key, "tenant_id": tenant_id, "back_api_key": back_api_key, "back_api_url": os.environ.get("KEEP_API_URL"), }, timeout=0.5 # intentionally short because it's blocking and we don't care about response. ) response.raise_for_status() except Exception as e: logger.error(f"Failed to remind about the client for {self.name}. Error: {e}") return class ExternalAIConfigAndMetadataDto(BaseModel): id: str algorithm_id: str tenant_id: str settings: list[Any] | Json[Any] settings_proposed_by_algorithm: list[Any] | Json[Any] | None feedback_logs: str | None algorithm: ExternalAIDto @classmethod def from_orm(cls, _object: ExternalAIConfigAndMetadata) -> "ExternalAIConfigAndMetadataDto": return cls( id=str(_object.id), algorithm_id=_object.algorithm_id, tenant_id=_object.tenant_id, settings=_object.settings, settings_proposed_by_algorithm=_object.settings_proposed_by_algorithm, feedback_logs=_object.feedback_logs, algorithm=ExternalAIDto.from_orm(_object.algorithm) ) ================================================ FILE: keep/api/models/alert.py ================================================ import datetime import hashlib import json import logging import urllib.parse import uuid from enum import Enum from typing import TYPE_CHECKING, Any, Dict, Optional import pytz from pydantic import AnyHttpUrl, BaseModel, Extra, root_validator, validator from keep.api.models.severity_base import SeverityBaseInterface if TYPE_CHECKING: pass logger = logging.getLogger(__name__) def get_fingerprint(fingerprint, values): # if its none, use the name if fingerprint is None: fingerprint_payload = values.get("name") # if the alert name is None, than use the entire payload if not fingerprint_payload: logger.warning("No name to alert, using the entire payload") fingerprint_payload = json.dumps(values) fingerprint = hashlib.sha256(fingerprint_payload.encode()).hexdigest() # take only the first 255 characters else: fingerprint = fingerprint[:255] return fingerprint class AlertSeverity(SeverityBaseInterface): CRITICAL = ("critical", 5) HIGH = ("high", 4) WARNING = ("warning", 3) INFO = ("info", 2) LOW = ("low", 1) class AlertStatus(Enum): # Active alert FIRING = "firing" # Alert has been resolved RESOLVED = "resolved" # Alert has been acknowledged but not resolved ACKNOWLEDGED = "acknowledged" # Alert is suppressed due to various reasons SUPPRESSED = "suppressed" # No Data PENDING = "pending" #Affected by Maintenance Windows MAINTENANCE = "maintenance" class DismissAlertRequest(BaseModel): alert_id: Optional[str] = None class AlertErrorDto(BaseModel): id: str provider_type: str event: dict error_message: Optional[str] = None timestamp: datetime.datetime class AlertDto(BaseModel): id: str | None name: str status: AlertStatus severity: AlertSeverity lastReceived: str firingStartTime: str | None = None firingStartTimeSinceLastResolved: str | None = None firingCounter: int = 0 unresolvedCounter: int = 0 environment: str = "undefined" isFullDuplicate: bool | None = False isPartialDuplicate: bool | None = False duplicateReason: str | None = None service: str | None = None source: list[str] | None = [] apiKeyRef: str | None = None message: str | None = None description: str | None = None description_format: str | None = None # Can be 'markdown' or 'html' pushed: bool = False # Whether the alert was pushed or pulled from the provider event_id: str | None = None # Database alert id url: AnyHttpUrl | None = None imageUrl: AnyHttpUrl | None = None labels: dict | None = {} fingerprint: str | None = ( None # The fingerprint of the alert (used for alert de-duplication) ) deleted: bool = ( False # @tal: Obselete field since we have dismissed, but kept for backwards compatibility ) dismissUntil: str | None = None # The time until the alert is dismissed # DO NOT MOVE DISMISSED ABOVE dismissedUntil since it is used in root_validator dismissed: bool = False # Whether the alert has been dismissed assignee: str | None = None # The assignee of the alert providerId: str | None = None # The provider id providerType: str | None = None # The provider type note: str | None = None # The note of the alert startedAt: str | None = ( None # The time the alert started - e.g. if alert triggered multiple times, it will be the time of the first trigger (calculated on querying) ) isNoisy: bool = False # Whether the alert is noisy enriched_fields: list = [] incident: str | None = None def __str__(self) -> str: # Convert the model instance to a dictionary model_dict = self.dict() return json.dumps(model_dict, indent=4, default=str) def __eq__(self, other): if isinstance(other, AlertDto): # Convert both instances to dictionaries dict_self = self.dict() dict_other = other.dict() # Fields to exclude from comparison since they are bit different in different db's # todo: solve it in a better way exclude_fields = {"lastReceived", "startedAt", "event_id"} # Remove excluded fields from both dictionaries for field in exclude_fields: dict_self.pop(field, None) dict_other.pop(field, None) # Compare the dictionaries return dict_self == dict_other return False def __ne__(self, other): return not self.__eq__(other) @validator("fingerprint", pre=True, always=True) def assign_fingerprint_if_none(cls, fingerprint, values): return get_fingerprint(fingerprint, values) @validator("deleted", pre=True, always=True) def validate_deleted(cls, deleted, values): if isinstance(deleted, bool): return deleted if isinstance(deleted, list): return values.get("lastReceived") in deleted @validator("url", pre=True) def prepend_https(cls, url): if not isinstance(url, str): return url url = url.strip() # If the URL is empty, return None to avoid validation errors if not url: return None if not url.startswith("http"): # @tb: in some cases we drop the event because of invalid url with no scheme # invalid or missing URL scheme (type=value_error.url.scheme) url = f"https://{url}" return urllib.parse.quote(url, safe="/:?=&") @validator("lastReceived", pre=True, always=True) def validate_last_received(cls, last_received): def convert_to_iso_format(date_string): try: dt = datetime.datetime.fromisoformat(date_string) dt_utc = dt.astimezone(pytz.UTC) return dt_utc.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" except ValueError: return None def parse_unix_timestamp(timestamp_string): try: # Remove trailing 'Z' if present timestamp_string = timestamp_string.rstrip("Z") # Convert string to float timestamp = float(timestamp_string) # Create datetime from timestamp dt = datetime.datetime.fromtimestamp( timestamp, tz=datetime.timezone.utc ) return dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" except (ValueError, TypeError): return None if not last_received: return datetime.datetime.now(datetime.timezone.utc).isoformat() # Try to convert the date to iso format # see: https://github.com/keephq/keep/issues/1397 iso_date = convert_to_iso_format(last_received) if iso_date: return iso_date # Try to parse as UNIX timestamp unix_date = parse_unix_timestamp(last_received) if unix_date: return unix_date raise ValueError(f"Invalid date format: {last_received}") @validator("dismissed", pre=True, always=True) def validate_dismissed(cls, dismissed, values): # normzlize dismissed value if isinstance(dismissed, str): dismissed = dismissed.lower() == "true" # if dismissed is False, return False if not dismissed: return dismissed # else, validate dismissedUntil dismiss_until = values.get("dismissUntil") # if there's no dismissUntil, return just return dismissed if not dismiss_until or dismiss_until == "forever": return dismissed # if there's dismissUntil, validate it dismiss_until_datetime = datetime.datetime.strptime( dismiss_until, "%Y-%m-%dT%H:%M:%S.%fZ" ).replace(tzinfo=datetime.timezone.utc) dismissed = ( datetime.datetime.now(datetime.timezone.utc) < dismiss_until_datetime ) return dismissed @validator("description_format") def validate_description_format(cls, description_format): if description_format is None: return None valid_formats = ["markdown", "html"] if description_format not in valid_formats: raise ValueError(f"description_format must be one of {valid_formats}") return description_format @root_validator(pre=True) def set_default_values(cls, values: Dict[str, Any]) -> Dict[str, Any]: # Check and set id: if not values.get("id"): values["id"] = str(uuid.uuid4()) # Check and set default severity severity = values.get("severity") try: # if severity is int, convert it to AlertSeverity if isinstance(severity, int): values["severity"] = AlertSeverity.from_number(severity) else: values["severity"] = AlertSeverity(severity) except ValueError: logging.warning( f"Invalid severity value: {severity}, setting default.", extra={"event": values}, ) values["severity"] = AlertSeverity.INFO # Check and set default status status = values.get("status") try: values["status"] = AlertStatus(status) except ValueError: logging.warning( f"Invalid status value: {status}, setting default.", extra={"event": values}, ) values["status"] = AlertStatus.FIRING # this is code duplication of enrichment_helpers.py and should be refactored lastReceived = values.get("lastReceived", None) if not lastReceived: lastReceived = datetime.datetime.now(datetime.timezone.utc).isoformat() values["lastReceived"] = lastReceived assignees = values.pop("assignees", None) # In some cases (for example PagerDuty) the assignees is list of dicts and we don't handle it atm. if assignees and isinstance(assignees, dict): dt = datetime.datetime.fromisoformat(lastReceived) dt.isoformat(timespec="milliseconds").replace("+00:00", "Z") assignee = assignees.get(lastReceived) or assignees.get(dt) values["assignee"] = assignee values.pop("deletedAt", None) return values # after root_validator to ensure that the values are set @root_validator(pre=False) def validate_status(cls, values: Dict[str, Any]) -> Dict[str, Any]: # if dismissed, change status to SUPPRESSED # note this is happen AFTER validate_dismissed which already consider # dismissed + dismissUntil # if values.get("dismissed"): # values["status"] = AlertStatus.SUPPRESSED return values class Config: extra = Extra.allow schema_extra = { "examples": [ { "id": "1234", "name": "Pod 'api-service-production' lacks memory", "status": "firing", "lastReceived": "2021-01-01T00:00:00.000Z", "environment": "production", "duplicateReason": None, "service": "backend", "source": ["prometheus"], "message": "The pod 'api-service-production' lacks memory causing high error rate", "description": "Due to the lack of memory, the pod 'api-service-production' is experiencing high error rate", "severity": "critical", "pushed": True, "url": "https://www.keephq.dev?alertId=1234", "labels": { "pod": "api-service-production", "region": "us-east-1", "cpu": "88", "memory": "100Mi", }, "ticket_url": "https://www.keephq.dev?enrichedTicketId=456", "fingerprint": "1234", } ] } use_enum_values = True json_encoders = { # Converts enums to their values for JSON serialization Enum: lambda v: v.value, } class AlertWithIncidentLinkMetadataDto(AlertDto): is_created_by_ai: bool = False @classmethod def from_db_instance(cls, db_alert, db_alert_to_incident): return cls( is_created_by_ai=db_alert_to_incident.is_created_by_ai, **db_alert.event, ) class DeleteRequestBody(BaseModel): fingerprint: str lastReceived: str restore: bool = False class DismissRequestBody(BaseModel): fingerprint: str dismissUntil: str dismissComment: str restore: bool = False class EnrichAlertNoteRequestBody(BaseModel): note: str fingerprint: str class EnrichAlertRequestBody(BaseModel): enrichments: dict[str, str] fingerprint: str class BatchEnrichAlertRequestBody(BaseModel): enrichments: dict[str, str] fingerprints: Optional[list[str]] = None cel: Optional[str] = None class UnEnrichAlertRequestBody(BaseModel): enrichments: list[str] fingerprint: str class DeduplicationRuleDto(BaseModel): id: str | None # UUID name: str description: str default: bool distribution: list[dict] # list of {hour: int, count: int} provider_id: str | None # None for default rules provider_type: str last_updated: str | None last_updated_by: str | None created_at: str | None created_by: str | None ingested: int dedup_ratio: float enabled: bool fingerprint_fields: list[str] full_deduplication: bool ignore_fields: list[str] is_provisioned: bool class DeduplicationRuleRequestDto(BaseModel): name: str description: Optional[str] = None provider_type: str provider_id: Optional[str] = None fingerprint_fields: list[str] full_deduplication: bool = False ignore_fields: Optional[list[str]] = None class EnrichIncidentRequestBody(BaseModel): enrichments: Dict[str, Any] force: bool = False class UnEnrichIncidentRequestBody(BaseModel): enrichments: list[str] fingerprint: str ================================================ FILE: keep/api/models/alert_audit.py ================================================ from datetime import datetime from typing import List, Optional from pydantic import BaseModel from keep.api.models.action_type import ActionType from keep.api.models.db.alert import AlertAudit class CommentMentionDto(BaseModel): mentioned_user_id: str class AlertAuditDto(BaseModel): id: str timestamp: datetime fingerprint: str action: ActionType user_id: str description: str mentions: Optional[List[CommentMentionDto]] = None @classmethod def from_orm(cls, alert_audit: AlertAudit) -> "AlertAuditDto": mentions_data = None if hasattr(alert_audit, 'mentions') and alert_audit.mentions: mentions_data = [ CommentMentionDto(mentioned_user_id=mention.mentioned_user_id) for mention in alert_audit.mentions ] return cls( id=str(alert_audit.id), timestamp=alert_audit.timestamp, fingerprint=alert_audit.fingerprint, action=alert_audit.action, user_id=alert_audit.user_id, description=alert_audit.description, mentions=mentions_data, ) @classmethod def from_orm_list(cls, alert_audits: list[AlertAudit]) -> list["AlertAuditDto"]: grouped_events = [] previous_event = None count = 1 for event in alert_audits: # Check if the current event is similar to the previous event if previous_event and ( event.user_id == previous_event.user_id and event.action == previous_event.action and event.description == previous_event.description ): # Increment the count if the events are similar count += 1 else: # If the events are not similar, append the previous event to the grouped events if previous_event: if count > 1: previous_event.description += f" x{count}" grouped_events.append(AlertAuditDto.from_orm(previous_event)) # Update the previous event to the current event and reset the count previous_event = event count = 1 # Add the last event to the grouped events if previous_event: if count > 1: previous_event.description += f" x{count}" grouped_events.append(AlertAuditDto.from_orm(previous_event)) return grouped_events ================================================ FILE: keep/api/models/db/action.py ================================================ from datetime import datetime from typing import Optional from sqlalchemy import UniqueConstraint from sqlmodel import Column, Field, SQLModel, TEXT class Action(SQLModel, table=True): __table_args__ = (UniqueConstraint("tenant_id", "name", "use"),) id: str = Field(default=None, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") use: str name: str description: Optional[str] action_raw: str = Field(sa_column=Column(TEXT)) installed_by: str installation_time: datetime class Config: orm_mode = True unique_together = ["tenant_id", "name", "use"] ================================================ FILE: keep/api/models/db/ai_external.py ================================================ import json import os from uuid import uuid4 from pydantic import BaseModel, Json from sqlalchemy import JSON, Column, ForeignKey, Text from sqlmodel import Field, SQLModel class ExternalAI(BaseModel): """ Base model for external algorithms. """ name: str = None description: str = None version: int = None api_url: str = None api_key: str = None config_default: Json = None @property def unique_id(self): return self.name + "_" + str(self.version) # Not sure if we'll need to move algorithm objects to the DB, # for now, it's ok to keep them as code. external_ai_transformers = ExternalAI( name="Transformers Correlation", description="""A transformer-based alert-to-incident correlation algorithm, tailored for each tenant by training on their specific alert and incident data. The system will automatically associate new alerts with existing incidents if they are sufficiently similar; otherwise, it will create new incidents. In essence, it behaves like a human, analyzing the alert feed and making decisions for each incoming alert.""", version=1, api_url=os.environ.get("KEEP_EXTERNAL_AI_TRANSFORMERS_URL", None), api_key=os.environ.get("KEEP_EXTERNAL_AI_TRANSFORMERS_API_KEY", None), config_default=json.dumps( [ { "min": 0.3, "max": 0.99, "value": 0.9, "type": "float", "name": "Model Accuracy Threshold", "description": "The trained model accuracy will be evaluated using 30 percent of alerts-to-incident correlations as a validation dataset. If the accuracy is below this threshold, the correlation won't be launched.", }, { "min": 0.3, "max": 0.99, "value": 0.9, "type": "float", "name": "Correlation Threshold", "description": "The minimum correlation value to consider two alerts belonging to an incident.", }, { "min": 1, "max": 20, "value": 1, "type": "int", "name": "Train Epochs", "description": "The amount of epochs to train the model for. The less the better to avoid over-fitting.", }, { "value": True, "type": "bool", "name": "Create New Incidents", "description": "Do you want AI to issue new incident if correlation is detected and the incnident alerts are related to is resolved?", }, { "value": True, "type": "bool", "name": "Enabled", "description": "Enable or disable the algorithm.", }, ] ), ) EXTERNAL_AIS = [external_ai_transformers] class ExternalAIConfigAndMetadata(SQLModel, table=True): """ Dynamic per-tenant algo settings and metadata """ id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) algorithm_id: str = Field(nullable=False) tenant_id: str = Field(ForeignKey("tenant.id"), nullable=False) settings: str = Field( sa_column=Column(JSON), ) settings_proposed_by_algorithm: str = Field( sa_column=Column(JSON), ) feedback_logs: str = Field(sa_column=Column(Text)) @property def algorithm(self) -> ExternalAI: matching_algos = [ algo for algo in EXTERNAL_AIS if algo.unique_id == self.algorithm_id ] return matching_algos[0] if len(matching_algos) > 0 else None def from_external_ai(tenant_id: str, algorithm: ExternalAI): external_ai = ExternalAIConfigAndMetadata( algorithm_id=algorithm.unique_id, tenant_id=tenant_id, settings=json.dumps(algorithm.config_default), ) return external_ai ================================================ FILE: keep/api/models/db/ai_suggestion.py ================================================ import enum from datetime import datetime from typing import Dict, List, Optional from uuid import UUID, uuid4 from sqlmodel import JSON, Column, Field, Relationship, SQLModel class AISuggestionType(enum.Enum): INCIDENT_SUGGESTION = "incident_suggestion" SUMMARY_GENERATION = "summary_generation" OTHER = "other" class AISuggestion(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id", index=True) user_id: str = Field(index=True) # the input that the user provided to the AI suggestion_input: Dict = Field(sa_column=Column(JSON)) # the hash of the suggestion input to allow for duplicate suggestions with the same input suggestion_input_hash: str = Field(index=True) # the type of suggestion suggestion_type: AISuggestionType = Field(index=True) # the content of the suggestion suggestion_content: Dict = Field(sa_column=Column(JSON)) # the model that was used to generate the suggestion model: str = Field() # the date and time when the suggestion was created created_at: datetime = Field(default_factory=datetime.utcnow) feedbacks: List["AIFeedback"] = Relationship(back_populates="suggestion") class Config: arbitrary_types_allowed = True class AIFeedback(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) suggestion_id: UUID = Field(foreign_key="aisuggestion.id", index=True) user_id: str = Field(index=True) feedback_content: str = Field(sa_column=Column(JSON)) rating: Optional[int] = Field(default=None) comment: Optional[str] = Field(default=None) created_at: datetime = Field(default_factory=datetime.utcnow) updated_at: datetime = Field( default_factory=datetime.utcnow, sa_column_kwargs={"onupdate": datetime.utcnow} ) suggestion: AISuggestion = Relationship(back_populates="feedbacks") class Config: arbitrary_types_allowed = True ================================================ FILE: keep/api/models/db/alert.py ================================================ import logging from datetime import datetime from typing import List from uuid import UUID, uuid4 from pydantic import PrivateAttr from sqlalchemy import ForeignKey, ForeignKeyConstraint, UniqueConstraint from sqlalchemy_utils import UUIDType from sqlmodel import JSON, TEXT, Column, Field, Index, Relationship, SQLModel from keep.api.core.config import config from keep.api.models.db.helpers import DATETIME_COLUMN_TYPE, NULL_FOR_DELETED_AT from keep.api.models.db.incident import Incident from keep.api.models.db.tenant import Tenant db_connection_string = config("DATABASE_CONNECTION_STRING", default=None) logger = logging.getLogger(__name__) class AlertToIncident(SQLModel, table=True): tenant_id: str = Field(foreign_key="tenant.id") timestamp: datetime = Field(default_factory=datetime.utcnow) alert_id: UUID = Field(foreign_key="alert.id", primary_key=True) incident_id: UUID = Field( sa_column=Column( UUIDType(binary=False), ForeignKey("incident.id", ondelete="CASCADE"), primary_key=True, ) ) is_created_by_ai: bool = Field(default=False) deleted_at: datetime = Field( default_factory=None, nullable=True, primary_key=True, default=NULL_FOR_DELETED_AT, ) class LastAlert(SQLModel, table=True): tenant_id: str = Field(foreign_key="tenant.id", nullable=False, primary_key=True) fingerprint: str = Field(primary_key=True, index=True) alert_id: UUID = Field(foreign_key="alert.id") timestamp: datetime = Field(nullable=False, index=True) first_timestamp: datetime = Field(nullable=False, index=True) alert_hash: str | None = Field(nullable=True, index=True) __table_args__ = ( # Original indexes from MySQL Index("idx_lastalert_tenant_timestamp", "tenant_id", "first_timestamp"), Index("idx_lastalert_tenant_timestamp_new", "tenant_id", "timestamp"), Index( "idx_lastalert_tenant_ordering", "tenant_id", "first_timestamp", "alert_id", "fingerprint", ), {}, ) class LastAlertToIncident(SQLModel, table=True): tenant_id: str = Field(foreign_key="tenant.id", nullable=False, primary_key=True) timestamp: datetime = Field(default_factory=datetime.utcnow) fingerprint: str = Field(primary_key=True) incident_id: UUID = Field( sa_column=Column( UUIDType(binary=False), ForeignKey("incident.id", ondelete="CASCADE"), primary_key=True, ) ) is_created_by_ai: bool = Field(default=False) deleted_at: datetime = Field( default_factory=None, nullable=True, primary_key=True, default=NULL_FOR_DELETED_AT, ) __table_args__ = ( ForeignKeyConstraint( ["tenant_id", "fingerprint"], ["lastalert.tenant_id", "lastalert.fingerprint"], ), Index( "idx_lastalerttoincident_tenant_fingerprint", "tenant_id", "fingerprint", "deleted_at", ), Index( "idx_tenant_deleted_fingerprint", "tenant_id", "deleted_at", "fingerprint" ), {}, ) class Alert(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") tenant: Tenant = Relationship() # index=True added because we query top 1000 alerts order by timestamp. # On a large dataset, this will be slow without an index. # with 1M alerts, we see queries goes from >30s to 0s with the index # todo: on MSSQL, the index is "nonclustered" index which cannot be controlled by SQLModel timestamp: datetime = Field( sa_column=Column(DATETIME_COLUMN_TYPE, index=True, nullable=False), default_factory=lambda: datetime.utcnow().replace( microsecond=int(datetime.utcnow().microsecond / 1000) * 1000 ), ) provider_type: str provider_id: str | None event: dict = Field(sa_column=Column(JSON)) fingerprint: str = Field(index=True) # Add the fingerprint field with an index # alert_hash is different than fingerprint, it is a hash of the alert itself # and it is used for deduplication. # alert can be different but have the same fingerprint (e.g. different "firing" and "resolved" will have the same fingerprint but not the same alert_hash) alert_hash: str | None # Define a one-to-one relationship to AlertEnrichment using alert_fingerprint alert_enrichment: "AlertEnrichment" = Relationship( sa_relationship_kwargs={ "primaryjoin": "and_(Alert.fingerprint == foreign(AlertEnrichment.alert_fingerprint), Alert.tenant_id == AlertEnrichment.tenant_id)", "uselist": False, } ) alert_instance_enrichment: "AlertEnrichment" = Relationship( sa_relationship_kwargs={ "primaryjoin": "and_(cast(Alert.id, String) == foreign(AlertEnrichment.alert_fingerprint), Alert.tenant_id == AlertEnrichment.tenant_id)", "uselist": False, "viewonly": True, }, ) _incidents: List[Incident] = PrivateAttr(default_factory=list) __table_args__ = ( Index( "ix_alert_tenant_fingerprint_timestamp", "tenant_id", "fingerprint", "timestamp", ), Index("idx_fingerprint_timestamp", "fingerprint", "timestamp"), Index( "idx_alert_tenant_timestamp_fingerprint", "tenant_id", "timestamp", "fingerprint", ), # Index to optimize linked provider queries (is_linked_provider function) # These queries look for alerts with specific tenant_id and provider_id combinations # where the provider doesn't exist in the provider table # Without this index, the query scans 400k+ rows and takes ~2s # With this index, the query takes ~0.4s Index( "idx_alert_tenant_provider", "tenant_id", "provider_id", ), ) class Config: arbitrary_types_allowed = True class AlertEnrichment(SQLModel, table=True): """ TODO: we need to rename this table to EntityEnrichment since it's not only for alerts anymore. @tb: for example, we use it also for Incidents now. """ id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") timestamp: datetime = Field(default_factory=datetime.utcnow) alert_fingerprint: str = Field(unique=True) enrichments: dict = Field(sa_column=Column(JSON)) # @tb: we need to think what to do about this relationship. alerts: list[Alert] = Relationship( back_populates="alert_enrichment", sa_relationship_kwargs={ "primaryjoin": "and_(Alert.fingerprint == AlertEnrichment.alert_fingerprint, Alert.tenant_id == AlertEnrichment.tenant_id)", "foreign_keys": "[AlertEnrichment.alert_fingerprint, AlertEnrichment.tenant_id]", "uselist": True, }, ) class Config: arbitrary_types_allowed = True class AlertDeduplicationRule(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") name: str = Field(index=True) description: str provider_id: str | None = Field(default=None) # None for default rules provider_type: str last_updated: datetime = Field(default_factory=datetime.utcnow) last_updated_by: str created_at: datetime = Field(default_factory=datetime.utcnow) created_by: str enabled: bool = Field(default=True) fingerprint_fields: list[str] = Field(sa_column=Column(JSON), default=[]) full_deduplication: bool = Field(default=False) ignore_fields: list[str] = Field(sa_column=Column(JSON), default=[]) priority: int = Field(default=0) # for future use is_provisioned: bool = Field(default=False) class Config: arbitrary_types_allowed = True class AlertDeduplicationEvent(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id", index=True) timestamp: datetime = Field( sa_column=Column(DATETIME_COLUMN_TYPE, nullable=False), default_factory=datetime.utcnow, ) deduplication_rule_id: UUID # TODO: currently rules can also be implicit (like default) so they won't exists on db Field(foreign_key="alertdeduplicationrule.id", index=True) deduplication_type: str = Field() # 'full' or 'partial' date_hour: datetime = Field( sa_column=Column(DATETIME_COLUMN_TYPE), default_factory=lambda: datetime.utcnow().replace( minute=0, second=0, microsecond=0 ), ) # these are only soft reference since it could be linked provider provider_id: str | None = Field() provider_type: str | None = Field() __table_args__ = ( Index( "ix_alert_deduplication_event_provider_id", "provider_id", ), Index( "ix_alert_deduplication_event_provider_type", "provider_type", ), Index( "ix_alert_deduplication_event_provider_id_date_hour", "provider_id", "date_hour", ), Index( "ix_alert_deduplication_event_provider_type_date_hour", "provider_type", "date_hour", ), ) class Config: arbitrary_types_allowed = True class AlertField(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id", index=True) field_name: str = Field(index=True) provider_id: str | None = Field(index=True) provider_type: str | None = Field(index=True) __table_args__ = ( UniqueConstraint("tenant_id", "field_name", name="uq_tenant_field"), Index("ix_alert_field_tenant_id", "tenant_id"), Index("ix_alert_field_tenant_id_field_name", "tenant_id", "field_name"), Index( "ix_alert_field_provider_id_provider_type", "provider_id", "provider_type" ), ) class Config: arbitrary_types_allowed = True class AlertRaw(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id", index=True) raw_alert: dict = Field(sa_column=Column(JSON)) timestamp: datetime = Field(default_factory=datetime.utcnow) provider_type: str | None = Field(default=None) error: bool = Field(default=False, index=True) error_message: str | None = Field(default=None) dismissed: bool = Field(default=False) dismissed_at: datetime | None = Field(default=None) dismissed_by: str | None = Field(default=None) __table_args__ = ( Index("ix_alert_raw_tenant_id_error", "tenant_id", "error"), Index("ix_alert_raw_tenant_id_timestamp", "tenant_id", "timestamp"), ) class Config: arbitrary_types_allowed = True class AlertAudit(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) fingerprint: str tenant_id: str = Field(foreign_key="tenant.id", nullable=False) # when timestamp: datetime = Field(default_factory=datetime.utcnow, nullable=False) # who user_id: str = Field(nullable=False) # what action: str = Field(nullable=False) description: str = Field(sa_column=Column(TEXT)) mentions: list["CommentMention"] = Relationship( back_populates="alert_audit", sa_relationship_kwargs={"lazy": "selectin"} ) __table_args__ = ( Index("ix_alert_audit_tenant_id", "tenant_id"), Index("ix_alert_audit_fingerprint", "fingerprint"), Index("ix_alert_audit_tenant_id_fingerprint", "tenant_id", "fingerprint"), Index("ix_alert_audit_timestamp", "timestamp"), ) class CommentMention(SQLModel, table=True): """Many-to-many relationship table for users mentioned in comments.""" id: UUID = Field(default_factory=uuid4, primary_key=True) comment_id: UUID = Field( sa_column=Column( UUIDType(binary=False), ForeignKey("alertaudit.id", ondelete="CASCADE"), nullable=False ) ) mentioned_user_id: str = Field(nullable=False) tenant_id: str = Field(foreign_key="tenant.id", nullable=False) created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False) alert_audit: AlertAudit = Relationship( back_populates="mentions", sa_relationship_kwargs={"lazy": "selectin"} ) __table_args__ = ( Index("ix_comment_mention_comment_id", "comment_id"), Index("ix_comment_mention_mentioned_user_id", "mentioned_user_id"), Index("ix_comment_mention_tenant_id", "tenant_id"), UniqueConstraint("comment_id", "mentioned_user_id", name="uq_comment_mention"), ) ================================================ FILE: keep/api/models/db/dashboard.py ================================================ from datetime import datetime from uuid import uuid4 from sqlalchemy import UniqueConstraint from sqlalchemy.dialects.postgresql import JSON from sqlmodel import Column, Field, SQLModel class Dashboard(SQLModel, table=True): id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") dashboard_name: str = Field(index=True) # Index for faster uniqueness checks dashboard_config: dict = Field(sa_column=Column(JSON)) created_by: str = Field(default=None) created_at: datetime = Field(default_factory=datetime.utcnow) updated_by: str = Field(default=None) updated_at: datetime = Field(default_factory=datetime.utcnow) is_active: bool = Field(default=True) is_private: bool = Field(default=False) __table_args__ = ( UniqueConstraint( "tenant_id", "dashboard_name", name="unique_dashboard_name_per_tenant" ), ) class Config: arbitrary_types_allowed = True ================================================ FILE: keep/api/models/db/enrichment_event.py ================================================ import enum from datetime import datetime, timezone from uuid import UUID, uuid4 from pydantic import BaseModel from sqlalchemy_utils import UUIDType from sqlmodel import JSON, TEXT, Column, Field, ForeignKey, Index, SQLModel from keep.api.models.db.alert import DATETIME_COLUMN_TYPE class EnrichmentType(str, enum.Enum): MAPPING = "mapping" EXTRACTION = "extraction" class EnrichmentStatus(str, enum.Enum): SUCCESS = "success" FAILURE = "failure" SKIPPED = "skipped" class EnrichmentEvent(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id", index=True) timestamp: datetime = Field( sa_column=Column(DATETIME_COLUMN_TYPE, nullable=False), default_factory=lambda: datetime.now(tz=timezone.utc), ) enriched_fields: dict = Field(sa_column=Column(JSON), default_factory=dict) status: str enrichment_type: str = Field() # 'mapping' or 'extraction' rule_id: int | None = Field(default=None) # ID of the mapping/extraction rule alert_id: UUID = Field( sa_column=Column( UUIDType(binary=False), nullable=False, ) ) enriched_fields: dict = Field(sa_column=Column(JSON), default_factory=dict) date_hour: datetime = Field( sa_column=Column(DATETIME_COLUMN_TYPE), default_factory=lambda: datetime.now(tz=timezone.utc).replace( minute=0, second=0, microsecond=0 ), ) __table_args__ = ( Index( "ix_enrichment_event_status", "status", ), Index( "ix_enrichment_event_tenant_id_date_hour", "tenant_id", "date_hour", ), Index( "ix_enrichment_event_alert_id", "alert_id", ), Index( "ix_enrichment_event_rule_id", "rule_id", ), ) class Config: arbitrary_types_allowed = True class EnrichmentLog(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id", index=True) enrichment_event_id: UUID = Field( sa_column=Column( UUIDType(binary=False), ForeignKey("enrichmentevent.id", ondelete="CASCADE"), nullable=False, ), default_factory=lambda: uuid4(), ) timestamp: datetime = Field( sa_column=Column(DATETIME_COLUMN_TYPE, nullable=False), default_factory=lambda: datetime.now(tz=timezone.utc), ) message: str = Field(sa_column=Column(TEXT)) __table_args__ = ( Index( "ix_enrichment_log_tenant_id_timestamp", "tenant_id", "timestamp", ), Index( "ix_enrichment_log_enrichment_event_id", "enrichment_event_id", ), ) class EnrichmentEventWithLogs(BaseModel): enrichment_event: EnrichmentEvent logs: list[EnrichmentLog] ================================================ FILE: keep/api/models/db/extraction.py ================================================ from datetime import datetime, timezone from typing import Optional from pydantic import BaseModel from sqlalchemy import DateTime from sqlalchemy.sql import func from sqlmodel import Column, Field, SQLModel class ExtractionRule(SQLModel, table=True): id: Optional[int] = Field(primary_key=True, default=None) tenant_id: str = Field(foreign_key="tenant.id") priority: int = Field(default=0, nullable=False) name: str = Field(max_length=255, nullable=False) description: Optional[str] = Field(max_length=2048) created_by: Optional[str] = Field(max_length=255) created_at: datetime = Field(default_factory=lambda: datetime.now(tz=timezone.utc)) updated_by: Optional[str] = Field(max_length=255) updated_at: Optional[datetime] = Field( sa_column=Column( DateTime(timezone=True), name="updated_at", onupdate=func.now(), server_default=func.now() ) ) disabled: bool = Field(default=False) pre: bool = Field(default=False) condition: Optional[str] = Field(max_length=2000) # cel attribute: str = Field(max_length=255) # the attribute to extract regex: str = Field(max_length=1024) # the regex to use for extraction class ExtractionRuleDtoBase(BaseModel): name: str description: Optional[str] = None priority: int = 0 attribute: str = None condition: Optional[str] = None disabled: bool = False regex: str pre: bool = False class ExtractionRuleDtoOut(ExtractionRuleDtoBase, extra="ignore"): id: int created_by: Optional[str] created_at: datetime updated_by: Optional[str] updated_at: Optional[datetime] ================================================ FILE: keep/api/models/db/facet.py ================================================ import enum from datetime import datetime from typing import Optional from uuid import UUID, uuid4 from sqlmodel import Field, Index, SQLModel class FacetEntityType(enum.Enum): INCIDENT = "incident" class FacetType(enum.Enum): str = "string" class Facet(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) entity_type: str = Field(nullable=False, max_length=50) property_path: str = Field(nullable=False, max_length=255) type: str = Field(nullable=False) name: str = Field(max_length=255, nullable=False) description: Optional[str] = Field(max_length=2048) tenant_id: str = Field(foreign_key="tenant.id", nullable=False) # when timestamp: datetime = Field(default_factory=datetime.utcnow, nullable=False) # who user_id: str = Field(nullable=False) __table_args__ = ( Index("ix_facet_tenant_id", "tenant_id"), # we need to be able to query facets by tenant_id quickly Index("ix_entity_type", "entity_type"), # we need to be able to query facets by entity_type quickly ) ================================================ FILE: keep/api/models/db/helpers.py ================================================ import logging from datetime import datetime from sqlalchemy.dialects.mssql import DATETIME2 as MSSQL_DATETIME2 from sqlalchemy.dialects.mysql import DATETIME as MySQL_DATETIME from sqlalchemy.engine.url import make_url from sqlmodel import DateTime from keep.api.consts import RUNNING_IN_CLOUD_RUN from keep.api.core.config import config logger = logging.getLogger(__name__) # We want to include the deleted_at field in the primary key, # but we also want to allow it to be nullable. MySQL doesn't allow nullable fields in primary keys, so: NULL_FOR_DELETED_AT = datetime(1000, 1, 1, 0, 0) DB_CONNECTION_STRING = config("DATABASE_CONNECTION_STRING", default=None) # managed (mysql) if RUNNING_IN_CLOUD_RUN or DB_CONNECTION_STRING == "impersonate": # Millisecond precision DATETIME_COLUMN_TYPE = MySQL_DATETIME(fsp=3) # self hosted (mysql, sql server, sqlite / postgres) else: try: url = make_url(DB_CONNECTION_STRING) dialect = url.get_dialect().name if dialect == "mssql": # Millisecond precision DATETIME_COLUMN_TYPE = MSSQL_DATETIME2(precision=3) elif dialect == "mysql": # Millisecond precision DATETIME_COLUMN_TYPE = MySQL_DATETIME(fsp=3) else: DATETIME_COLUMN_TYPE = DateTime except Exception: logger.warning( "Could not determine the database dialect, falling back to default datetime column type" ) # give it a default DATETIME_COLUMN_TYPE = DateTime ================================================ FILE: keep/api/models/db/incident.py ================================================ import enum from datetime import datetime from typing import List, Optional from uuid import UUID, uuid4 from pydantic import PrivateAttr from retry import retry from sqlalchemy import ForeignKey, event from sqlalchemy.exc import IntegrityError from sqlalchemy_utils import UUIDType from sqlmodel import ( JSON, TEXT, Column, Field, Index, Relationship, Session, SQLModel, func, select, text, ) from keep.api.models.alert import SeverityBaseInterface from keep.api.models.db.rule import ResolveOn from keep.api.models.db.tenant import Tenant class IncidentType(str, enum.Enum): MANUAL = "manual" # Created manually by users AI = "ai" # Created by AI RULE = "rule" # Created by rules engine TOPOLOGY = "topology" # Created by topology processor class IncidentSeverity(SeverityBaseInterface): CRITICAL = ("critical", 5) HIGH = ("high", 4) WARNING = ("warning", 3) INFO = ("info", 2) LOW = ("low", 1) def from_number(n): for severity in IncidentSeverity: if severity.order == n: return severity raise ValueError(f"No IncidentSeverity with order {n}") class IncidentStatus(enum.Enum): # Active incident FIRING = "firing" # Incident has been resolved RESOLVED = "resolved" # Incident has been acknowledged but not resolved ACKNOWLEDGED = "acknowledged" # Incident was merged with another incident MERGED = "merged" # Incident was removed DELETED = "deleted" @classmethod def get_active(cls, return_values=False) -> List[str | enum.Enum]: statuses = [cls.FIRING, cls.ACKNOWLEDGED] if return_values: return [s.value for s in statuses] return statuses @classmethod def get_closed(cls, return_values=False) -> List[str | enum.Enum]: statuses = [cls.RESOLVED, cls.MERGED, cls.DELETED] if return_values: return [s.value for s in statuses] return statuses class Incident(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") tenant: Tenant = Relationship() # Auto-incrementing number per tenant running_number: Optional[int] = Field(default=None) user_generated_name: str | None = Field(sa_column=Column(TEXT)) ai_generated_name: str | None = Field(sa_column=Column(TEXT)) user_summary: str = Field(sa_column=Column(TEXT)) generated_summary: str = Field(sa_column=Column(TEXT)) assignee: str | None severity: int = Field(default=IncidentSeverity.CRITICAL.order) forced_severity: bool = Field(default=False) status: str = Field(default=IncidentStatus.FIRING.value, index=True) creation_time: datetime = Field(default_factory=datetime.utcnow) # Start/end should be calculated from first/last alerts # But I suppose to have this fields as cache, to prevent extra requests start_time: datetime | None end_time: datetime | None last_seen_time: datetime | None is_predicted: bool = Field(default=False) is_candidate: bool = Field(default=False) is_visible: bool = Field(default=True) alerts_count: int = Field(default=0) affected_services: list = Field(sa_column=Column(JSON), default_factory=list) sources: list = Field(sa_column=Column(JSON), default_factory=list) rule_id: UUID | None = Field( sa_column=Column( UUIDType(binary=False), ForeignKey("rule.id", ondelete="CASCADE"), nullable=True, ), ) # Note: IT IS NOT A UNIQUE IDENTIFIER (as in alerts) rule_fingerprint: str = Field(default="", sa_column=Column(TEXT)) # This is the fingerprint of the incident generated by the underlying tool # It's not a unique identifier in the DB (constraint), but when we have the same incident from some tools, we can use it to detect duplicates fingerprint: str | None = Field(default=None, sa_column=Column(TEXT)) incident_type: str = Field(default=IncidentType.MANUAL.value) # for topology incidents incident_application: UUID | None = Field(default=None) resolve_on: str = ResolveOn.ALL.value same_incident_in_the_past_id: UUID | None = Field( sa_column=Column( UUIDType(binary=False), ForeignKey("incident.id", ondelete="SET NULL"), nullable=True, ), ) same_incident_in_the_past: Optional["Incident"] = Relationship( back_populates="same_incidents_in_the_future", sa_relationship_kwargs=dict( remote_side="Incident.id", foreign_keys="[Incident.same_incident_in_the_past_id]", ), ) same_incidents_in_the_future: List["Incident"] = Relationship( back_populates="same_incident_in_the_past", sa_relationship_kwargs=dict( foreign_keys="[Incident.same_incident_in_the_past_id]", ), ) merged_into_incident_id: UUID | None = Field( sa_column=Column( UUIDType(binary=False), ForeignKey("incident.id", ondelete="SET NULL"), nullable=True, ), ) merged_at: datetime | None = Field(default=None) merged_by: str | None = Field(default=None) merged_into: Optional["Incident"] = Relationship( back_populates="merged_incidents", sa_relationship_kwargs=dict( remote_side="Incident.id", foreign_keys="[Incident.merged_into_incident_id]", ), ) merged_incidents: List["Incident"] = Relationship( back_populates="merged_into", sa_relationship_kwargs=dict( foreign_keys="[Incident.merged_into_incident_id]", ), ) # @tb: _alerts is Alert, not explicitly typed because of circular dependency _alerts: List = PrivateAttr(default_factory=list) _enrichments: dict = PrivateAttr(default={}) class Config: arbitrary_types_allowed = True __table_args__ = ( Index( "ix_incident_tenant_running_number", "tenant_id", "running_number", unique=True, postgresql_where=text("running_number IS NOT NULL"), # For PostgreSQL sqlite_where=text("running_number IS NOT NULL"), # For SQLite ), ) @property def alerts(self): if hasattr(self, "_alerts"): return self._alerts else: return [] @property def enrichments(self): return getattr(self, "_enrichments", {}) def set_enrichments(self, enrichments): self._enrichments = enrichments @retry(exceptions=(IntegrityError,), tries=3, delay=0.1, backoff=2, jitter=(0, 0.1)) def get_next_running_number(session, tenant_id: str) -> int: """Get the next running number for a tenant.""" try: # Get the maximum running number for the tenant result = session.exec( select(func.max(Incident.running_number)).where( Incident.tenant_id == tenant_id ) ).first() # If no incidents exist yet, start from 1 next_number = (result or 0) + 1 return next_number except IntegrityError: session.rollback() # Refresh the session's view of the data session.expire_all() raise @event.listens_for(Incident, "before_insert") def set_running_number(mapper, connection, target): if target.running_number is None: # Create a temporary session to get the next running number with Session(connection) as session: try: target.running_number = get_next_running_number( session, target.tenant_id ) except Exception: target.running_number = None # def upgrade() -> None: # # ### commands auto generated by Alembic - please adjust! ### # with op.batch_alter_table("incident", schema=None) as batch_op: # batch_op.add_column(sa.Column("running_number", sa.Integer(), nullable=True)) # op.create_index( # "ix_incident_tenant_running_number", # "incident", # ["tenant_id", "running_number"], # unique=True, # postgresql_where=text("running_number IS NOT NULL"), # mysql_where=text("running_number IS NOT NULL"), # sqlite_where=text("running_number IS NOT NULL"), # ) ================================================ FILE: keep/api/models/db/maintenance_window.py ================================================ # builtins from datetime import datetime from typing import Optional from pydantic import BaseModel from sqlalchemy import DateTime, JSON # third-parties from sqlmodel import Column, Field, Index, SQLModel, func from keep.api.models.alert import AlertStatus DEFAULT_ALERT_STATUSES_TO_IGNORE = [ AlertStatus.RESOLVED.value, AlertStatus.ACKNOWLEDGED.value, ] class MaintenanceWindowRule(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) name: str tenant_id: str = Field(foreign_key="tenant.id") description: Optional[str] = None created_by: str cel_query: str start_time: datetime end_time: datetime duration_seconds: Optional[int] = None updated_at: Optional[datetime] = Field( sa_column=Column( DateTime(timezone=True), name="updated_at", onupdate=func.now(), server_default=func.now(), ) ) suppress: bool = False enabled: bool = True ignore_statuses: list = Field(sa_column=Column(JSON), default_factory=list) __table_args__ = ( Index("ix_maintenance_rule_tenant_id", "tenant_id"), Index("ix_maintenance_rule_tenant_id_end_time", "tenant_id", "end_time"), ) class MaintenanceRuleCreate(BaseModel): name: str description: Optional[str] = None cel_query: str start_time: datetime duration_seconds: Optional[int] = None suppress: bool = False enabled: bool = True ignore_statuses: list[str] = DEFAULT_ALERT_STATUSES_TO_IGNORE class MaintenanceRuleRead(BaseModel): id: int name: str description: Optional[str] created_by: str cel_query: str start_time: datetime end_time: datetime duration_seconds: Optional[int] updated_at: Optional[datetime] suppress: bool = False enabled: bool = True ignore_statuses: list[str] = DEFAULT_ALERT_STATUSES_TO_IGNORE ================================================ FILE: keep/api/models/db/mapping.py ================================================ from datetime import datetime, timezone from typing import Literal, Optional from pydantic import BaseModel, validator from sqlalchemy import String from sqlmodel import JSON, Column, Field, SQLModel class MappingRule(SQLModel, table=True): id: Optional[int] = Field(primary_key=True, default=None) tenant_id: str = Field(foreign_key="tenant.id") priority: int = Field(default=0, nullable=False) name: str = Field(max_length=255, nullable=False) description: Optional[str] = Field(max_length=2048) file_name: Optional[str] = Field(max_length=255) created_by: Optional[str] = Field(max_length=255) created_at: datetime = Field(default_factory=lambda: datetime.now(tz=timezone.utc)) disabled: bool = Field(default=False) # Whether this rule should override existing attributes in the alert override: bool = Field(default=True) condition: Optional[str] = Field(max_length=2000) # The type of this mapping rule type: str = Field( sa_column=Column( String(255), name="type", server_default="csv", ), max_length=255, ) # The attributes to match against (e.g. [["service","region"], ["pod"]]) # Within a list it's AND, between lists it's OR: (service AND pod) OR pod matchers: list[list[str]] = Field(sa_column=Column(JSON)) # The rows of the CSV file [{service: "service1", region: "region1", ...}, ...] rows: Optional[list[dict]] = Field( sa_column=Column(JSON), ) # max_length=204800) updated_by: Optional[str] = Field(max_length=255, default=None) last_updated_at: datetime = Field(default_factory=datetime.utcnow) # Multi-level mapping fields is_multi_level: bool = Field(default=False) new_property_name: Optional[str] = Field(max_length=255) prefix_to_remove: Optional[str] = Field(max_length=255) class MappRuleDtoBase(BaseModel): name: str description: Optional[str] = None file_name: Optional[str] = None priority: int = 0 matchers: list[list[str]] type: Literal["csv", "topology"] = "csv" is_multi_level: bool = False new_property_name: Optional[str] = None prefix_to_remove: Optional[str] = None @validator("new_property_name") def validate_new_property_name(cls, v, values): if values.get("is_multi_level") and not v: raise ValueError( "new_property_name is required when is_multi_level is True" ) return v @validator("matchers") def validate_matchers(cls, v, values): if values.get("is_multi_level") and len(v) > 1: raise ValueError("Multi-level mapping can only have one matcher group") return v class MappingRuleDtoOut(MappRuleDtoBase, extra="ignore"): id: int created_by: Optional[str] created_at: datetime attributes: list[str] = [] updated_by: Optional[str] | None last_updated_at: Optional[datetime] | None rows: Optional[list[dict]] = None class MappingRuleDtoIn(MappRuleDtoBase): rows: Optional[list[dict]] = None @validator("rows", pre=True, always=True) def validate_rows(cls, rows, values): if not rows and values.get("type") == "csv": raise ValueError("Mapping of type CSV cannot have empty rows") return rows class MappingRuleUpdateDtoIn(MappRuleDtoBase): rows: Optional[list[dict]] = None ================================================ FILE: keep/api/models/db/migrations/__init__.py ================================================ ================================================ FILE: keep/api/models/db/migrations/env.py ================================================ import asyncio import os from logging.config import fileConfig from alembic import context from alembic.script import ScriptDirectory from sqlalchemy.future import Connection from sqlmodel import SQLModel import keep.api.logging from keep.api.core.db_utils import create_db_engine from keep.api.models.db.action import * from keep.api.models.db.ai_suggestion import * from keep.api.models.db.alert import * from keep.api.models.db.dashboard import * from keep.api.models.db.extraction import * from keep.api.models.db.facet import * from keep.api.models.db.maintenance_window import * from keep.api.models.db.mapping import * from keep.api.models.db.preset import * from keep.api.models.db.provider import * from keep.api.models.db.secret import * from keep.api.models.db.rule import * from keep.api.models.db.statistics import * from keep.api.models.db.tenant import * from keep.api.models.db.topology import * from keep.api.models.db.user import * from keep.api.models.db.workflow import * target_metadata = SQLModel.metadata # this is the Alembic Config object, which provides # access to the values within the .ini file in use. config = context.config # Interpret the config file for Python logging. # This line sets up loggers basically. if config.config_file_name is not None: # backup the current config logging_config = config.get_section("loggers") fileConfig(config.config_file_name) async def run_migrations_offline() -> None: """Run migrations in 'offline' mode. This configures the context with just a URL and not an Engine, though an Engine is acceptable here as well. By skipping the Engine creation we don't even need a DBAPI to be available. Calls to context.execute() here emit the given string to the script output. """ connectable = create_db_engine() context.configure( url=str(connectable.url), target_metadata=target_metadata, literal_binds=True, dialect_opts={"paramstyle": "named"}, render_as_batch=True, ) with context.begin_transaction(): context.run_migrations() def do_run_migrations(connection: Connection) -> None: """ Run actual sync migrations. :param connection: connection to the database. """ context.configure( connection=connection, target_metadata=target_metadata, render_as_batch=True ) with context.begin_transaction(): context.run_migrations() async def run_migrations_online() -> None: """ Run migrations in 'online' mode. In this scenario we need to create an Engine and associate a connection with the context. """ connectable = create_db_engine() try: do_run_migrations(connectable.connect()) except Exception as e: # print all migrations so we will know what failed list_migrations(connectable) raise e def list_migrations(connectable): """ List all migrations and their status for debugging. """ try: # Get the script directory from the alembic context script_directory = ScriptDirectory.from_config(config) current_rev = script_directory.get_current_head() # List all available migrations pid = os.getpid() print(f"[{pid}] Available migrations:") try: for script in script_directory.walk_revisions(): status = ( "PENDING" if current_rev and script.revision > current_rev else "APPLIED" ) print(f" - {script.revision}: {script.doc} ({status})") except Exception as exc: logger.exception(f"Failed to list migrations: {exc}") except Exception as exc: logger.exception(f"Failed to process migration information: {exc}") loop = asyncio.get_event_loop() if context.is_offline_mode(): task = run_migrations_offline() else: task = run_migrations_online() loop.run_until_complete(task) # SHAHAR: set back the logs to the default after alembic is done keep.api.logging.setup_logging() ================================================ FILE: keep/api/models/db/migrations/script.py.mako ================================================ """${message} Revision ID: ${up_revision} Revises: ${down_revision | comma,n} Create Date: ${create_date} """ from alembic import op import sqlalchemy as sa import sqlmodel import sqlalchemy_utils ${imports if imports else ""} # revision identifiers, used by Alembic. revision = ${repr(up_revision)} down_revision = ${repr(down_revision)} branch_labels = ${repr(branch_labels)} depends_on = ${repr(depends_on)} def upgrade() -> None: ${upgrades if upgrades else "pass"} def downgrade() -> None: ${downgrades if downgrades else "pass"} ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-11-17-10_54c1252b2c8a.py ================================================ """First migration Revision ID: 54c1252b2c8a Revises: Create Date: 2024-07-11 17:10:10.815182 """ import logging import sqlalchemy as sa import sqlalchemy_utils import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "54c1252b2c8a" down_revision = None branch_labels = None depends_on = None logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) def _upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "tenant", sa.Column("configuration", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.PrimaryKeyConstraint("id"), ) op.create_table( "user", sa.Column("id", sa.Integer(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("username", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("password_hash", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("role", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("last_sign_in", sa.DateTime(), nullable=True), sa.Column("created_at", sa.DateTime(), nullable=False), sa.PrimaryKeyConstraint("id"), ) op.create_index(op.f("ix_user_username"), "user", ["username"], unique=True) op.create_table( "action", sa.Column("action_raw", sa.TEXT(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("use", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("installed_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("installation_time", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("tenant_id", "name", "use"), ) op.create_table( "alert", sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("event", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("provider_type", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("provider_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("alert_hash", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_index( op.f("ix_alert_fingerprint"), "alert", ["fingerprint"], unique=False ) op.create_index(op.f("ix_alert_timestamp"), "alert", ["timestamp"], unique=False) op.create_table( "alertdeduplicationfilter", sa.Column("fields", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("matcher_cel", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "alertenrichment", sa.Column("enrichments", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column( "alert_fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("alert_fingerprint"), ) op.create_table( "alertraw", sa.Column("raw_alert", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "dashboard", sa.Column( "dashboard_config", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("dashboard_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("created_by", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column("updated_by", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("updated_at", sa.DateTime(), nullable=False), sa.Column("is_active", sa.Boolean(), nullable=False), sa.Column("is_private", sa.Boolean(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint( "tenant_id", "dashboard_name", name="unique_dashboard_name_per_tenant" ), ) op.create_index( op.f("ix_dashboard_dashboard_name"), "dashboard", ["dashboard_name"], unique=False, ) op.create_table( "extractionrule", sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=True, ), sa.Column("id", sa.Integer(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("priority", sa.Integer(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), sa.Column( "description", sqlmodel.sql.sqltypes.AutoString(length=2048), nullable=True ), sa.Column( "created_by", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=True ), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column( "updated_by", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=True ), sa.Column("disabled", sa.Boolean(), nullable=False), sa.Column("pre", sa.Boolean(), nullable=False), sa.Column( "condition", sqlmodel.sql.sqltypes.AutoString(length=2000), nullable=True ), sa.Column( "attribute", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False ), sa.Column( "regex", sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=False ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "mappingrule", sa.Column("matchers", sa.JSON(), nullable=True), sa.Column("rows", sa.JSON(), nullable=True), sa.Column("id", sa.Integer(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("priority", sa.Integer(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), sa.Column( "description", sqlmodel.sql.sqltypes.AutoString(length=2048), nullable=True ), sa.Column( "file_name", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=True ), sa.Column( "created_by", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=True ), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column("disabled", sa.Boolean(), nullable=False), sa.Column("override", sa.Boolean(), nullable=False), sa.Column( "condition", sqlmodel.sql.sqltypes.AutoString(length=2000), nullable=True ), sa.Column( "updated_by", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=True ), sa.Column("last_updated_at", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "preset", sa.Column("options", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("created_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("is_private", sa.Boolean(), nullable=True), sa.Column("is_noisy", sa.Boolean(), nullable=True), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("name"), sa.UniqueConstraint("tenant_id", "name"), ) op.create_index( op.f("ix_preset_created_by"), "preset", ["created_by"], unique=False ) op.create_index(op.f("ix_preset_tenant_id"), "preset", ["tenant_id"], unique=False) op.create_table( "provider", sa.Column("validatedScopes", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("type", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("installed_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("installation_time", sa.DateTime(), nullable=False), sa.Column( "configuration_key", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("consumer", sa.Boolean(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("tenant_id", "name"), ) op.create_table( "rule", sa.Column("definition", sa.JSON(), nullable=True), sa.Column("grouping_criteria", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("definition_cel", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timeframe", sa.Integer(), nullable=False), sa.Column("created_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("creation_time", sa.DateTime(), nullable=False), sa.Column("updated_by", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("update_time", sa.DateTime(), nullable=True), sa.Column( "group_description", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), sa.Column( "item_description", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "tenantapikey", sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("reference_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("key_hash", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("is_system", sa.Boolean(), nullable=False), sa.Column("is_deleted", sa.Boolean(), nullable=False), sa.Column( "system_description", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), sa.Column("created_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("role", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column("last_used", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.UniqueConstraint("tenant_id", "reference_id"), sa.PrimaryKeyConstraint("key_hash"), ) op.create_table( "tenantinstallation", sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("bot_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("installed", sa.Boolean(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "workflow", sa.Column("name", sa.TEXT(), nullable=True), sa.Column("created_by", sa.TEXT(), nullable=True), sa.Column("workflow_raw", sa.TEXT(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("updated_by", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("creation_time", sa.DateTime(), nullable=False), sa.Column("interval", sa.Integer(), nullable=True), sa.Column("is_deleted", sa.Boolean(), nullable=False), sa.Column("revision", sa.Integer(), nullable=False), sa.Column("last_updated", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "group", sa.Column( "rule_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True ), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("creation_time", sa.DateTime(), nullable=False), sa.Column( "group_fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.ForeignKeyConstraint(["rule_id"], ["rule.id"], ondelete="CASCADE"), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "workflowexecution", sa.Column("triggered_by", sa.TEXT(), nullable=True), sa.Column("status", sa.TEXT(), nullable=True), sa.Column("results", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("workflow_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("started", sa.DateTime(), nullable=False), sa.Column("is_running", sa.Integer(), nullable=False), sa.Column("timeslot", sa.Integer(), nullable=False), sa.Column("execution_number", sa.Integer(), nullable=False), sa.Column( "error", sqlmodel.sql.sqltypes.AutoString(length=10240), nullable=True ), sa.Column("execution_time", sa.Integer(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.ForeignKeyConstraint( ["workflow_id"], ["workflow.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint( "workflow_id", "execution_number", "is_running", "timeslot" ), ) op.create_table( "alerttogroup", sa.Column( "group_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False, ), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("alert_id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.ForeignKeyConstraint( ["alert_id"], ["alert.id"], ), sa.ForeignKeyConstraint(["group_id"], ["group.id"], ondelete="CASCADE"), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("group_id", "alert_id"), ) op.create_table( "workflowexecutionlog", sa.Column("message", sa.TEXT(), nullable=True), sa.Column("context", sa.JSON(), nullable=True), sa.Column("id", sa.Integer(), nullable=False), sa.Column( "workflow_execution_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["workflow_execution_id"], ["workflowexecution.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "workflowtoalertexecution", sa.Column("id", sa.Integer(), nullable=False), sa.Column( "workflow_execution_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column( "alert_fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("event_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["workflow_execution_id"], ["workflowexecution.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("workflow_execution_id", "alert_fingerprint"), ) # ### end Alembic commands ### def upgrade() -> None: """ This migration is special because it creates the tables from scratch, and should tolerate the case where the tables already exist. """ try: _upgrade() except Exception as e: if "already exists" in str(e): logging.warning(str(e)) logging.warning( "Table already exists, which most likely means that tables has already been created before the migration mechanism was introduced. It's ok!" ) else: raise e def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("workflowtoalertexecution") op.drop_table("workflowexecutionlog") op.drop_table("alerttogroup") op.drop_table("workflowexecution") op.drop_table("group") op.drop_table("workflow") op.drop_table("tenantinstallation") op.drop_table("tenantapikey") op.drop_table("rule") op.drop_table("provider") op.drop_index(op.f("ix_preset_tenant_id"), table_name="preset") op.drop_index(op.f("ix_preset_created_by"), table_name="preset") op.drop_table("preset") op.drop_table("mappingrule") op.drop_table("extractionrule") op.drop_index(op.f("ix_dashboard_dashboard_name"), table_name="dashboard") op.drop_table("dashboard") op.drop_table("alertraw") op.drop_table("alertenrichment") op.drop_table("alertdeduplicationfilter") op.drop_index(op.f("ix_alert_timestamp"), table_name="alert") op.drop_index(op.f("ix_alert_fingerprint"), table_name="alert") op.drop_table("alert") op.drop_table("action") op.drop_index(op.f("ix_user_username"), table_name="user") op.drop_table("user") op.drop_table("tenant") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-15-15-10_c37ec8f6db3e.py ================================================ """Adding alertaudit table Revision ID: c37ec8f6db3e Revises: 54c1252b2c8a Create Date: 2024-07-15 15:10:51.175030 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "c37ec8f6db3e" down_revision = "54c1252b2c8a" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "alertaudit", sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("user_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("action", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sa.Text(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_index( "ix_alert_audit_fingerprint", "alertaudit", ["fingerprint"], unique=False ) op.create_index( "ix_alert_audit_tenant_id", "alertaudit", ["tenant_id"], unique=False ) op.create_index( "ix_alert_audit_tenant_id_fingerprint", "alertaudit", ["tenant_id", "fingerprint"], unique=False, ) op.create_index( "ix_alert_audit_timestamp", "alertaudit", ["timestamp"], unique=False ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_index("ix_alert_audit_timestamp", table_name="alertaudit") op.drop_index("ix_alert_audit_tenant_id_fingerprint", table_name="alertaudit") op.drop_index("ix_alert_audit_tenant_id", table_name="alertaudit") op.drop_index("ix_alert_audit_fingerprint", table_name="alertaudit") op.drop_table("alertaudit") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-16-12-16_37019ca3eb2e.py ================================================ """Incident related tables Revision ID: 37019ca3eb2e Revises: c37ec8f6db3e Create Date: 2024-07-16 12:16:01.837477 """ import sqlalchemy as sa import sqlmodel from alembic import op from sqlalchemy_utils import UUIDType # revision identifiers, used by Alembic. revision = "37019ca3eb2e" down_revision = "c37ec8f6db3e" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "incident", sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("assignee", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("creation_time", sa.DateTime(), nullable=False), sa.Column("start_time", sa.DateTime(), nullable=True), sa.Column("end_time", sa.DateTime(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "alerttoincident", sa.Column( "incident_id", UUIDType(binary=False), nullable=False, ), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("alert_id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.ForeignKeyConstraint( ["alert_id"], ["alert.id"], ), sa.ForeignKeyConstraint(["incident_id"], ["incident.id"], ondelete="CASCADE"), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("incident_id", "alert_id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("alerttoincident") op.drop_table("incident") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-17-16-46_dcbd2873dcfd.py ================================================ """Add is_predicted and is_confirmed flags to Incident model Revision ID: dcbd2873dcfd Revises: 37019ca3eb2e Create Date: 2024-07-17 16:46:59.386127 """ import sqlalchemy as sa from alembic import op from sqlalchemy.sql import expression # revision identifiers, used by Alembic. revision = "dcbd2873dcfd" down_revision = "37019ca3eb2e" branch_labels = None depends_on = None def upgrade() -> None: op.add_column( "incident", sa.Column( "is_confirmed", sa.Boolean(), nullable=False, default=False, server_default=expression.false(), ), ) op.add_column( "incident", sa.Column( "is_predicted", sa.Boolean(), nullable=False, default=False, server_default=expression.false(), ), ) def downgrade() -> None: op.drop_column("incident", "is_confirmed") op.drop_column("incident", "is_predicted") ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-24-13-39_9ba0aeecd4d0.py ================================================ """For AI Revision ID: 9ba0aeecd4d0 Revises: dcbd2873dcfd Create Date: 2024-07-24 13:39:10.576538 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "9ba0aeecd4d0" down_revision = "dcbd2873dcfd" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "pmimatrix", sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("fingerprint_i", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("fingerprint_j", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("pmi", sa.Float(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("fingerprint_i", "fingerprint_j"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("pmimatrix") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-25-17-13_67f1efb93c99.py ================================================ """Add fields for prepopulated data from alerts Revision ID: 67f1efb93c99 Revises: dcbd2873dcfd Create Date: 2024-07-25 17:13:04.428633 """ import warnings import sqlalchemy as sa from alembic import op from pydantic import BaseModel from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import Session from sqlalchemy import exc as sa_exc # revision identifiers, used by Alembic. revision = "67f1efb93c99" down_revision = "9ba0aeecd4d0" branch_labels = None depends_on = None # Define a completely separate metadata for the migration migration_metadata = sa.MetaData() # Direct table definition for AlertToIncident alert_to_incident_table = sa.Table( 'alerttoincident', migration_metadata, sa.Column('alert_id', UUID(as_uuid=False), sa.ForeignKey('alert.id', ondelete='CASCADE'), primary_key=True), sa.Column('incident_id', UUID(as_uuid=False), sa.ForeignKey('incident.id', ondelete='CASCADE'), primary_key=True) ) # The following code will shoow SA warning about dialect, so we suppress it. with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sa_exc.SAWarning) # Direct table definition for Incident incident_table = sa.Table( 'incident', migration_metadata, sa.Column('id', UUID(as_uuid=False), primary_key=True), sa.Column('alerts_count', sa.Integer, default=0), sa.Column('affected_services', sa.JSON, default_factory=list), sa.Column('sources', sa.JSON, default_factory=list) ) # Direct table definition for Alert alert_table = sa.Table( 'alert', migration_metadata, sa.Column('id', UUID(as_uuid=False), primary_key=True), sa.Column('provider_type', sa.String), sa.Column('event', sa.JSON) ) class AlertDtoLocal(BaseModel): service: str | None = None source: list[str] | None = [] def populate_db(): session = Session(op.get_bind()) incidents = session.execute(sa.select(incident_table)).fetchall() for incident in incidents: stmt = ( sa.select(alert_table).select_from(alert_table) .join(alert_to_incident_table, alert_table.c.id == alert_to_incident_table.c.alert_id) .where(alert_to_incident_table.c.incident_id == str(incident.id)) ) alerts = session.execute(stmt).all() alerts_dto = [AlertDtoLocal(**alert.event) for alert in alerts] stmt = ( sa.update(incident_table).where(incident_table.c.id == incident.id).values( sources=list(set([source for alert_dto in alerts_dto for source in alert_dto.source])), affected_services=list(set([alert.service for alert in alerts_dto if alert.service is not None])), alerts_count=len(alerts) ) ) session.execute(stmt) session.commit() def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.add_column("incident", sa.Column("affected_services", sa.JSON(), nullable=True)) op.add_column("incident", sa.Column("sources", sa.JSON(), nullable=True)) op.add_column("incident", sa.Column("alerts_count", sa.Integer(), nullable=False, server_default="0")) populate_db() # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_column("incident", "alerts_count") op.drop_column("incident", "sources") op.drop_column("incident", "affected_services") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-28-16-24_8e5942040de6.py ================================================ """Summaries added Revision ID: 8e5942040de6 Revises: 9ba0aeecd4d0 Create Date: 2024-07-28 16:24:58.364281 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "8e5942040de6" down_revision = "67f1efb93c99" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.add_column( "incident", sa.Column("user_summary", sqlmodel.sql.sqltypes.AutoString(), nullable=True), ) op.add_column( "incident", sa.Column( "generated_summary", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_column("incident", "generated_summary") op.drop_column("incident", "user_summary") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-29-12-51_c91b348b94f2.py ================================================ """Description replaced w/ user_summary Revision ID: c91b348b94f2 Revises: 8e5942040de6 Create Date: 2024-07-29 12:51:24.496126 """ import sqlalchemy as sa from alembic import op from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import Session # revision identifiers, used by Alembic. revision = "c91b348b94f2" down_revision = "8e5942040de6" branch_labels = None depends_on = None # Define a completely separate metadata for the migration migration_metadata = sa.MetaData() # Direct table definition for Incident incident_table = sa.Table( "incident", migration_metadata, sa.Column("id", UUID(as_uuid=False), primary_key=True), sa.Column("description", sa.String), sa.Column("user_summary", sa.String), ) def populate_db(session): # we need to populate the user_summary field with the description session.execute( sa.update(incident_table).values(user_summary=incident_table.c.description) ) session.commit() def depopulate_db(session): # we need to populate the description field with the user_summary session.execute( sa.update(incident_table).values(description=incident_table.c.user_summary) ) session.commit() def upgrade() -> None: # First ensure data is copied session = Session(op.get_bind()) populate_db(session) # Then drop the column using batch_alter_table for SQLite compatibility with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_column("description") def downgrade() -> None: # First add the description column back with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column( "description", sa.VARCHAR(), nullable=False, server_default="" ) ) # Copy the data from user_summary to description session = Session(op.get_bind()) depopulate_db(session) # Finally drop the user_summary column with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_column("user_summary") ================================================ FILE: keep/api/models/db/migrations/versions/2024-07-29-18-10_92f4f93f2140.py ================================================ """Topology Migrations Revision ID: 92f4f93f2140 Revises: dcbd2873dcfd Create Date: 2024-07-29 18:10:37.723465 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "92f4f93f2140" down_revision = "c91b348b94f2" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "topologyservice", sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("tags", sa.JSON(), nullable=True), sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=True, ), sa.Column("id", sa.Integer(), nullable=False), sa.Column( "source_provider_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("repository", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("service", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("environment", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("display_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("team", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("application", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("email", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("slack", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "topologyservicedependency", sa.Column("service_id", sa.Integer(), nullable=True), sa.Column("depends_on_service_id", sa.Integer(), nullable=True), sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=True, ), sa.Column("id", sa.Integer(), nullable=False), sa.Column("protocol", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["depends_on_service_id"], ["topologyservice.id"], ondelete="CASCADE" ), sa.ForeignKeyConstraint( ["service_id"], ["topologyservice.id"], ondelete="CASCADE" ), sa.PrimaryKeyConstraint("id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("topologyservicedependency") op.drop_table("topologyservice") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-08-05-13-09_4147d9e706c0.py ================================================ """Provider last pull time Revision ID: 4147d9e706c0 Revises: 92f4f93f2140 Create Date: 2024-08-05 13:09:18.851721 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "4147d9e706c0" down_revision = "92f4f93f2140" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.add_column("provider", sa.Column("last_pull_time", sa.DateTime(), nullable=True)) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_column("provider", "last_pull_time") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-08-11-17-38_9453855f3ba0.py ================================================ """Add tags Revision ID: 9453855f3ba0 Revises: 42098785763c Create Date: 2024-08-11 17:38:26.085168 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "9453855f3ba0" down_revision = "4147d9e706c0" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "tag", sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("name"), ) op.create_table( "presettaglink", sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("preset_id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tag_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.ForeignKeyConstraint( ["preset_id"], ["preset.id"], ), sa.ForeignKeyConstraint( ["tag_id"], ["tag.id"], ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("tenant_id", "preset_id", "tag_id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("presettaglink") op.drop_table("tag") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-08-13-19-22_0832e0d9889a.py ================================================ """add last_seen_time field to incident Revision ID: 0832e0d9889a Revises: 005efc57cc1c Create Date: 2024-08-13 19:22:35.873850 """ import sqlalchemy as sa from alembic import op from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import Session # revision identifiers, used by Alembic. revision = "0832e0d9889a" down_revision = "9453855f3ba0" branch_labels = None depends_on = None # Define a completely separate metadata for the migration migration_metadata = sa.MetaData() # Direct table definition for AlertToIncident alert_to_incident_table = sa.Table( 'alerttoincident', migration_metadata, sa.Column('alert_id', UUID(as_uuid=False), sa.ForeignKey('alert.id', ondelete='CASCADE'), primary_key=True), sa.Column('incident_id', UUID(as_uuid=False), sa.ForeignKey('incident.id', ondelete='CASCADE'), primary_key=True) ) # Direct table definition for Incident incident_table = sa.Table( 'incident', migration_metadata, sa.Column('id', UUID(as_uuid=False), primary_key=True), sa.Column('start_time', sa.DateTime, nullable=True), sa.Column('last_seen_time', sa.DateTime, nullable=True), ) # Direct table definition for Alert alert_table = sa.Table( 'alert', migration_metadata, sa.Column('id', UUID(as_uuid=False), primary_key=True), sa.Column('timestamp', sa.DateTime), ) def populate_db(): session = Session(op.get_bind()) incidents = session.execute(sa.select(incident_table)).fetchall() for incident in incidents: stmt = ( sa.select([sa.func.min(alert_table.c.timestamp), sa.func.max(alert_table.c.timestamp)]) .select_from(alert_table) .join(alert_to_incident_table, alert_table.c.id == alert_to_incident_table.c.alert_id) .where(alert_to_incident_table.c.incident_id == str(incident.id)) ) started_at, last_seen_at = session.execute(stmt).one() stmt = ( sa.update(incident_table).where(incident_table.c.id == incident.id).values( start_time=started_at, last_seen_time=last_seen_at ) ) session.execute(stmt) session.commit() def upgrade() -> None: op.add_column("incident", sa.Column("last_seen_time", sa.DateTime(), nullable=True)) populate_db() def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_column("incident", "last_seen_time") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-08-14-18-30_87594ea6d308.py ================================================ """add rules-related fields to the incident Revision ID: 87594ea6d308 Revises: 0832e0d9889a Create Date: 2024-08-14 18:30:09.052273 """ import sqlalchemy as sa import sqlalchemy_utils import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "87594ea6d308" down_revision = "0832e0d9889a" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column( "rule_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True, ) ) batch_op.add_column( sa.Column( "rule_fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False, default="", server_default="" ) ) batch_op.add_column( sa.Column("severity", sa.Integer(), nullable=False, server_default=sa.text("(5)"), default=5) ) batch_op.create_foreign_key( "incident_rule_id_fk", "rule", ["rule_id"], ["id"], ondelete="CASCADE" ) with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column( "require_approve", sa.Boolean(), nullable=False, server_default=sa.text("(FALSE)"), ) ) # op.drop_table("alerttogroup") # op.drop_table("group") with op.batch_alter_table("alerttoincident", schema=None) as batch_op: batch_op.add_column(sa.Column("timestamp", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp())) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_constraint("incident_rule_id_fk", type_="foreignkey") batch_op.drop_column("rule_fingerprint") batch_op.drop_column("rule_id") batch_op.drop_column("severity") with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("require_approve") with op.batch_alter_table("alerttoincident", schema=None) as batch_op: batch_op.drop_column("timestamp") op.create_table( "group", sa.Column("rule_id", sa.VARCHAR(length=32), nullable=True), sa.Column("id", sa.VARCHAR(length=32), nullable=False), sa.Column("tenant_id", sa.VARCHAR(length=32), nullable=False), sa.Column("creation_time", sa.DATETIME(), nullable=False), sa.Column("group_fingerprint", sa.VARCHAR(length=32), nullable=False), sa.ForeignKeyConstraint(["rule_id"], ["rule.id"], ondelete="CASCADE"), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "alerttogroup", sa.Column("group_id", sa.CHAR(length=32), nullable=False), sa.Column("tenant_id", sa.VARCHAR(length=32), nullable=False), sa.Column("timestamp", sa.DATETIME(), nullable=False), sa.Column("alert_id", sa.VARCHAR(length=32), nullable=False), sa.ForeignKeyConstraint( ["alert_id"], ["alert.id"], ), sa.ForeignKeyConstraint(["group_id"], ["group.id"], ondelete="CASCADE"), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("group_id", "alert_id"), ) ================================================ FILE: keep/api/models/db/migrations/versions/2024-08-25-16-40_4ef2c767664c.py ================================================ """alter rule_fingerprint to text Revision ID: 4ef2c767664c Revises: 87594ea6d308 Create Date: 2024-08-25 16:40:38.661553 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "4ef2c767664c" down_revision = "87594ea6d308" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.alter_column( "rule_fingerprint", existing_type=sa.VARCHAR(), type_=sa.TEXT(), nullable=True, existing_server_default=sa.text("('')"), ) def downgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.alter_column( "rule_fingerprint", existing_type=sa.TEXT(), type_=sa.VARCHAR(), nullable=False, existing_server_default=sa.text("('')"), ) ================================================ FILE: keep/api/models/db/migrations/versions/2024-08-25-16-48_1c650a429672.py ================================================ """Modify summary column types Revision ID: 1c650a429672 Revises: 4ef2c767664c Create Date: 2024-08-25 16:08:06.271696 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "1c650a429672" down_revision = "4ef2c767664c" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.alter_column( "user_summary", existing_type=sa.VARCHAR(), type_=sa.TEXT(), existing_nullable=True, ) batch_op.alter_column( "generated_summary", existing_type=sa.VARCHAR(), type_=sa.TEXT(), existing_nullable=True, ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.alter_column( "generated_summary", existing_type=sa.TEXT(), type_=sa.VARCHAR(), existing_nullable=True, ) batch_op.alter_column( "user_summary", existing_type=sa.TEXT(), type_=sa.VARCHAR(), existing_nullable=True, ) # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-08-30-09-34_7ed12220a0d3.py ================================================ """Added is_disabled to workflows Revision ID: 7ed12220a0d3 Revises: 1c650a429672 Create Date: 2024-08-30 09:34:41.782797 """ import sqlalchemy as sa import yaml from alembic import op from keep.parser.parser import Parser # revision identifiers, used by Alembic. revision = "7ed12220a0d3" down_revision = "1c650a429672" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("workflow", schema=None) as batch_op: batch_op.add_column(sa.Column("is_disabled", sa.Boolean(), nullable=False, server_default=sa.false())) connection = op.get_bind() workflows = connection.execute(sa.text("SELECT id, workflow_raw FROM workflow")).fetchall() updates = [] for workflow in workflows: try: workflow_yaml = yaml.safe_load(workflow.workflow_raw) # If, by any chance, the existing workflow YAML's "disabled" value resolves to true, # we need to update the database to set `is_disabled` to `True` if Parser.parse_disabled(workflow_yaml): updates.append({ 'id': workflow.id, 'is_disabled': True }) except Exception as e: print(f"Failed to parse workflow_raw for workflow id {workflow.id}: {e}") continue if updates: connection.execute( sa.text( "UPDATE workflow SET is_disabled = :is_disabled WHERE id = :id" ), updates ) def downgrade() -> None: with op.batch_alter_table("workflow", schema=None) as batch_op: batch_op.drop_column("is_disabled") connection = op.get_bind() workflows = connection.execute(sa.text("SELECT id, workflow_raw FROM workflow")).fetchall() updates = [] for workflow in workflows: try: workflow_yaml = yaml.safe_load(workflow.workflow_raw) if 'disabled' in workflow_yaml: workflow_yaml.pop('disabled', None) updated_workflow_raw = yaml.safe_dump(workflow_yaml) updates.append({ 'id': workflow.id, 'workflow_raw': updated_workflow_raw }) except Exception as e: print(f"Failed to parse workflow_raw for workflow id {workflow.id}: {e}") continue if updates: connection.execute( sa.text( "UPDATE workflow SET workflow_raw = :workflow_raw WHERE id = :id" ), updates ) ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-01-14-04_94886bc59c11.py ================================================ """user_generated_name and ai_generated_name separation for incident model added Revision ID: 94886bc59c11 Revises: 1c650a429672 Create Date: 2024-09-01 14:04:52.407708 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "94886bc59c11" down_revision = "7ed12220a0d3" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column("user_generated_name", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) batch_op.add_column( sa.Column( "ai_generated_name", sqlmodel.sql.sqltypes.AutoString(), nullable=True ) ) with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.execute(sa.text("UPDATE incident SET user_generated_name = name")) batch_op.drop_column("name") # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column(sa.Column("name", sa.VARCHAR(), nullable=False)) with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.execute(sa.text("UPDATE incident SET name = user_generated_name")) batch_op.drop_column("ai_generated_name") batch_op.drop_column("user_generated_name") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-02-12-07_70671c95028e.py ================================================ """Maintenance Windows Revision ID: 70671c95028e Revises: 94886bc59c11 Create Date: 2024-09-02 12:07:09.147349 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "70671c95028e" down_revision = "94886bc59c11" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "maintenancewindowrule", sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=True, ), sa.Column("id", sa.Integer(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("created_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("cel_query", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("start_time", sa.DateTime(), nullable=False), sa.Column("end_time", sa.DateTime(), nullable=False), sa.Column("duration_seconds", sa.Integer(), nullable=True), sa.Column("enabled", sa.Boolean(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("maintenancewindowrule", schema=None) as batch_op: batch_op.create_index( "ix_maintenance_rule_tenant_id", ["tenant_id"], unique=False ) batch_op.create_index( "ix_maintenance_rule_tenant_id_end_time", ["tenant_id", "end_time"], unique=False, ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("maintenancewindowrule", schema=None) as batch_op: batch_op.drop_index("ix_maintenance_rule_tenant_id_end_time") batch_op.drop_index("ix_maintenance_rule_tenant_id") op.drop_table("maintenancewindowrule") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-03-10-08_49e7c02579db.py ================================================ """add suppress to mw Revision ID: 49e7c02579db Revises: 70671c95028e Create Date: 2024-09-03 10:08:21.612949 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "49e7c02579db" down_revision = "70671c95028e" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("maintenancewindowrule", schema=None) as batch_op: batch_op.add_column(sa.Column("suppress", sa.Boolean(), nullable=False)) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("maintenancewindowrule", schema=None) as batch_op: batch_op.drop_column("suppress") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-03-16-24_1a5eb7069f9a.py ================================================ """more topology data Revision ID: 1a5eb7069f9a Revises: 49e7c02579db Create Date: 2024-09-03 16:24:25.791272 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "1a5eb7069f9a" down_revision = "49e7c02579db" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.add_column( sa.Column("ip_address", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) batch_op.add_column( sa.Column("mac_address", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) batch_op.add_column( sa.Column("category", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) batch_op.add_column( sa.Column("manufacturer", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.drop_column("manufacturer") batch_op.drop_column("category") batch_op.drop_column("mac_address") batch_op.drop_column("ip_address") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-04-13-09_e6653be70b62.py ================================================ """mapping type Revision ID: e6653be70b62 Revises: 1a5eb7069f9a Create Date: 2024-09-04 13:09:14.958740 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "e6653be70b62" down_revision = "1a5eb7069f9a" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("mappingrule", schema=None) as batch_op: batch_op.add_column( sa.Column("type", sqlmodel.sql.sqltypes.AutoString(), nullable=False) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("mappingrule", schema=None) as batch_op: batch_op.drop_column("type") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-08-17-51_1aacee84447e.py ================================================ """Store timeunit for Rule for better UX Revision ID: 1aacee84447e Revises: 1c650a429672 Create Date: 2024-08-26 17:01:21.263004 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "1aacee84447e" down_revision = "e6653be70b62" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column("timeunit", sqlmodel.sql.sqltypes.AutoString(), nullable=False, server_default="seconds") ) def downgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("timeunit") ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-13-10-48_938b1aa62d5c.py ================================================ """Provisioned Revision ID: 938b1aa62d5c Revises: 710b4ff1d19e Create Date: 2024-09-13 10:48:16.112419 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "938b1aa62d5c" down_revision = "1aacee84447e" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.add_column( "provider", sa.Column( "provisioned", sa.Boolean(), nullable=False, server_default=sa.false() ), ) op.add_column( "workflow", sa.Column( "provisioned", sa.Boolean(), nullable=False, server_default=sa.false() ), ) op.add_column( "workflow", sa.Column( "provisioned_file", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("workflow", schema=None) as batch_op: batch_op.drop_column("provisioned") with op.batch_alter_table("provider", schema=None) as batch_op: batch_op.drop_column("provisioned") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-17-23-30_c5443d9deb0f.py ================================================ """Add status to Incident model Revision ID: c5443d9deb0f Revises: 710b4ff1d19e Create Date: 2024-09-11 23:30:04.308017 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "c5443d9deb0f" down_revision = "938b1aa62d5c" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False, default="firing", server_default="firing") ) batch_op.create_index( batch_op.f("ix_incident_status"), ["status"], unique=False ) def downgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_incident_status")) batch_op.drop_column("status") ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-18-02-05_772790c2e50a.py ================================================ """add WorkflowToIncidentExecution Revision ID: 772790c2e50a Revises: 49e7c02579db Create Date: 2024-09-08 02:05:42.739163 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "772790c2e50a" down_revision = "c5443d9deb0f" branch_labels = None depends_on = None def upgrade() -> None: op.create_table( "workflowtoincidentexecution", sa.Column("id", sa.Integer(), nullable=False), sa.Column( "workflow_execution_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("incident_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["workflow_execution_id"], ["workflowexecution.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("workflow_execution_id", "incident_id"), ) def downgrade() -> None: op.drop_table("workflowtoincidentexecution") ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py ================================================ """mappingrule type default value Revision ID: 5d7ae55efc6a Revises: 938b1aa62d5c Create Date: 2024-09-18 14:08:49.363483 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "5d7ae55efc6a" down_revision = "772790c2e50a" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("mappingrule", schema=None) as batch_op: batch_op.alter_column( "type", existing_type=sa.VARCHAR(length=255), nullable=False, server_default="csv", ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("mappingrule", schema=None) as batch_op: batch_op.alter_column( "type", existing_type=sa.VARCHAR(length=255), nullable=True ) # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-19-15-26_493f217af6b6.py ================================================ """Dedup Revision ID: 493f217af6b6 Revises: 5d7ae55efc6a Create Date: 2024-09-19 15:26:21.564118 """ import sqlalchemy as sa import sqlmodel from alembic import op from sqlalchemy.dialects import sqlite # revision identifiers, used by Alembic. revision = "493f217af6b6" down_revision = "5d7ae55efc6a" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "alertdeduplicationevent", sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("date_hour", sa.DateTime(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column( "deduplication_rule_id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False ), sa.Column( "deduplication_type", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("provider_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("provider_type", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_index( "ix_alert_deduplication_event_provider_id", "alertdeduplicationevent", ["provider_id"], unique=False, ) op.create_index( "ix_alert_deduplication_event_provider_id_date_hour", "alertdeduplicationevent", ["provider_id", "date_hour"], unique=False, ) op.create_index( "ix_alert_deduplication_event_provider_type", "alertdeduplicationevent", ["provider_type"], unique=False, ) op.create_index( "ix_alert_deduplication_event_provider_type_date_hour", "alertdeduplicationevent", ["provider_type", "date_hour"], unique=False, ) op.create_index( op.f("ix_alertdeduplicationevent_tenant_id"), "alertdeduplicationevent", ["tenant_id"], unique=False, ) op.create_table( "alertdeduplicationrule", sa.Column("fingerprint_fields", sa.JSON(), nullable=True), sa.Column("ignore_fields", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("provider_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("provider_type", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("last_updated", sa.DateTime(), nullable=False), sa.Column( "last_updated_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column("created_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("enabled", sa.Boolean(), nullable=False), sa.Column("full_deduplication", sa.Boolean(), nullable=False), sa.Column("priority", sa.Integer(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_index( op.f("ix_alertdeduplicationrule_name"), "alertdeduplicationrule", ["name"], unique=False, ) op.create_table( "alertfield", sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("field_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("provider_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("provider_type", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("tenant_id", "field_name", name="uq_tenant_field"), ) op.create_index( "ix_alert_field_provider_id_provider_type", "alertfield", ["provider_id", "provider_type"], unique=False, ) op.create_index( "ix_alert_field_tenant_id", "alertfield", ["tenant_id"], unique=False ) op.create_index( "ix_alert_field_tenant_id_field_name", "alertfield", ["tenant_id", "field_name"], unique=False, ) op.create_index( op.f("ix_alertfield_field_name"), "alertfield", ["field_name"], unique=False ) op.create_index( op.f("ix_alertfield_provider_id"), "alertfield", ["provider_id"], unique=False ) op.create_index( op.f("ix_alertfield_provider_type"), "alertfield", ["provider_type"], unique=False, ) op.create_index( op.f("ix_alertfield_tenant_id"), "alertfield", ["tenant_id"], unique=False ) op.drop_table("alertdeduplicationfilter") # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "alertdeduplicationfilter", sa.Column("fields", sqlite.JSON(), nullable=True), sa.Column("id", sa.CHAR(length=32), nullable=False), sa.Column("tenant_id", sa.VARCHAR(), nullable=False), sa.Column("matcher_cel", sa.VARCHAR(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.drop_index(op.f("ix_alertfield_tenant_id"), table_name="alertfield") op.drop_index(op.f("ix_alertfield_provider_type"), table_name="alertfield") op.drop_index(op.f("ix_alertfield_provider_id"), table_name="alertfield") op.drop_index(op.f("ix_alertfield_field_name"), table_name="alertfield") op.drop_index("ix_alert_field_tenant_id_field_name", table_name="alertfield") op.drop_index("ix_alert_field_tenant_id", table_name="alertfield") op.drop_index("ix_alert_field_provider_id_provider_type", table_name="alertfield") op.drop_table("alertfield") op.drop_index( op.f("ix_alertdeduplicationrule_name"), table_name="alertdeduplicationrule" ) op.drop_table("alertdeduplicationrule") op.drop_index( op.f("ix_alertdeduplicationevent_tenant_id"), table_name="alertdeduplicationevent", ) op.drop_index( "ix_alert_deduplication_event_provider_type_date_hour", table_name="alertdeduplicationevent", ) op.drop_index( "ix_alert_deduplication_event_provider_type", table_name="alertdeduplicationevent", ) op.drop_index( "ix_alert_deduplication_event_provider_id_date_hour", table_name="alertdeduplicationevent", ) op.drop_index( "ix_alert_deduplication_event_provider_id", table_name="alertdeduplicationevent" ) op.drop_table("alertdeduplicationevent") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-09-22-14-16_01ebe17218c0.py ================================================ """Topology applications Revision ID: 01ebe17218c0 Revises: 493f217af6b6 Create Date: 2024-09-22 14:16:17.078591 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "01ebe17218c0" down_revision = "493f217af6b6" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "topologyapplication", sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "topologyserviceapplication", sa.Column("service_id", sa.Integer(), nullable=False), sa.Column("application_id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.ForeignKeyConstraint( ["application_id"], ["topologyapplication.id"], ), sa.ForeignKeyConstraint( ["service_id"], ["topologyservice.id"], ), sa.PrimaryKeyConstraint("service_id", "application_id"), ) with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.drop_column("application") # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.add_column(sa.Column("application", sa.VARCHAR(), nullable=True)) op.drop_table("topologyserviceapplication") op.drop_table("topologyapplication") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-05-18-37_017d759805d9.py ================================================ """Add resolve_on action to Rule Revision ID: 017d759805d9 Revises: 01ebe17218c0 Create Date: 2024-10-05 18:37:45.152090 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "017d759805d9" down_revision = "01ebe17218c0" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column("resolve_on", sqlmodel.sql.sqltypes.AutoString(), nullable=False, default="never", server_default="never") ) def downgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("resolve_on") ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-08-10-47_bf756df80e9d.py ================================================ """Incident linking to each other Revision ID: bf756df80e9d Revises: 017d759805d9 Create Date: 2024-10-08 10:47:25.326327 """ import sqlalchemy as sa import sqlalchemy_utils from alembic import op # revision identifiers, used by Alembic. revision = "bf756df80e9d" down_revision = "017d759805d9" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column( "same_incident_in_the_past_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True, ) ) batch_op.create_foreign_key( "same_incident_in_the_past_id_fk", "incident", ["same_incident_in_the_past_id"], ["id"], ondelete="SET NULL", ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_constraint("same_incident_in_the_past_id_fk", type_="foreignkey") batch_op.drop_column("same_incident_in_the_past_id") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-14-08-34_83c1020be97d.py ================================================ """Alert To Incident link history Revision ID: 83c1020be97d Revises: bf756df80e9d Create Date: 2024-10-14 08:34:46.608806 """ from sqlalchemy import inspect from alembic import op import sqlalchemy as sa from sqlalchemy.sql import expression from contextlib import contextmanager # revision identifiers, used by Alembic. revision = "83c1020be97d" down_revision = "bf756df80e9d" branch_labels = None depends_on = None @contextmanager def drop_and_restore_f_keys(table_name, conn): inspector = inspect(conn) existing_f_keys = inspector.get_foreign_keys(table_name, schema=None) print(f"Existing foreign keys: {existing_f_keys}") # Drop all foreign keys for fk in existing_f_keys: try: op.drop_constraint(fk['name'], table_name, type_='foreignkey') print(f"Dropped foreign key: {fk['name']}") except NotImplementedError as e: if "No support for ALTER of constraints in SQLite dialect." in str(e): print("No support for ALTER of constraints in SQLite dialect, constraint should be overriden later so skipping") else: raise e try: yield finally: # Restore all foreign keys for fk in existing_f_keys: try: op.create_foreign_key( fk['name'], table_name, fk['referred_table'], fk['constrained_columns'], fk['referred_columns'], ondelete=fk['options'].get('ondelete') ) print(f"Restored foreign key: {fk['name']}") except NotImplementedError as e: if "No support for ALTER of constraints in SQLite dialect." in str(e): print("No support for ALTER of constraints in SQLite dialect, constraint should be overriden later so skipping") else: raise e def upgrade() -> None: with op.batch_alter_table("alerttoincident", schema=None) as batch_op: batch_op.add_column(sa.Column( "is_created_by_ai", sa.Boolean(), nullable=False, server_default=expression.false() )) batch_op.add_column(sa.Column( "deleted_at", sa.DateTime(), nullable=False, server_default="1000-01-01 00:00:00", )) conn = op.get_bind() with drop_and_restore_f_keys("alerttoincident", conn): try: with op.batch_alter_table("alerttoincident", schema=None) as batch_op: inspector = inspect(conn) existing_primary_key = inspector.get_pk_constraint('alerttoincident', schema=None) batch_op.drop_constraint(existing_primary_key['name'], type_="primary") except ValueError as e: if "Constraint must have a name" in str(e): print("Constraint must have a name, constraint should be overriden later so skipping") else: raise e with op.batch_alter_table("alerttoincident", schema=None) as batch_op: batch_op.create_primary_key( "alerttoincident_pkey", ["alert_id", "incident_id", "deleted_at"] ) def downgrade() -> None: conn = op.get_bind() inspector = inspect(conn) existing_primary_key = inspector.get_pk_constraint('alerttoincident', schema=None) with op.batch_alter_table("alerttoincident", schema=None) as batch_op: batch_op.drop_column("deleted_at") batch_op.drop_column("is_created_by_ai") with drop_and_restore_f_keys("alerttoincident", conn): with op.batch_alter_table("alerttoincident", schema=None) as batch_op: batch_op.drop_constraint(existing_primary_key['name'], type_="primary") with op.batch_alter_table("alerttoincident", schema=None) as batch_op: batch_op.create_primary_key( "alerttoincident_pkey", ["alert_id", "incident_id"] ) ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-22-10-38_8438f041ee0e.py ================================================ """add pulling_enabled Revision ID: 8438f041ee0e Revises: 83c1020be97d Create Date: 2024-10-22 10:38:29.857284 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "8438f041ee0e" down_revision = "83c1020be97d" branch_labels = None depends_on = None def is_sqlite(): """Check if we're running on SQLite""" bind = op.get_bind() return bind.engine.name == "sqlite" def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### if is_sqlite(): # SQLite specific implementation with op.batch_alter_table("provider", schema=None) as batch_op: # First add the column as nullable with a default value batch_op.add_column( sa.Column( "pulling_enabled", sa.Boolean(), server_default=sa.true(), nullable=True, ) ) # Then make it not nullable if needed with op.batch_alter_table("provider", schema=None) as batch_op: batch_op.alter_column("pulling_enabled", nullable=False) else: # PostgreSQL and other databases implementation # 1. Add the column as nullable op.add_column( "provider", sa.Column("pulling_enabled", sa.Boolean(), nullable=True) ) # 2. Set default value for existing rows op.execute( "UPDATE provider SET pulling_enabled = true WHERE pulling_enabled IS NULL" ) # 3. Make it non-nullable with default op.alter_column( "provider", "pulling_enabled", existing_type=sa.Boolean(), nullable=False, server_default=sa.true(), ) def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("provider", schema=None) as batch_op: batch_op.drop_column("pulling_enabled") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-23-15-21_89b4d3905d26.py ================================================ """Merge Incidents Revision ID: 89b4d3905d26 Revises: 8438f041ee0e Create Date: 2024-10-21 20:48:40.151171 """ import sqlalchemy as sa import sqlalchemy_utils import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "89b4d3905d26" down_revision = "8438f041ee0e" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column( "merged_into_incident_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True, ) ) batch_op.add_column(sa.Column("merged_at", sa.DateTime(), nullable=True)) batch_op.add_column( sa.Column("merged_by", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) batch_op.create_foreign_key( "fk_incident_merged_into_incident_id", "incident", ["merged_into_incident_id"], ["id"], ondelete="SET NULL", ) def downgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_constraint( "fk_incident_merged_into_incident_id", type_="foreignkey" ) batch_op.drop_column("merged_by") batch_op.drop_column("merged_at") batch_op.drop_column("merged_into_incident_id") ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-26-17-03_3f056d747d9e.py ================================================ """AI config Revision ID: 3f056d747d9e Revises: 192157fd5788 Create Date: 2024-10-26 17:03:02.383942 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "3f056d747d9e" down_revision = "192157fd5788" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "externalaiconfigandmetadata", sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("algorithm_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("settings", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("settings_proposed_by_algorithm", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("feedback_logs", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.PrimaryKeyConstraint("id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("externalaiconfigandmetadata") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-29-18-37_991b30bcf0b9.py ================================================ """Fix broken links between alerts and incidents Revision ID: 991b30bcf0b9 Revises: 89b4d3905d26 Create Date: 2024-10-29 18:37:28.668473 """ import sqlalchemy as sa from alembic import op import logging # revision identifiers, used by Alembic. revision = "991b30bcf0b9" down_revision = "89b4d3905d26" branch_labels = None depends_on = None logger = logging.getLogger(__name__) def upgrade() -> None: connection = op.get_bind() if connection.dialect.name == 'sqlite': logger.info("""Migration 83c1020be97d corrupted alert_to_incident.deleted_at at SQLite databases because server_default was set to \"1000-01-01 00:00:00\", not \"1000-01-01 00:00:00.000000\". Fixing the value in this migration.""") # Filtering only by deleted_at = '1000-01-01 00:00:00'. If deleted_at is different, it should be already formated well. result = connection.execute(sa.text("SELECT incident_id, alert_id, deleted_at FROM alerttoincident WHERE deleted_at = '1000-01-01 00:00:00'")) db_datetime_format = "%Y-%m-%d %H:%M:%S.%f" print(f"Database datetime format: {db_datetime_format}") for row in result: try: connection.execute( sa.text( "UPDATE alerttoincident SET deleted_at = '1000-01-01 00:00:00.000000' WHERE incident_id = :incident_id AND alert_id = :alert_id AND deleted_at = '1000-01-01 00:00:00'" ), {"incident_id": row["incident_id"], "alert_id": row["alert_id"]} ) print(f"Updated deleted_at for incident_id: {row['incident_id']}, alert_id: {row['alert_id']}") except sa.exc.IntegrityError as e: if "UNIQUE constraint failed: alerttoincident.alert_id, alerttoincident.incident_id, alerttoincident.deleted_at" in str(e): connection.execute( sa.text( "DELETE FROM alerttoincident WHERE incident_id = :incident_id AND alert_id = :alert_id AND deleted_at = '1000-01-01 00:00:00'" ), {"incident_id": row["incident_id"], "alert_id": row["alert_id"]} ) logger.warning(f"IntegrityError encountered for incident_id: {row['incident_id']}, alert_id: {row['alert_id']}. It's a duplicate. Deleted.") else: raise e else: logger.info("Skipping the fix since it's not SQLite.") # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### pass # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-10-31-18-01_273b29f368b7.py ================================================ """Adding AI tables Revision ID: 273b29f368b7 Revises: 991b30bcf0b9 Create Date: 2024-10-31 18:01:17.427403 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "273b29f368b7" down_revision = "991b30bcf0b9" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "aisuggestion", sa.Column("suggestion_input", sa.JSON(), nullable=True), sa.Column("suggestion_content", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("user_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column( "suggestion_input_hash", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column( "suggestion_type", sa.Enum( "INCIDENT_SUGGESTION", "SUMMARY_GENERATION", "OTHER", name="aisuggestiontype", ), nullable=False, ), sa.Column("model", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("created_at", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("aisuggestion", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_aisuggestion_suggestion_input_hash"), ["suggestion_input_hash"], unique=False, ) batch_op.create_index( batch_op.f("ix_aisuggestion_suggestion_type"), ["suggestion_type"], unique=False, ) batch_op.create_index( batch_op.f("ix_aisuggestion_tenant_id"), ["tenant_id"], unique=False ) batch_op.create_index( batch_op.f("ix_aisuggestion_user_id"), ["user_id"], unique=False ) op.create_table( "aifeedback", sa.Column("feedback_content", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("suggestion_id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("user_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("rating", sa.Integer(), nullable=True), sa.Column("comment", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column("updated_at", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["suggestion_id"], ["aisuggestion.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("aifeedback", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_aifeedback_suggestion_id"), ["suggestion_id"], unique=False ) batch_op.create_index( batch_op.f("ix_aifeedback_user_id"), ["user_id"], unique=False ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("aifeedback", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_aifeedback_user_id")) batch_op.drop_index(batch_op.f("ix_aifeedback_suggestion_id")) op.drop_table("aifeedback") with op.batch_alter_table("aisuggestion", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_aisuggestion_user_id")) batch_op.drop_index(batch_op.f("ix_aisuggestion_tenant_id")) batch_op.drop_index(batch_op.f("ix_aisuggestion_suggestion_type")) batch_op.drop_index(batch_op.f("ix_aisuggestion_suggestion_input_hash")) op.drop_table("aisuggestion") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-11-03-10-49_ef0b5b0df41c.py ================================================ """Adding new index on alert hash Revision ID: ef0b5b0df41c Revises: 273b29f368b7 Create Date: 2024-11-03 10:49:04.708264 """ from alembic import op # revision identifiers, used by Alembic. revision = "ef0b5b0df41c" down_revision = "273b29f368b7" branch_labels = None depends_on = None def upgrade() -> None: # Using batch operation to ensure compatibility with multiple databases with op.batch_alter_table("alert", schema=None) as batch_op: batch_op.create_index( "ix_alert_tenant_fingerprint_timestamp", ["tenant_id", "fingerprint", "timestamp"], unique=False, ) def downgrade() -> None: with op.batch_alter_table("alert", schema=None) as batch_op: batch_op.drop_index("ix_alert_tenant_fingerprint_timestamp") ================================================ FILE: keep/api/models/db/migrations/versions/2024-11-08-20-58_895fe80117aa.py ================================================ """Add timestamp and provider_type to alertraw Revision ID: 895fe80117aa Revises: ef0b5b0df41c Create Date: 2024-11-08 20:58:40.201477 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "895fe80117aa" down_revision = "ef0b5b0df41c" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.add_column( "alertraw", sa.Column( "timestamp", sa.DateTime(), nullable=False, server_default=sa.text("CURRENT_TIMESTAMP"), ), ) op.add_column( "alertraw", sa.Column("provider_type", sa.String(255), nullable=True), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_column("alertraw", "provider_type") op.drop_column("alertraw", "timestamp") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-11-10-13-06_620b6c048091.py ================================================ """incident fingerprint Revision ID: 620b6c048091 Revises: 895fe80117aa Create Date: 2024-11-10 13:06:09.620665 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "620b6c048091" down_revision = "895fe80117aa" branch_labels = None depends_on = None def is_postgres(): """Check if we're running on PostgreSQL""" bind = op.get_bind() return bind.engine.name == "postgresql" def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column( "fingerprint", sa.TEXT(length=36) if not is_postgres() else sa.TEXT(), nullable=True, ) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_column("fingerprint") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-11-20-15-50_192157fd5788.py ================================================ """system table Revision ID: 192157fd5788 Revises: 620b6c048091 Create Date: 2024-11-20 15:50:29.500867 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "192157fd5788" down_revision = "620b6c048091" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "system", sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("value", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.PrimaryKeyConstraint("id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("system") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-01-16-40_3ad5308e7200.py ================================================ """New types for AI config Revision ID: 3ad5308e7200 Revises: 3f056d747d9e Create Date: 2024-12-01 16:40:12.655642 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "3ad5308e7200" down_revision = "3f056d747d9e" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("externalaiconfigandmetadata", schema=None) as batch_op: batch_op.alter_column( "settings", existing_type=sa.VARCHAR(), type_=sa.JSON(), nullable=True, postgresql_using="settings::json" ) batch_op.alter_column( "settings_proposed_by_algorithm", existing_type=sa.VARCHAR(), type_=sa.JSON(), existing_nullable=True, postgresql_using="settings::json" ) batch_op.alter_column( "feedback_logs", existing_type=sa.VARCHAR(), type_=sa.Text(), existing_nullable=True, ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("externalaiconfigandmetadata", schema=None) as batch_op: batch_op.alter_column( "feedback_logs", existing_type=sa.Text(), type_=sa.VARCHAR(length=255), existing_nullable=True, ) batch_op.alter_column( "settings_proposed_by_algorithm", existing_type=sa.JSON(), type_=sa.VARCHAR(length=255), existing_nullable=True, ) batch_op.alter_column( "settings", existing_type=sa.JSON(), type_=sa.VARCHAR(length=255), nullable=False, ) # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-02-13-36_bdae8684d0b4.py ================================================ """add lastalert and lastalerttoincident table Revision ID: bdae8684d0b4 Revises: 3ad5308e7200 Create Date: 2024-11-05 22:48:04.733192 """ import sqlalchemy as sa import sqlalchemy_utils import sqlmodel from alembic import op from sqlalchemy import text from sqlalchemy.orm import Session # revision identifiers, used by Alembic. revision = "bdae8684d0b4" down_revision = "3ad5308e7200" branch_labels = None depends_on = None migration_metadata = sa.MetaData() def populate_db(): session = Session(op.get_bind()) if session.bind.dialect.name == "postgresql": migrate_lastalert_query = text( """ insert into lastalert (tenant_id, fingerprint, alert_id, timestamp) select alert.tenant_id, alert.fingerprint, alert.id as alert_id, alert.timestamp from alert join ( select alert.tenant_id, alert.fingerprint, max(alert.timestamp) as last_received from alert group by fingerprint, tenant_id ) as a ON alert.fingerprint = a.fingerprint and alert.timestamp = a.last_received and alert.tenant_id = a.tenant_id on conflict do nothing """ ) migrate_lastalerttoincident_query = text( """ insert into lastalerttoincident (incident_id, tenant_id, timestamp, fingerprint, is_created_by_ai, deleted_at) select ati.incident_id, ati.tenant_id, ati.timestamp, lf.fingerprint, ati.is_created_by_ai, ati.deleted_at from alerttoincident as ati join ( select alert.tenant_id, alert.id, alert.fingerprint from alert join ( select alert.tenant_id, alert.fingerprint, max(alert.timestamp) as last_received from alert group by fingerprint, tenant_id ) as a on alert.fingerprint = a.fingerprint and alert.timestamp = a.last_received and alert.tenant_id = a.tenant_id ) as lf on ati.alert_id = lf.id on conflict do nothing """ ) else: migrate_lastalert_query = text( """ INSERT INTO lastalert (tenant_id, fingerprint, alert_id, timestamp) SELECT grouped_alerts.tenant_id, grouped_alerts.fingerprint, MAX(grouped_alerts.alert_id) as alert_id, -- Using MAX to consistently pick one alert_id grouped_alerts.timestamp FROM ( select alert.tenant_id, alert.fingerprint, alert.id as alert_id, alert.timestamp from alert join ( select alert.tenant_id, alert.fingerprint, max(alert.timestamp) as last_received from alert group by fingerprint, tenant_id ) as a ON alert.fingerprint = a.fingerprint and alert.timestamp = a.last_received and alert.tenant_id = a.tenant_id ) as grouped_alerts GROUP BY grouped_alerts.tenant_id, grouped_alerts.fingerprint, grouped_alerts.timestamp; """ ) migrate_lastalerttoincident_query = text( """ REPLACE INTO lastalerttoincident (incident_id, tenant_id, timestamp, fingerprint, is_created_by_ai, deleted_at) select ati.incident_id, ati.tenant_id, ati.timestamp, lf.fingerprint, ati.is_created_by_ai, ati.deleted_at from alerttoincident as ati join ( select alert.id, alert.fingerprint, alert.tenant_id from alert join ( select alert.tenant_id,alert.fingerprint, max(alert.timestamp) as last_received from alert group by fingerprint, tenant_id ) as a on alert.fingerprint = a.fingerprint and alert.timestamp = a.last_received and alert.tenant_id = a.tenant_id ) as lf on ati.alert_id = lf.id; """ ) session.execute(migrate_lastalert_query) session.execute(migrate_lastalerttoincident_query) def upgrade() -> None: op.create_table( "lastalert", sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("alert_id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["alert_id"], ["alert.id"], ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("tenant_id", "fingerprint"), ) with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_lastalert_timestamp"), ["timestamp"], unique=False ) # Add index for the fingerprint column that will be referenced by foreign key batch_op.create_index("ix_lastalert_fingerprint", ["fingerprint"], unique=False) op.create_table( "lastalerttoincident", sa.Column( "incident_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False, ), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("is_created_by_ai", sa.Boolean(), nullable=False), sa.Column("deleted_at", sa.DateTime(), nullable=True), sa.ForeignKeyConstraint( ["tenant_id", "fingerprint"], ["lastalert.tenant_id", "lastalert.fingerprint"], ), sa.ForeignKeyConstraint(["incident_id"], ["incident.id"], ondelete="CASCADE"), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint( "tenant_id", "incident_id", "fingerprint", "deleted_at" ), ) populate_db() def downgrade() -> None: op.drop_table("lastalerttoincident") with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_lastalert_timestamp")) op.drop_table("lastalert") ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-02-20-42_c6e5594c99f8.py ================================================ """add first_timestamp field to LastAlert Revision ID: c6e5594c99f8 Revises: bdae8684d0b4 Create Date: 2024-12-02 20:42:33.311541 """ import sqlalchemy as sa from alembic import op from sqlalchemy import text from sqlalchemy.dialects import mysql from sqlalchemy.orm import Session # revision identifiers, used by Alembic. revision = "c6e5594c99f8" down_revision = "bdae8684d0b4" branch_labels = None depends_on = None def populate_db(): session = Session(op.get_bind()) session.execute( text( """ UPDATE lastalert SET first_timestamp = ( SELECT MIN(alert.timestamp) FROM alert WHERE alert.fingerprint = lastalert.fingerprint AND alert.tenant_id = lastalert.tenant_id ) """ ) ) def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.alter_column( "fingerprint", existing_type=mysql.TINYTEXT(), type_=sa.TEXT(), existing_nullable=True, ) with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.add_column(sa.Column("first_timestamp", sa.DateTime(), nullable=True)) batch_op.create_index( batch_op.f("ix_lastalert_first_timestamp"), ["first_timestamp"], unique=False, ) populate_db() def downgrade() -> None: with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_lastalert_first_timestamp")) batch_op.drop_column("first_timestamp") ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-08-16-24_55cc64020f6d.py ================================================ """Add Alert Hash to LastAlert Revision ID: 55cc64020f6d Revises: c6e5594c99f8 Create Date: 2024-12-08 16:24:01.808208 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "55cc64020f6d" down_revision = "c6e5594c99f8" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.add_column( sa.Column("alert_hash", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) batch_op.create_index( batch_op.f("ix_lastalert_alert_hash"), ["alert_hash"], unique=False ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_lastalert_alert_hash")) batch_op.drop_column("alert_hash") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-10-19-11_7297ae99cd21.py ================================================ """Add Rule.create_on Revision ID: 7297ae99cd21 Revises: 4f8c4b185d5b Create Date: 2024-12-10 19:11:28.512095 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "7297ae99cd21" down_revision = "4f8c4b185d5b" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column("create_on", sqlmodel.sql.sqltypes.AutoString(), nullable=False, default="any", server_default="any") ) # ### end Alembic commands ### def downgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("create_on") ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-17-12-48_3d20d954e058.py ================================================ """Add index to WorkflowExecution Revision ID: 3d20d954e058 Revises: 55cc64020f6d Create Date: 2024-12-17 12:48:04.713649 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "3d20d954e058" down_revision = "55cc64020f6d" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.create_index( "idx_workflowexecution_tenant_workflow_id_timestamp", ["tenant_id", "workflow_id", sa.desc("started")], unique=False, ) if op.get_bind().dialect.name == "mysql": batch_op.create_index( "idx_workflowexecution_workflow_tenant_started_status", [ "workflow_id", "tenant_id", sa.desc("started"), sa.text("status(255)"), ], unique=False, ) else: batch_op.create_index( "idx_workflowexecution_workflow_tenant_started_status", ["workflow_id", "tenant_id", sa.desc("started"), "status"], unique=False, ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.drop_index("idx_workflowexecution_workflow_tenant_started_status") batch_op.drop_index("idx_workflowexecution_tenant_workflow_id_timestamp") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-23-17-22_0c5e002094a9.py ================================================ """Add provider logs Revision ID: 0c5e002094a9 Revises: 3d20d954e058 Create Date: 2024-12-23 17:22:04.119440 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "0c5e002094a9" down_revision = "3d20d954e058" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "providerexecutionlog", sa.Column("log_message", sa.TEXT(), nullable=True), sa.Column("context", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("provider_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("log_level", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("execution_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["provider_id"], ["provider.id"], ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("id"), ) # Create indexes based on database type conn = op.get_bind() inspector = sa.inspect(conn) dialect_name = inspector.dialect.name if dialect_name == "postgresql": op.create_index( "idx_provider_logs_tenant_provider", "providerexecutionlog", ["tenant_id", "provider_id"], postgresql_using="btree", ) op.create_index( "idx_provider_logs_timestamp", "providerexecutionlog", ["timestamp"], postgresql_using="btree", ) elif dialect_name == "mysql": op.create_index( "idx_provider_logs_tenant_provider", "providerexecutionlog", ["tenant_id", "provider_id"], mysql_using="btree", ) op.create_index( "idx_provider_logs_timestamp", "providerexecutionlog", ["timestamp"], mysql_using="btree", ) else: # sqlite op.create_index( "idx_provider_logs_tenant_provider", "providerexecutionlog", ["tenant_id", "provider_id"], ) op.create_index( "idx_provider_logs_timestamp", "providerexecutionlog", ["timestamp"] ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### # # Drop indexes first op.drop_index( "idx_provider_logs_tenant_provider", table_name="providerexecutionlog" ) op.drop_index("idx_provider_logs_timestamp", table_name="providerexecutionlog") op.drop_table("providerexecutionlog") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2024-12-23-18-49_4f8c4b185d5b.py ================================================ """Add is_provisioned column for DeduplicationRule table Revision ID: 4f8c4b185d5b Revises: 0c5e002094a9 Create Date: 2024-12-23 18:49:00.882402 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "4f8c4b185d5b" down_revision = "0c5e002094a9" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### # Add the new column with a server default with op.batch_alter_table("alertdeduplicationrule", schema=None) as batch_op: batch_op.add_column(sa.Column("is_provisioned", sa.Boolean(), nullable=False, server_default=sa.text("false"))) # Update existing records to have the default value op.execute("UPDATE alertdeduplicationrule SET is_provisioned = false") # Remove the server default (optional, to match schema-only behavior) with op.batch_alter_table("alertdeduplicationrule", schema=None) as batch_op: batch_op.alter_column("is_provisioned", server_default=None) # ### end Alembic commands ### # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("alertdeduplicationrule", schema=None) as batch_op: batch_op.drop_column("is_provisioned") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-01-01-09-59_dcb7f88a04da.py ================================================ """Few more indexes Revision ID: dcb7f88a04da Revises: 7297ae99cd21 Create Date: 2025-01-01 09:59:13.393588 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "dcb7f88a04da" down_revision = "7297ae99cd21" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("alert", schema=None) as batch_op: batch_op.create_index( "idx_alert_tenant_timestamp_fingerprint", ["tenant_id", "timestamp", "fingerprint"], unique=False, ) batch_op.create_index( "idx_fingerprint_timestamp", ["fingerprint", "timestamp"], unique=False ) with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.alter_column( "first_timestamp", existing_type=sa.DATETIME(), nullable=False ) batch_op.create_index( "idx_lastalert_tenant_ordering", ["tenant_id", "first_timestamp", "alert_id", "fingerprint"], unique=False, ) batch_op.create_index( "idx_lastalert_tenant_timestamp", ["tenant_id", "first_timestamp"], unique=False, ) batch_op.create_index( "idx_lastalert_tenant_timestamp_new", ["tenant_id", "timestamp"], unique=False, ) with op.batch_alter_table("lastalerttoincident", schema=None) as batch_op: batch_op.create_index( "idx_lastalerttoincident_tenant_fingerprint", ["tenant_id", "fingerprint", "deleted_at"], unique=False, ) batch_op.create_index( "idx_tenant_deleted_fingerprint", ["tenant_id", "deleted_at", "fingerprint"], unique=False, ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("lastalerttoincident", schema=None) as batch_op: batch_op.drop_index("idx_tenant_deleted_fingerprint") batch_op.drop_index("idx_lastalerttoincident_tenant_fingerprint") with op.batch_alter_table("lastalert", schema=None) as batch_op: batch_op.drop_index("idx_lastalert_tenant_timestamp_new") batch_op.drop_index("idx_lastalert_tenant_timestamp") batch_op.drop_index("idx_lastalert_tenant_ordering") batch_op.alter_column( "first_timestamp", existing_type=sa.DATETIME(), nullable=True ) with op.batch_alter_table("alert", schema=None) as batch_op: batch_op.drop_index("idx_fingerprint_timestamp") batch_op.drop_index("idx_alert_tenant_timestamp_fingerprint") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-01-01-15-14_1c117f1accff.py ================================================ """Topology Incident Revision ID: 1c117f1accff Revises: dcb7f88a04da Create Date: 2025-01-01 15:14:55.998284 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "1c117f1accff" down_revision = "dcb7f88a04da" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column( "incident_type", sqlmodel.sql.sqltypes.AutoString(), nullable=True ) ) batch_op.add_column(sa.Column("incident_application", sa.Uuid(), nullable=True)) batch_op.add_column( sa.Column("resolve_on", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) with op.batch_alter_table("topologyapplication", schema=None) as batch_op: batch_op.add_column( sa.Column("repository", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.add_column( sa.Column("namespace", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_column("incident_application") batch_op.drop_column("incident_type") batch_op.drop_column("resolve_on") with op.batch_alter_table("topologyapplication", schema=None) as batch_op: batch_op.drop_column("repository") with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.drop_column("namespace") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-01-08-19-20_8a4ec08f2d6b.py ================================================ """add_facet_table Revision ID: 8a4ec08f2d6b Revises: dcb7f88a04da Create Date: 2025-01-08 19:20:32.154545 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "8a4ec08f2d6b" down_revision = "dcb7f88a04da" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "facet", sa.Column("id", sa.Uuid(), nullable=False), sa.Column( "entity_type", sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False ), sa.Column( "property_path", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False, ), sa.Column("type", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("name", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), sa.Column( "description", sqlmodel.sql.sqltypes.AutoString(length=2048), nullable=True ), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("user_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("facet", schema=None) as batch_op: batch_op.drop_index("ix_facet_tenant_id") batch_op.drop_index("ix_entity_type") op.drop_table("facet") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-01-14-18-41_416155f25854.py ================================================ """Add workflowexecution.started index Revision ID: 416155f25854 Revises: 1c117f1accff Create Date: 2025-01-14 18:41:45.817371 """ from alembic import op # revision identifiers, used by Alembic. revision = "416155f25854" down_revision = "1c117f1accff" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_workflowexecution_started"), ["started"], unique=False ) def downgrade() -> None: with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_workflowexecution_started")) ================================================ FILE: keep/api/models/db/migrations/versions/2025-01-16-14-00_e3f33e571c3c.py ================================================ """is_deleted to rule Revision ID: e3f33e571c3c Revises: 416155f25854 Create Date: 2025-01-16 14:00:53.211856 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "e3f33e571c3c" down_revision = "416155f25854" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column(sa.Column("is_deleted", sa.Boolean(), nullable=False, server_default=sa.false())) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("is_deleted") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-01-19-10-44_d359baaf0836.py ================================================ """Merge 8a4ec08f2d6b and e3f33e571c3c heads Revision ID: d359baaf0836 Revises: 8a4ec08f2d6b, e3f33e571c3c Create Date: 2025-01-19 10:44:47.871555 """ # revision identifiers, used by Alembic. revision = "d359baaf0836" down_revision = ("8a4ec08f2d6b", "e3f33e571c3c") branch_labels = None depends_on = None def upgrade() -> None: pass def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-01-26-15-25_8176d7153747.py ================================================ """Add manual field in topology-service Revision ID: 8176d7153747 Revises: 7fde94be79e4 Create Date: 2025-01-26 15:25:23.811890 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "8176d7153747" down_revision = "7fde94be79e4" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.add_column(sa.Column("is_manual", sa.Boolean(), nullable=True)) # ### end Alembic commands ### def downgrade() -> None: with op.batch_alter_table("topologyservice", schema=None) as batch_op: batch_op.drop_column("is_manual") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-05-15-46_e343054ae740.py ================================================ """Fix wrong rule.resolve_on values Revision ID: e343054ae740 Revises: d359baaf0836 Create Date: 2025-02-05 15:46:25.933229 """ from alembic import op from sqlalchemy import text from sqlalchemy.orm import Session # revision identifiers, used by Alembic. revision = "e343054ae740" down_revision = "d359baaf0836" branch_labels = None depends_on = None def populate_db(): session = Session(op.get_bind()) session.execute( text(""" UPDATE rule SET resolve_on = 'all_resolved' WHERE resolve_on = 'all' """)) session.execute( text(""" UPDATE rule SET resolve_on = 'first_resolved' WHERE resolve_on = 'first' """)) session.execute( text(""" UPDATE rule SET resolve_on = 'last_resolved' WHERE resolve_on = 'last' """)) def upgrade() -> None: populate_db() def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-10-12-05_908d95386e29.py ================================================ """Add incident severity_forced flag Revision ID: 908d95386e29 Revises: e343054ae740 Create Date: 2025-02-05 12:05:19.795904 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "908d95386e29" down_revision = "e343054ae740" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column(sa.Column("forced_severity", sa.Boolean(), nullable=False, server_default=sa.false())) def downgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_column("forced_severity") ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-11-12-59_21d314490e6a.py ================================================ """Enrichment Event Revision ID: 21d314490e6a Revises: 908d95386e29 Create Date: 2025-02-11 12:59:12.987863 """ import json import sqlalchemy as sa import sqlalchemy_utils import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "21d314490e6a" down_revision = "908d95386e29" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "enrichmentevent", sa.Column("id", sa.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("enriched_fields", sa.JSON(), nullable=True), sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column( "enrichment_type", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("rule_id", sa.Integer(), nullable=True), sa.Column( "alert_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False, ), sa.Column("date_hour", sa.DateTime(), nullable=True), # @tb: we might sometime save the alert_id before the alert is actually created # sa.ForeignKeyConstraint(["alert_id"], ["alert.id"], ondelete="CASCADE"), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("enrichmentevent", schema=None) as batch_op: batch_op.create_index( "ix_enrichment_event_alert_id", ["alert_id"], unique=False ) batch_op.create_index("ix_enrichment_event_rule_id", ["rule_id"], unique=False) batch_op.create_index("ix_enrichment_event_status", ["status"], unique=False) batch_op.create_index( "ix_enrichment_event_tenant_id_date_hour", ["tenant_id", "date_hour"], unique=False, ) batch_op.create_index( batch_op.f("ix_enrichmentevent_tenant_id"), ["tenant_id"], unique=False ) op.create_table( "enrichmentlog", sa.Column("id", sa.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("enrichment_event_id", sa.Uuid(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("message", sa.TEXT(), nullable=True), sa.ForeignKeyConstraint( ["enrichment_event_id"], ["enrichmentevent.id"], ondelete="CASCADE" ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("enrichmentlog", schema=None) as batch_op: batch_op.create_index( "ix_enrichment_log_enrichment_event_id", ["enrichment_event_id"], unique=False, ) batch_op.create_index( "ix_enrichment_log_tenant_id_timestamp", ["tenant_id", "timestamp"], unique=False, ) batch_op.create_index( batch_op.f("ix_enrichmentlog_tenant_id"), ["tenant_id"], unique=False ) # Transform old matchers format to new format connection = op.get_bind() result = connection.execute(sa.text("SELECT id, matchers FROM mappingrule")) for row in result: old_matchers = row.matchers if isinstance(old_matchers, str): old_matchers = json.loads(old_matchers) new_matchers = [] for matcher in old_matchers: m = matcher.split("&&") if isinstance(matcher, str) else matcher m = [s.strip() for s in m] new_matchers.append(m) connection.execute( sa.text("UPDATE mappingrule SET matchers = :new_matchers WHERE id = :id"), {"new_matchers": json.dumps(new_matchers), "id": row.id}, ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("enrichmentlog", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_enrichmentlog_tenant_id")) batch_op.drop_index("ix_enrichment_log_tenant_id_timestamp") batch_op.drop_index("ix_enrichment_log_enrichment_event_id") op.drop_table("enrichmentlog") with op.batch_alter_table("enrichmentevent", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_enrichmentevent_tenant_id")) batch_op.drop_index("ix_enrichment_event_tenant_id_date_hour") batch_op.drop_index("ix_enrichment_event_status") batch_op.drop_index("ix_enrichment_event_rule_id") batch_op.drop_index("ix_enrichment_event_alert_id") op.drop_table("enrichmentevent") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-13-09-54_cfe08cc46950.py ================================================ """Incident Template Name Revision ID: 7fde94be79e4 Revises: 21d314490e6a Create Date: 2025-02-13 09:50:43.868988 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "7fde94be79e4" down_revision = "21d314490e6a" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column( "incident_name_template", sqlmodel.sql.sqltypes.AutoString(), nullable=True, ) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("incident_name_template") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-13-17-27_90e2d22edc6a.py ================================================ """WF index Revision ID: 90e2d22edc6a Revises: 8176d7153747 Create Date: 2025-02-13 17:27:56.350500 """ from alembic import op from sqlalchemy import text # revision identifiers, used by Alembic. revision = "90e2d22edc6a" down_revision = "8176d7153747" branch_labels = None depends_on = None def upgrade() -> None: conn = op.get_bind() dialect_name = op.get_context().dialect.name try: conn.execute( text("COMMIT") ) # Close existing transaction, otherwise it will fail on PG on the next step except Exception: pass # No transaction to commit try: if dialect_name == "mysql": # MySQL allows/requires length for string columns in indexes op.create_index( "idx_status_started", "workflowexecution", [(text("status(255)")), "started"], ) else: # PostgreSQL and SQLite don't need/support length specifications op.create_index( "idx_status_started", "workflowexecution", ["status", "started"] ) except Exception as e: print(f"Error creating index raised error: {e}") print("Index idx_status_started already exists. It's ok.") def downgrade() -> None: op.drop_index("idx_status_started", table_name="workflowexecution") ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-18-18-09_876a424d8f06.py ================================================ """Extend dismissed enrichments with SUPPRESSED status Revision ID: 876a424d8f06 Revises: 8176d7153747 Create Date: 2025-02-18 18:09:40.656808 """ from alembic import op from sqlalchemy import and_, null from sqlalchemy.orm.attributes import flag_modified from sqlmodel import Session from keep.api.core.db_utils import get_json_extract_field from keep.api.models.alert import AlertStatus from keep.api.models.db.alert import AlertEnrichment # revision identifiers, used by Alembic. revision = "876a424d8f06" down_revision = "8176d7153747" branch_labels = None depends_on = None def populate_db(): session = Session(op.get_bind()) dismissed_field = get_json_extract_field(session, AlertEnrichment.enrichments, "dismissed") status_field = get_json_extract_field(session, AlertEnrichment.enrichments, "status") enrichments = session.query(AlertEnrichment).filter( and_( dismissed_field.in_(['true', 'True']), status_field.is_(null()) ) ).all() for enrichment in enrichments: enrichment.enrichments['status'] = AlertStatus.SUPPRESSED.value flag_modified(enrichment, "enrichments") session.add(enrichment) session.commit() def upgrade() -> None: populate_db() def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-19-15-32_35ebba262eb0.py ================================================ """Merge heads Revision ID: 35ebba262eb0 Revises: 90e2d22edc6a, 876a424d8f06 Create Date: 2025-02-19 15:32:56.689105 """ # revision identifiers, used by Alembic. revision = "35ebba262eb0" down_revision = ("90e2d22edc6a", "876a424d8f06") branch_labels = None depends_on = None def upgrade() -> None: pass def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-20-23-15_ea25d9402518.py ================================================ """Add idx_alert_tenant_provider index Revision ID: ea25d9402518 Revises: 35ebba262eb0 Create Date: 2025-02-20 23:15:59.831382 """ from alembic import op # revision identifiers, used by Alembic. revision = "ea25d9402518" down_revision = "35ebba262eb0" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("alert", schema=None) as batch_op: if not op.get_bind().dialect.has_index( op.get_bind(), "alert", "idx_alert_tenant_provider" ): batch_op.create_index( "idx_alert_tenant_provider", ["tenant_id", "provider_id"], unique=False ) def downgrade() -> None: with op.batch_alter_table("alert", schema=None) as batch_op: batch_op.drop_index("idx_alert_tenant_provider") ================================================ FILE: keep/api/models/db/migrations/versions/2025-02-25-14-20_a82154690f35.py ================================================ """TopologyApplication repository default_value Revision ID: a82154690f35 Revises: ea25d9402518 Create Date: 2025-02-25 14:20:04.175052 """ import sqlalchemy as sa from alembic import op from sqlalchemy import text from sqlalchemy.orm import Session # revision identifiers, used by Alembic. revision = "a82154690f35" down_revision = "ea25d9402518" branch_labels = None depends_on = None def prepare_data(): session = Session(op.get_bind()) session.execute(text("UPDATE topologyapplication set description = '' where description is null")) session.execute(text("UPDATE topologyapplication set repository = '' where repository is null")) def upgrade() -> None: prepare_data() with op.batch_alter_table("topologyapplication", schema=None) as batch_op: batch_op.alter_column("description", existing_type=sa.VARCHAR(255), nullable=False) batch_op.alter_column("repository", existing_type=sa.VARCHAR(255), nullable=False) def downgrade() -> None: with op.batch_alter_table("topologyapplication", schema=None) as batch_op: batch_op.alter_column("repository", existing_type=sa.VARCHAR(255), nullable=True) batch_op.alter_column("description", existing_type=sa.VARCHAR(255), nullable=True) ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-05-15-55_0b80bda47ee2.py ================================================ """Custom Images Revision ID: 0b80bda47ee2 Revises: a82154690f35 Create Date: 2025-03-05 15:55:27.653706 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "0b80bda47ee2" down_revision = "a82154690f35" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "providerimage", sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("image_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("image_blob", sa.LargeBinary(), nullable=True), sa.Column("last_updated", sa.DateTime(), nullable=False), sa.Column( "updated_by", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("providerimage") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-11-16-54_16309df224d1.py ================================================ """Add_unique_constraint_for_alert_fingerprint_and_tenant_id Revision ID: 16309df224d1 Revises: 0b80bda47ee2 Create Date: 2025-03-11 16:54:14.972144 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "16309df224d1" down_revision = "0b80bda47ee2" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### conn = op.get_bind() dialect = conn.dialect.name op.create_table( "alertenrichment_before_tenant_fingerprint_constraint", sa.Column("enrichments", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column( "alert_fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("alert_fingerprint"), ) # Copy existing data op.execute( """ INSERT INTO alertenrichment_before_tenant_fingerprint_constraint (id, tenant_id, alert_fingerprint, timestamp, enrichments) SELECT id, tenant_id, alert_fingerprint, timestamp, enrichments FROM alertenrichment; """ ) if dialect == "mysql": try: op.drop_constraint("alert_fingerprint", "alertenrichment", type_="unique") except Exception: # ignore because this constraint may not exist in prod pass op.execute( """ WITH duplicates AS ( SELECT id, ROW_NUMBER() OVER ( PARTITION BY tenant_id, alert_fingerprint ORDER BY timestamp DESC ) AS rn FROM alertenrichment ) DELETE FROM alertenrichment WHERE id IN (SELECT id FROM duplicates WHERE rn > 1); """ ) with op.batch_alter_table("alertenrichment") as batch_op: batch_op.create_unique_constraint( "uc_alertenrichment_tenant_fingerprint", ["tenant_id", "alert_fingerprint"], ) elif dialect == "postgresql": constraint_exists = conn.execute( sa.text( """ SELECT conname FROM pg_constraint WHERE conrelid = 'alertenrichment'::regclass AND conname = 'alert_fingerprint'; """ ) ).fetchone() with op.batch_alter_table("alertenrichment") as batch_op: if constraint_exists: batch_op.drop_constraint("alert_fingerprint", type_="unique") batch_op.execute( """ WITH duplicates AS ( SELECT id, ROW_NUMBER() OVER ( PARTITION BY tenant_id, alert_fingerprint ORDER BY timestamp DESC ) AS rn FROM alertenrichment ) DELETE FROM alertenrichment WHERE id IN (SELECT id FROM duplicates WHERE rn > 1); """ ) batch_op.create_unique_constraint( "uc_alertenrichment_tenant_fingerprint", ["tenant_id", "alert_fingerprint"], ) elif dialect == "sqlite": op.execute("DROP TABLE alertenrichment;") op.create_table( "alertenrichment", sa.Column("enrichments", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column( "alert_fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), ) # Copy existing data op.execute( """ INSERT INTO alertenrichment (id, tenant_id, alert_fingerprint, timestamp, enrichments) SELECT id, tenant_id, alert_fingerprint, timestamp, enrichments FROM alertenrichment_before_tenant_fingerprint_constraint; """ ) op.execute( """ WITH duplicates AS ( SELECT id, ROW_NUMBER() OVER ( PARTITION BY tenant_id, alert_fingerprint ORDER BY timestamp DESC ) AS rn FROM alertenrichment ) DELETE FROM alertenrichment WHERE id IN (SELECT id FROM duplicates WHERE rn > 1); """ ) with op.batch_alter_table("alertenrichment") as batch_op: batch_op.create_unique_constraint( "uc_alertenrichment_tenant_fingerprint", ["tenant_id", "alert_fingerprint"], ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### conn = op.get_bind() dialect = conn.dialect.name if dialect == "mysql": op.execute( "ALTER TABLE alertenrichment DROP FOREIGN KEY alertenrichment_ibfk_1;" ) op.drop_constraint( "uc_alertenrichment_tenant_fingerprint", "alertenrichment", type_="unique" ) op.execute( """ ALTER TABLE alertenrichment ADD CONSTRAINT alertenrichment_ibfk_1 FOREIGN KEY (tenant_id) REFERENCES tenant(id) ON DELETE CASCADE ON UPDATE CASCADE; """ ) op.create_unique_constraint( "alert_fingerprint", "alertenrichment", ["alert_fingerprint"] ) elif dialect == "postgresql": op.drop_constraint( "uc_alertenrichment_tenant_fingerprint", "alertenrichment", type_="unique" ) op.create_unique_constraint( "alert_fingerprint", "alertenrichment", ["alert_fingerprint"] ) elif dialect == "sqlite": op.create_table( "alertenrichment_new", sa.Column("enrichments", sa.JSON(), nullable=True), sa.Column("id", sqlmodel.sql.sqltypes.types.Uuid(), nullable=False), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column( "alert_fingerprint", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("alert_fingerprint"), ) op.execute( """ INSERT INTO alertenrichment_new (id, tenant_id, alert_fingerprint, timestamp, enrichments) SELECT id, tenant_id, alert_fingerprint, timestamp, enrichments FROM alertenrichment; """ ) op.execute("DROP TABLE alertenrichment;") op.execute("ALTER TABLE alertenrichment_new RENAME TO alertenrichment;") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-12-13-22_ab333148350e.py ================================================ """Running incident number Revision ID: ab333148350e Revises: 0b80bda47ee2 Create Date: 2025-03-12 13:22:48.372003 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "ab333148350e" down_revision = "0b80bda47ee2" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column(sa.Column("running_number", sa.Integer(), nullable=True)) batch_op.create_index( "ix_incident_tenant_running_number", ["tenant_id", "running_number"], unique=True, postgresql_where=sa.text("running_number IS NOT NULL"), sqlite_where=sa.text("running_number IS NOT NULL"), ) with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column( "incident_prefix", sqlmodel.sql.sqltypes.AutoString(length=10), nullable=True, ) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.drop_index( "ix_incident_tenant_running_number", postgresql_where=sa.text("running_number IS NOT NULL"), sqlite_where=sa.text("running_number IS NOT NULL"), ) batch_op.drop_column("running_number") with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("incident_prefix") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-12-14-36_9f11356d8ed9.py ================================================ """empty message Revision ID: 9f11356d8ed9 Revises: 16309df224d1, ab333148350e Create Date: 2025-03-12 14:36:09.529471 """ # revision identifiers, used by Alembic. revision = "9f11356d8ed9" down_revision = ("16309df224d1", "ab333148350e") branch_labels = None depends_on = None def upgrade() -> None: pass def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-12-14-46_ca74b4a04371.py ================================================ """Add alertraw index and error Revision ID: ca74b4a04371 Revises: 0b80bda47ee2 Create Date: 2025-03-06 10:46:23.453102 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "ca74b4a04371" down_revision = "9f11356d8ed9" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("alertraw", schema=None) as batch_op: # Get database connection to check dialect conn = op.get_bind() dialect_name = conn.dialect.name # Handle PostgreSQL differently to avoid NOT NULL violation if dialect_name == "postgresql": # First add the columns as nullable batch_op.add_column(sa.Column("error", sa.Boolean(), nullable=True, server_default=sa.false())) batch_op.add_column( sa.Column("error_message", sa.String(length=2048), nullable=True) ) batch_op.add_column(sa.Column("dismissed", sa.Boolean(), nullable=True, server_default=sa.false())) # Set default values for the new columns batch_op.alter_column( "error", nullable=False, server_default=sa.text("false") ) batch_op.alter_column( "dismissed", nullable=False, server_default=sa.text("false") ) else: # For MySQL if dialect_name == "mysql": batch_op.add_column( sa.Column( "error", sa.Boolean(), nullable=False, server_default=sa.text("0"), ) ) batch_op.add_column( sa.Column("error_message", sa.String(length=2048), nullable=True) ) batch_op.add_column( sa.Column( "dismissed", sa.Boolean(), nullable=False, server_default=sa.text("0"), ) ) else: # SQLite and others batch_op.add_column( sa.Column( "error", sa.Boolean(), nullable=False, server_default=sa.text("false"), ) ) batch_op.add_column( sa.Column("error_message", sa.String(length=2048), nullable=True) ) batch_op.add_column( sa.Column( "dismissed", sa.Boolean(), nullable=False, server_default="false", ) ) # Common operations for all dialects batch_op.add_column(sa.Column("dismissed_at", sa.DateTime(), nullable=True)) batch_op.add_column( sa.Column("dismissed_by", sa.String(length=255), nullable=True) ) batch_op.create_index( "ix_alert_raw_tenant_id_error", ["tenant_id", "error"], unique=False ) batch_op.create_index( "ix_alert_raw_tenant_id_timestamp", ["tenant_id", "timestamp"], unique=False ) batch_op.create_index(batch_op.f("ix_alertraw_error"), ["error"], unique=False) batch_op.create_index( batch_op.f("ix_alertraw_tenant_id"), ["tenant_id"], unique=False ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("alertraw", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_alertraw_tenant_id")) batch_op.drop_index(batch_op.f("ix_alertraw_error")) batch_op.drop_index("ix_alert_raw_tenant_id_timestamp") batch_op.drop_index("ix_alert_raw_tenant_id_error") batch_op.drop_column("error_message") batch_op.drop_column("error") batch_op.drop_column("dismissed_by") batch_op.drop_column("dismissed_at") batch_op.drop_column("dismissed") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-13-14-08_c0e70149c9ec.py ================================================ """Unique api key reference Revision ID: c0e70149c9ec Revises: ca74b4a04371 Create Date: 2025-03-13 14:08:22.939513 """ from alembic import op # revision identifiers, used by Alembic. revision = "c0e70149c9ec" down_revision = "ca74b4a04371" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("tenantapikey", schema=None) as batch_op: batch_op.create_unique_constraint( "unique_tenant_to_reference", ["tenant_id", "reference_id"] ) def downgrade() -> None: with op.batch_alter_table("tenantapikey", schema=None) as batch_op: batch_op.drop_constraint("unique_tenant_to_reference", type_="unique") ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-14-15-52_f3ecc7411f38.py ================================================ """Add is_candidate and is_visible flags to Incident to replace is_confirmed Revision ID: f3ecc7411f38 Revises: c0e70149c9ec Create Date: 2025-03-07 15:52:10.729973 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "f3ecc7411f38" down_revision = "c0e70149c9ec" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column(sa.Column("is_candidate", sa.Boolean(), server_default=sa.false(), nullable=False)) batch_op.add_column(sa.Column("is_visible", sa.Boolean(), server_default=sa.true(), nullable=False)) with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.execute("""UPDATE incident SET is_candidate = not is_confirmed""") batch_op.drop_column("is_confirmed") def downgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.add_column( sa.Column( "is_confirmed", sa.BOOLEAN(), server_default=sa.false(), nullable=False, ) ) with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.execute("""UPDATE incident SET is_confirmed = not is_candidate""") batch_op.drop_column("is_visible") batch_op.drop_column("is_candidate") ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-16-11-08_aff0128aa8f1.py ================================================ """multi-level mapping Revision ID: aff0128aa8f1 Revises: f3ecc7411f38 Create Date: 2025-03-16 11:08:09.846457 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "aff0128aa8f1" down_revision = "f3ecc7411f38" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("mappingrule", schema=None) as batch_op: batch_op.add_column(sa.Column("is_multi_level", sa.Boolean(), nullable=False, server_default=sa.false())) batch_op.add_column( sa.Column( "new_property_name", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=True, ) ) batch_op.add_column( sa.Column( "prefix_to_remove", sqlmodel.sql.sqltypes.AutoString(length=255), nullable=True, ) ) with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column( "multi_level", sa.Boolean(), nullable=False, server_default=sa.text("(FALSE)"), ) ) batch_op.add_column( sa.Column( "multi_level_property_name", sqlmodel.sql.sqltypes.AutoString(), nullable=True, ) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("mappingrule", schema=None) as batch_op: batch_op.drop_column("prefix_to_remove") batch_op.drop_column("new_property_name") batch_op.drop_column("is_multi_level") with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("multi_level_property_name") batch_op.drop_column("multi_level") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-18-14-54_971abbbf0a2c.py ================================================ """Default workflow_id=None for WorkflowExecution for test runs Revision ID: 971abbbf0a2c Revises: c0880e315ebe Create Date: 2025-03-18 14:54:56.003392 """ import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import mysql # revision identifiers, used by Alembic. revision = "971abbbf0a2c" down_revision = "c0880e315ebe" branch_labels = None depends_on = None def upgrade() -> None: # First check if the column is nullable (for those who haven't migrated yet) connection = op.get_bind() dialect = connection.dialect.name inspector = sa.inspect(connection) columns = inspector.get_columns("workflowexecution") workflow_id_column = next((c for c in columns if c["name"] == "workflow_id"), None) is_nullable = ( workflow_id_column.get("nullable", True) if workflow_id_column else True ) # Find the actual foreign key constraint name for workflow_id foreign_keys = inspector.get_foreign_keys("workflowexecution") workflow_fk = None for fk in foreign_keys: if ( "workflow_id" in fk.get("constrained_columns", []) and fk.get("referred_table") == "workflow" ): workflow_fk = fk break fk_name = workflow_fk.get("name") if workflow_fk else None # Drop the foreign key constraint if it exists if fk_name: with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.drop_constraint(fk_name, type_="foreignkey") # First create a "test" workflow if it doesn't exist # This helps maintain referential integrity op.execute( """ INSERT INTO tenant (id, name) SELECT 'system-test-workflow', 'System Test Workflow Tenant' WHERE NOT EXISTS (SELECT 1 FROM tenant WHERE id = 'system-test-workflow') """ ) # Then create the test workflow using this tenant op.execute( """ INSERT INTO workflow (id, tenant_id, name, description, created_by, creation_time, workflow_raw, is_deleted, is_disabled, revision, last_updated) SELECT 'test', 'system-test-workflow', 'Test Workflow', 'Auto-generated test workflow for unassociated executions', 'system', CURRENT_TIMESTAMP, '{}', FALSE, FALSE, 1, CURRENT_TIMESTAMP WHERE NOT EXISTS (SELECT 1 FROM workflow WHERE id = 'test') """ ) # Update NULL values to 'test' if needed if is_nullable: op.execute( "UPDATE workflowexecution SET workflow_id = 'test' WHERE workflow_id IS NULL" ) # Handle PostgreSQL transaction error - commit the changes made so far # This prevents the "current transaction is aborted" error if dialect == "postgresql": op.execute("COMMIT") # Conditionally check if indexes exist before dropping indexes = inspector.get_indexes("workflowexecution") index_names = [idx["name"] for idx in indexes] # For PostgreSQL, we need to handle each operation separately # to avoid transaction errors cascading if dialect == "postgresql": # Drop indexes if they exist if "idx_status_started" in index_names: op.execute("DROP INDEX idx_status_started") if "idx_workflowexecution_workflow_tenant_started_status" in index_names: op.execute( "DROP INDEX idx_workflowexecution_workflow_tenant_started_status" ) # Make column NOT NULL with default 'test' op.execute( "ALTER TABLE workflowexecution ALTER COLUMN workflow_id SET NOT NULL" ) op.execute( "ALTER TABLE workflowexecution ALTER COLUMN workflow_id SET DEFAULT 'test'" ) # Try to create the new index try: op.execute( "CREATE INDEX idx_workflowexecution_workflow_tenant_started_status ON " "workflowexecution (workflow_id, tenant_id, started, status)" ) except Exception as e: print(f"Note: Index creation skipped - {str(e)}") # Add the foreign key back try: op.execute( "ALTER TABLE workflowexecution ADD CONSTRAINT fk_workflowexecution_workflow " "FOREIGN KEY (workflow_id) REFERENCES workflow(id) ON DELETE SET DEFAULT" ) except Exception as e: print(f"Note: Foreign key creation skipped - {str(e)}") else: # For non-PostgreSQL databases, use the original approach with op.batch_alter_table("workflowexecution", schema=None) as batch_op: # Make column NOT NULL with default 'test' batch_op.alter_column( "workflow_id", existing_type=( mysql.VARCHAR(length=255) if dialect == "mysql" else sa.String(length=255) ), nullable=False, server_default="test", ) # Only drop indexes if they exist if "idx_status_started" in index_names: batch_op.drop_index("idx_status_started") if "idx_workflowexecution_workflow_tenant_started_status" in index_names: batch_op.drop_index( "idx_workflowexecution_workflow_tenant_started_status" ) # Create new index (this will fail if it already exists) try: # Create the index based on dialect if dialect == "mysql": op.execute( "CREATE INDEX idx_workflowexecution_workflow_tenant_started_status ON " "workflowexecution (workflow_id, tenant_id, started, status(255))" ) else: # SQLite or other dialects with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.create_index( "idx_workflowexecution_workflow_tenant_started_status", ["workflow_id", "tenant_id", "started", "status"], unique=False, ) except Exception as e: # Log that the index already exists, but don't fail the migration print(f"Note: Index creation skipped - {str(e)}") # Add the foreign key back inspector = sa.inspect(connection) foreign_keys = inspector.get_foreign_keys("workflowexecution") has_workflow_fk = any( fk.get("referred_table") == "workflow" and "workflow_id" in fk.get("constrained_columns", []) for fk in foreign_keys ) if not has_workflow_fk: try: with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.create_foreign_key( None, # Let the database generate a name "workflow", ["workflow_id"], ["id"], ondelete="SET DEFAULT", ) except Exception as e: print(f"Note: Foreign key creation skipped - {str(e)}") def downgrade() -> None: # Similar defensive approach for downgrade connection = op.get_bind() dialect = connection.dialect.name inspector = sa.inspect(connection) indexes = inspector.get_indexes("workflowexecution") index_names = [idx["name"] for idx in indexes] # Handle PostgreSQL separately for the downgrade as well if dialect == "postgresql": # Drop index if it exists if "idx_workflowexecution_workflow_tenant_started_status" in index_names: op.execute( "DROP INDEX idx_workflowexecution_workflow_tenant_started_status" ) # Create other indexes if needed if "idx_status_started" not in index_names: try: op.execute( "CREATE INDEX idx_status_started ON workflowexecution (status, started)" ) except Exception: pass # Make column nullable again and remove default op.execute( "ALTER TABLE workflowexecution ALTER COLUMN workflow_id DROP NOT NULL" ) op.execute( "ALTER TABLE workflowexecution ALTER COLUMN workflow_id DROP DEFAULT" ) # Convert 'test' values back to NULL op.execute( "UPDATE workflowexecution SET workflow_id = NULL WHERE workflow_id = 'test'" ) else: # For non-PostgreSQL databases with op.batch_alter_table("workflowexecution", schema=None) as batch_op: # Only try to drop if it exists if "idx_workflowexecution_workflow_tenant_started_status" in index_names: batch_op.drop_index( "idx_workflowexecution_workflow_tenant_started_status" ) # Recreate indexes if they don't exist try: batch_op.create_index( "idx_workflowexecution_workflow_tenant_started_status", ["workflow_id", "tenant_id", "started", "status"], unique=False, mysql_length={"status": 255}, ) except Exception: pass # Conditionally check if indexes exist before adding indexes = inspector.get_indexes("workflowexecution") index_names = [idx["name"] for idx in indexes] if "idx_status_started" not in index_names: try: batch_op.create_index( "idx_status_started", ["status", "started"], unique=False, mysql_length={"status": 255}, ) except Exception: pass # Make column nullable again batch_op.alter_column( "workflow_id", existing_type=( mysql.VARCHAR(length=255) if dialect == "mysql" else sa.String(length=255) ), nullable=True, server_default=None, ) # Convert 'test' values back to NULL op.execute( "UPDATE workflowexecution SET workflow_id = NULL WHERE workflow_id = 'test'" ) ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-20-09-37_c0880e315ebe.py ================================================ """Convert incident name fields to text Revision ID: c0880e315ebe Revises: aff0128aa8f1 Create Date: 2025-03-20 09:37:38.596306 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "c0880e315ebe" down_revision = "aff0128aa8f1" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.alter_column( "user_generated_name", existing_type=sa.VARCHAR(), type_=sa.TEXT(), existing_nullable=True, ) batch_op.alter_column( "ai_generated_name", existing_type=sa.VARCHAR(), type_=sa.TEXT(), existing_nullable=True, ) def downgrade() -> None: with op.batch_alter_table("incident", schema=None) as batch_op: batch_op.alter_column( "ai_generated_name", existing_type=sa.TEXT(), type_=sa.VARCHAR(255), existing_nullable=True, ) batch_op.alter_column( "user_generated_name", existing_type=sa.TEXT(), type_=sa.VARCHAR(255), existing_nullable=True, ) ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-24-14-26_2a6132b443ab.py ================================================ """remove_alert_fingerprint_constraint_for_postgresql Revision ID: 2a6132b443ab Revises: 971abbbf0a2c Create Date: 2025-03-24 14:26:11.506748 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "2a6132b443ab" down_revision = "971abbbf0a2c" branch_labels = None depends_on = None def upgrade() -> None: conn = op.get_bind() dialect = conn.dialect.name if dialect == "postgresql": constraint_exists = conn.execute( sa.text( """ SELECT conname FROM pg_constraint WHERE conrelid = 'alertenrichment'::regclass AND conname = 'alertenrichment_alert_fingerprint_key'; """ ) ).fetchone() with op.batch_alter_table("alertenrichment") as batch_op: if constraint_exists: batch_op.drop_constraint( "alertenrichment_alert_fingerprint_key", type_="unique" ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### conn = op.get_bind() dialect = conn.dialect.name if dialect == "postgresql": with op.batch_alter_table("alertenrichment") as batch_op: batch_op.create_unique_constraint( "alertenrichment_alert_fingerprint_key", ["alert_fingerprint"], ) # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-03-30-10-53_e663a98b1142.py ================================================ """“add-counter_shows_firing_only-column-for-preset” Revision ID: e663a98b1142 Revises: 2a6132b443ab Create Date: 2025-03-30 10:53:31.773788 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "e663a98b1142" down_revision = "2a6132b443ab" branch_labels = None depends_on = None def upgrade(): op.add_column( "preset", sa.Column( "counter_shows_firing_only", sa.Boolean(), nullable=True, # make it nullable to avoid issues with old rows server_default=sa.false(), # default value for new rows ), ) def downgrade(): op.drop_column("preset", "counter_shows_firing_only") ================================================ FILE: keep/api/models/db/migrations/versions/2025-04-03-12-09_bdf252fbc1be.py ================================================ """json_for_dashboard_config Revision ID: bdf252fbc1be Revises: e663a98b1142 Create Date: 2025-04-03 12:09:19.911725 """ import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import mysql # revision identifiers, used by Alembic. revision = "bdf252fbc1be" down_revision = "e663a98b1142" branch_labels = None depends_on = None def upgrade() -> None: conn = op.get_bind() if conn.dialect.name == "postgresql": result = conn.execute( sa.text( "SELECT data_type FROM information_schema.columns WHERE table_name='dashboard' AND column_name='dashboard_config';" ) ).fetchone() if result and result[0] != "json": conn.execute( sa.text( "ALTER TABLE dashboard ALTER COLUMN dashboard_config TYPE JSON USING dashboard_config::json;" ) ) elif conn.dialect.name == "mysql": result = conn.execute( sa.text("SHOW COLUMNS FROM dashboard WHERE Field='dashboard_config';") ).fetchone() if result and "json" not in result[1].lower(): op.alter_column("dashboard", "dashboard_config", type_=mysql.JSON) def downgrade() -> None: conn = op.get_bind() if conn.dialect.name == "postgresql": result = conn.execute( sa.text( "SELECT data_type FROM information_schema.columns WHERE table_name='dashboard' AND column_name='dashboard_config';" ) ).fetchone() if result and result[0] == "json": op.alter_column("dashboard", "dashboard_config", type_=sa.Text) elif conn.dialect.name == "mysql": result = conn.execute( sa.text("SHOW COLUMNS FROM dashboard WHERE Field='dashboard_config';") ).fetchone() if result and "json" in result[1].lower(): op.alter_column("dashboard", "dashboard_config", type_=sa.Text) ================================================ FILE: keep/api/models/db/migrations/versions/2025-04-04-21-48_0dafe96ea97f.py ================================================ """auto delete provider logs Revision ID: 0dafe96ea97f Revises: e663a98b1142 Create Date: 2025-04-04 21:48:38.282584 """ from alembic import op from sqlalchemy import inspect from sqlalchemy.dialects import mysql # revision identifiers, used by Alembic. revision = "0dafe96ea97f" down_revision = "e663a98b1142" branch_labels = None depends_on = None def upgrade() -> None: dialect = op.get_context().dialect.name if dialect == "sqlite": # SQLite doesn't support ALTER TABLE for dropping constraints # Create a new table with the desired schema, move data, drop old table, rename new table # Get table info conn = op.get_bind() inspector = inspect(conn) columns = inspector.get_columns("providerexecutionlog") column_definitions = [] # Recreate column definitions for column in columns: # Make provider_id nullable if column["name"] == "provider_id": column["nullable"] = True column_type = column["type"] nullable = "NULL" if column["nullable"] else "NOT NULL" default = ( f"DEFAULT {column['default']}" if column.get("default") is not None else "" ) column_def = f"{column['name']} {column_type} {nullable} {default}".strip() column_definitions.append(column_def) # Create new table with foreign key constraint included primary_keys = [] for column in columns: if column.get("primary_key", False): primary_keys.append(column["name"]) # Need to include primary key and foreign key in table creation primary_key_clause = ( f", PRIMARY KEY ({', '.join(primary_keys)})" if primary_keys else "" ) op.execute( f""" CREATE TABLE providerexecutionlog_new ( {", ".join(column_definitions)}{primary_key_clause}, FOREIGN KEY (provider_id) REFERENCES provider(id) ON DELETE CASCADE ) """ ) # Copy data op.execute( """ INSERT INTO providerexecutionlog_new SELECT * FROM providerexecutionlog """ ) # Drop old table op.drop_table("providerexecutionlog") # Rename new table op.rename_table("providerexecutionlog_new", "providerexecutionlog") # No need to separately add foreign key as it's included in table creation else: # PostgreSQL and MySQL support with op.batch_alter_table("providerexecutionlog", schema=None) as batch_op: batch_op.alter_column( "provider_id", existing_type=mysql.VARCHAR(length=255), nullable=True ) if dialect == "postgresql": batch_op.drop_constraint( "providerexecutionlog_provider_id_fkey", type_="foreignkey" ) else: batch_op.drop_constraint( "providerexecutionlog_ibfk_1", type_="foreignkey" ) batch_op.create_foreign_key( None, "provider", ["provider_id"], ["id"], ondelete="CASCADE" ) # ### end Alembic commands ### def downgrade() -> None: dialect = op.get_context().dialect.name if dialect == "sqlite": # For SQLite, recreate the table again without CASCADE # Get table info conn = op.get_bind() inspector = inspect(conn) columns = inspector.get_columns("providerexecutionlog") column_definitions = [] # Recreate column definitions for column in columns: # Make provider_id NOT NULL if column["name"] == "provider_id": column["nullable"] = False column_type = column["type"] nullable = "NULL" if column["nullable"] else "NOT NULL" default = ( f"DEFAULT {column['default']}" if column.get("default") is not None else "" ) column_def = f"{column['name']} {column_type} {nullable} {default}".strip() column_definitions.append(column_def) # Create new table with foreign key constraint included primary_keys = [] for column in columns: if column.get("primary_key", False): primary_keys.append(column["name"]) # Need to include primary key and foreign key in table creation primary_key_clause = ( f", PRIMARY KEY ({', '.join(primary_keys)})" if primary_keys else "" ) op.execute( f""" CREATE TABLE providerexecutionlog_new ( {", ".join(column_definitions)}{primary_key_clause}, FOREIGN KEY (provider_id) REFERENCES provider(id) ) """ ) # Copy data op.execute( """ INSERT INTO providerexecutionlog_new SELECT * FROM providerexecutionlog """ ) # Drop old table op.drop_table("providerexecutionlog") # Rename new table op.rename_table("providerexecutionlog_new", "providerexecutionlog") # No need to separately add foreign key as it's included in table creation else: # PostgreSQL and MySQL downgrade with op.batch_alter_table("providerexecutionlog", schema=None) as batch_op: batch_op.drop_constraint(None, type_="foreignkey") if dialect == "postgresql": batch_op.create_foreign_key( "providerexecutionlog_provider_id_fkey", "provider", ["provider_id"], ["id"], ) else: batch_op.create_foreign_key( "providerexecutionlog_ibfk_1", "provider", ["provider_id"], ["id"] ) batch_op.alter_column( "provider_id", existing_type=mysql.VARCHAR(length=255), nullable=False ) # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-04-06-12-18_78777e6b12d3.py ================================================ """empty message Revision ID: 78777e6b12d3 Revises: bdf252fbc1be, 0dafe96ea97f Create Date: 2025-04-06 12:18:21.809822 """ # revision identifiers, used by Alembic. revision = "78777e6b12d3" down_revision = ("bdf252fbc1be", "0dafe96ea97f") branch_labels = None depends_on = None def upgrade() -> None: pass def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-04-08-10-43_59991b568c7d.py ================================================ """restore_idx_status_started_index Revision ID: 59991b568c7d Revises: 78777e6b12d3 Create Date: 2025-04-08 10:43:53.361024 """ from alembic import op # revision identifiers, used by Alembic. revision = "59991b568c7d" down_revision = "78777e6b12d3" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### # Check if the index exists before creating it if op.get_bind().dialect.name == "sqlite": # SQLite does not support querying for index existence directly, so we attempt to create it op.execute( """ CREATE INDEX IF NOT EXISTS idx_status_started ON workflowexecution (status, started) """ ) elif op.get_bind().dialect.name == "mysql": try: op.execute( """ CREATE INDEX idx_status_started ON workflowexecution (status(255), started) """ ) except Exception: # if it fails, it means the index already exists pass elif op.get_bind().dialect.name == "postgresql": op.execute( """ DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = 'idx_status_started' AND n.nspname = 'public' ) THEN CREATE INDEX idx_status_started ON workflowexecution (status, started); END IF; END$$; """ ) # ### end Alembic commands ### def downgrade() -> None: # Nothing to do because the index idx_status_started must have been created long before this migration pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-04-15-15-30_885ff6b12fed.py ================================================ """Workflow Versions Revision ID: 885ff6b12fed Revises: 59991b568c7d Create Date: 2025-04-15 15:30:48.099088 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "885ff6b12fed" down_revision = "59991b568c7d" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "workflowversion", sa.Column("workflow_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("revision", sa.Integer(), nullable=False), sa.Column("workflow_raw", sa.TEXT(), nullable=True), sa.Column("updated_by", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.func.current_timestamp(), nullable=False, ), sa.Column("is_valid", sa.Boolean(), nullable=False), sa.Column("is_current", sa.Boolean(), nullable=False), sa.Column("comment", sqlmodel.sql.sqltypes.AutoString(), nullable=True), sa.ForeignKeyConstraint( ["workflow_id"], ["workflow.id"], ), sa.PrimaryKeyConstraint("workflow_id", "revision"), ) with op.batch_alter_table("workflow", schema=None) as batch_op: batch_op.alter_column( "last_updated", existing_type=sa.DateTime(timezone=True), server_default=sa.func.current_timestamp(), nullable=False, ) # Then handle column and index changes with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.add_column( sa.Column( "workflow_revision", sa.Integer(), nullable=False, server_default="1" ) ) batch_op.create_index( "idx_workflowexecution_tenant_workflow_id_revision_timestamp", ["tenant_id", "workflow_id", "workflow_revision", "started"], unique=False, ) batch_op.create_index( "idx_workflowexecution_workflow_revision_tenant_started_status", [ "workflow_id", "workflow_revision", "tenant_id", "started", "status", ], mysql_length={"status": 255}, unique=False, ) batch_op.create_index( "idx_workflowexecution_workflow_revision", ["workflow_id", "workflow_revision"], unique=False, ) # Update existing records with their corresponding workflow revision connection = op.get_bind() # Remove orphaned workflow executions connection.execute( sa.text( """ DELETE FROM workflowexecution WHERE workflow_id NOT IN (SELECT id FROM workflow) """ ) ) # Update workflow executions with their corresponding workflow revision, skipping null revisions connection.execute( sa.text( """ UPDATE workflowexecution SET workflow_revision = ( SELECT revision FROM workflow WHERE workflow.id = workflowexecution.workflow_id AND workflow.revision IS NOT NULL ) WHERE EXISTS ( SELECT 1 FROM workflow WHERE workflow.id = workflowexecution.workflow_id AND workflow.revision IS NOT NULL ) """ ) ) # Create initial workflow versions for existing workflows connection.execute( sa.text( """ INSERT INTO workflowversion ( workflow_id, revision, workflow_raw, updated_by, updated_at, is_valid, is_current, comment ) SELECT id as workflow_id, COALESCE(revision, 1) as revision, workflow_raw, COALESCE(updated_by, created_by) as updated_by, COALESCE(last_updated, CURRENT_DATE) as updated_at, true as is_valid, true as is_current, 'Initial version migration' as comment FROM workflow """ ) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### # First drop foreign key constraints because they prevent dropping indexes (at least in mysql) inspector = sa.inspect(op.get_bind()) foreign_keys = inspector.get_foreign_keys("workflowexecution") for foreign_key in foreign_keys: if foreign_key["name"]: op.drop_constraint( foreign_key["name"], "workflowexecution", type_="foreignkey" ) else: print(f"foreign_key {foreign_key} has no name, skipping") # Then handle column and index changes with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.drop_index("idx_workflowexecution_workflow_revision") batch_op.drop_index( "idx_workflowexecution_tenant_workflow_id_revision_timestamp" ) batch_op.drop_index( "idx_workflowexecution_workflow_revision_tenant_started_status" ) batch_op.drop_column("workflow_revision") # Finally recreate foreign key constraints with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.create_foreign_key( "workflowexecution_ibfk_1", "tenant", ["tenant_id"], ["id"], ) batch_op.create_foreign_key( "workflowexecution_ibfk_2", "workflow", ["workflow_id"], ["id"], ondelete="SET DEFAULT", ) op.drop_table("workflowversion") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-04-21-10-18_819927b7ccfa.py ================================================ """workflow is_test and workflowexecution is_test_run columns Revision ID: 819927b7ccfa Revises: 885ff6b12fed Create Date: 2025-04-21 10:18:49.074198 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "819927b7ccfa" down_revision = "885ff6b12fed" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("workflow", schema=None) as batch_op: batch_op.add_column( sa.Column( "is_test", sa.Boolean(), nullable=False, server_default=sa.false(), ) ) with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.add_column( sa.Column( "is_test_run", sa.Boolean(), nullable=False, server_default=sa.false(), ) ) # Delete all related data for deprecated system test workflow tenant, workflow and related workflowexecutions, workflowexecutionlog, etc # First delete workflow execution logs op.execute( """ DELETE FROM workflowexecutionlog WHERE workflow_execution_id IN ( SELECT id FROM workflowexecution WHERE workflow_id = 'test' ) """ ) # Delete workflow-to-alert relations op.execute( """ DELETE FROM workflowtoalertexecution WHERE workflow_execution_id IN ( SELECT id FROM workflowexecution WHERE workflow_id = 'test' ) """ ) # Delete workflow-to-incident relations op.execute( """ DELETE FROM workflowtoincidentexecution WHERE workflow_execution_id IN ( SELECT id FROM workflowexecution WHERE workflow_id = 'test' ) """ ) # Delete workflow executions op.execute("DELETE FROM workflowexecution WHERE workflow_id = 'test'") # Delete workflow version op.execute("DELETE FROM workflowversion WHERE workflow_id = 'test'") # Delete the test workflow op.execute("DELETE FROM workflow WHERE id = 'test'") # Finally delete the system test tenant op.execute("DELETE FROM tenant WHERE id = 'system-test-workflow'") # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("workflowexecution", schema=None) as batch_op: batch_op.drop_column("is_test_run") with op.batch_alter_table("workflow", schema=None) as batch_op: batch_op.drop_column("is_test") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-04-15-02_eddcb77eb6f3.py ================================================ """Providers metadata Revision ID: eddcb77eb6f3 Revises: 819927b7ccfa Create Date: 2025-05-04 15:02:12.314043 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "eddcb77eb6f3" down_revision = "819927b7ccfa" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("provider", schema=None) as batch_op: batch_op.add_column(sa.Column("provider_metadata", sa.JSON(), nullable=True)) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("provider", schema=None) as batch_op: batch_op.drop_column("provider_metadata") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-06-13-09_7b687c555318.py ================================================ """Recalculate alerts_count for incidents Revision ID: 7b687c555318 Revises: eddcb77eb6f3 Create Date: 2025-05-06 13:09:27.462927 """ # revision identifiers, used by Alembic. revision = "7b687c555318" down_revision = "eddcb77eb6f3" branch_labels = None depends_on = None def upgrade() -> None: pass def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-12-17-49_c2f78c69e9cf.py ================================================ """Recalculate alerts_count for incidents Revision ID: c2f78c69e9cf Revises: 7b687c555318 Create Date: 2025-05-12 17:49:09.779088 """ from collections import defaultdict from alembic import op from sqlalchemy import select, update from sqlalchemy.orm import Session from sqlalchemy.sql.functions import count from keep.api.models.db.alert import LastAlertToIncident from keep.api.models.db.helpers import NULL_FOR_DELETED_AT from keep.api.models.db.incident import Incident # revision identifiers, used by Alembic. revision = "c2f78c69e9cf" down_revision = "7b687c555318" branch_labels = None depends_on = None def upgrade() -> None: session = Session(op.get_bind()) counts = session.execute( select( count(LastAlertToIncident.fingerprint), LastAlertToIncident.incident_id ) .where(LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT) .group_by(LastAlertToIncident.incident_id) ).all() counts_per_incident = defaultdict(int) for count_, incident_id in counts: counts_per_incident[incident_id] = count_ incident_ids = session.execute(select(Incident.id)).scalars().all() for incident_id in incident_ids: session.execute( update(Incident) .where(Incident.id == incident_id) .values(alerts_count=counts_per_incident.get(incident_id, 0)) ) session.commit() def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-15-00-34_fcef2c58b21c.py ================================================ """Add threshold field to Rule Revision ID: fcef2c58b21c Revises: 7b687c555318 Create Date: 2025-05-15 00:34:31.753003 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "fcef2c58b21c" down_revision = "7b687c555318" branch_labels = None depends_on = None def upgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column(sa.Column("threshold", sa.Integer(), nullable=False, server_default="1")) batch_op.create_check_constraint("rule_threshold_positive_int_constraint", "threshold>0") def downgrade() -> None: with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_constraint("rule_threshold_positive_int_constraint", type_="check") batch_op.drop_column("threshold") ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-15-14-18_bedb5f07417b.py ================================================ """merge heads "c2f78c69e9cf" and "fcef2c58b21c": Add threshold field to Rule + Recalculate alerts_count for incidents Revision ID: bedb5f07417b Revises: c2f78c69e9cf, fcef2c58b21c Create Date: 2025-05-15 14:18:13.356729 """ # revision identifiers, used by Alembic. revision = "bedb5f07417b" down_revision = ("c2f78c69e9cf", "fcef2c58b21c") branch_labels = None depends_on = None def upgrade() -> None: pass def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-16-14-33_aa167915c4d6.py ================================================ """Add ignore_statuses to MaintenanceWindowRule Revision ID: aa167915c4d6 Revises: bedb5f07417b Create Date: 2025-05-16 14:33:29.828572 """ import sqlalchemy as sa from alembic import op from sqlmodel import Session from keep.api.models.db.maintenance_window import DEFAULT_ALERT_STATUSES_TO_IGNORE # revision identifiers, used by Alembic. revision = "aa167915c4d6" down_revision = "bedb5f07417b" branch_labels = None depends_on = None migration_metadata = sa.MetaData() mwr_table = sa.Table( 'maintenancewindowrule', migration_metadata, sa.Column('id', sa.Integer, primary_key=True), sa.Column('ignore_statuses', sa.JSON) ) def populate_db(): session = Session(op.get_bind()) session.execute(sa.update(mwr_table).values(ignore_statuses=DEFAULT_ALERT_STATUSES_TO_IGNORE)) def upgrade() -> None: with op.batch_alter_table("maintenancewindowrule", schema=None) as batch_op: batch_op.add_column(sa.Column("ignore_statuses", sa.JSON(), nullable=True)) populate_db() def downgrade() -> None: with op.batch_alter_table("maintenancewindowrule", schema=None) as batch_op: batch_op.drop_column("ignore_statuses") ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-19-18-48_90e3eababbf0.py ================================================ """merge migration between combined_commentmention and aa167915c4d6 Revision ID: 90e3eababbf0 Revises: combined_commentmention, aa167915c4d6 Create Date: 2025-05-19 18:48:20.899302 """ # revision identifiers, used by Alembic. revision = "90e3eababbf0" down_revision = ("combined_commentmention", "aa167915c4d6") branch_labels = None depends_on = None def upgrade() -> None: pass def downgrade() -> None: pass ================================================ FILE: keep/api/models/db/migrations/versions/2025-05-19-20-54_combined_commentmention.py ================================================ """Add CommentMention table with proper cascade delete Revision ID: combined_commentmention Revises: aa167915c4d6 Create Date: 2025-05-19 20:54:00.000000 """ import sqlalchemy as sa import sqlmodel from alembic import op from sqlalchemy import inspect # revision identifiers, used by Alembic. revision = "combined_commentmention" down_revision = "aa167915c4d6" # Same as the original parent branch_labels = None depends_on = None def upgrade() -> None: # Check if the commentmention table already exists conn = op.get_bind() inspector = inspect(conn) if "commentmention" not in inspector.get_table_names(): # Create the CommentMention table for storing user mentions in comments with CASCADE delete op.create_table( "commentmention", sa.Column("id", sa.Uuid(), nullable=False), sa.Column("comment_id", sa.Uuid(), nullable=False), sa.Column( "mentioned_user_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), sa.Column("tenant_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("created_at", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["comment_id"], ["alertaudit.id"], name="fk_commentmention_alertaudit_cascade", ondelete="CASCADE", ), sa.ForeignKeyConstraint( ["tenant_id"], ["tenant.id"], name="fk_commentmention_tenant", ondelete="CASCADE", ), sa.PrimaryKeyConstraint("id", name="pk_commentmention"), sa.UniqueConstraint( "comment_id", "mentioned_user_id", name="uq_comment_mention" ), ) # Create indexes op.create_index( "ix_comment_mention_comment_id", "commentmention", ["comment_id"], unique=False, ) op.create_index( "ix_comment_mention_mentioned_user_id", "commentmention", ["mentioned_user_id"], unique=False, ) op.create_index( "ix_comment_mention_tenant_id", "commentmention", ["tenant_id"], unique=False, ) def downgrade() -> None: # Drop the table if it exists conn = op.get_bind() inspector = inspect(conn) if "commentmention" in inspector.get_table_names(): op.drop_table("commentmention") ================================================ FILE: keep/api/models/db/migrations/versions/2025-06-04-10-43_7c14f776ef6b.py ================================================ """add rule assignee Revision ID: 7c14f776ef6b Revises: 90e3eababbf0 Create Date: 2025-06-04 10:43:04.805408 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "7c14f776ef6b" down_revision = "90e3eababbf0" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.add_column( sa.Column("assignee", sqlmodel.sql.sqltypes.AutoString(), nullable=True) ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("rule", schema=None) as batch_op: batch_op.drop_column("assignee") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/migrations/versions/2025-06-18-17-17_9dd1be4539e0.py ================================================ """feat: Add dbsecretmanager Revision ID: 9dd1be4539e0 Revises: 7c14f776ef6b Create Date: 2025-06-18 17:17:07.950227 """ import sqlalchemy as sa import sqlmodel from alembic import op # revision identifiers, used by Alembic. revision = "9dd1be4539e0" down_revision = "7c14f776ef6b" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "secret", sa.Column("key", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("value", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("last_updated", sa.DateTime(), nullable=False), sa.PrimaryKeyConstraint("key"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.drop_table("secret") # ### end Alembic commands ### ================================================ FILE: keep/api/models/db/preset.py ================================================ import enum from typing import Any, Dict, List, Optional from uuid import UUID, uuid4 from pydantic import BaseModel, conint, constr from sqlalchemy import UniqueConstraint from sqlmodel import JSON, Column, Field, Relationship, SQLModel class StaticPresetsId(enum.Enum): # ID of the default preset FEED_PRESET_ID = "11111111-1111-1111-1111-111111111111" DISMISSED_PRESET_ID = "11111111-1111-1111-1111-111111111113" GROUPS_PRESET_ID = "11111111-1111-1111-1111-111111111114" WITHOUT_INCIDENT_PRESET_ID = "11111111-1111-1111-1111-111111111115" def generate_uuid(): return str(uuid4()) class PresetTagLink(SQLModel, table=True): tenant_id: str = Field(foreign_key="tenant.id", primary_key=True) preset_id: UUID = Field(foreign_key="preset.id", primary_key=True) tag_id: str = Field(foreign_key="tag.id", primary_key=True) class Tag(SQLModel, table=True): id: str = Field(default_factory=generate_uuid, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") name: str = Field(unique=True, nullable=False) presets: List["Preset"] = Relationship( back_populates="tags", link_model=PresetTagLink ) class TagDto(BaseModel): id: Optional[str] # for new tag from the frontend, the id would be None name: str class Preset(SQLModel, table=True): __table_args__ = (UniqueConstraint("tenant_id", "name"),) id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id", index=True) created_by: Optional[str] = Field(index=True, nullable=False) is_private: Optional[bool] = Field(default=False) is_noisy: Optional[bool] = Field(default=False) counter_shows_firing_only: Optional[bool] = Field(default=False) name: str = Field(unique=True) options: list = Field(sa_column=Column(JSON)) # [{"label": "", "value": ""}] tags: List[Tag] = Relationship( back_populates="presets", link_model=PresetTagLink, sa_relationship_kwargs={"lazy": "joined"}, ) def to_dict(self): """Convert the model to a dictionary including relationships.""" preset_dict = self.dict() preset_dict["tags"] = [tag.dict() for tag in self.tags] return preset_dict # datatype represents a query with CEL (str) and SQL (dict) class PresetSearchQuery(BaseModel): cel_query: constr(min_length=0) sql_query: Dict[str, Any] limit: conint(ge=0) = 1000 timeframe: conint(ge=0) = 0 class Config: allow_mutation = False class PresetDto(BaseModel, extra="ignore"): id: UUID name: str options: list = [] created_by: Optional[str] = None is_private: Optional[bool] = Field(default=False) is_noisy: Optional[bool] = Field(default=False) """Whether the preset is noisy or not""" # if true, the preset should do noise now counter_shows_firing_only: Optional[bool] = Field(default=False) """Indicates whether counter in navbar displays only firing alerts""" should_do_noise_now: Optional[bool] = Field(default=False) """Meaning is_noisy + at least one alert is doing noise""" # static presets static: Optional[bool] = Field(default=False) tags: List[TagDto] = [] @property def cel_query(self) -> str: query = [ option for option in self.options if option.get("label", "").lower() == "cel" ] if not query: # should not happen, maybe on old presets return "" elif len(query) > 1: # should not happen return "" return query[0].get("value", "") @property def sql_query(self) -> str: query = [ option for option in self.options if option.get("label", "").lower() == "sql" ] if not query: # should not happen, maybe on old presets return "" elif len(query) > 1: # should not happen return "" return query[0].get("value", "") @property def column_visibility(self) -> Dict[str, bool]: """Get column visibility configuration from preset options""" config = [ option for option in self.options if option.get("label", "").lower() == "column_visibility" ] if not config: return {} return config[0].get("value", {}) @property def column_order(self) -> List[str]: """Get column order configuration from preset options""" config = [ option for option in self.options if option.get("label", "").lower() == "column_order" ] if not config: return [] return config[0].get("value", []) @property def column_rename_mapping(self) -> Dict[str, str]: """Get column rename mapping from preset options""" config = [ option for option in self.options if option.get("label", "").lower() == "column_rename_mapping" ] if not config: return {} return config[0].get("value", {}) @property def column_time_formats(self) -> Dict[str, str]: """Get column time formats from preset options""" config = [ option for option in self.options if option.get("label", "").lower() == "column_time_formats" ] if not config: return {} return config[0].get("value", {}) @property def column_list_formats(self) -> Dict[str, str]: """Get column list formats from preset options""" config = [ option for option in self.options if option.get("label", "").lower() == "column_list_formats" ] if not config: return {} return config[0].get("value", {}) @property def query(self) -> PresetSearchQuery: return PresetSearchQuery( cel_query=self.cel_query, sql_query=self.sql_query, ) class PresetOption(BaseModel, extra="ignore"): label: str # cel or sql dict value: str | dict ================================================ FILE: keep/api/models/db/provider.py ================================================ from datetime import datetime from typing import Optional from sqlalchemy import TEXT, UniqueConstraint from sqlmodel import JSON, Column, Field, ForeignKey, Index, SQLModel class Provider(SQLModel, table=True): __table_args__ = (UniqueConstraint("tenant_id", "name"),) id: str = Field(default=None, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") name: str description: Optional[str] type: str installed_by: str installation_time: datetime configuration_key: str validatedScopes: dict = Field( sa_column=Column(JSON) ) # scope name is key and value is either True if validated or string with error message, e.g: {"read": True, "write": "error message"} consumer: bool = False pulling_enabled: bool = True last_pull_time: Optional[datetime] provisioned: bool = Field(default=False) provider_metadata: dict = Field( sa_column=Column(JSON) ) # metadata about the provider, e.g: {"version": "1.0.0"} class Config: orm_mode = True unique_together = ["tenant_id", "name"] class ProviderExecutionLog(SQLModel, table=True): __table_args__ = ( UniqueConstraint("id"), Index("idx_provider_logs_tenant_provider", "tenant_id", "provider_id"), Index("idx_provider_logs_timestamp", "timestamp"), ) id: str = Field(default=None, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") provider_id: str = Field( sa_column=Column(ForeignKey("provider.id", ondelete="CASCADE")) ) timestamp: datetime = Field(default_factory=datetime.utcnow) log_message: str = Field(sa_column=Column(TEXT)) log_level: str = Field(default="INFO") # INFO, WARNING, ERROR, DEBUG context: dict = Field(sa_column=Column(JSON), default={}) execution_id: Optional[str] = None # To group related logs together class Config: orm_mode = True ================================================ FILE: keep/api/models/db/provider_image.py ================================================ import datetime from sqlalchemy import Column, LargeBinary from sqlmodel import Field, SQLModel class ProviderImage(SQLModel, table=True): id: str = Field(primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") image_name: str image_blob: bytes = Field(sa_column=Column(LargeBinary)) last_updated: datetime.datetime = Field( default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc) ) updated_by: str = Field(default="system", max_length=255) ================================================ FILE: keep/api/models/db/rule.py ================================================ from datetime import datetime from enum import Enum from uuid import UUID, uuid4 from sqlalchemy import CheckConstraint from sqlmodel import JSON, Column, Field, SQLModel # Currently a rule_definition is a list of SQL expressions # We use querybuilder for that class ResolveOn(Enum): # the alert was triggered FIRST = "first_resolved" LAST = "last_resolved" ALL = "all_resolved" NEVER = "never" class CreateIncidentOn(Enum): # the alert was triggered ANY = "any" ALL = "all" # TODOs/Pitfalls down the road which we hopefully need to address in the future: # 1. nested attibtues (event.foo.bar = 1) # 2. scale - when event arrives, we need to check if the rule is applicable to the event # the naive approach is to iterate over all rules and check if the rule is applicable # which won't scale. # 3. action - currently support create alert, down the road should support workflows # 4. timeframe - should be per definition group class Rule(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") name: str definition: dict = Field(sa_column=Column(JSON)) # sql / params definition_cel: str # cel timeframe: int # time in seconds timeunit: str = Field(default="seconds") created_by: str creation_time: datetime updated_by: str = None update_time: datetime = None # list of "group_by" attributes - when to break the rule into groups grouping_criteria: list = Field(sa_column=Column(JSON), default=[]) # e.g. The {{ labels.queue }} is more than third full on {{ num_of_alerts }} queue managers | {{ start_time }} || {{ last_update_time }} group_description: str = None # e.g. The {{ labels.queue }} is more than third full on {{ num_of_alerts }} queue managers item_description: str = None require_approve: bool = False resolve_on: str = ResolveOn.NEVER.value create_on: str = CreateIncidentOn.ANY.value is_deleted: bool = False incident_name_template: str = None incident_prefix: str | None = None multi_level: bool = False multi_level_property_name: str | None = None threshold: int = Field(sa_column_args=(CheckConstraint("threshold>0"),), default=1) assignee: str | None = None ================================================ FILE: keep/api/models/db/secret.py ================================================ from datetime import datetime from sqlmodel import Field, SQLModel class Secret(SQLModel, table=True): key: str = Field(primary_key=True) value: str last_updated: datetime = Field( default_factory=datetime.utcnow, ) class Config: orm_mode = True ================================================ FILE: keep/api/models/db/statistics.py ================================================ from sqlmodel import Field, SQLModel class PMIMatrix(SQLModel, table=True): tenant_id: str = Field(foreign_key="tenant.id") fingerprint_i: str = Field(primary_key=True) fingerprint_j: str = Field(primary_key=True) pmi: float ================================================ FILE: keep/api/models/db/system.py ================================================ from sqlmodel import Field, SQLModel class System(SQLModel, table=True): id: str = Field(primary_key=True) name: str value: str ================================================ FILE: keep/api/models/db/tenant.py ================================================ from datetime import datetime from typing import List, Optional from uuid import UUID, uuid4 from sqlmodel import JSON, Column, Field, Relationship, SQLModel, UniqueConstraint class Tenant(SQLModel, table=True): # uuid id: str = Field(primary_key=True) name: str configuration: dict | None = Field(sa_column=Column(JSON), default=None) installations: List["TenantInstallation"] = Relationship(back_populates="tenant") class TenantApiKey(SQLModel, table=True): tenant_id: str = Field(foreign_key="tenant.id") reference_id: str = Field(description="For instance, the GitHub installation ID") key_hash: str = Field(primary_key=True) tenant: Tenant = Relationship() is_system: bool = False is_deleted: bool = False system_description: Optional[str] = None created_by: str role: str created_at: datetime = Field(default_factory=datetime.utcnow) last_used: str = Field(default=None) __table_args__ = ( UniqueConstraint("tenant_id", "reference_id", name="unique_tenant_reference"), ) class Config: orm_mode = True class TenantInstallation(SQLModel, table=True): id: UUID = Field(default=uuid4, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") bot_id: str installed: bool = False tenant: Optional[Tenant] = Relationship(back_populates="installations") ================================================ FILE: keep/api/models/db/topology.py ================================================ from datetime import datetime from typing import List, Optional from uuid import UUID, uuid4 from pydantic import BaseModel from sqlalchemy import DateTime, ForeignKey from sqlmodel import JSON, Column, Field, Relationship, SQLModel, func class TopologyServiceApplication(SQLModel, table=True): service_id: int = Field(foreign_key="topologyservice.id", primary_key=True) application_id: UUID = Field(foreign_key="topologyapplication.id", primary_key=True) service: "TopologyService" = Relationship( sa_relationship_kwargs={ "primaryjoin": "TopologyService.id == TopologyServiceApplication.service_id", "viewonly": "True", }, ) application: "TopologyApplication" = Relationship( sa_relationship_kwargs={ "primaryjoin": "TopologyApplication.id == TopologyServiceApplication.application_id", "viewonly": "True", }, ) class TopologyApplication(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) tenant_id: str = Field(sa_column=Column(ForeignKey("tenant.id"))) name: str description: str = Field(default_factory=str) repository: str = Field(default_factory=str) services: List["TopologyService"] = Relationship( back_populates="applications", link_model=TopologyServiceApplication ) class TopologyService(SQLModel, table=True): id: Optional[int] = Field(primary_key=True, default=None) tenant_id: str = Field(sa_column=Column(ForeignKey("tenant.id"))) source_provider_id: str = "unknown" repository: Optional[str] tags: Optional[List[str]] = Field(sa_column=Column(JSON)) service: str environment: str = Field(default="unknown") display_name: str description: Optional[str] team: Optional[str] email: Optional[str] slack: Optional[str] ip_address: Optional[str] = None mac_address: Optional[str] = None category: Optional[str] = None manufacturer: Optional[str] = None namespace: Optional[str] = None is_manual: Optional[bool] = False updated_at: Optional[datetime] = Field( sa_column=Column( DateTime(timezone=True), name="updated_at", onupdate=func.now(), server_default=func.now(), ) ) dependencies: List["TopologyServiceDependency"] = Relationship( back_populates="service", sa_relationship_kwargs={ "foreign_keys": "[TopologyServiceDependency.service_id]", "cascade": "all, delete-orphan", }, ) applications: List[TopologyApplication] = Relationship( back_populates="services", link_model=TopologyServiceApplication ) class Config: orm_mode = True unique_together = ["tenant_id", "service", "environment", "source_provider_id"] class TopologyServiceDependency(SQLModel, table=True): id: Optional[int] = Field(primary_key=True, default=None) service_id: int = Field( sa_column=Column(ForeignKey("topologyservice.id", ondelete="CASCADE")) ) depends_on_service_id: int = Field( sa_column=Column(ForeignKey("topologyservice.id", ondelete="CASCADE")) ) # service_id calls deponds_on_service_id (A->B) protocol: Optional[str] = "unknown" updated_at: Optional[datetime] = Field( sa_column=Column( DateTime(timezone=True), name="updated_at", onupdate=func.now(), server_default=func.now(), ) ) service: TopologyService = Relationship( back_populates="dependencies", sa_relationship_kwargs={ "foreign_keys": "[TopologyServiceDependency.service_id]" }, ) dependent_service: TopologyService = Relationship( sa_relationship_kwargs={ "foreign_keys": "[TopologyServiceDependency.depends_on_service_id]" } ) class TopologyServiceDtoBase(BaseModel, extra="ignore"): source_provider_id: Optional[str] repository: Optional[str] = None tags: Optional[List[str]] = None service: str display_name: str environment: str = "unknown" description: Optional[str] = None team: Optional[str] = None email: Optional[str] = None slack: Optional[str] = None ip_address: Optional[str] = None mac_address: Optional[str] = None category: Optional[str] = None manufacturer: Optional[str] = None namespace: Optional[str] = None is_manual: Optional[bool] = False class TopologyServiceInDto(TopologyServiceDtoBase): dependencies: dict[str, str] = {} # dict of service it depends on : protocol application_relations: Optional[dict[UUID, str]] = ( None # An option field, pass it in the form of {application_id_1: application_name_1, application_id_2: application_name_2, ...} tha t the service belongs to, the process_topology function handles the creation/updation of the application ) class TopologyServiceDependencyDto(BaseModel, extra="ignore"): id: Optional[str] = None serviceId: str serviceName: str protocol: Optional[str] = "unknown" @classmethod def from_orm(cls, db_dependency: TopologyServiceDependency): return TopologyServiceDependencyDto( id=db_dependency.id, serviceId=str(db_dependency.depends_on_service_id), protocol=db_dependency.protocol, serviceName=db_dependency.dependent_service.service, ) class TopologyApplicationDto(BaseModel, extra="ignore"): id: UUID name: str description: Optional[str] = None repository: Optional[str] = None services: List[TopologyService] = Relationship( back_populates="applications", link_model="TopologyServiceApplication" ) class TopologyServiceDtoIn(BaseModel, extra="ignore"): id: int class TopologyApplicationDtoIn(BaseModel, extra="ignore"): id: Optional[UUID] = None name: str description: str = "" repository: str = "" services: List[TopologyServiceDtoIn] = [] class TopologyApplicationServiceDto(BaseModel, extra="ignore"): id: str name: str service: str @classmethod def from_orm(cls, service: "TopologyService") -> "TopologyApplicationServiceDto": return cls( id=str(service.id), name=service.display_name, service=service.service, ) class TopologyApplicationDtoOut(TopologyApplicationDto): services: List[TopologyApplicationServiceDto] = [] @classmethod def from_orm( cls, application: "TopologyApplication" ) -> "TopologyApplicationDtoOut": return cls( id=application.id, name=application.name, description=application.description, repository=application.repository, services=[ TopologyApplicationServiceDto.from_orm(service) for service in application.services ], ) class TopologyServiceDtoOut(TopologyServiceDtoBase): id: str dependencies: List[TopologyServiceDependencyDto] application_ids: List[UUID] updated_at: Optional[datetime] @classmethod def from_orm( cls, service: "TopologyService", application_ids: List[UUID] ) -> "TopologyServiceDtoOut": return cls( id=str(service.id), source_provider_id=service.source_provider_id, repository=service.repository, tags=service.tags, service=service.service, display_name=service.display_name, environment=service.environment, description=service.description, team=service.team, email=service.email, slack=service.slack, ip_address=service.ip_address, mac_address=service.mac_address, manufacturer=service.manufacturer, category=service.category, dependencies=[ TopologyServiceDependencyDto( id=dep.id, serviceId=str(dep.depends_on_service_id), protocol=dep.protocol, serviceName=dep.dependent_service.service, ) for dep in service.dependencies ], application_ids=application_ids, updated_at=service.updated_at, namespace=service.namespace, is_manual=service.is_manual if service.is_manual is not None else False, ) class TopologyServiceCreateRequestDTO(BaseModel, extra="ignore"): repository: Optional[str] = None tags: Optional[List[str]] = None service: str display_name: str environment: str = "unknown" description: Optional[str] = None team: Optional[str] = None email: Optional[str] = None slack: Optional[str] = None ip_address: Optional[str] = None mac_address: Optional[str] = None category: Optional[str] = None manufacturer: Optional[str] = None namespace: Optional[str] = None class TopologyServiceUpdateRequestDTO(TopologyServiceCreateRequestDTO, extra="ignore"): id: int class TopologyServiceDependencyCreateRequestDto(BaseModel, extra="ignore"): service_id: int depends_on_service_id: int protocol: Optional[str] = "unknown" class TopologyServiceDependencyUpdateRequestDto( TopologyServiceDependencyCreateRequestDto, extra="ignore" ): service_id: Optional[int] depends_on_service_id: Optional[int] id: int class DeleteServicesRequest(BaseModel, extra="ignore"): service_ids: List[int] class TopologyServiceYAML(TopologyServiceCreateRequestDTO, extra="ignore"): id: int source_provider_id: Optional[str] = None is_manual: Optional[bool] = None ================================================ FILE: keep/api/models/db/user.py ================================================ from datetime import datetime from sqlmodel import Field, SQLModel # THIS IS ONLY FOR SINGLE TENANT (self-hosted) USAGES from keep.api.core.dependencies import SINGLE_TENANT_UUID class User(SQLModel, table=True): # Unique ID for each user id: int = Field(primary_key=True) tenant_id: str = Field(default=SINGLE_TENANT_UUID) # Username for the user (should be unique) username: str = Field(index=True, unique=True) # Hashed password (never store plain-text passwords) password_hash: str # Role role: str # Timestamp for the last sign-in of the user last_sign_in: datetime = Field(default=None) # Account creation timestamp created_at: datetime = Field(default_factory=datetime.utcnow) ================================================ FILE: keep/api/models/db/workflow.py ================================================ from datetime import datetime from typing import List, Optional from sqlalchemy import TEXT, DateTime, Index, PrimaryKeyConstraint, func from sqlmodel import JSON, Column, Field, Relationship, SQLModel, UniqueConstraint def get_dummy_workflow_id(tenant_id: str) -> str: return f"system-dummy-workflow-{tenant_id}" class Workflow(SQLModel, table=True): id: str = Field(default=None, primary_key=True) tenant_id: str = Field(foreign_key="tenant.id") name: str = Field(sa_column=Column(TEXT)) description: Optional[str] created_by: str = Field(sa_column=Column(TEXT)) updated_by: Optional[str] = None creation_time: datetime = Field(default_factory=datetime.utcnow) interval: Optional[int] workflow_raw: str = Field(sa_column=Column(TEXT)) is_deleted: bool = Field(default=False) is_disabled: bool = Field(default=False) revision: int = Field(default=1, nullable=False) last_updated: datetime = Field( sa_column=Column( DateTime(timezone=True), name="last_updated", onupdate=func.now(), server_default=func.now(), nullable=False, ) ) provisioned: bool = Field(default=False) provisioned_file: Optional[str] = None is_test: bool = Field(default=False) executions: List["WorkflowExecution"] = Relationship(back_populates="workflow") versions: List["WorkflowVersion"] = Relationship(back_populates="workflow") class Config: orm_mode = True class WorkflowVersion(SQLModel, table=True): __table_args__ = (PrimaryKeyConstraint("workflow_id", "revision"),) workflow_id: str = Field(primary_key=True, foreign_key="workflow.id") revision: int = Field(primary_key=True) workflow_raw: str = Field(sa_column=Column(TEXT)) updated_by: str updated_at: datetime = Field( sa_column=Column( DateTime(timezone=True), name="updated_at", onupdate=func.now(), server_default=func.now(), nullable=False, ) ) is_valid: bool = Field(default=False) is_current: bool = Field(default=False) comment: Optional[str] = None workflow: "Workflow" = Relationship(back_populates="versions") executions: List["WorkflowExecution"] = Relationship( back_populates="version", sa_relationship_kwargs={ "primaryjoin": "and_(WorkflowVersion.workflow_id == WorkflowExecution.workflow_id, " "WorkflowVersion.revision == WorkflowExecution.workflow_revision)", "foreign_keys": "[WorkflowExecution.workflow_id, WorkflowExecution.workflow_revision]", "viewonly": True, }, ) class WorkflowExecution(SQLModel, table=True): __table_args__ = ( UniqueConstraint("workflow_id", "execution_number", "is_running", "timeslot"), Index( "idx_workflowexecution_tenant_workflow_id_timestamp", "tenant_id", "workflow_id", "started", ), Index( "idx_workflowexecution_tenant_workflow_id_revision_timestamp", "tenant_id", "workflow_id", "workflow_revision", "started", ), Index( "idx_workflowexecution_workflow_tenant_started_status", "workflow_id", "tenant_id", "started", "status", mysql_length={"status": 255}, ), Index( "idx_workflowexecution_workflow_revision_tenant_started_status", "workflow_id", "workflow_revision", "tenant_id", "started", "status", mysql_length={"status": 255}, ), Index( "idx_status_started", "status", "started", mysql_length={"status": 255}, ), Index( "idx_workflowexecution_workflow_revision", "workflow_id", "workflow_revision", ), ) id: str = Field(default=None, primary_key=True) workflow_id: str = Field( foreign_key="workflow.id", default="test" ) # default=test for test runs, which are not associated with a workflow workflow_revision: int = Field( default=1 ) # Add this to track which version was executed tenant_id: str = Field(foreign_key="tenant.id") started: datetime = Field(default_factory=datetime.utcnow, index=True) triggered_by: str = Field(sa_column=Column(TEXT)) status: str = Field(sa_column=Column(TEXT)) is_running: int = Field(default=1) timeslot: int = Field( default_factory=lambda: int(datetime.utcnow().timestamp() / 120) ) execution_number: int error: Optional[str] = Field(max_length=10240) execution_time: Optional[int] results: dict = Field(sa_column=Column(JSON), default={}) is_test_run: bool = Field(default=False) workflow: "Workflow" = Relationship( back_populates="executions", sa_relationship_kwargs={"foreign_keys": "[WorkflowExecution.workflow_id]"}, ) version: "WorkflowVersion" = Relationship( back_populates="executions", sa_relationship_kwargs={ "primaryjoin": "and_(WorkflowVersion.workflow_id == WorkflowExecution.workflow_id, WorkflowVersion.revision == WorkflowExecution.workflow_revision)", "foreign_keys": "[WorkflowExecution.workflow_id, WorkflowExecution.workflow_revision]", "viewonly": True, }, ) logs: List["WorkflowExecutionLog"] = Relationship( back_populates="workflowexecution" ) workflow_to_alert_execution: "WorkflowToAlertExecution" = Relationship( back_populates="workflow_execution" ) workflow_to_incident_execution: "WorkflowToIncidentExecution" = Relationship( back_populates="workflow_execution" ) class Config: orm_mode = True class WorkflowToAlertExecution(SQLModel, table=True): __table_args__ = (UniqueConstraint("workflow_execution_id", "alert_fingerprint"),) # https://sqlmodel.tiangolo.com/tutorial/automatic-id-none-refresh/ id: Optional[int] = Field(primary_key=True, default=None) workflow_execution_id: str = Field(foreign_key="workflowexecution.id") alert_fingerprint: str event_id: str | None workflow_execution: WorkflowExecution = Relationship( back_populates="workflow_to_alert_execution" ) class WorkflowToIncidentExecution(SQLModel, table=True): __table_args__ = (UniqueConstraint("workflow_execution_id", "incident_id"),) # https://sqlmodel.tiangolo.com/tutorial/automatic-id-none-refresh/ id: Optional[int] = Field(primary_key=True, default=None) workflow_execution_id: str = Field(foreign_key="workflowexecution.id") incident_id: str | None workflow_execution: WorkflowExecution = Relationship( back_populates="workflow_to_incident_execution" ) class WorkflowExecutionLog(SQLModel, table=True): id: int = Field(default=None, primary_key=True) workflow_execution_id: str = Field(foreign_key="workflowexecution.id") timestamp: datetime message: str = Field(sa_column=Column(TEXT)) workflowexecution: Optional[WorkflowExecution] = Relationship(back_populates="logs") context: dict = Field(sa_column=Column(JSON)) class Config: orm_mode = True ================================================ FILE: keep/api/models/facet.py ================================================ from typing import Any, Optional from pydantic import BaseModel import pydantic from keep.api.models.db.facet import FacetType class FacetOptionsQueryDto(BaseModel): cel: Optional[str] facet_queries: Optional[dict[str, str]] class FacetOptionDto(BaseModel): display_name: str value: Any matches_count: int class FacetDto(BaseModel): id: str property_path: str name: str description: Optional[str] is_static: bool is_lazy: bool = True type: FacetType class CreateFacetDto(BaseModel): property_path: str name: str description: Optional[str] @pydantic.validator('property_path') def name_validator(cls, v: str): if not v.strip(): raise ValueError('property_path must not be empty') return v @pydantic.validator('name') def property_path_validator(cls, v: str): if not v.strip(): raise ValueError('name must not be empty') return v ================================================ FILE: keep/api/models/incident.py ================================================ import datetime import json import logging from enum import Enum from typing import Any, Dict, List, Optional from uuid import UUID from pydantic import ( BaseModel, Extra, Field, PrivateAttr, validator, root_validator, ) from sqlmodel import col, desc from keep.api.models.db.incident import Incident, IncidentSeverity, IncidentStatus from keep.api.models.db.rule import ResolveOn, Rule class IncidentStatusChangeDto(BaseModel): status: IncidentStatus comment: str | None tagged_users: list[str] = [] @validator('tagged_users') @classmethod def validate_no_duplicate_users(cls, value): """Ensure there are no duplicate users in the tagged_users list.""" if len(value) != len(set(value)): unique_users = list(dict.fromkeys(value)) # Preserves order while removing duplicates return unique_users return value class IncidentSeverityChangeDto(BaseModel): severity: IncidentSeverity comment: str | None class IncidentDtoIn(BaseModel): user_generated_name: str | None assignee: str | None user_summary: str | None same_incident_in_the_past_id: UUID | None severity: IncidentSeverity | None class Config: extra = Extra.allow schema_extra = { "examples": [ { "id": "c2509cb3-6168-4347-b83b-a41da9df2d5b", "name": "Incident name", "user_summary": "Keep: Incident description", "status": "firing", } ] } class IncidentDto(IncidentDtoIn): id: UUID start_time: datetime.datetime | None last_seen_time: datetime.datetime | None end_time: datetime.datetime | None creation_time: datetime.datetime | None alerts_count: int alert_sources: list[str] status: IncidentStatus = IncidentStatus.FIRING assignee: str | None services: list[str] is_predicted: bool is_candidate: bool generated_summary: str | None ai_generated_name: str | None rule_fingerprint: str | None fingerprint: ( str | None ) # This is the fingerprint of the incident generated by the underlying tool same_incident_in_the_past_id: UUID | None merged_into_incident_id: UUID | None merged_by: str | None merged_at: datetime.datetime | None enrichments: dict | None = {} incident_type: str | None incident_application: str | None resolve_on: str = Field( default=ResolveOn.ALL.value, description="Resolution strategy for the incident", ) rule_id: UUID | None rule_name: str | None rule_is_deleted: bool | None _tenant_id: str = PrivateAttr() # AlertDto, not explicitly typed because of circular dependency _alerts: Optional[List] = PrivateAttr(default=None) def __init__(self, **data): super().__init__(**data) if "alerts" in data: self._alerts = data["alerts"] if "tenant_id" in data: self._tenant_id = data.pop("tenant_id") def __str__(self) -> str: # Convert the model instance to a dictionary model_dict = self.dict() return json.dumps(model_dict, indent=4, default=str) class Config: extra = Extra.allow schema_extra = IncidentDtoIn.Config.schema_extra underscore_attrs_are_private = True json_encoders = { # Converts UUID to their values for JSON serialization UUID: lambda v: str(v), } @property def name(self): return self.user_generated_name or self.ai_generated_name @property def alerts(self) -> List: if self._alerts is not None: return self._alerts from keep.api.core.db import get_incident_alerts_by_incident_id from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts try: if not self._tenant_id: return [] except Exception: logging.getLogger(__name__).error( "Tenant ID is not set in incident", extra={"incident_id": self.id}, ) return [] alerts, _ = get_incident_alerts_by_incident_id(self._tenant_id, str(self.id)) return convert_db_alerts_to_dto_alerts(alerts) @root_validator(pre=True) def set_default_values(cls, values: Dict[str, Any]) -> Dict[str, Any]: # Check and set default status status = values.get("status") try: values["status"] = IncidentStatus(status) except ValueError: logging.getLogger(__name__).warning( f"Invalid status value: {status}, setting default.", extra={"event": values}, ) values["status"] = IncidentStatus.FIRING return values @classmethod def from_db_incident(cls, db_incident: "Incident", rule: "Rule" = None): severity = ( IncidentSeverity.from_number(db_incident.severity) if isinstance(db_incident.severity, int) else db_incident.severity ) # some default value for resolve_on if not db_incident.resolve_on: db_incident.resolve_on = ResolveOn.ALL.value dto = cls( id=db_incident.id, user_generated_name=db_incident.user_generated_name, ai_generated_name=db_incident.ai_generated_name, user_summary=db_incident.user_summary, generated_summary=db_incident.generated_summary, is_predicted=db_incident.is_predicted, is_candidate=db_incident.is_candidate, creation_time=db_incident.creation_time, start_time=db_incident.start_time, last_seen_time=db_incident.last_seen_time, end_time=db_incident.end_time, alerts_count=db_incident.alerts_count, alert_sources=db_incident.sources or [], severity=severity, status=db_incident.status, assignee=db_incident.assignee, services=db_incident.affected_services or [], rule_fingerprint=db_incident.rule_fingerprint, fingerprint=db_incident.fingerprint, same_incident_in_the_past_id=db_incident.same_incident_in_the_past_id, merged_into_incident_id=db_incident.merged_into_incident_id, merged_by=db_incident.merged_by, merged_at=db_incident.merged_at, incident_type=db_incident.incident_type, incident_application=str(db_incident.incident_application), enrichments=db_incident.enrichments, resolve_on=db_incident.resolve_on, rule_id=rule.id if rule else None, rule_name=rule.name if rule else None, rule_is_deleted=rule.is_deleted if rule else None, ) # This field is required for getting alerts when required dto._tenant_id = db_incident.tenant_id if db_incident.enrichments: dto = dto.copy(update=db_incident.enrichments) return dto def to_db_incident(self) -> "Incident": """Converts an IncidentDto instance to an Incident database model.""" from keep.api.models.db.alert import Incident db_incident = Incident( id=self.id, user_generated_name=self.user_generated_name, ai_generated_name=self.ai_generated_name, user_summary=self.user_summary, generated_summary=self.generated_summary, assignee=self.assignee, severity=self.severity.order, status=self.status.value, creation_time=self.creation_time or datetime.datetime.utcnow(), start_time=self.start_time, end_time=self.end_time, last_seen_time=self.last_seen_time, alerts_count=self.alerts_count, affected_services=self.services, sources=self.alert_sources, is_predicted=self.is_predicted, is_candidate=self.is_candidate, rule_fingerprint=self.rule_fingerprint, fingerprint=self.fingerprint, same_incident_in_the_past_id=self.same_incident_in_the_past_id, merged_into_incident_id=self.merged_into_incident_id, merged_by=self.merged_by, merged_at=self.merged_at, ) return db_incident class SplitIncidentRequestDto(BaseModel): alert_fingerprints: list[str] destination_incident_id: UUID class SplitIncidentResponseDto(BaseModel): destination_incident_id: UUID moved_alert_fingerprints: list[str] class MergeIncidentsRequestDto(BaseModel): source_incident_ids: list[UUID] destination_incident_id: UUID class MergeIncidentsResponseDto(BaseModel): merged_incident_ids: list[UUID] failed_incident_ids: list[UUID] destination_incident_id: UUID message: str class IncidentSorting(Enum): creation_time = "creation_time" start_time = "start_time" last_seen_time = "last_seen_time" severity = "severity" status = "status" alerts_count = "alerts_count" creation_time_desc = "-creation_time" start_time_desc = "-start_time" last_seen_time_desc = "-last_seen_time" severity_desc = "-severity" status_desc = "-status" alerts_count_desc = "-alerts_count" def get_order_by(self, model): if self.value.startswith("-"): return desc(col(getattr(model, self.value[1:]))) return col(getattr(model, self.value)) class IncidentListFilterParamsDto(BaseModel): statuses: List[IncidentStatus] = [s.value for s in IncidentStatus] severities: List[IncidentSeverity] = [s.value for s in IncidentSeverity] assignees: List[str] services: List[str] sources: List[str] class IncidentCandidate(BaseModel): incident_name: str alerts: List[int] = Field( description="List of alert numbers (1-based index) included in this incident" ) reasoning: str severity: str = Field( description="Assessed severity level", enum=["Low", "Medium", "High", "Critical"], ) recommended_actions: List[str] confidence_score: float = Field( description="Confidence score of the incident clustering (0.0 to 1.0)" ) confidence_explanation: str = Field( description="Explanation of how the confidence score was calculated" ) class IncidentClustering(BaseModel): incidents: List[IncidentCandidate] class IncidentCommit(BaseModel): accepted: bool original_suggestion: dict changes: dict = Field(default_factory=dict) incident: IncidentDto class IncidentsClusteringSuggestion(BaseModel): incident_suggestion: list[IncidentDto] suggestion_id: str ================================================ FILE: keep/api/models/provider.py ================================================ from datetime import datetime from typing import Any, Literal from pydantic import BaseModel, Field from keep.providers.models.provider_config import ProviderScope from keep.providers.models.provider_method import ProviderMethodDTO class ProviderAlertsCountResponseDTO(BaseModel): count: int class Provider(BaseModel): id: str | None = None display_name: str type: str config: dict[str, dict] = Field(default_factory=dict) details: dict[str, Any] | None = None can_notify: bool # TODO: consider making it strongly typed for UI validations notify_params: list[str] | None = None can_query: bool query_params: list[str] | None = None installed: bool = False # whether we got alert from this provider without installaltion linked: bool = False last_alert_received: str | None = None # Whether we support webhooks without install supports_webhook: bool = False # Whether we also support auto install for webhooks can_setup_webhook: bool = False # If the setup webhook checkbox in the UI is checked and disabled. webhook_required: bool = False provider_description: str | None = None oauth2_url: str | None = None scopes: list[ProviderScope] = Field(default_factory=list) validatedScopes: dict[str, bool | str] | None = Field(default_factory=dict) methods: list[ProviderMethodDTO] = Field(default_factory=list) installed_by: str | None = None installation_time: datetime | None = None pulling_available: bool = False pulling_enabled: bool = True last_pull_time: datetime | None = None docs: str | None = None tags: list[ Literal[ "alert", "ticketing", "messaging", "data", "queue", "topology", "incident" ] ] = Field(default_factory=list) categories: list[str] = Field(default_factory=lambda: ["Others"]) coming_soon: bool = False alertsDistribution: dict[str, int] | None = None alertExample: dict | None = None default_fingerprint_fields: list[str] | None = None provisioned: bool = False health: bool = False provider_metadata: dict[str, Any] | None = Field(default_factory=dict) ================================================ FILE: keep/api/models/query.py ================================================ from typing import Optional from pydantic import BaseModel class SortOptionsDto(BaseModel): sort_by: Optional[str] sort_dir: Optional[str] class QueryDto(BaseModel): cel: Optional[str] limit: Optional[int] = 1000 offset: Optional[int] = 0 sort_by: Optional[str] # must be deprecated because we have sort_options sort_dir: Optional[str] # must be deprecated because we have sort_options sort_options: Optional[list[SortOptionsDto]] ================================================ FILE: keep/api/models/search_alert.py ================================================ from pydantic import BaseModel, Extra, Field, validator from keep.api.models.db.preset import PresetSearchQuery class SearchAlertsRequest(BaseModel): query: PresetSearchQuery = Field(..., alias="query") timeframe: int = Field(..., alias="timeframe") @validator("query") def validate_search_query(cls, value): if value.timeframe < 0: raise ValueError("Timeframe must be greater than or equal to 0.") return value class Config: extra = Extra.allow ================================================ FILE: keep/api/models/severity_base.py ================================================ from enum import Enum class SeverityBaseInterface(Enum): def __new__(cls, severity_name, severity_order): obj = object.__new__(cls) obj._value_ = severity_name obj.severity_order = severity_order return obj @property def order(self): return self.severity_order def __str__(self): return self._value_ @classmethod def from_number(cls, n): for severity in cls: if severity.order == n: return severity raise ValueError(f"No AlertSeverity with order {n}") def __lt__(self, other): if isinstance(other, SeverityBaseInterface): return self.order < other.order return NotImplemented def __le__(self, other): if isinstance(other, SeverityBaseInterface): return self.order <= other.order return NotImplemented def __gt__(self, other): if isinstance(other, SeverityBaseInterface): return self.order > other.order return NotImplemented def __ge__(self, other): if isinstance(other, SeverityBaseInterface): return self.order >= other.order return NotImplemented ================================================ FILE: keep/api/models/smtp.py ================================================ from typing import Optional from pydantic import BaseModel, SecretStr, validator class SMTPSettings(BaseModel): host: str port: int from_email: str username: Optional[str] = None password: Optional[SecretStr] = None secure: bool = True # Only for testing to_email: Optional[str] = "keep@example.com" @validator("from_email", "to_email") def email_validator(cls, v): if "@" not in v or "." not in v: raise ValueError("Invalid email address") return v class Config: schema_extra = { "example": { "host": "smtp.example.com", "port": 587, "username": "user@example.com", "password": "password", "secure": True, "from_email": "noreply@example.com", "to_email": "", } } ================================================ FILE: keep/api/models/time_stamp.py ================================================ import json from typing import Optional from fastapi import Query, HTTPException from pydantic import BaseModel, Field from datetime import datetime class TimeStampFilter(BaseModel): lower_timestamp: Optional[datetime] = Field(None, alias="start") upper_timestamp: Optional[datetime] = Field(None, alias="end") class Config: allow_population_by_field_name = True # Function to handle the time_stamp query parameter and parse it def _get_time_stamp_filter(time_stamp: Optional[str] = Query(None)) -> TimeStampFilter: if time_stamp: try: # Parse the JSON string time_stamp_dict = json.loads(time_stamp) # Return the TimeStampFilter object, Pydantic will map 'from' -> lower_timestamp and 'to' -> upper_timestamp return TimeStampFilter(**time_stamp_dict) except (json.JSONDecodeError, TypeError): raise HTTPException(status_code=400, detail="Invalid time_stamp format") return TimeStampFilter() ================================================ FILE: keep/api/models/user.py ================================================ from typing import List, Optional, Set from pydantic import BaseModel, Extra class Group(BaseModel, extra=Extra.ignore): id: str name: str roles: list[str] = [] members: list[str] = [] memberCount: int = 0 class User(BaseModel, extra=Extra.ignore): email: str name: str role: Optional[str] = None picture: Optional[str] created_at: str last_login: Optional[str] ldap: Optional[bool] = False groups: Optional[list[Group]] = [] class Role(BaseModel): id: str name: str description: str scopes: Set[str] predefined: bool = True class CreateOrUpdateRole(BaseModel): name: Optional[str] description: Optional[str] scopes: Optional[Set[str]] class PermissionEntity(BaseModel): id: str # permission id type: str # 'user' or 'group' name: Optional[str] # permission name class ResourcePermission(BaseModel): resource_id: str resource_name: str resource_type: str permissions: List[PermissionEntity] ================================================ FILE: keep/api/models/webhook.py ================================================ from pydantic import BaseModel class WebhookSettings(BaseModel): webhookApi: str apiKey: str modelSchema: dict class ProviderWebhookSettings(BaseModel): webhookDescription: str | None = None webhookTemplate: str webhookMarkdown: str | None = None ================================================ FILE: keep/api/models/workflow.py ================================================ from collections import OrderedDict from datetime import datetime from typing import List, Literal, Optional from pydantic import BaseModel, validator from keep.functions import cyaml def represent_ordered_dict(dumper, data): filtered_data = {k: v for k, v in data.items() if v is not None} return dumper.represent_mapping("tag:yaml.org,2002:map", filtered_data.items()) cyaml.add_representer(OrderedDict, represent_ordered_dict) class ProviderDTO(BaseModel): type: str id: str | None # if not installed - no id name: str installed: bool class WorkflowDTO(BaseModel): id: str name: Optional[str] = "Workflow file doesn't contain name" description: Optional[str] = "Workflow file doesn't contain description" created_by: str creation_time: datetime triggers: List[dict] = None interval: int | None = None disabled: bool = False last_execution_time: datetime = None last_execution_status: str = None providers: List[ProviderDTO] workflow_raw: str revision: int = 1 last_updated: datetime = None last_updated_by: str = None invalid: bool = False # whether the workflow is invalid or not (for UI purposes) last_executions: List[dict] = None last_execution_started: datetime = None provisioned: bool = False provisioned_file: str = None alertRule: bool = False canRun: bool = True @property def workflow_raw_id(self): workflow_id = cyaml.safe_load(self.workflow_raw).get("id") return workflow_id @validator("workflow_raw", pre=False, always=True) def manipulate_raw(cls, raw, values): """We want to control the "sort" of a workflow when it gets to the front: 1. id 2. desc 3. triggers 4 --- all the rest --- 5. steps 6. actions Args: raw (_type_): _description_ Returns: _type_: _description_ """ ordered_raw = OrderedDict() d = cyaml.safe_load(raw) # id desc and triggers ordered_raw["id"] = d.get("id") values["workflow_raw_id"] = d.get("id") ordered_raw["description"] = d.get("description") ordered_raw["disabled"] = d.get("disabled") ordered_raw["triggers"] = d.get("triggers") for key, val in d.items(): if key not in [ "id", "description", "disabled", "triggers", "steps", "actions", ]: ordered_raw[key] = val # than steps and actions ordered_raw["steps"] = d.get("steps") # last, actions ordered_raw["actions"] = d.get("actions") return cyaml.dump(ordered_raw, width=99999) class WorkflowExecutionLogsDTO(BaseModel): id: int timestamp: datetime message: str context: Optional[dict] class WorkflowToAlertExecutionDTO(BaseModel): workflow_id: str workflow_execution_id: str alert_fingerprint: str workflow_status: str workflow_started: datetime event_id: str | None class WorkflowExecutionDTO(BaseModel): id: str workflow_id: str | None # None for test runs workflow_revision: int | None started: datetime triggered_by: str status: str workflow_name: Optional[str] # for UI purposes logs: Optional[List[WorkflowExecutionLogsDTO]] error: Optional[str] execution_time: Optional[float] results: Optional[dict] event_id: Optional[str] event_type: Optional[str] class WorkflowCreateOrUpdateDTO(BaseModel): workflow_id: str status: Literal["created", "updated"] revision: int = 1 class WorkflowRunResponseDTO(BaseModel): workflow_execution_id: str class WorkflowRawDto(BaseModel): workflow_raw: str class WorkflowVersionDTO(BaseModel): revision: int updated_by: str | None updated_at: datetime class WorkflowVersionListDTO(BaseModel): versions: List[WorkflowVersionDTO] class PreparsedWorkflowDTO(BaseModel): id: str name: str description: Optional[str] = "Workflow file doesn't contain description" interval: int | None = None disabled: bool = False ================================================ FILE: keep/api/observability.py ================================================ import logging import os from urllib.parse import urlparse from fastapi import FastAPI, Request from opentelemetry import metrics, trace from opentelemetry.exporter.cloud_trace import CloudTraceSpanExporter from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( OTLPSpanExporter as GRPCOTLPSpanExporter, ) from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( OTLPSpanExporter as HTTPOTLPSpanExporter, ) from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from opentelemetry.instrumentation.logging import LoggingInstrumentor from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.propagate import set_global_textmap from opentelemetry.propagators.cloud_trace_propagator import CloudTraceFormatPropagator from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.semconv.resource import ResourceAttributes from keep.api.core.config import config def get_protocol_from_endpoint(endpoint): parsed_url = urlparse(endpoint) if parsed_url.scheme == "http": return HTTPOTLPSpanExporter elif parsed_url.scheme == "grpc": return GRPCOTLPSpanExporter else: raise ValueError(f"Unsupported protocol: {parsed_url.scheme}") def setup(app: FastAPI): logger = logging.getLogger(__name__) # Configure the OpenTelemetry SDK service_name = os.environ.get( "OTEL_SERVICE_NAME", os.environ.get("SERVICE_NAME", "keep-api") ) otlp_collector_endpoint = os.environ.get( "OTEL_EXPORTER_OTLP_ENDPOINT", os.environ.get("OTLP_ENDPOINT", False) ) otlp_traces_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", None) otlp_logs_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT", None) otlp_metrics_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", None) enable_cloud_trace_exporter = config( "CLOUD_TRACE_ENABLED", default=False, cast=bool ) metrics_enabled = os.environ.get("METRIC_OTEL_ENABLED", "") resource = Resource.create( attributes={ ResourceAttributes.SERVICE_NAME: service_name, ResourceAttributes.SERVICE_INSTANCE_ID: f"worker-{os.getpid()}", } ) provider = TracerProvider(resource=resource) if otlp_collector_endpoint: logger.info(f"OTLP endpoint set to {otlp_collector_endpoint}") if otlp_traces_endpoint: logger.info(f"OTLP Traces endpoint set to {otlp_traces_endpoint}") SpanExporter = get_protocol_from_endpoint(otlp_traces_endpoint) processor = BatchSpanProcessor(SpanExporter(endpoint=otlp_traces_endpoint)) provider.add_span_processor(processor) if metrics_enabled.lower() == "true" and otlp_metrics_endpoint: logger.info( f"Metrics enabled. OTLP Metrics endpoint set to {otlp_metrics_endpoint}" ) reader = PeriodicExportingMetricReader( OTLPMetricExporter(endpoint=otlp_metrics_endpoint) ) metric_provider = MeterProvider(resource=resource, metric_readers=[reader]) metrics.set_meter_provider(metric_provider) if otlp_logs_endpoint: logger.info(f"OTLP Logs endpoint set to {otlp_logs_endpoint}") if enable_cloud_trace_exporter: logger.info("Cloud Trace exporter enabled.") processor = BatchSpanProcessor( CloudTraceSpanExporter(resource_regex="service.*") ) provider.add_span_processor(processor) trace.set_tracer_provider(provider) # Enable trace context propagation propagator = CloudTraceFormatPropagator() set_global_textmap(propagator) # let's create a simple middleware that will add a trace id to each request # this will allow us to trace requests through the system and in the exception handler class TraceIDMiddleware: async def __call__(self, request: Request, call_next): tracer = trace.get_current_span() trace_id = tracer.get_span_context().trace_id request.state.trace_id = format(trace_id, "032x") response = await call_next(request) return response app.middleware("http")(TraceIDMiddleware()) # Auto-instrument FastAPI application FastAPIInstrumentor.instrument_app(app) RequestsInstrumentor().instrument() # Enable OpenTelemetry Logging Instrumentation LoggingInstrumentor().instrument() ================================================ FILE: keep/api/redis_settings.py ================================================ """ Shared Redis configuration module for ARQ pool and worker. This module provides a centralized way to configure Redis connections, supporting both direct Redis and Redis Sentinel configurations. """ from arq.connections import RedisSettings from keep.api.core.config import config def get_redis_settings() -> RedisSettings: """ Get Redis configuration, supporting both direct Redis and Redis Sentinel. For Redis Sentinel, set: - REDIS=true - REDIS_SENTINEL_ENABLED=true - REDIS_SENTINEL_HOSTS=host1:port1,host2:port2 (comma-separated) - REDIS_SENTINEL_SERVICE_NAME=mymaster (default: mymaster) For direct Redis (default): - REDIS_HOST=localhost (default: localhost) - REDIS_PORT=6379 (default: 6379) Returns: RedisSettings: Configured Redis settings for ARQ """ sentinel_enabled = config("REDIS_SENTINEL_ENABLED", cast=bool, default=False) ssl_enabled = config("REDIS_SSL", cast=bool, default=False) if sentinel_enabled: sentinel_hosts_str = config("REDIS_SENTINEL_HOSTS", default="localhost:26379") sentinel_hosts = [] for host_port in sentinel_hosts_str.split(","): host_port = host_port.strip() if ":" in host_port: host, port = host_port.split(":", 1) sentinel_hosts.append((host.strip(), int(port.strip()))) else: sentinel_hosts.append((host_port, 26379)) service_name = config("REDIS_SENTINEL_SERVICE_NAME", default="mymaster") return RedisSettings( host=sentinel_hosts, sentinel=True, sentinel_master=service_name, username=config("REDIS_USERNAME", default=None), password=config("REDIS_PASSWORD", default=None), ssl=ssl_enabled, conn_timeout=60, conn_retries=10, conn_retry_delay=10, ) else: return RedisSettings( host=config("REDIS_HOST", default="localhost"), port=config("REDIS_PORT", cast=int, default=6379), username=config("REDIS_USERNAME", default=None), password=config("REDIS_PASSWORD", default=None), ssl=ssl_enabled, conn_timeout=60, conn_retries=10, conn_retry_delay=10, ) ================================================ FILE: keep/api/routes/__init__.py ================================================ ================================================ FILE: keep/api/routes/actions.py ================================================ import logging from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, status from fastapi.responses import JSONResponse from keep.actions.actions_factory import ActionsCRUD from keep.functions import cyaml from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory logger = logging.getLogger(__name__) router = APIRouter() # GET all actions @router.get("", description="Get all actions") def get_actions( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:actions"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info("Getting installed actions", extra={"tenant_id": tenant_id}) actions = ActionsCRUD.get_all_actions(tenant_id) try: return actions except Exception: logger.exception("Failed to get actions") raise HTTPException( status_code=400, detail="Unknown exception when getting actions" ) async def _get_action_info(request: Request, file: UploadFile) -> dict: """ "Get action data either from file io or form data""" try: if file: action_inforaw = await file.read() else: action_inforaw = await request.body() action_info = cyaml.safe_load(action_inforaw) except cyaml.YAMLError: logger.exception("Invalid YAML format when parsing actions file") raise HTTPException(status_code=400, detail="Invalid yaml format") return action_info # POST actions @router.post( "", description="Create new actions by uploading a file", status_code=status.HTTP_201_CREATED, ) async def create_actions( request: Request, file: UploadFile = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:actions"]) ), ): tenant_id = authenticated_entity.tenant_id installed_by = authenticated_entity.email actions_dict = await _get_action_info(request, file) ActionsCRUD.add_actions(tenant_id, installed_by, actions_dict.get("actions", [])) return {"message": "success"} # DELETE an action @router.delete("/{action_id}", description="Delete an action") def delete_action( action_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:actions"]) ), ): tenant_id = authenticated_entity.tenant_id return ActionsCRUD.remove_action(tenant_id, action_id) # UPDATE an action @router.put("/{action_id}", description="Update an action") async def put_action( action_id: str, request: Request, file: UploadFile, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:actions"]) ), ): tenant_id = authenticated_entity.tenant_id action_dict: dict = await _get_action_info(request, file) updated_action = ActionsCRUD.update_action(tenant_id, action_id, action_dict) if updated_action: return updated_action return JSONResponse(status_code=204, content={"message": "No content"}) ================================================ FILE: keep/api/routes/ai.py ================================================ import logging from fastapi import APIRouter, Depends from keep.api.core.db import ( get_alerts_count, get_first_alert_datetime, get_incidents_count, get_or_create_external_ai_settings, update_extrnal_ai_settings, ) from keep.api.models.ai_external import ExternalAIConfigAndMetadataDto from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) @router.get( "/stats", description="Get stats for the AI Landing Page", include_in_schema=False, ) def get_stats( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ): tenant_id = authenticated_entity.tenant_id external_ai_settings = get_or_create_external_ai_settings(tenant_id) for setting in external_ai_settings: setting.algorithm.remind_about_the_client(tenant_id) return { "alerts_count": get_alerts_count(tenant_id), "first_alert_datetime": get_first_alert_datetime(tenant_id), "incidents_count": get_incidents_count(tenant_id), "algorithm_configs": external_ai_settings, } @router.put( "/{algorithm_id}/settings", description="Update settings for an external AI", include_in_schema=False, ) def update_settings( algorithm_id: str, body: ExternalAIConfigAndMetadataDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), ): tenant_id = authenticated_entity.tenant_id return update_extrnal_ai_settings(tenant_id, body) ================================================ FILE: keep/api/routes/alerts.py ================================================ import base64 import concurrent.futures import hashlib import hmac import json import logging import os import time from concurrent.futures import Future, ThreadPoolExecutor from copy import deepcopy from typing import List, Optional import celpy from arq import ArqRedis from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request from fastapi.responses import JSONResponse from pusher import Pusher from sqlalchemy_utils import UUIDType from sqlmodel import Session from keep.api.arq_pool import get_pool from keep.api.bl.enrichments_bl import EnrichmentsBl from keep.api.consts import KEEP_ARQ_QUEUE_BASIC from keep.api.core.alerts import ( get_alert_facets, get_alert_facets_data, get_alert_potential_facet_fields, query_last_alerts, ) from keep.api.core.cel_to_sql.sql_providers.base import CelToSqlException from keep.api.core.config import config from keep.api.core.db import dismiss_error_alerts as dismiss_error_alerts_db from keep.api.core.db import enrich_alerts_with_incidents from keep.api.core.db import get_alert_audit as get_alert_audit_db from keep.api.core.db import ( get_alerts_by_fingerprint, get_alerts_by_ids, get_alerts_metrics_by_provider, get_enrichment, ) from keep.api.core.db import get_error_alerts as get_error_alerts_db from keep.api.core.db import ( get_last_alerts, get_last_alerts_by_fingerprints, get_provider_by_name, get_session, is_all_alerts_resolved, ) from keep.api.core.dependencies import extract_generic_body, get_pusher_client from keep.api.core.elastic import ElasticClient from keep.api.core.metrics import running_tasks_by_process_gauge, running_tasks_gauge from keep.api.models.action_type import ActionType from keep.api.models.alert import ( AlertDto, AlertErrorDto, AlertStatus, BatchEnrichAlertRequestBody, DeleteRequestBody, DismissAlertRequest, EnrichAlertNoteRequestBody, EnrichAlertRequestBody, UnEnrichAlertRequestBody, ) from keep.api.models.alert_audit import AlertAuditDto from keep.api.models.db.incident import IncidentStatus from keep.api.models.db.rule import ResolveOn from keep.api.models.facet import FacetOptionsQueryDto from keep.api.models.query import QueryDto from keep.api.models.search_alert import SearchAlertsRequest from keep.api.models.time_stamp import TimeStampFilter from keep.api.routes.preset import pull_data_from_providers from keep.api.tasks.process_event_task import process_event from keep.api.utils.email_utils import EmailTemplates, send_email from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts from keep.api.utils.time_stamp_helpers import get_time_stamp_filter from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory from keep.providers.providers_factory import ProvidersFactory from keep.searchengine.searchengine import SearchEngine from keep.workflowmanager.workflowmanager import WorkflowManager router = APIRouter() logger = logging.getLogger(__name__) REDIS = os.environ.get("REDIS", "false") == "true" EVENT_WORKERS = int(config("KEEP_EVENT_WORKERS", default=5, cast=int)) # Create dedicated threadpool process_event_executor = ThreadPoolExecutor( max_workers=EVENT_WORKERS, thread_name_prefix="process_event_worker" ) @router.post( "/facets/options", description="Query alert facet options. Accepts dictionary where key is facet id and value is cel to query facet", ) def fetch_alert_facet_options( facet_options_query: FacetOptionsQueryDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alert facets from DB", extra={ "tenant_id": tenant_id, }, ) try: facet_options = get_alert_facets_data( tenant_id=tenant_id, facet_options_query=facet_options_query ) except CelToSqlException as e: logger.exception( f'Error parsing CEL expression "{facet_options_query.cel}". {str(e)}' ) raise HTTPException( status_code=400, detail=f"Error parsing CEL expression: {facet_options_query.cel}", ) from e logger.info( "Fetched alert facets from DB", extra={ "tenant_id": tenant_id, }, ) return facet_options @router.get( "/facets", description="Get alert facets", ) def fetch_alert_facets( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alert facets from DB", extra={ "tenant_id": tenant_id, }, ) facets = get_alert_facets(tenant_id=tenant_id) logger.info( "Fetched alert facets from DB", extra={ "tenant_id": tenant_id, }, ) return facets @router.get( "/facets/fields", description="Get potential fields for alert facets", ) def fetch_alert_facet_fields( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alert facet fields from DB", extra={ "tenant_id": tenant_id, }, ) fields = get_alert_potential_facet_fields(tenant_id=tenant_id) logger.info( "Fetched alert facet fields from DB", extra={ "tenant_id": tenant_id, }, ) return fields @router.post( "/query", description="Get last alerts occurrence", ) def query_alerts( request: Request, query: QueryDto, bg_tasks: BackgroundTasks, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ): # Gathering alerts may take a while and we don't care if it will finish before we return the response. # In the worst case, gathered alerts will be pulled in the next request. # This approach is not good. We should continuesly pull alerts without relying on whether request is done or not. bg_tasks.add_task( pull_data_from_providers, authenticated_entity.tenant_id, request.state.trace_id, ) tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alerts from DB", extra={ "tenant_id": tenant_id, }, ) try: db_alerts, total_count = query_last_alerts(tenant_id=tenant_id, query=query) except CelToSqlException as e: logger.exception(f'Error parsing CEL expression "{query.cel}". {str(e)}') raise HTTPException( status_code=400, detail=f"Error parsing CEL expression: {query.cel}" ) from e db_alerts = enrich_alerts_with_incidents(tenant_id, db_alerts) enriched_alerts_dto = convert_db_alerts_to_dto_alerts( db_alerts, with_incidents=True ) logger.info( "Fetched alerts from DB", extra={ "tenant_id": tenant_id, "query": query, "total_count": total_count, }, ) return { "limit": query.limit, "offset": query.offset, "count": total_count, "results": enriched_alerts_dto, } @router.get( "", description="Get last alerts occurrence", ) def get_all_alerts( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), limit: int = 1000, ) -> list[AlertDto]: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alerts from DB", extra={ "tenant_id": tenant_id, }, ) db_alerts = get_last_alerts(tenant_id=tenant_id, limit=limit) enriched_alerts_dto = convert_db_alerts_to_dto_alerts(db_alerts) logger.info( "Fetched alerts from DB", extra={ "tenant_id": tenant_id, }, ) return enriched_alerts_dto @router.get("/{fingerprint}/history", description="Get alert history") def get_alert_history( fingerprint: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list[AlertDto]: logger.info( "Fetching alert history", extra={ "fingerprint": fingerprint, "tenant_id": authenticated_entity.tenant_id, }, ) db_alerts = get_alerts_by_fingerprint( tenant_id=authenticated_entity.tenant_id, fingerprint=fingerprint, limit=1000, with_alert_instance_enrichment=True, ) enriched_alerts_dto = convert_db_alerts_to_dto_alerts( db_alerts, with_alert_instance_enrichment=True ) logger.info( "Fetched alert history", extra={ "tenant_id": authenticated_entity.tenant_id, "fingerprint": fingerprint, }, ) return enriched_alerts_dto @router.delete("", description="Delete alert by finerprint and last received time") def delete_alert( delete_alert: DeleteRequestBody, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:alert"]) ), ) -> dict[str, str]: tenant_id = authenticated_entity.tenant_id user_email = authenticated_entity.email logger.info( "Deleting alert", extra={ "fingerprint": delete_alert.fingerprint, "restore": delete_alert.restore, "lastReceived": delete_alert.lastReceived, "tenant_id": tenant_id, }, ) deleted_last_received = [] # the last received(s) that are deleted assignees_last_receievd = {} # the last received(s) that are assigned to someone # If we enriched before, get the enrichment enrichment = get_enrichment(tenant_id, delete_alert.fingerprint) if enrichment: deleted_last_received = enrichment.enrichments.get("deletedAt", []) assignees_last_receievd = enrichment.enrichments.get("assignees", {}) if ( delete_alert.restore is True and delete_alert.lastReceived in deleted_last_received ): # Restore deleted alert deleted_last_received.remove(delete_alert.lastReceived) elif ( delete_alert.restore is False and delete_alert.lastReceived not in deleted_last_received ): # Delete the alert if it's not already deleted (wtf basically, shouldn't happen) deleted_last_received.append(delete_alert.lastReceived) if delete_alert.lastReceived not in assignees_last_receievd: # auto-assign the deleting user to the alert assignees_last_receievd[delete_alert.lastReceived] = user_email # overwrite the enrichment enrichment_bl = EnrichmentsBl(tenant_id) enrichment_bl.enrich_entity( fingerprint=delete_alert.fingerprint, enrichments={ "deletedAt": deleted_last_received, "assignees": assignees_last_receievd, }, action_type=ActionType.DELETE_ALERT, action_description=f"Alert deleted by {user_email}", action_callee=user_email, ) logger.info( "Deleted alert successfully", extra={ "tenant_id": tenant_id, "restore": delete_alert.restore, "fingerprint": delete_alert.fingerprint, }, ) return {"status": "ok"} @router.post( "/{fingerprint}/assign/{last_received}", description="Assign alert to user" ) def assign_alert( fingerprint: str, last_received: str, unassign: bool = False, authenticated_entity: AuthenticatedEntity = Depends( # @tb: this is read because NOC users can also assign alerts to themselves # anyway, this function needs to be refactored IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> dict[str, str]: tenant_id = authenticated_entity.tenant_id user_email = authenticated_entity.email logger.info( "Assigning alert", extra={ "fingerprint": fingerprint, "tenant_id": tenant_id, }, ) assignees_last_receievd = {} # the last received(s) that are assigned to someone status = None enrichment = get_enrichment(tenant_id, fingerprint) if enrichment: assignees_last_receievd = enrichment.enrichments.get("assignees", {}) status = enrichment.enrichments.get("status") if unassign: assignees_last_receievd.pop(last_received, None) else: assignees_last_receievd[last_received] = user_email enrichments = {"assignees": assignees_last_receievd} if not status: enrichments["status"] = "acknowledged" enrichment_bl = EnrichmentsBl(tenant_id) enrichment_bl.enrich_entity( fingerprint=fingerprint, enrichments=enrichments, action_type=ActionType.ACKNOWLEDGE, action_description=f"Alert assigned to {user_email}, status: {status}", action_callee=user_email, dispose_on_new_alert=True, ) try: if not unassign: # if we're assigning the alert to someone, send email logger.info("Sending assign alert email to user") # TODO: this should be changed to dynamic url but we don't know what's the frontend URL keep_platform_url = config( "KEEP_PLATFORM_URL", default="https://platform.keephq.dev" ) url = f"{keep_platform_url}/alerts?fingerprint={fingerprint}" send_email( to_email=user_email, template_id=EmailTemplates.ALERT_ASSIGNED_TO_USER, url=url, ) logger.info("Sent assign alert email to user") except Exception as e: logger.exception( "Failed to send email to user", extra={ "error": str(e), "tenant_id": tenant_id, "user_email": user_email, }, ) logger.info( "Assigned alert successfully", extra={ "tenant_id": tenant_id, "fingerprint": fingerprint, }, ) return {"status": "ok"} def discard_future( trace_id: str, future: Future, running_tasks: set, started_time: float, ): try: running_tasks.discard(future) running_tasks_gauge.dec() running_tasks_by_process_gauge.labels(pid=os.getpid()).dec() # Log any exception that occurred in the future try: exception = future.exception() if exception: logger.error( "Task failed with exception", extra={ "trace_id": trace_id, "error": str(exception), "processing_time": time.time() - started_time, }, ) else: logger.info( "Task completed", extra={ "processing_time": time.time() - started_time, "trace_id": trace_id, }, ) except concurrent.futures.CancelledError: logger.error( "Task was cancelled", extra={ "trace_id": trace_id, "processing_time": time.time() - started_time, }, ) except Exception: # Make sure we always decrement both counters even if something goes wrong running_tasks_gauge.dec() running_tasks_by_process_gauge.labels(pid=os.getpid()).dec() logger.exception( "Error in discard_future callback", extra={ "trace_id": trace_id, }, ) def create_process_event_task( tenant_id: str, provider_type: str | None, provider_id: str | None, fingerprint: str, api_key_name: str | None, trace_id: str, event: AlertDto | list[AlertDto] | dict, running_tasks: set, ) -> str: logger.info("Adding task", extra={"trace_id": trace_id}) started_time = time.time() running_tasks_gauge.inc() # Increase total counter running_tasks_by_process_gauge.labels( pid=os.getpid() ).inc() # Increase process counter future = process_event_executor.submit( process_event, {}, # ctx tenant_id, provider_type, provider_id, fingerprint, api_key_name, trace_id, event, ) running_tasks.add(future) future.add_done_callback( lambda task: discard_future(trace_id, task, running_tasks, started_time) ) logger.info("Task added", extra={"trace_id": trace_id}) return str(id(future)) @router.post( "/event", description="Receive a generic alert event", response_model=AlertDto | list[AlertDto], status_code=202, ) async def receive_generic_event( event: AlertDto | list[AlertDto] | dict, request: Request, provider_id: str | None = None, fingerprint: str | None = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), ): """ A generic webhook endpoint that can be used by any provider to send alerts to Keep. Args: alert (AlertDto | list[AlertDto]): The alert(s) to be sent to Keep. bg_tasks (BackgroundTasks): Background tasks handler. tenant_id (str, optional): Defaults to Depends(verify_api_key). """ running_tasks: set = request.state.background_tasks if REDIS: redis: ArqRedis = await get_pool() job = await redis.enqueue_job( "process_event_in_worker", authenticated_entity.tenant_id, None, provider_id, fingerprint, authenticated_entity.api_key_name, request.state.trace_id, event, _queue_name=KEEP_ARQ_QUEUE_BASIC, ) logger.info( "Enqueued job", extra={ "job_id": job.job_id, "tenant_id": authenticated_entity.tenant_id, "queue": KEEP_ARQ_QUEUE_BASIC, }, ) task_name = job.job_id else: task_name = create_process_event_task( authenticated_entity.tenant_id, None, provider_id, fingerprint, authenticated_entity.api_key_name, request.state.trace_id, event, running_tasks, ) return JSONResponse(content={"task_name": task_name}, status_code=202) # https://learn.netdata.cloud/docs/alerts-&-notifications/notifications/centralized-cloud-notifications/webhook#challenge-secret @router.get( "/event/netdata", description="Helper function to complete Netdata webhook challenge", ) async def webhook_challenge(): try: token = Request.query_params.get("token").encode("ascii") except Exception as e: logger.exception("Failed to get token", extra={"error": str(e)}) raise HTTPException(status_code=400, detail="Bad request: failed to get token") KEY = "keep-netdata-webhook-integration" # creates HMAC SHA-256 hash from incomming token and your consumer secret sha256_hash_digest = hmac.new( KEY.encode(), msg=token, digestmod=hashlib.sha256 ).digest() # construct response data with base64 encoded hash response = { "response_token": "sha256=" + base64.b64encode(sha256_hash_digest).decode("ascii") } return json.dumps(response) @router.post( "/event/{provider_type}", description="Receive an alert event from a provider", status_code=202, ) async def receive_event( provider_type: str, request: Request, provider_id: str | None = None, provider_name: str | None = None, fingerprint: str | None = None, event=Depends(extract_generic_body), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), ) -> dict[str, str]: trace_id = request.state.trace_id running_tasks: set = request.state.background_tasks provider_class = None try: t = time.time() logger.debug(f"Getting provider class for {provider_type}") provider_class = ProvidersFactory.get_provider_class(provider_type) logger.debug( "Got provider class", extra={ "provider_type": provider_type, "time": time.time() - t, }, ) except ModuleNotFoundError: raise HTTPException( status_code=400, detail=f"Provider {provider_type} not found" ) if not provider_class: raise HTTPException( status_code=400, detail=f"Provider {provider_type} not found" ) # Parse the raw body t = time.time() logger.debug("Parsing event raw body") try: event = provider_class.parse_event_raw_body(event) except Exception: logger.exception( "Failed to parse event raw body", extra={"tenant_id": authenticated_entity.tenant_id, "event": event}, ) raise HTTPException(status_code=400, detail="Malformed event") logger.debug("Parsed event raw body", extra={"time": time.time() - t}) # If provider_name is provided, try to get provider_id from it if provider_name and not provider_id: provider = get_provider_by_name(authenticated_entity.tenant_id, provider_name) if not provider or provider.type != provider_type: raise HTTPException( status_code=404, detail=f"Provider with name '{provider_name}' not found", ) provider_id = provider.id if REDIS: redis: ArqRedis = await get_pool() job = await redis.enqueue_job( "process_event_in_worker", authenticated_entity.tenant_id, provider_type, provider_id, fingerprint, authenticated_entity.api_key_name, trace_id, event, _queue_name=KEEP_ARQ_QUEUE_BASIC, ) logger.info( "Enqueued job", extra={ "job_id": job.job_id, "tenant_id": authenticated_entity.tenant_id, "queue": KEEP_ARQ_QUEUE_BASIC, }, ) task_name = job.job_id else: task_name = create_process_event_task( authenticated_entity.tenant_id, provider_type, provider_id, fingerprint, authenticated_entity.api_key_name, trace_id, event, running_tasks, ) return JSONResponse(content={"task_name": task_name}, status_code=202) @router.get( "/{fingerprint}", description="Get alert by fingerprint", ) def get_alert( fingerprint: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> AlertDto: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alert", extra={ "fingerprint": fingerprint, "tenant_id": tenant_id, }, ) all_alerts = get_all_alerts(authenticated_entity=authenticated_entity) alert = list(filter(lambda alert: alert.fingerprint == fingerprint, all_alerts)) if alert: return alert[0] else: raise HTTPException(status_code=404, detail="Alert not found") @router.post("/enrich/note", description="Enrich an alert note") def enrich_alert_note( enrich_data: EnrichAlertNoteRequestBody, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) # also NOC ), session: Session = Depends(get_session), ) -> dict[str, str]: logger.info("Enriching alert note", extra={"fingerprint": enrich_data.fingerprint}) enriched_data = EnrichAlertRequestBody( enrichments={"note": enrich_data.note}, fingerprint=enrich_data.fingerprint, ) return _enrich_alert( enriched_data, authenticated_entity=authenticated_entity, dispose_on_new_alert=True, session=session, ) @router.post( "/batch_enrich", description="Enrich alerts by providing either a list of fingerprints or a CEL expression to select alerts. Examples for CEL: \"name.contains('CPU')\", \"labels.severity == 'critical'\", \"name.contains('Memory') && labels.region == 'us-east-1'\"", ) def batch_enrich_alerts( enrich_data: BatchEnrichAlertRequestBody, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), dispose_on_new_alert: Optional[bool] = Query( False, description="Dispose on new alert" ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info( "Enriching alerts in batch", extra={ "tenant_id": tenant_id, }, ) if ( "dismissed" in enrich_data.enrichments and enrich_data.enrichments["dismissed"].lower() == "true" ): enrich_data.enrichments["status"] = AlertStatus.SUPPRESSED.value if not enrich_data.fingerprints and not enrich_data.cel: raise HTTPException( status_code=400, detail="Either fingerprints or cel must be provided" ) if enrich_data.fingerprints and enrich_data.cel: raise HTTPException( status_code=400, detail="Either fingerprints or cel can be provided at once" ) # If CEL is provided, use it to find matching alerts if enrich_data.cel: logger.info( "Enriching alerts by CEL query", extra={ "cel": enrich_data.cel, "tenant_id": tenant_id, }, ) try: db_alerts, total_count = query_last_alerts( tenant_id=tenant_id, query=QueryDto(cel=enrich_data.cel), ) if not db_alerts: logger.info( "No alerts found matching the CEL query", extra={"cel": enrich_data.cel, "tenant_id": tenant_id}, ) return { "status": "ok", "message": "No alerts matched the query", } fingerprints = [alert.fingerprint for alert in db_alerts] logger.info( "Found alerts matching CEL query", extra={ "cel": enrich_data.cel, "tenant_id": tenant_id, "alert_count": total_count, }, ) except CelToSqlException as e: logger.exception( f'Error parsing CEL expression "{enrich_data.cel}". {str(e)}' ) raise HTTPException( status_code=400, detail=f"Error parsing CEL expression: {enrich_data.cel}", ) from e except Exception as e: logger.exception("Failed to process CEL query", extra={"error": str(e)}) return {"status": "failed", "message": str(e)} else: # Use the provided fingerprints fingerprints = enrich_data.fingerprints logger.info( "Enriching alerts batch", extra={ "fingerprints": fingerprints, "tenant_id": tenant_id, }, ) # Common enrichment processing try: enrichment_bl = EnrichmentsBl(tenant_id, db=session) ( action_type, action_description, should_run_workflow, should_check_incidents_resolution, ) = enrichment_bl.get_enrichment_metadata( enrich_data.enrichments, authenticated_entity ) enrichments = deepcopy(enrich_data.enrichments) enrichment_bl.batch_enrich( fingerprints=fingerprints, enrichments=enrichments, action_type=action_type, action_callee=authenticated_entity.email, action_description=action_description, dispose_on_new_alert=dispose_on_new_alert, ) last_alerts = get_last_alerts_by_fingerprints( tenant_id, fingerprints, session=session ) alert_ids = [last_alert.alert_id for last_alert in last_alerts] if dispose_on_new_alert: # Create instance-wide enrichment for history # For better database-native UUID support formatted_alert_ids = [ UUIDType(binary=False).process_bind_param( alert_id, session.bind.dialect ) for alert_id in alert_ids ] enrichment_bl.batch_enrich( fingerprints=formatted_alert_ids, enrichments=enrichments, action_type=action_type, action_callee=authenticated_entity.email, action_description=action_description, audit_enabled=False, ) alerts = get_alerts_by_ids(tenant_id, alert_ids, session=session) enriched_alerts_dto = convert_db_alerts_to_dto_alerts(alerts, session=session) # push the enriched alert to the elasticsearch try: logger.info("Pushing enriched alerts to elasticsearch") elastic_client = ElasticClient(tenant_id) elastic_client.index_alerts( alerts=enriched_alerts_dto, ) logger.info("Pushed enriched alerts to elasticsearch") except Exception: logger.exception("Failed to push alerts to elasticsearch") pass # use pusher to push the enriched alert to the client pusher_client = get_pusher_client() if pusher_client: logger.info("Telling client to poll alerts") try: pusher_client.trigger( f"private-{tenant_id}", "poll-alerts", "{}", ) logger.info("Told client to poll alerts") except Exception: logger.exception("Failed to tell client to poll alerts") pass logger.info( "Alerts batch enriched successfully", extra={"fingerprints": fingerprints, "tenant_id": tenant_id}, ) if should_run_workflow: workflow_manager = WorkflowManager.get_instance() workflow_manager.insert_events( tenant_id=tenant_id, events=enriched_alerts_dto, ) # @tb add "and session" cuz I saw AttributeError: 'NoneType' object has no attribute 'add'" if should_check_incidents_resolution and session: enrich_alerts_with_incidents(tenant_id=tenant_id, alerts=alerts) for alert in alerts: for incident in alert._incidents: if ( incident.resolve_on == ResolveOn.ALL.value and is_all_alerts_resolved(incident=incident, session=session) ): incident.status = IncidentStatus.RESOLVED.value session.add(incident) session.commit() return {"status": "ok"} except HTTPException: # Re-raise HTTP exceptions raise except Exception as e: logger.exception("Failed to enrich alerts batch", extra={"error": str(e)}) return {"status": "failed"} @router.post( "/enrich", description="Enrich an alert", ) def enrich_alert( enrich_data: EnrichAlertRequestBody, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), dispose_on_new_alert: Optional[bool] = Query( False, description="Dispose on new alert" ), session: Session = Depends(get_session), ) -> dict[str, str]: if ( "dismissed" in enrich_data.enrichments and enrich_data.enrichments["dismissed"].lower() == "true" ): enrich_data.enrichments["status"] = AlertStatus.SUPPRESSED.value tenant_id = authenticated_entity.tenant_id logger.info( "Enriching alert", extra={ "fingerprint": enrich_data.fingerprint, "tenant_id": tenant_id, }, ) return _enrich_alert( enrich_data, authenticated_entity=authenticated_entity, dispose_on_new_alert=dispose_on_new_alert, session=session, ) def _enrich_alert( enrich_data: EnrichAlertRequestBody, authenticated_entity: AuthenticatedEntity, session: Session, dispose_on_new_alert: bool = False, ) -> dict[str, str]: tenant_id = authenticated_entity.tenant_id logger.info( "Enriching alert", extra={ "fingerprint": enrich_data.fingerprint, "tenant_id": tenant_id, }, ) try: enrichement_bl = EnrichmentsBl(tenant_id, db=session) ( action_type, action_description, should_run_workflow, should_check_incidents_resolution, ) = enrichement_bl.get_enrichment_metadata( enrich_data.enrichments, authenticated_entity ) enrichments = deepcopy(enrich_data.enrichments) enrichment_kwargs = { "fingerprint": enrich_data.fingerprint, "enrichments": enrichments, "action_type": action_type, "action_callee": authenticated_entity.email, "action_description": action_description, } if dispose_on_new_alert: enrichement_bl.disposable_enrich_entity(**enrichment_kwargs) else: enrichement_bl.enrich_entity(**enrichment_kwargs) # get the alert with the new enrichment alert = get_alerts_by_fingerprint( authenticated_entity.tenant_id, enrich_data.fingerprint, limit=1 ) if not alert: logger.warning( "Alert not found", extra={"fingerprint": enrich_data.fingerprint} ) return {"status": "failed"} enriched_alerts_dto = convert_db_alerts_to_dto_alerts(alert, session=session) # push the enriched alert to the elasticsearch try: logger.info("Pushing enriched alert to elasticsearch") elastic_client = ElasticClient(tenant_id) elastic_client.index_alert( alert=enriched_alerts_dto[0], ) logger.info("Pushed enriched alert to elasticsearch") except Exception: logger.exception("Failed to push alert to elasticsearch") pass # use pusher to push the enriched alert to the client pusher_client = get_pusher_client() if pusher_client: logger.info("Telling client to poll alerts") try: pusher_client.trigger( f"private-{tenant_id}", "poll-alerts", "{}", ) logger.info("Told client to poll alerts") except Exception: logger.exception("Failed to tell client to poll alerts") pass logger.info( "Alert enriched successfully", extra={"fingerprint": enrich_data.fingerprint, "tenant_id": tenant_id}, ) if should_run_workflow: workflow_manager = WorkflowManager.get_instance() workflow_manager.insert_events( tenant_id=tenant_id, events=[enriched_alerts_dto[0]] ) if should_check_incidents_resolution: enrichement_bl.check_incident_resolution(enriched_alerts_dto[0]) return {"status": "ok"} except Exception as e: logger.exception("Failed to enrich alert", extra={"error": str(e)}) return {"status": "failed"} @router.post( "/unenrich", description="Un-Enrich an alert", ) def unenrich_alert( enrich_data: UnEnrichAlertRequestBody, pusher_client: Pusher = Depends(get_pusher_client), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), ) -> dict[str, str]: tenant_id = authenticated_entity.tenant_id logger.info( "Un-Enriching alert", extra={ "fingerprint": enrich_data.fingerprint, "tenant_id": tenant_id, }, ) if "assignees" in enrich_data.enrichments: return {"status": "failed"} alert = get_alerts_by_fingerprint( authenticated_entity.tenant_id, enrich_data.fingerprint, limit=1 ) if not alert: logger.warning( "Alert not found", extra={"fingerprint": enrich_data.fingerprint} ) return {"status": "failed"} try: enrichement_bl = EnrichmentsBl(tenant_id) if "status" in enrich_data.enrichments: action_type = ActionType.STATUS_UNENRICH action_description = ( f"Alert status was un-enriched by {authenticated_entity.email}" ) elif "note" in enrich_data.enrichments: action_type = ActionType.UNCOMMENT action_description = f"Comment removed by {authenticated_entity.email}" elif "ticket_url" in enrich_data.enrichments: action_type = ActionType.TICKET_UNASSIGNED action_description = f"Ticket unassigned by {authenticated_entity.email}" else: action_type = ActionType.GENERIC_UNENRICH action_description = f"Alert en-enriched by {authenticated_entity.email}" enrichments_object = get_enrichment(tenant_id, enrich_data.fingerprint) enrichments = enrichments_object.enrichments new_enrichments = { key: value for key, value in enrichments.items() if key not in enrich_data.enrichments } enrichement_bl.enrich_entity( fingerprint=enrich_data.fingerprint, enrichments=new_enrichments, action_type=action_type, action_callee=authenticated_entity.email, action_description=action_description, force=True, ) alert = get_alerts_by_fingerprint( authenticated_entity.tenant_id, enrich_data.fingerprint, limit=1 ) enriched_alerts_dto = convert_db_alerts_to_dto_alerts(alert) # push the enriched alert to the elasticsearch try: logger.info("Pushing enriched alert to elasticsearch") elastic_client = ElasticClient(tenant_id) elastic_client.index_alert( alert=enriched_alerts_dto[0], ) logger.info("Pushed un-enriched alert to elasticsearch") except Exception: logger.exception("Failed to push alert to elasticsearch") pass # use pusher to push the enriched alert to the client if pusher_client: logger.info("Telling client to poll alerts") try: pusher_client.trigger( f"private-{tenant_id}", "poll-alerts", "{}", ) logger.info("Told client to poll alerts") except Exception: logger.exception("Failed to tell client to poll alerts") pass logger.info( "Alert un-enriched successfully", extra={"fingerprint": enrich_data.fingerprint, "tenant_id": tenant_id}, ) return {"status": "ok"} except Exception as e: logger.exception("Failed to un-enrich alert", extra={"error": str(e)}) return {"status": "failed"} @router.post( "/search", description="Search alerts", ) async def search_alerts( search_request: SearchAlertsRequest, # Use the model directly authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list[AlertDto]: tenant_id = authenticated_entity.tenant_id try: logger.info( "Searching alerts", extra={"tenant_id": tenant_id}, ) search_engine = SearchEngine(tenant_id) filtered_alerts = search_engine.search_alerts(search_request.query) logger.info( "Searched alerts", extra={"tenant_id": tenant_id}, ) return filtered_alerts except celpy.celparser.CELParseError as e: logger.warning("Failed to parse the search query", extra={"error": str(e)}) return JSONResponse( status_code=400, content={ "error": "Failed to parse the search query", "query": search_request.query, "line": e.line, "column": e.column, }, ) except HTTPException: raise except Exception as e: logger.exception("Failed to search alerts", extra={"error": str(e)}) raise HTTPException(status_code=500, detail="Failed to search alerts") @router.post( "/audit", description="Get alert timeline audit trail for multiple fingerprints", ) def get_multiple_fingerprint_alert_audit( fingerprints: list[str], authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list[AlertAuditDto]: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alert audit", extra={"fingerprints": fingerprints, "tenant_id": tenant_id}, ) alert_audit = get_alert_audit_db(tenant_id, fingerprints) if not alert_audit: raise HTTPException(status_code=404, detail="Alert not found") grouped_events = [] # Group the results by fingerprint for "deduplication" (2x, 3x, etc.) thingy.. grouped_audit = {} for audit in alert_audit: if audit.fingerprint not in grouped_audit: grouped_audit[audit.fingerprint] = [] grouped_audit[audit.fingerprint].append(audit) for values in grouped_audit.values(): grouped_events.extend(AlertAuditDto.from_orm_list(values)) return grouped_events @router.get( "/{fingerprint}/audit", description="Get alert timeline audit trail", ) def get_alert_audit( fingerprint: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list[AlertAuditDto]: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alert audit", extra={ "fingerprint": fingerprint, "tenant_id": tenant_id, }, ) alert_audit = get_alert_audit_db(tenant_id, fingerprint) if not alert_audit: raise HTTPException(status_code=404, detail="Alert not found") grouped_events = AlertAuditDto.from_orm_list(alert_audit) return grouped_events @router.get("/quality/metrics", description="Get alert quality") def get_alert_quality( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), time_stamp: TimeStampFilter = Depends(get_time_stamp_filter), fields: Optional[List[str]] = Query([]), ): logger.info( "Fetching alert quality metrics per provider", extra={"tenant_id": authenticated_entity.tenant_id, "fields": fields}, ) start_date = time_stamp.lower_timestamp if time_stamp else None end_date = time_stamp.upper_timestamp if time_stamp else None db_alerts_quality = get_alerts_metrics_by_provider( tenant_id=authenticated_entity.tenant_id, start_date=start_date, end_date=end_date, fields=fields, ) return db_alerts_quality @router.get( "/event/error", description="Get alerts that Keep failed to process", ) def get_error_alerts( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), limit: int = 1000, ) -> list[AlertErrorDto]: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching error alerts from DB", extra={ "tenant_id": tenant_id, }, ) error_alerts = get_error_alerts_db(tenant_id=tenant_id, limit=limit) error_alerts_dtos = [ AlertErrorDto( id=str(alert.id), event=alert.raw_alert or {}, error_message=alert.error_message, timestamp=alert.timestamp, provider_type=alert.provider_type or "keep", ) for alert in error_alerts ] logger.info( "Fetched error alerts from DB", extra={ "tenant_id": tenant_id, }, ) return error_alerts_dtos @router.post( "/event/error/dismiss", description="Dismiss error alerts. If alert_id is provided, dismisses that specific alert. If no alert_id is provided, dismisses all alerts.", ) def dismiss_error_alerts( request: DismissAlertRequest = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id # If alert_id is provided, dismiss a specific alert if request and request.alert_id: alert_id = request.alert_id logger.info( "Dismissing specific error alert", extra={ "tenant_id": tenant_id, "alert_id": alert_id, }, ) # Update the alert in the database to mark it as dismissed dismiss_error_alerts_db( tenant_id=tenant_id, alert_id=alert_id, dismissed_by=authenticated_entity.email, ) logger.info( "Successfully dismissed an error alert", extra={ "tenant_id": tenant_id, "alert_id": alert_id, }, ) return {"success": True, "message": "Alert dismissed successfully"} # If no alert_id is provided, dismiss all alerts else: logger.info( "Dismissing all error alerts for tenant", extra={ "tenant_id": tenant_id, }, ) # Update all alerts for the tenant to mark them as dismissed dismiss_error_alerts_db( tenant_id=tenant_id, dismissed_by=authenticated_entity.email ) logger.info( "Successfully dismissed all error alerts", extra={ "tenant_id": tenant_id, }, ) return {"success": True, "message": "Successfully dismissed all alerts"} ================================================ FILE: keep/api/routes/auth/__init__.py ================================================ ================================================ FILE: keep/api/routes/auth/groups.py ================================================ import logging from fastapi import APIRouter, Depends from pydantic import BaseModel from keep.api.models.user import Group from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) class CreateOrUpdateGroupRequest(BaseModel): name: str roles: list[str] members: list[str] class Config: allow_population_by_field_name = True @router.get("", description="Get all groups") def get_groups( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ) -> list[Group]: identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) groups = identity_manager.get_groups() return groups @router.post("", description="Create a group") def create_group( group: CreateOrUpdateGroupRequest, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) return identity_manager.create_group(group.name, group.members, group.roles) @router.put("/{group_name}", description="Update a group") def update_group( group_name: str, group: CreateOrUpdateGroupRequest, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) return identity_manager.update_group(group.name, group.members, group.roles) @router.delete("/{group_name}", description="Delete a group") def delete_group( group_name: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) return identity_manager.delete_group(group_name) ================================================ FILE: keep/api/routes/auth/permissions.py ================================================ import logging from typing import List from fastapi import APIRouter, Body, Depends from keep.api.models.user import ResourcePermission from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import ALL_RESOURCES from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) @router.get("", description="Get resources permissions") def get_permissions( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ) -> List[ResourcePermission]: identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) try: permissions = identity_manager.get_permissions() except Exception as e: logger.error(f"Failed to get permissions: {e}") return [] # filter out permissions for keep_alert permissions = [ permission for permission in permissions if "keep_alert" not in permission.resource_type and "keep_route" not in permission.resource_type ] return permissions @router.post("", description="Create permissions for resources") def create_permissions( resource_permissions: List[ResourcePermission] = Body( ..., description="List of resource permissions" ), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) identity_manager.create_permissions(resource_permissions) return {"message": "Permissions created successfully"} @router.get("/scopes", description="Get all resources types") def get_scopes( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ) -> List[str]: scopes = [] for resource in ALL_RESOURCES: scopes.extend( [ f"read:{resource}", f"write:{resource}", f"delete:{resource}", f"update:{resource}", ] ) return scopes ================================================ FILE: keep/api/routes/auth/roles.py ================================================ import logging from fastapi import APIRouter, Body, Depends from keep.api.models.user import CreateOrUpdateRole, Role from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) @router.get("", description="Get roles") def get_roles( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ) -> list[Role]: identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) roles = identity_manager.get_roles() return roles @router.post("", description="Create role") def create_role( role: CreateOrUpdateRole = Body(..., description="Role"), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) role = identity_manager.create_role(role) return role @router.put("/{role_id}", description="Update role") def update_role( role_id: str, role: CreateOrUpdateRole = Body(..., description="Role"), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) role = identity_manager.update_role(role_id, role) return role @router.delete("/{role_id}", description="Delete role") def delete_role( role_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) identity_manager.delete_role(role_id) return {"status": "OK"} ================================================ FILE: keep/api/routes/auth/users.py ================================================ import logging from typing import Optional from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel, Field, validator from keep.api.models.user import User from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) class CreateUserRequest(BaseModel): email: str = Field(alias="username") name: Optional[str] = None password: Optional[str] = None # auth0 does not need password role: Optional[str] = ( None # user can be assigned to group and get its roles from groups ) groups: Optional[list[str]] = None class Config: allow_population_by_field_name = True class UpdateUserRequest(BaseModel): email: Optional[str] = Field(alias="username") password: Optional[str] = None role: Optional[str] = Field(default=None) groups: Optional[list[str]] = None class Config: allow_population_by_field_name = True @validator("role", allow_reuse=True) def validate_role(cls, v): if v == "": return None return v @router.get("", description="Get all users") def get_users( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ) -> list[User]: identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) return identity_manager.get_users() @router.delete("/{user_email}", description="Delete a user") def delete_user( user_email: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) return identity_manager.delete_user(user_email) @router.post("", description="Create a user") async def create_user( request_data: CreateUserRequest, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): tenant_id = authenticated_entity.tenant_id user_email = request_data.email user_name = request_data.name password = request_data.password role = request_data.role groups = request_data.groups if not user_email: raise HTTPException(status_code=400, detail="Email is required") identity_manager = IdentityManagerFactory.get_identity_manager(tenant_id) return identity_manager.create_user( user_email=user_email, user_name=user_name, password=password, role=role, groups=groups, ) @router.put("/{user_email}", description="Update a user") async def update_user( user_email: str, request_data: UpdateUserRequest, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): tenant_id = authenticated_entity.tenant_id identity_manager = IdentityManagerFactory.get_identity_manager(tenant_id) update_data = request_data.dict(exclude_unset=True) if not update_data: raise HTTPException(status_code=400, detail="No update data provided") try: return identity_manager.update_user(user_email, update_data) except NotImplementedError: raise HTTPException( status_code=501, detail="Updating users is not supported by this identity manager", ) ================================================ FILE: keep/api/routes/cel.py ================================================ import logging from typing import Any from fastapi import APIRouter from pydantic import BaseModel from keep.api.core.cel_to_sql.cel_ast_converter import CelToAstConverter from celpy import CELParseError router = APIRouter() logger = logging.getLogger(__name__) class CelExpressionPayload(BaseModel): cel: str class CelExpressionValidationMarker(BaseModel): columnStart: int columnEnd: int @router.post( "/validate", description="Validate CEL expression", ) def validate( cel_payload: CelExpressionPayload, ) -> Any: try: CelToAstConverter.convert_to_ast(cel_payload.cel) return [] except CELParseError as e: return [ CelExpressionValidationMarker( columnStart=e.column, columnEnd=e.column + 1, ) ] ================================================ FILE: keep/api/routes/dashboard.py ================================================ import json import logging import os from datetime import datetime, timedelta from typing import Dict, List, Optional from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel from keep.api.core.db import ( create_dashboard as create_dashboard_db, get_provider_distribution, get_incidents_created_distribution, get_combined_workflow_execution_distribution, calc_incidents_mttr, ) from keep.api.core.db import delete_dashboard as delete_dashboard_db from keep.api.core.db import get_dashboards as get_dashboards_db from keep.api.core.db import update_dashboard as update_dashboard_db from keep.api.models.time_stamp import TimeStampFilter, _get_time_stamp_filter from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory class DashboardCreateDTO(BaseModel): dashboard_name: str dashboard_config: Dict class DashboardUpdateDTO(BaseModel): dashboard_config: Optional[Dict] = None # Allow partial updates dashboard_name: Optional[str] = None class DashboardResponseDTO(BaseModel): id: str dashboard_name: str dashboard_config: Dict created_at: datetime updated_at: datetime router = APIRouter() logger = logging.getLogger(__name__) def provision_dashboards(tenant_id: str): try: dashboards_raw = json.loads(os.environ.get("KEEP_DASHBOARDS", "[]")) except Exception: logger.exception("Failed to load dashboards from environment variable") return if not dashboards_raw: logger.debug("No dashboards to provision") return logger.info( "Provisioning Dashboards", extra={"num_of_dashboards": len(dashboards_raw)} ) dashboards_to_provision = [ DashboardCreateDTO.parse_obj(dashboard) for dashboard in dashboards_raw ] for dashboard in dashboards_to_provision: logger.info( "Provisioning Dashboard", extra={"dashboard_name": dashboard.dashboard_name}, ) try: create_dashboard_db( tenant_id, dashboard.dashboard_name, "system", dashboard.dashboard_config, ) logger.info( "Provisioned Dashboard", extra={"dashboard_name": dashboard.dashboard_name}, ) except Exception: logger.exception( "Failed to provision dashboard", extra={"dashboard_name": dashboard.dashboard_name}, ) logger.info( "Provisioned Dashboards", extra={"num_of_dashboards": len(dashboards_raw)} ) @router.get("", response_model=List[DashboardResponseDTO]) def read_dashboards( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:dashboards"]) ), ): dashboards = get_dashboards_db(authenticated_entity.tenant_id) return dashboards @router.post("", response_model=DashboardResponseDTO) def create_dashboard( dashboard_dto: DashboardCreateDTO, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:dashboards"]) ), ): email = authenticated_entity.email dashboard = create_dashboard_db( tenant_id=authenticated_entity.tenant_id, dashboard_name=dashboard_dto.dashboard_name, dashboard_config=dashboard_dto.dashboard_config, created_by=email, ) return dashboard @router.put("/{dashboard_id}", response_model=DashboardResponseDTO) def update_dashboard( dashboard_id: str, dashboard_dto: DashboardUpdateDTO, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:dashboards"]) ), ): # update the dashboard in the database dashboard = update_dashboard_db( tenant_id=authenticated_entity.tenant_id, dashboard_id=dashboard_id, dashboard_name=dashboard_dto.dashboard_name, dashboard_config=dashboard_dto.dashboard_config, updated_by=authenticated_entity.email, ) return dashboard @router.delete("/{dashboard_id}") def delete_dashboard( dashboard_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:dashboards"]) ), ): # delete the dashboard from the database dashboard = delete_dashboard_db(authenticated_entity.tenant_id, dashboard_id) if not dashboard: raise HTTPException(status_code=404, detail="Dashboard not found") return {"ok": True} @router.get("/metric-widgets") def get_metric_widgets( time_stamp: TimeStampFilter = Depends(_get_time_stamp_filter), mttr: bool = True, apd: bool = True, ipd: bool = True, wpd: bool = True, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:dashboards"]) ), ): data = {} tenant_id = authenticated_entity.tenant_id if not time_stamp.lower_timestamp or not time_stamp.upper_timestamp: time_stamp = TimeStampFilter( upper_timestamp=datetime.utcnow(), lower_timestamp=datetime.utcnow() - timedelta(hours=24), ) if apd: data["apd"] = get_provider_distribution( tenant_id=tenant_id, aggregate_all=True, timestamp_filter=time_stamp ) if ipd: data["ipd"] = get_incidents_created_distribution( tenant_id=tenant_id, timestamp_filter=time_stamp ) if wpd: data["wpd"] = get_combined_workflow_execution_distribution( tenant_id=tenant_id, timestamp_filter=time_stamp ) if mttr: data["mttr"] = calc_incidents_mttr( tenant_id=tenant_id, timestamp_filter=time_stamp ) return data ================================================ FILE: keep/api/routes/deduplications.py ================================================ import logging import uuid from fastapi import APIRouter, Depends, HTTPException from keep.api.alert_deduplicator.alert_deduplicator import AlertDeduplicator from keep.api.models.alert import DeduplicationRuleRequestDto as DeduplicationRule from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) @router.get( "", description="Get Deduplications", ) def get_deduplications( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:deduplications"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info("Getting deduplications") alert_deduplicator = AlertDeduplicator(tenant_id) deduplications = alert_deduplicator.get_deduplications() logger.info(deduplications) return deduplications @router.get( "/fields", description="Get Optional Fields For Deduplications", ) def get_deduplication_fields( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:deduplications"]) ), ) -> dict[str, list[str]]: tenant_id = authenticated_entity.tenant_id logger.info("Getting deduplication fields") alert_deduplicator = AlertDeduplicator(tenant_id) fields = alert_deduplicator.get_deduplication_fields() logger.info("Got deduplication fields") return fields @router.post( "", description="Create Deduplication Rule", ) def create_deduplication_rule( rule: DeduplicationRule, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:deduplications"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info( "Creating deduplication rule", extra={"tenant_id": tenant_id, "rule": rule.dict()}, ) alert_deduplicator = AlertDeduplicator(tenant_id) try: # This is a custom rule created_rule = alert_deduplicator.create_deduplication_rule( rule=rule, created_by=authenticated_entity.email ) logger.info("Created deduplication rule") return created_rule except HTTPException as e: raise e except Exception as e: logger.exception("Error creating deduplication rule") raise HTTPException(status_code=400, detail=str(e)) @router.put( "/{rule_id}", description="Update Deduplication Rule", ) def update_deduplication_rule( rule_id: str, rule: DeduplicationRule, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:deduplications"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info("Updating deduplication rule", extra={"rule_id": rule_id}) alert_deduplicator = AlertDeduplicator(tenant_id) try: updated_rule = alert_deduplicator.update_deduplication_rule( rule_id, rule, authenticated_entity.email ) logger.info("Updated deduplication rule") return updated_rule except Exception as e: logger.exception("Error updating deduplication rule") raise HTTPException(status_code=400, detail=str(e)) @router.delete( "/{rule_id}", description="Delete Deduplication Rule", ) def delete_deduplication_rule( rule_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:deduplications"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info("Deleting deduplication rule", extra={"rule_id": rule_id}) alert_deduplicator = AlertDeduplicator(tenant_id) # verify rule id is uuid try: uuid.UUID(rule_id) except ValueError: raise HTTPException(status_code=400, detail="Invalid rule id") try: success = alert_deduplicator.delete_deduplication_rule(rule_id) if success: logger.info("Deleted deduplication rule") return {"message": "Deduplication rule deleted successfully"} else: raise HTTPException(status_code=404, detail="Deduplication rule not found") except HTTPException as e: logger.exception("Error deleting deduplication rule") # keep the same status code raise e except Exception as e: logger.exception("Error deleting deduplication rule") raise HTTPException(status_code=400, detail=str(e)) ================================================ FILE: keep/api/routes/extraction.py ================================================ import logging from uuid import UUID from fastapi import APIRouter, Depends, HTTPException, Query from fastapi.responses import JSONResponse from sqlmodel import Session from keep.api.bl.enrichments_bl import EnrichmentsBl from keep.api.core.db import get_alert_by_event_id, get_session from keep.api.models.db.enrichment_event import EnrichmentEventWithLogs, EnrichmentType from keep.api.models.db.extraction import ( ExtractionRule, ExtractionRuleDtoBase, ExtractionRuleDtoOut, ) from keep.api.utils.pagination import EnrichmentEventPaginatedResultsDto from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) @router.get("", description="Get all extraction rules") def get_extraction_rules( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:extraction"]) ), session: Session = Depends(get_session), ) -> list[ExtractionRuleDtoOut]: logger.info("Getting extraction rules") rules = ( session.query(ExtractionRule) .filter(ExtractionRule.tenant_id == authenticated_entity.tenant_id) .all() ) return [ExtractionRuleDtoOut(**rule.dict()) for rule in rules] @router.post("", description="Create a new extraction rule") def create_extraction_rule( rule_dto: ExtractionRuleDtoBase, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:extraction"]) ), session: Session = Depends(get_session), ) -> ExtractionRuleDtoOut: logger.info("Creating a new extraction rule") new_rule = ExtractionRule( **rule_dto.dict(), created_by=authenticated_entity.email, tenant_id=authenticated_entity.tenant_id ) session.add(new_rule) session.commit() session.refresh(new_rule) return ExtractionRuleDtoOut(**new_rule.dict()) @router.put("/{rule_id}", description="Update an existing extraction rule") def update_extraction_rule( rule_id: int, rule_dto: ExtractionRuleDtoBase, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:extraction"]) ), session: Session = Depends(get_session), ) -> ExtractionRuleDtoOut: logger.info("Updating an extraction rule") rule: ExtractionRule | None = ( session.query(ExtractionRule) .filter( ExtractionRule.id == rule_id, ExtractionRule.tenant_id == authenticated_entity.tenant_id, ) .first() ) if rule is None: raise HTTPException(status_code=404, detail="Extraction rule not found") for key, value in rule_dto.dict(exclude_unset=True).items(): setattr(rule, key, value) rule.updated_by = authenticated_entity.email session.commit() session.refresh(rule) return ExtractionRuleDtoOut(**rule.dict()) @router.delete("/{rule_id}", description="Delete an extraction rule") def delete_extraction_rule( rule_id: int, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:extraction"]) ), session: Session = Depends(get_session), ): logger.info("Deleting an extraction rule") rule = ( session.query(ExtractionRule) .filter( ExtractionRule.id == rule_id, ExtractionRule.tenant_id == authenticated_entity.tenant_id, ) .first() ) if rule is None: raise HTTPException(status_code=404, detail="Extraction rule not found") session.delete(rule) session.commit() return {"message": "Extraction rule deleted successfully"} @router.post( "/{rule_id}/execute/{alert_id}", description="Execute an extraction rule against an alert", responses={ 200: {"description": "Extraction rule executed successfully"}, 400: {"description": "Extraction rule failed to execute"}, 404: {"description": "Extraction rule or alert not found"}, 403: { "description": "User does not have permission to execute extraction rule" }, }, ) def execute_rule( rule_id: int, alert_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:extraction"]) ), ): logger.info( "Executing an extraction rule against an alert", extra={ "rule_id": rule_id, "alert_id": alert_id, "tenant_id": authenticated_entity.tenant_id, }, ) enrichment_bl = EnrichmentsBl(tenant_id=authenticated_entity.tenant_id) alert = get_alert_by_event_id(authenticated_entity.tenant_id, str(alert_id)) if not alert: raise HTTPException(status_code=404, detail="Alert not found") enriched = enrichment_bl.run_extraction_rule_by_id(rule_id, alert) if enriched: logger.info( "Extraction rule executed successfully", extra={"rule_id": rule_id, "alert_id": alert_id}, ) else: logger.error( "Extraction rule failed to execute", extra={"rule_id": rule_id, "alert_id": alert_id}, ) return JSONResponse( status_code=200, content={"enrichment_event_id": str(enrichment_bl.enrichment_event_id)}, ) @router.get("/{rule_id}/executions", description="Get all executions for a rule") def get_enrichment_events( rule_id: int, limit: int = Query(20), offset: int = Query(0), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:extraction"]) ), ) -> EnrichmentEventPaginatedResultsDto: logger.info( "Getting enrichment events", extra={ "rule_id": rule_id, "limit": limit, "offset": offset, "tenant_id": authenticated_entity.tenant_id, }, ) enrichment_bl = EnrichmentsBl(tenant_id=authenticated_entity.tenant_id) events = enrichment_bl.get_enrichment_events( rule_id, limit, offset, EnrichmentType.EXTRACTION ) total_count = enrichment_bl.get_total_enrichment_events( rule_id, EnrichmentType.EXTRACTION ) logger.info( "Got enrichment events", extra={"events_count": len(events)}, ) return EnrichmentEventPaginatedResultsDto( count=total_count, items=events, limit=limit, offset=offset, ) @router.get( "/{rule_id}/executions/{enrichment_event_id}", description="Get an execution for a rule", ) def get_enrichment_event_logs( rule_id: int, enrichment_event_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:extraction"]) ), ) -> EnrichmentEventWithLogs: logger.info( "Getting enrichment event logs", extra={ "rule_id": rule_id, "enrichment_event_id": enrichment_event_id, "tenant_id": authenticated_entity.tenant_id, }, ) enrichment_bl = EnrichmentsBl(tenant_id=authenticated_entity.tenant_id) enrichment_event = enrichment_bl.get_enrichment_event(enrichment_event_id) logs = enrichment_bl.get_enrichment_event_logs(enrichment_event_id) if not logs: raise HTTPException(status_code=404, detail="Logs not found") logger.info( "Got enrichment event logs", extra={"logs_count": len(logs)}, ) return EnrichmentEventWithLogs( enrichment_event=enrichment_event, logs=logs, ) ================================================ FILE: keep/api/routes/facets.py ================================================ import logging from fastapi import ( APIRouter, Depends, HTTPException, ) import keep.api.core.facets as facets from keep.api.models.facet import CreateFacetDto, FacetDto from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) # Mapping of entity name to entity type # TODO: Maybe we need to migrate current facets to match endpoint entity names entity_name_to_entity_type = { "incidents": "incident", "alerts": "alert", "workflows": "workflow", } @router.post( "", description="Add facet for {entity_name}", ) async def add_facet( entity_name: str, create_facet_dto: CreateFacetDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ) ) -> FacetDto: if entity_name not in entity_name_to_entity_type: raise HTTPException(status_code=409, detail="Entity not found") entity_type = entity_name_to_entity_type[entity_name] tenant_id = authenticated_entity.tenant_id logger.info( "Creating facet for incident", extra={ "tenant_id": tenant_id, }, ) created_facet = facets.create_facet( tenant_id=tenant_id, entity_type=entity_type, facet=create_facet_dto ) return created_facet @router.delete( "/{facet_id}", description="Delete facet for {enity_name}", ) async def delete_facet( facet_id: str, entity_name: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ) ): if entity_name not in entity_name_to_entity_type: raise HTTPException(status_code=409, detail="Entity not found") entity_type = entity_name_to_entity_type[entity_name] tenant_id = authenticated_entity.tenant_id logger.info( "Deleting facet for incident", extra={ "tenant_id": tenant_id, "facet_id": facet_id, }, ) is_deleted = facets.delete_facet( tenant_id=tenant_id, entity_type=entity_type, facet_id=facet_id ) if not is_deleted: raise HTTPException(status_code=404, detail="Facet not found") ================================================ FILE: keep/api/routes/healthcheck.py ================================================ from fastapi import APIRouter router = APIRouter() @router.get("", description="simple healthcheck endpoint") def healthcheck() -> dict: """ Does nothing but return 200 response code Returns: dict: empty JSON object """ return {} ================================================ FILE: keep/api/routes/incidents.py ================================================ import logging from typing import List, Optional from uuid import UUID from arq import ArqRedis from fastapi import ( APIRouter, BackgroundTasks, Body, Depends, HTTPException, Query, Request, Response, ) from pusher import Pusher from sqlmodel import Session from keep.api.arq_pool import get_pool from keep.api.bl.ai_suggestion_bl import AISuggestionBl from keep.api.bl.enrichments_bl import EnrichmentsBl from keep.api.bl.incident_reports import IncidentReportsBl from keep.api.bl.incidents_bl import IncidentBl from keep.api.consts import KEEP_ARQ_QUEUE_BASIC, REDIS from keep.api.core.cel_to_sql.sql_providers.base import CelToSqlException from keep.api.core.db import ( DestinationIncidentNotFound, add_audit, confirm_predicted_incident_by_id, get_future_incidents_by_incident_id, get_incident_alerts_and_links_by_incident_id, get_incident_by_id, get_incidents_meta_for_tenant, get_last_alerts, get_rule, get_session, get_workflow_executions_for_incident_or_alert, merge_incidents_to_id, get_enrichment, ) from keep.api.core.dependencies import extract_generic_body, get_pusher_client from keep.api.core.incidents import ( get_incident_facets, get_incident_facets_data, get_incident_potential_facet_fields, ) from keep.api.models.action_type import ActionType from keep.api.models.alert import ( AlertDto, EnrichIncidentRequestBody, UnEnrichIncidentRequestBody, ) from keep.api.models.db.alert import ( AlertAudit, CommentMention, ) from keep.api.models.db.incident import IncidentSeverity, IncidentStatus from keep.api.models.facet import FacetOptionsQueryDto from keep.api.models.incident import ( IncidentCommit, IncidentDto, IncidentDtoIn, IncidentListFilterParamsDto, IncidentsClusteringSuggestion, IncidentSeverityChangeDto, IncidentSorting, IncidentStatusChangeDto, MergeIncidentsRequestDto, MergeIncidentsResponseDto, SplitIncidentRequestDto, SplitIncidentResponseDto, ) from keep.api.models.workflow import WorkflowExecutionDTO from keep.api.tasks.process_incident_task import process_incident from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts from keep.api.utils.pagination import ( AlertWithIncidentLinkMetadataPaginatedResultsDto, IncidentsPaginatedResultsDto, WorkflowExecutionsPaginatedResultsDto, ) from keep.api.utils.pluralize import pluralize from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory from keep.providers.providers_factory import ProvidersFactory from keep.topologies.topologies_service import TopologiesService # noqa router = APIRouter() logger = logging.getLogger(__name__) @router.post( "", description="Create new incident", status_code=202, response_model=IncidentDto, ) def create_incident( incident_dto: IncidentDtoIn, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), session: Session = Depends(get_session), ) -> IncidentDto: tenant_id = authenticated_entity.tenant_id incident_bl = IncidentBl(tenant_id, session, pusher_client) return incident_bl.create_incident(incident_dto) @router.get( "/meta", description="Get incidents' metadata for filtering", response_model=IncidentListFilterParamsDto, ) def get_incidents_meta( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> IncidentListFilterParamsDto: tenant_id = authenticated_entity.tenant_id meta = get_incidents_meta_for_tenant(tenant_id=tenant_id) return IncidentListFilterParamsDto(**meta) @router.get( "", description="Get last incidents", ) def get_all_incidents( candidate: bool = False, predicted: Optional[bool] = None, limit: int = 25, offset: int = 0, sorting: IncidentSorting = IncidentSorting.creation_time, status: List[IncidentStatus] = Query(None), severity: List[IncidentSeverity] = Query(None), assignees: List[str] = Query(None), sources: List[str] = Query(None), affected_services: List[str] = Query(None), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), cel: str = Query(None), ) -> IncidentsPaginatedResultsDto: tenant_id = authenticated_entity.tenant_id filters = {} if status: filters["status"] = [s.value for s in status] if severity: filters["severity"] = [s.order for s in severity] if assignees: filters["assignee"] = assignees if sources: filters["sources"] = sources if affected_services: filters["affected_services"] = affected_services logger.info( "Fetching incidents from DB", extra={ "tenant_id": tenant_id, "limit": limit, "offset": offset, "sorting": sorting, "filters": filters, }, ) # get all preset ids that the user has access to identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) # Note: if no limitations (allowed_preset_ids is []), then all presets are allowed allowed_incident_ids = identity_manager.get_user_permission_on_resource_type( resource_type="incident", authenticated_entity=authenticated_entity, ) incident_bl = IncidentBl(tenant_id, session=None, pusher_client=None) try: result = incident_bl.query_incidents( tenant_id=tenant_id, is_candidate=candidate, is_predicted=predicted, limit=limit, offset=offset, sorting=sorting, cel=cel, allowed_incident_ids=allowed_incident_ids, ) logger.info( "Fetched incidents from DB", extra={ "tenant_id": tenant_id, "limit": limit, "offset": offset, "sorting": sorting, "filters": filters, }, ) return result except CelToSqlException as e: logger.exception(f'Error parsing CEL expression "{cel}". {str(e)}') raise HTTPException( status_code=400, detail=f"Error parsing CEL expression: {cel}" ) from e @router.post( "/facets/options", description="Query incident facet options. Accepts dictionary where key is facet id and value is cel to query facet", ) def fetch_inicident_facet_options( facet_options_query: FacetOptionsQueryDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident facets from DB", extra={ "tenant_id": tenant_id, }, ) # get all preset ids that the user has access to identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) # Note: if no limitations (allowed_preset_ids is []), then all presets are allowed allowed_incident_ids = identity_manager.get_user_permission_on_resource_type( resource_type="incident", authenticated_entity=authenticated_entity, ) facet_options = get_incident_facets_data( tenant_id=tenant_id, allowed_incident_ids=allowed_incident_ids, facet_options_query=facet_options_query, ) logger.info( "Fetched incident facets from DB", extra={ "tenant_id": tenant_id, }, ) return facet_options @router.get( "/facets", description="Get incident facets", ) def fetch_inicident_facets( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident facets from DB", extra={ "tenant_id": tenant_id, }, ) facets = get_incident_facets(tenant_id=tenant_id) logger.info( "Fetched incident facets from DB", extra={ "tenant_id": tenant_id, }, ) return facets @router.get( "/facets/fields", description="Get potential fields for incident facets", ) def fetch_alert_facet_fields( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ) -> list: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident facet fields from DB", extra={ "tenant_id": tenant_id, }, ) fields = get_incident_potential_facet_fields(tenant_id=tenant_id) logger.info( "Fetched incident facet fields from DB", extra={ "tenant_id": tenant_id, }, ) return fields @router.get( "/report", description="Get incidents report", ) def get_incidents_report( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), cel: str = Query(None), ): tenant_id = authenticated_entity.tenant_id reports_bl = IncidentReportsBl(tenant_id) # get all preset ids that the user has access to identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) # Note: if no limitations (allowed_preset_ids is []), then all presets are allowed allowed_incident_ids = identity_manager.get_user_permission_on_resource_type( resource_type="incident", authenticated_entity=authenticated_entity, ) try: return reports_bl.get_incident_reports( incidents_query_cel=cel, allowed_incident_ids=allowed_incident_ids ) except CelToSqlException as e: logger.exception(f'Error parsing CEL expression "{cel}". {str(e)}') raise HTTPException( status_code=400, detail=f"Error parsing CEL expression: {cel}" ) @router.get( "/{incident_id}", description="Get incident by id", ) def get_incident( incident_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:incident"]) ), ) -> IncidentDto: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": tenant_id, }, ) incident = get_incident_by_id(tenant_id=tenant_id, incident_id=incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") rule = None if incident.rule_id: rule = get_rule(tenant_id, incident.rule_id) incident_dto = IncidentDto.from_db_incident(incident, rule) return incident_dto @router.put( "/{incident_id}", description="Update incident by id", ) def update_incident( incident_id: UUID, updated_incident_dto: IncidentDtoIn, generated_by_ai: bool = Query( default=False, alias="generatedByAi", description="Whether the incident update request was generated by AI", ), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), session: Session = Depends(get_session), ) -> IncidentDto: tenant_id = authenticated_entity.tenant_id incident_bl = IncidentBl(tenant_id, session=session, pusher_client=pusher_client) current_incident = get_incident_by_id(tenant_id, incident_id) if not current_incident: raise HTTPException(status_code=404, detail="Incident not found") if ( updated_incident_dto.assignee and current_incident.assignee != updated_incident_dto.assignee ): add_audit( tenant_id, str(incident_id), authenticated_entity.email, ActionType.INCIDENT_ASSIGN, f"Incident assigned to {updated_incident_dto.assignee}", ) new_incident_dto = incident_bl.update_incident( incident_id, updated_incident_dto, generated_by_ai ) return new_incident_dto @router.delete( "/bulk", description="Delete incidents in bulk", ) def bulk_delete_incidents( incident_ids: List[UUID] = Body(..., embed=True), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id incident_bl = IncidentBl(tenant_id, session, pusher_client) incident_bl.bulk_delete_incidents(incident_ids) return Response(status_code=202) @router.delete( "/{incident_id}", description="Delete incident by incident id", ) def delete_incident( incident_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id incident_bl = IncidentBl(tenant_id, session, pusher_client) incident_bl.delete_incident(incident_id) return Response(status_code=202) @router.post( "/{incident_id}/split", description="Split incident by incident id", response_model=SplitIncidentResponseDto, ) async def split_incident( incident_id: UUID, command: SplitIncidentRequestDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), session: Session = Depends(get_session), ) -> SplitIncidentResponseDto: tenant_id = authenticated_entity.tenant_id logger.info( "Splitting incident", extra={ "incident_id": incident_id, "tenant_id": tenant_id, "alert_fingerprints": command.alert_fingerprints, }, ) incident_bl = IncidentBl(tenant_id, session, pusher_client) await incident_bl.add_alerts_to_incident( incident_id=command.destination_incident_id, alert_fingerprints=command.alert_fingerprints, ) incident_bl.delete_alerts_from_incident( incident_id=incident_id, alert_fingerprints=command.alert_fingerprints ) return SplitIncidentResponseDto( destination_incident_id=command.destination_incident_id, moved_alert_fingerprints=command.alert_fingerprints, ) @router.post( "/merge", description="Merge incidents", response_model=MergeIncidentsResponseDto ) def merge_incidents( command: MergeIncidentsRequestDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), ) -> MergeIncidentsResponseDto: tenant_id = authenticated_entity.tenant_id logger.info( "Merging incidents", extra={ "source_incident_ids": command.source_incident_ids, "destination_incident_id": command.destination_incident_id, "tenant_id": tenant_id, }, ) try: merged_ids, failed_ids = merge_incidents_to_id( tenant_id, command.source_incident_ids, command.destination_incident_id, authenticated_entity.email, ) if not merged_ids: message = "No incidents merged" else: message = f"{pluralize(len(merged_ids), 'incident')} merged into {command.destination_incident_id} successfully" if failed_ids: message += f", {pluralize(len(failed_ids), 'incident')} failed to merge" raise HTTPException(f"Some incidents failed to merge. {message}") return MergeIncidentsResponseDto( merged_incident_ids=merged_ids, failed_incident_ids=failed_ids, destination_incident_id=command.destination_incident_id, message=message, ) except DestinationIncidentNotFound as e: raise HTTPException(status_code=400, detail=str(e)) @router.get( "/{incident_id}/alerts", description="Get incident alerts by incident incident id", ) def get_incident_alerts( incident_id: UUID, limit: int = 25, offset: int = 0, include_unlinked: bool = False, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:incidents"]) ), ) -> AlertWithIncidentLinkMetadataPaginatedResultsDto: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": tenant_id, }, ) incident = get_incident_by_id(tenant_id=tenant_id, incident_id=incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") logger.info( "Fetching incident's alert", extra={ "incident_id": incident_id, "tenant_id": tenant_id, }, ) db_alerts_and_links, total_count = get_incident_alerts_and_links_by_incident_id( tenant_id=tenant_id, incident_id=incident_id, limit=limit, offset=offset, include_unlinked=include_unlinked, ) enriched_alerts_dto = convert_db_alerts_to_dto_alerts(db_alerts_and_links) logger.info( "Fetched alerts from DB", extra={ "tenant_id": tenant_id, }, ) return AlertWithIncidentLinkMetadataPaginatedResultsDto( limit=limit, offset=offset, count=total_count, items=enriched_alerts_dto ) @router.get( "/{incident_id}/future_incidents", description="Get same incidents linked to this one", ) def get_future_incidents_for_an_incident( incident_id: str, limit: int = 25, offset: int = 0, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:incidents"]) ), ) -> IncidentsPaginatedResultsDto: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": tenant_id, }, ) incident = get_incident_by_id(tenant_id=tenant_id, incident_id=incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") logger.info( "Fetching future incidents from", extra={ "incident_id": incident_id, "tenant_id": tenant_id, }, ) db_incidents, total_count = get_future_incidents_by_incident_id( limit=limit, offset=offset, incident_id=incident_id, ) future_incidents = [ IncidentDto.from_db_incident(incident) for incident in db_incidents ] logger.info( "Fetched future incidents from DB", extra={ "incident_id": incident_id, "tenant_id": tenant_id, }, ) return IncidentsPaginatedResultsDto( limit=limit, offset=offset, count=total_count, items=future_incidents ) @router.get( "/{incident_id}/workflows", description="Get incident workflows by incident id", ) def get_incident_workflows( incident_id: UUID, limit: int = 25, offset: int = 0, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:incidents"]) ), ) -> WorkflowExecutionsPaginatedResultsDto: """ Get all workflows associated with an incident. It associated both with the incident itself and alerts associated with the incident. """ tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident's workflows", extra={"incident_id": incident_id, "tenant_id": tenant_id}, ) workflow_executions, total_count = get_workflow_executions_for_incident_or_alert( tenant_id=tenant_id, incident_id=str(incident_id), limit=limit, offset=offset, ) workflow_execution_dtos = [ WorkflowExecutionDTO(**we._mapping) for we in workflow_executions ] paginated_workflow_execution_dtos = WorkflowExecutionsPaginatedResultsDto( limit=limit, offset=offset, count=total_count, items=workflow_execution_dtos ) return paginated_workflow_execution_dtos @router.post( "/{incident_id}/alerts", description="Add alerts to incident", status_code=202, response_model=List[AlertDto], ) async def add_alerts_to_incident( incident_id: UUID, alert_fingerprints: List[str], is_created_by_ai: bool = False, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id incident_bl = IncidentBl(tenant_id, session, pusher_client) await incident_bl.add_alerts_to_incident( incident_id, alert_fingerprints, is_created_by_ai ) return Response(status_code=202) @router.delete( "/{incident_id}/alerts", description="Delete alerts from incident", status_code=202, response_model=List[AlertDto], ) def delete_alerts_from_incident( incident_id: UUID, fingerprints: List[str], authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), session=Depends(get_session), pusher_client: Pusher | None = Depends(get_pusher_client), ): tenant_id = authenticated_entity.tenant_id incident_bl = IncidentBl(tenant_id, session, pusher_client) incident_bl.delete_alerts_from_incident( incident_id=incident_id, alert_fingerprints=fingerprints ) return Response(status_code=202) @router.post( "/event/{provider_type}", description="Receive an alert event from a provider", status_code=202, ) async def receive_event( provider_type: str, bg_tasks: BackgroundTasks, request: Request, provider_id: str | None = None, event=Depends(extract_generic_body), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), ) -> dict[str, str]: trace_id = request.state.trace_id logger.info( "Received event", extra={ "trace_id": trace_id, "tenant_id": authenticated_entity.tenant_id, "provider_type": provider_type, "provider_id": provider_id, }, ) provider_class = None try: provider_class = ProvidersFactory.get_provider_class(provider_type) except ModuleNotFoundError: raise HTTPException( status_code=400, detail=f"Provider {provider_type} not found" ) if not provider_class: raise HTTPException( status_code=400, detail=f"Provider {provider_type} not found" ) # Parse the raw body event = provider_class.format_incident( event, authenticated_entity.tenant_id, provider_type, provider_id ) if REDIS: redis: ArqRedis = await get_pool() job = await redis.enqueue_job( "async_process_incident", authenticated_entity.tenant_id, provider_id, provider_type, event, trace_id, _queue_name=KEEP_ARQ_QUEUE_BASIC, ) logger.info( "Enqueued job", extra={ "job_id": job.job_id, "tenant_id": authenticated_entity.tenant_id, "queue": KEEP_ARQ_QUEUE_BASIC, }, ) else: logger.info("Processing incident in the background") bg_tasks.add_task( process_incident, {}, authenticated_entity.tenant_id, provider_id, provider_type, event, trace_id, ) return Response(status_code=202) @router.post("/{incident_id}/assign", description="Assign incident to user") def assign_incident( incident_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), session: Session = Depends(get_session), ): logger.info( "Assigning incident to user", extra={"incident_id": incident_id, "assignee": authenticated_entity.email}, ) incident = get_incident_by_id( authenticated_entity.tenant_id, incident_id, session=session ) if not incident: raise HTTPException(status_code=404, detail="Incident not found") incident.assignee = authenticated_entity.email add_audit( authenticated_entity.tenant_id, str(incident_id), authenticated_entity.email, ActionType.INCIDENT_ASSIGN, f"Incident self-assigned to {authenticated_entity.email}", ) session.commit() return Response(status_code=202) @router.post( "/{incident_id}/status", description="Change incident status", response_model=IncidentDto, ) def change_incident_status( incident_id: UUID, change: IncidentStatusChangeDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), session: Session = Depends(get_session), ) -> IncidentDto: tenant_id = authenticated_entity.tenant_id incident_bl = IncidentBl(tenant_id, session) new_incident_dto = incident_bl.change_status( incident_id, change.status, authenticated_entity ) return new_incident_dto @router.post( "/{incident_id}/severity", description="Change incident severity", response_model=IncidentDto, ) def change_incident_severity( incident_id: UUID, change: IncidentSeverityChangeDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), session: Session = Depends(get_session), pusher_client: Pusher | None = Depends(get_pusher_client), ) -> IncidentDto: tenant_id = authenticated_entity.tenant_id logger.info( "Changing the severity of an incident", extra={ "incident_id": incident_id, "tenant_id": tenant_id, "severity": change.severity.value, }, ) incident_bl = IncidentBl( tenant_id, session, pusher_client, user=authenticated_entity.email ) incident_dto = incident_bl.update_severity( incident_id, change.severity, change.comment ) return incident_dto @router.post("/{incident_id}/comment", description="Add incident audit activity") def add_comment( incident_id: UUID, change: IncidentStatusChangeDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher = Depends(get_pusher_client), session: Session = Depends(get_session), ) -> AlertAudit: extra = { "tenant_id": authenticated_entity.tenant_id, "commenter": authenticated_entity.email, "comment": change.comment, "incident_id": str(incident_id), "tagged_users": change.tagged_users, } logger.info("Adding comment to incident", extra=extra) comment = add_audit( authenticated_entity.tenant_id, str(incident_id), authenticated_entity.email, ActionType.INCIDENT_COMMENT, change.comment, session=session, commit=False, ) if change.tagged_users: for user_email in change.tagged_users: mention = CommentMention( comment_id=comment.id, mentioned_user_id=user_email, tenant_id=authenticated_entity.tenant_id, ) session.add(mention) session.commit() session.refresh(comment) if pusher_client: pusher_client.trigger( f"private-{authenticated_entity.tenant_id}", "incident-comment", {} ) logger.info("Added comment to incident", extra=extra) return comment @router.post( "/ai/suggest", description="Create incident with AI", response_model=IncidentsClusteringSuggestion, status_code=202, ) async def create_with_ai( alerts_fingerprints: List[str], authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), session: Session = Depends(get_session), ) -> IncidentsClusteringSuggestion: tenant_id = authenticated_entity.tenant_id # Get alerts data alerts = get_last_alerts(tenant_id, fingerprints=alerts_fingerprints) alerts_dto = convert_db_alerts_to_dto_alerts(alerts) # Get topology data topology_data = TopologiesService.get_all_topology_data(tenant_id, session) # Create suggestions using AI suggestion_bl = AISuggestionBl(tenant_id, session) return suggestion_bl.suggest_incidents( alerts_dto=alerts_dto, topology_data=topology_data, user_id=authenticated_entity.email, ) @router.post( "/ai/{suggestion_id}/commit", description="Commit incidents with AI and user feedback", response_model=List[IncidentDto], status_code=202, ) async def commit_with_ai( suggestion_id: UUID, incidents_with_feedback: List[IncidentCommit], authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), session: Session = Depends(get_session), pusher_client: Pusher | None = Depends(get_pusher_client), ) -> List[IncidentDto]: tenant_id = authenticated_entity.tenant_id # Create business logic instances ai_feedback_bl = AISuggestionBl(tenant_id, session) incident_bl = IncidentBl(tenant_id, session, pusher_client) # Commit incidents with feedback committed_incidents = await ai_feedback_bl.commit_incidents( suggestion_id=suggestion_id, incidents_with_feedback=[ incident.dict() for incident in incidents_with_feedback ], user_id=authenticated_entity.email, incident_bl=incident_bl, ) # Notify about changes if pusher client is available if pusher_client: try: pusher_client.trigger( f"private-{tenant_id}", "incident-change", {}, ) except Exception as e: logger.error(f"Failed to notify client: {str(e)}") return committed_incidents @router.post( "/{incident_id}/confirm", description="Confirm predicted incident by id", response_model=IncidentDto, ) def confirm_incident( incident_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), ) -> IncidentDto: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching incident", extra={ "incident_id": incident_id, "tenant_id": tenant_id, }, ) incident = confirm_predicted_incident_by_id(tenant_id, incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident candidate not found") new_incident_dto = IncidentDto.from_db_incident(incident) return new_incident_dto @router.post( "/{incident_id}/enrich", description="Enrich incident with additional data", status_code=202, ) async def enrich_incident( incident_id: UUID, enrichment: EnrichIncidentRequestBody, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), db_session: Session = Depends(get_session), ) -> Response: """Enrich incident with additional data.""" tenant_id = authenticated_entity.tenant_id # Get incident to verify it exists incident = get_incident_by_id(tenant_id=tenant_id, incident_id=incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") # Use the existing enrichment infrastructure enrichment_bl = EnrichmentsBl(tenant_id, db_session) enrichment_bl.enrich_entity( fingerprint=incident_id, enrichments=enrichment.enrichments, action_type=ActionType.INCIDENT_ENRICH, action_callee=authenticated_entity.email, action_description=f"Incident enriched by {authenticated_entity.email}", force=enrichment.force, ) # Notify clients if pusher is available if pusher_client: try: pusher_client.trigger( f"private-{tenant_id}", "incident-change", {}, ) except Exception as e: logger.exception( "Failed to notify clients about incident change", extra={"error": str(e)}, ) return Response(status_code=202) @router.post( "/{incident_id}/unenrich", description="Unenrich incident additional data", status_code=202, ) async def unenrich_incident( incident_id: UUID, enrichment: UnEnrichIncidentRequestBody, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:incident"]) ), pusher_client: Pusher | None = Depends(get_pusher_client), ) -> Response: """Unenrich incident additional data.""" tenant_id = authenticated_entity.tenant_id # Get incident to verify it exists incident = get_incident_by_id(tenant_id=tenant_id, incident_id=incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") enrichments_object = get_enrichment(tenant_id, enrichment.fingerprint) if not enrichments_object: raise HTTPException(status_code=404, detail="Enrichment not found") enrichments = enrichments_object.enrichments new_enrichments = { key: value for key, value in enrichments.items() if key not in enrichment.enrichments } # Use the existing enrichment infrastructure enrichment_bl = EnrichmentsBl(tenant_id) enrichment_bl.enrich_entity( fingerprint=enrichment.fingerprint, enrichments=new_enrichments, action_type=ActionType.INCIDENT_UNENRICH, action_callee=authenticated_entity.email, action_description=f"Incident un-enriched by {authenticated_entity.email}", force=True, ) # Notify clients if pusher is available if pusher_client: try: pusher_client.trigger( f"private-{tenant_id}", "incident-change", {}, ) except Exception as e: logger.exception( "Failed to notify clients about incident change", extra={"error": str(e)}, ) return Response(status_code=202) ================================================ FILE: keep/api/routes/maintenance.py ================================================ from datetime import timedelta from fastapi import APIRouter, Depends, HTTPException from sqlmodel import Session from keep.api.core.db import get_session from keep.api.models.db.maintenance_window import ( MaintenanceRuleCreate, MaintenanceRuleRead, MaintenanceWindowRule, ) from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() @router.get( "", response_model=list[MaintenanceRuleRead], description="Get all maintenance rules", ) def get_maintenance_rules( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:maintenance"]) ), session: Session = Depends(get_session), ) -> list[MaintenanceRuleRead]: rules = ( session.query(MaintenanceWindowRule) .filter(MaintenanceWindowRule.tenant_id == authenticated_entity.tenant_id) .all() ) return [MaintenanceRuleRead(**rule.dict()) for rule in rules] @router.post( "", response_model=MaintenanceRuleRead, description="Create a new maintenance rule" ) def create_maintenance_rule( rule_dto: MaintenanceRuleCreate, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:maintenance"]) ), session: Session = Depends(get_session), ) -> MaintenanceRuleRead: end_time = rule_dto.start_time + timedelta(seconds=rule_dto.duration_seconds) new_rule = MaintenanceWindowRule( **rule_dto.dict(), end_time=end_time, created_by=authenticated_entity.email, tenant_id=authenticated_entity.tenant_id, ) session.add(new_rule) session.commit() session.refresh(new_rule) return MaintenanceRuleRead(**new_rule.dict()) @router.put( "/{rule_id}", response_model=MaintenanceRuleRead, description="Update an existing maintenance rule", ) def update_maintenance_rule( rule_id: int, rule_dto: MaintenanceRuleCreate, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:maintenance"]) ), session: Session = Depends(get_session), ) -> MaintenanceRuleRead: rule: MaintenanceWindowRule = ( session.query(MaintenanceWindowRule) .filter( MaintenanceWindowRule.tenant_id == authenticated_entity.tenant_id, MaintenanceWindowRule.id == rule_id, ) .first() ) if not rule: raise HTTPException( status_code=404, detail="Maintenance rule not found or access denied" ) for key, value in rule_dto.dict().items(): setattr(rule, key, value) end_time = rule_dto.start_time + timedelta(seconds=rule_dto.duration_seconds) rule.end_time = end_time session.commit() session.refresh(rule) return MaintenanceRuleRead(**rule.dict()) @router.delete("/{rule_id}", description="Delete a maintenance rule") def delete_maintenance_rule( rule_id: int, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:maintenance"]) ), session: Session = Depends(get_session), ): rule = ( session.query(MaintenanceWindowRule) .filter( MaintenanceWindowRule.tenant_id == authenticated_entity.tenant_id, MaintenanceWindowRule.id == rule_id, ) .first() ) if not rule: raise HTTPException( status_code=404, detail="Maintenance rule not found or access denied" ) session.delete(rule) session.commit() return {"detail": "Maintenance rule deleted successfully"} ================================================ FILE: keep/api/routes/mapping.py ================================================ import datetime import logging from uuid import UUID from fastapi import APIRouter, Depends, HTTPException, Query from fastapi.responses import JSONResponse from sqlmodel import Session from keep.api.bl.enrichments_bl import EnrichmentsBl from keep.api.core.db import get_session from keep.api.models.db.enrichment_event import EnrichmentEventWithLogs from keep.api.models.db.mapping import ( MappingRule, MappingRuleDtoIn, MappingRuleDtoOut, MappingRuleUpdateDtoIn, ) from keep.api.models.db.topology import TopologyService from keep.api.utils.pagination import EnrichmentEventPaginatedResultsDto from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) @router.get("", description="Get all mapping rules", response_model_exclude=["rows"]) def get_rules( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:rules"]) ), session: Session = Depends(get_session), ) -> list[MappingRuleDtoOut]: logger.info("Getting mapping rules") # @tb: get the model without all the rows becuase it might be heavy rules: list[MappingRule] = ( session.query(MappingRule) .filter(MappingRule.tenant_id == authenticated_entity.tenant_id) .all() ) logger.info("Got mapping rules", extra={"rules_count": len(rules) if rules else 0}) rules_dtos = [] if rules: for rule in rules: rule_dto = MappingRuleDtoOut(**rule.model_dump()) attributes = [] if rule_dto.type == "csv": # @tb: when we get the model without the rows, we have to save the attributes when creating the rule. attributes = [ key for key in rule.rows[0].keys() if not any(key in matcher for matcher in rule.matchers) ] elif rule_dto.type == "topology": attributes = [ field for field in TopologyService.__fields__ if field not in rule.matchers and field != "tenant_id" and field != "id" ] rule_dto.attributes = attributes rules_dtos.append(rule_dto) return rules_dtos @router.get("/{rule_id}", description="Get a mapping rule by id") def get_rule( rule_id: int, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:rules"]) ), session: Session = Depends(get_session), ) -> MappingRuleDtoOut: logger.info("Getting mapping rule by id", extra={"rule_id": rule_id}) rule = ( session.query(MappingRule) .filter( MappingRule.tenant_id == authenticated_entity.tenant_id, MappingRule.id == rule_id, ) .first() ) if rule is None: raise HTTPException(status_code=404, detail="Rule not found") return MappingRuleDtoOut(**rule.model_dump()) @router.post( "", description="Create a new mapping rule", response_model_exclude={"rows", "tenant_id"}, ) def create_rule( rule: MappingRuleDtoIn, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:rules"]) ), session: Session = Depends(get_session), ) -> MappingRule: logger.info("Creating a new mapping rule") new_rule = MappingRule( **rule.dict(), tenant_id=authenticated_entity.tenant_id, created_by=authenticated_entity.email, ) if not new_rule.name or not new_rule.matchers: raise HTTPException( status_code=400, detail="Rule name and matchers are required" ) session.add(new_rule) session.commit() session.refresh(new_rule) logger.info("Created a new mapping rule", extra={"rule_id": new_rule.id}) return new_rule @router.delete("/{rule_id}", description="Delete a mapping rule") def delete_rule( rule_id: int, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:rules"]) ), session: Session = Depends(get_session), ): logger.info("Deleting a mapping rule", extra={"rule_id": rule_id}) rule = ( session.query(MappingRule) .filter(MappingRule.id == rule_id) .filter(MappingRule.tenant_id == authenticated_entity.tenant_id) .first() ) if rule is None: raise HTTPException(status_code=404, detail="Rule not found") session.delete(rule) session.commit() logger.info("Deleted a mapping rule", extra={"rule_id": rule_id}) return {"message": "Rule deleted successfully"} @router.put("/{rule_id}", description="Update an existing rule") def update_rule( rule_id: int, rule: MappingRuleUpdateDtoIn, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:rules"]) ), session: Session = Depends(get_session), ) -> MappingRuleDtoOut: logger.info("Updating a mapping rule") existing_rule: MappingRule = ( session.query(MappingRule) .filter( MappingRule.tenant_id == authenticated_entity.tenant_id, MappingRule.id == rule_id, ) .first() ) if existing_rule is None: raise HTTPException(status_code=404, detail="Rule not found") existing_rule.name = rule.name existing_rule.description = rule.description existing_rule.matchers = rule.matchers existing_rule.file_name = rule.file_name existing_rule.priority = rule.priority existing_rule.updated_by = authenticated_entity.email existing_rule.last_updated_at = datetime.datetime.now(tz=datetime.timezone.utc) if rule.rows is not None: existing_rule.rows = rule.rows session.commit() session.refresh(existing_rule) response = MappingRuleDtoOut(**existing_rule.dict()) if rule.rows is not None: response.attributes = [ key for key in existing_rule.rows[0].keys() if key not in rule.matchers ] return response # todo: we can make it generic for all enrichment events, not only mapping @router.get("/{rule_id}/executions", description="Get all executions for a rule") def get_enrichment_events( rule_id: int, limit: int = Query(20), offset: int = Query(0), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:rules"]) ), ) -> EnrichmentEventPaginatedResultsDto: logger.info( "Getting enrichment events", extra={ "rule_id": rule_id, "limit": limit, "offset": offset, "tenant_id": authenticated_entity.tenant_id, }, ) enrichment_bl = EnrichmentsBl(tenant_id=authenticated_entity.tenant_id) events = enrichment_bl.get_enrichment_events(rule_id, limit, offset) total_count = enrichment_bl.get_total_enrichment_events(rule_id) logger.info( "Got enrichment events", extra={"events_count": len(events)}, ) return EnrichmentEventPaginatedResultsDto( count=total_count, items=events, limit=limit, offset=offset, ) @router.get( "/{rule_id}/executions/{enrichment_event_id}", description="Get an execution for a rule", ) def get_enrichment_event_logs( rule_id: int, enrichment_event_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:rules"]) ), ) -> EnrichmentEventWithLogs: logger.info( "Getting enrichment event logs", extra={ "rule_id": rule_id, "enrichment_event_id": enrichment_event_id, "tenant_id": authenticated_entity.tenant_id, }, ) enrichment_bl = EnrichmentsBl(tenant_id=authenticated_entity.tenant_id) enrichment_event = enrichment_bl.get_enrichment_event(enrichment_event_id) logs = enrichment_bl.get_enrichment_event_logs(enrichment_event_id) if not logs: raise HTTPException(status_code=404, detail="Logs not found") logger.info( "Got enrichment event logs", extra={"logs_count": len(logs)}, ) return EnrichmentEventWithLogs( enrichment_event=enrichment_event, logs=logs, ) @router.post( "/{rule_id}/execute/{alert_id}", description="Execute a mapping rule against an alert", responses={ 200: {"description": "Mapping rule executed successfully"}, 400: {"description": "Mapping rule failed to execute"}, 404: {"description": "Mapping rule or alert not found"}, 403: {"description": "User does not have permission to execute mapping rule"}, }, ) def execute_rule( rule_id: int, alert_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:rules"]) ), ): logger.info( "Executing a mapping rule against an alert", extra={ "rule_id": rule_id, "alert_id": alert_id, "tenant_id": authenticated_entity.tenant_id, }, ) enrichment_bl = EnrichmentsBl(tenant_id=authenticated_entity.tenant_id) enriched = enrichment_bl.run_mapping_rule_by_id(rule_id, alert_id) if enriched: logger.info( "Mapping rule executed successfully", extra={"rule_id": rule_id, "alert_id": alert_id}, ) else: logger.error( "Mapping rule failed to execute", extra={"rule_id": rule_id, "alert_id": alert_id}, ) return JSONResponse( status_code=200, content={"enrichment_event_id": str(enrichment_bl.enrichment_event_id)}, ) ================================================ FILE: keep/api/routes/metrics.py ================================================ from typing import List import chevron from fastapi import APIRouter, Depends, Query, Request, Response from fastapi.responses import JSONResponse from prometheus_client import ( CONTENT_TYPE_LATEST, CollectorRegistry, generate_latest, multiprocess, ) from keep.api.core.config import config from keep.api.core.db import ( get_last_alerts_for_incidents, get_last_incidents, get_workflow_executions_count, ) from keep.api.core.limiter import limiter from keep.api.models.alert import AlertDto from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8" NO_AUTH_METRICS = config("KEEP_NO_AUTH_METRICS", default=False, cast=bool) if NO_AUTH_METRICS: @router.get("/processing", include_in_schema=False) async def get_processing_metrics( request: Request, ): registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) metrics = generate_latest(registry) return Response(content=metrics, media_type=CONTENT_TYPE_LATEST) else: @router.get("/processing", include_in_schema=False) async def get_processing_metrics( request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:metrics"]) ), ): registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) metrics = generate_latest(registry) return Response(content=metrics, media_type=CONTENT_TYPE_LATEST) @router.get("") def get_metrics( labels: List[str] = Query(None), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:metrics"]) ), ): """ This endpoint is used by Prometheus to scrape such metrics from the application: - alerts_total {incident_name, incident_id} - The total number of alerts per incident. - open_incidents_total - The total number of open incidents. - workflows_executions_total {status} - The total number of workflow executions. Please note that those metrics are per-tenant and are not designed to be used for the monitoring of the application itself. Example prometheus configuration: ``` scrape_configs: - job_name: "scrape_keep" scrape_interval: 5m # It's important to scrape not too often to avoid rate limiting. static_configs: - targets: ["https://api.keephq.dev"] # Or your own domain. authorization: type: Bearer credentials: "{Your API Key}" # Optional, you can add labels to exported incidents. # Label values will be equal to the last incident's alert payload value matching the label. # Attention! Don't add "flaky" labels which could change from alert to alert within the same incident. # Good labels: ['labels.department', 'labels.team'], bad labels: ['labels.severity', 'labels.pod_id'] # Check Keep -> Feed -> "extraPayload" column, it will help in writing labels. params: labels: ['labels.service', 'labels.queue'] # Will resuld as: "labels_service" and "labels_queue". ``` """ # We don't use im-memory metrics countrs here which is typical for prometheus exporters, # they would make us expose our app's pod id's. This is a customer-facing endpoint # we're deploying to SaaS, and we want to hide our internal infra. tenant_id = authenticated_entity.tenant_id export = str() # Exporting alerts per incidents export += "# HELP alerts_total The total number of alerts per incident.\n" export += "# TYPE alerts_total counter\n" incidents, incidents_total = get_last_incidents( tenant_id=tenant_id, limit=1000, is_candidate=False, ) last_alerts_for_incidents = get_last_alerts_for_incidents( [incident.id for incident in incidents] ) for incident in incidents: incident_name = ( incident.user_generated_name if incident.user_generated_name else incident.ai_generated_name ) extra_labels = "" try: last_alert = last_alerts_for_incidents[str(incident.id)][0] last_alert_dto = AlertDto(**last_alert.event) except IndexError: last_alert_dto = None if labels is not None: for label in labels: label_value = chevron.render("{{ " + label + " }}", last_alert_dto) label = label.replace(".", "_") extra_labels += f',{label}="{label_value}"' export += f'alerts_total{{incident_name="{incident_name}",incident_id="{incident.id}"{extra_labels}}} {incident.alerts_count}\n' # Exporting stats about open incidents export += "\n\n" export += "# HELP open_incidents_total The total number of open incidents.\r\n" export += "# TYPE open_incidents_total counter\n" export += f"open_incidents_total {incidents_total}\n" workflow_execution_counts = get_workflow_executions_count( tenant_id=tenant_id, ) export += "\n\n" export += "# HELP workflows_executions_total The total number of workflows.\r\n" export += "# TYPE workflows_executions_total counter\n" export += f"workflows_executions_total {{status=\"success\"}} {workflow_execution_counts['success']}\n" export += f"workflows_executions_total {{status=\"other\"}} {workflow_execution_counts['other']}\n" return Response(content=export, media_type=CONTENT_TYPE_LATEST) @router.get("/dumb", include_in_schema=False) @limiter.limit(config("KEEP_LIMIT_CONCURRENCY", default="10/minute", cast=str)) async def get_dumb(request: Request) -> JSONResponse: """ This endpoint is used to test the rate limiting. Args: request (Request): The request object. Returns: JSONResponse: A JSON response with the message "hello world" ({"hello": "world"}). """ # await asyncio.sleep(5) return JSONResponse(content={"hello": "world"}) ================================================ FILE: keep/api/routes/preset.py ================================================ import logging import os import uuid from datetime import datetime from fastapi import ( APIRouter, BackgroundTasks, Depends, HTTPException, Request, Response, ) from pydantic import BaseModel from sqlmodel import Session, select from keep.api.consts import PROVIDER_PULL_INTERVAL_MINUTE, STATIC_PRESETS from keep.api.core.db import get_db_preset_by_name from keep.api.core.db import get_presets as get_presets_db from keep.api.core.db import ( get_session, update_preset_options, update_provider_last_pull_time, ) from keep.api.models.alert import AlertDto from keep.api.models.db.preset import ( Preset, PresetDto, PresetOption, PresetTagLink, Tag, TagDto, ) from keep.api.models.time_stamp import TimeStampFilter, _get_time_stamp_filter from keep.api.tasks.process_event_task import process_event from keep.api.tasks.process_incident_task import process_incident from keep.api.tasks.process_topology_task import process_topology from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory from keep.providers.base.base_provider import BaseIncidentProvider, BaseTopologyProvider from keep.providers.providers_factory import ProvidersFactory from keep.searchengine.searchengine import SearchEngine router = APIRouter() logger = logging.getLogger(__name__) # SHAHAR: this function runs as background tasks as a seperate thread # DO NOT ADD async HERE as it will run in the main thread and block the whole server def pull_data_from_providers( tenant_id: str, trace_id: str, ) -> list[AlertDto]: """ Pulls alerts from providers and record the to the DB. "Get or create logics". """ if os.environ.get("KEEP_PULL_DATA_ENABLED", "true") != "true": logger.debug("Pull data from providers is disabled") return providers = ProvidersFactory.get_installed_providers( tenant_id=tenant_id, include_details=False ) logger.info( "Pulling data from providers", extra={ "tenant_id": tenant_id, "trace_id": trace_id, "providers_len": len(providers), }, ) for provider in providers: extra = { "provider_type": provider.type, "provider_id": provider.id, "tenant_id": tenant_id, "trace_id": trace_id, } if not provider.pulling_enabled: logger.debug("Pulling is disabled for this provider", extra=extra) continue if provider.last_pull_time is not None: now = datetime.now() minutes_passed = (now - provider.last_pull_time).total_seconds() / 60 if minutes_passed <= PROVIDER_PULL_INTERVAL_MINUTE: logger.info( "Skipping provider data pulling since not enough time has passed", extra={ **extra, "minutes_passed": minutes_passed, "provider_last_pull_time": str(provider.last_pull_time), }, ) continue try: logger.info( f"Pulling alerts from provider {provider.type} ({provider.id})", extra=extra, ) # Even if we failed at processing some event, lets save the last pull time to not iterate this process over and over again. update_provider_last_pull_time(tenant_id=tenant_id, provider_id=provider.id) provider_class = ProvidersFactory.get_installed_provider( tenant_id=tenant_id, provider_id=provider.id, provider_type=provider.type, ) sorted_provider_alerts_by_fingerprint = ( provider_class.get_alerts_by_fingerprint(tenant_id=tenant_id) ) logger.info( f"Pulling alerts from provider {provider.type} ({provider.id}) completed", extra=extra, ) # TODO: this should be moved somewhere else (@tb: too much logic in this function, wil handle it another time.) if isinstance(provider_class, BaseIncidentProvider): try: incidents = provider_class.get_incidents() process_incident( {}, tenant_id=tenant_id, provider_id=provider.id, provider_type=provider.type, incidents=incidents, trace_id=trace_id, ) except NotImplementedError: logger.debug( f"Provider {provider.type} ({provider.id}) does not implement pulling incidents", extra=extra, ) except Exception: logger.exception( f"Unknown error pulling incidents from provider {provider.type} ({provider.id})", extra={**extra, "trace_id": trace_id}, ) else: logger.debug( f"Provider {provider.type} ({provider.id}) does not implement pulling incidents", extra=extra, ) try: if isinstance(provider_class, BaseTopologyProvider): logger.info("Pulling topology data", extra=extra) topology_data, _ = provider_class.pull_topology() logger.info( "Pulling topology data finished, processing", extra={**extra, "topology_length": len(topology_data)}, ) process_topology( tenant_id, topology_data, provider.id, provider.type ) logger.info("Finished processing topology data", extra=extra) except NotImplementedError: logger.debug( f"Provider {provider.type} ({provider.id}) does not implement pulling topology data", extra=extra, ) except Exception as e: logger.exception( f"Unknown error pulling topology from provider {provider.type} ({provider.id})", extra={**extra, "exception": str(e)}, ) for fingerprint, alert in sorted_provider_alerts_by_fingerprint.items(): process_event( {}, tenant_id, provider.type, provider.id, fingerprint, None, trace_id, alert, notify_client=False, ) except Exception as e: logger.exception( f"Unknown error pulling from provider {provider.type} ({provider.id})", extra={**extra, "exception": str(e)}, ) logger.info( "Pulling data from providers completed", extra={ "tenant_id": tenant_id, "trace_id": trace_id, "providers_len": len(providers), }, ) @router.get( "", description="Get all presets for tenant", ) def get_presets( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:preset"]) ), session: Session = Depends(get_session), time_stamp: TimeStampFilter = Depends(_get_time_stamp_filter), ) -> list[PresetDto]: tenant_id = authenticated_entity.tenant_id logger.info(f"Getting all presets {time_stamp}") # get all preset ids that the user has access to identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) # Note: if no limitations (allowed_preset_ids is []), then all presets are allowed allowed_preset_ids = identity_manager.get_user_permission_on_resource_type( resource_type="preset", authenticated_entity=authenticated_entity, ) # both global and private presets presets = get_presets_db( tenant_id=tenant_id, email=authenticated_entity.email, preset_ids=allowed_preset_ids, ) presets_dto = [PresetDto(**preset.to_dict()) for preset in presets] # add static presets (unless allowed_preset_ids is set) if not allowed_preset_ids: presets_dto.append(STATIC_PRESETS["feed"]) logger.info("Got all presets") return presets_dto class CreateOrUpdatePresetDto(BaseModel): name: str | None options: list[PresetOption] is_private: bool = False # if true visible to all users of that tenant is_noisy: bool = False # if true, the preset will be noisy tags: list[TagDto] = [] # tags to assign to the preset counter_shows_firing_only: bool = ( True # if true, the counter will show only firing alerts ) @router.post("", description="Create a preset for tenant") def create_preset( body: CreateOrUpdatePresetDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:presets"]) ), session: Session = Depends(get_session), ) -> PresetDto: tenant_id = authenticated_entity.tenant_id if not body.options or not body.name: raise HTTPException(400, "Options and name are required") if body.name == "Feed" or body.name == "Deleted": raise HTTPException(400, "Cannot create preset with this name") options_dict = [option.dict() for option in body.options] created_by = authenticated_entity.email preset = Preset( tenant_id=tenant_id, options=options_dict, name=body.name, created_by=created_by, is_private=body.is_private, is_noisy=body.is_noisy, counter_shows_firing_only=body.counter_shows_firing_only, ) # Handle tags tags = [] for tag in body.tags: # New tag, create it if not tag.id: # check if tag with the same name already exists # (can happen due to some sync problems) existing_tag = session.query(Tag).filter(Tag.name == tag.name).first() if existing_tag: tags.append(existing_tag) continue new_tag = Tag(name=tag.name, tenant_id=tenant_id) session.add(new_tag) session.commit() session.refresh(new_tag) tags.append(new_tag) else: existing_tag = session.get(Tag, tag.id) if existing_tag is None: raise HTTPException(400, f"Tag with id {tag.id} does not exist") tags.append(existing_tag) # Add preset and commit to generate preset ID session.add(preset) session.commit() session.refresh(preset) # Explicitly create PresetTagLink entries for tag in tags: preset_tag_link = PresetTagLink( tenant_id=tenant_id, preset_id=preset.id, tag_id=tag.id ) session.add(preset_tag_link) session.commit() session.refresh(preset) logger.info("Created preset") return PresetDto(**preset.to_dict()) @router.delete( "/{preset_id}", description="Delete a preset for tenant", ) def delete_preset( preset_id: uuid.UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:presets"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info("Deleting preset", extra={"uuid": preset_id}) # Delete links session.query(PresetTagLink).filter(PresetTagLink.preset_id == preset_id).delete() statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id == preset_id) ) preset = session.exec(statement).first() if not preset: raise HTTPException(404, "Preset not found") session.delete(preset) session.commit() logger.info("Deleted preset", extra={"uuid": preset_id}) return {} @router.put( "/{preset_id}", description="Update a preset for tenant", ) def update_preset( preset_id: uuid.UUID, body: CreateOrUpdatePresetDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:presets"]) ), session: Session = Depends(get_session), ) -> PresetDto: tenant_id = authenticated_entity.tenant_id logger.info("Updating preset", extra={"uuid": preset_id}) statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id == preset_id) ) preset = session.exec(statement).first() if not preset: raise HTTPException(404, "Preset not found") if body.name: if body.name == "Feed" or body.name == "Deleted": raise HTTPException(400, "Cannot create preset with this name") if body.name != preset.name: preset.name = body.name preset.is_private = body.is_private preset.is_noisy = body.is_noisy preset.counter_shows_firing_only = body.counter_shows_firing_only options_dict = [option.dict() for option in body.options] if not options_dict: raise HTTPException(400, "Options cannot be empty") preset.options = options_dict # Handle tags tags = [] for tag in body.tags: # New tag, create it if not tag.id: # check if tag with the same name already exists # (can happen due to some sync problems) existing_tag = session.query(Tag).filter(Tag.name == tag.name).first() if existing_tag: tags.append(existing_tag) continue new_tag = Tag(name=tag.name, tenant_id=tenant_id) session.add(new_tag) session.commit() session.refresh(new_tag) tags.append(new_tag) else: existing_tag = session.get(Tag, tag.id) if existing_tag is None: raise HTTPException(400, f"Tag with id {tag.id} does not exist") tags.append(existing_tag) # Clear existing tag links session.query(PresetTagLink).filter(PresetTagLink.preset_id == preset.id).delete() # Explicitly create PresetTagLink entries for tag in tags: preset_tag_link = PresetTagLink( tenant_id=tenant_id, preset_id=preset.id, tag_id=tag.id ) session.add(preset_tag_link) session.commit() session.refresh(preset) logger.info("Updated preset", extra={"uuid": preset_id}) return PresetDto(**preset.to_dict()) @router.get( "/{preset_name}/alerts", description="Get the alerts of a preset", ) def get_preset_alerts( request: Request, bg_tasks: BackgroundTasks, preset_name: str, response: Response, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:presets"]) ), ) -> list: # Gathering alerts may take a while and we don't care if it will finish before we return the response. # In the worst case, gathered alerts will be pulled in the next request. bg_tasks.add_task( pull_data_from_providers, authenticated_entity.tenant_id, request.state.trace_id, ) tenant_id = authenticated_entity.tenant_id logger.info( "Getting preset alerts", extra={"preset_name": preset_name, "tenant_id": tenant_id}, ) # handle static presets if preset_name in STATIC_PRESETS: preset = STATIC_PRESETS[preset_name] else: preset = get_db_preset_by_name(tenant_id, preset_name) # if preset does not exist if not preset: raise HTTPException(404, "Preset not found") if isinstance(preset, Preset): preset_dto = PresetDto(**preset.to_dict()) else: preset_dto = PresetDto(**preset.dict()) # get all preset ids that the user has access to identity_manager = IdentityManagerFactory.get_identity_manager( authenticated_entity.tenant_id ) # Note: if no limitations (allowed_preset_ids is []), then all presets are allowed allowed_preset_ids = identity_manager.get_user_permission_on_resource_type( resource_type="preset", authenticated_entity=authenticated_entity, ) if allowed_preset_ids and str(preset_dto.id) not in allowed_preset_ids: raise HTTPException(403, "Not authorized to access this preset") search_engine = SearchEngine(tenant_id=tenant_id) preset_alerts = search_engine.search_alerts(preset_dto.query) logger.info("Got preset alerts", extra={"preset_name": preset_name}) response.headers["X-Search-Type"] = str(search_engine.search_mode.value) return preset_alerts class CreatePresetTab(BaseModel): name: str filter: str @router.post( "/{preset_id}/tab", description="Create a tab for a preset", ) def create_preset_tab( preset_id: uuid.UUID, body: CreatePresetTab, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:presets"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info("Creating preset tab", extra={"preset_id": preset_id}) statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id == preset_id) ) preset = session.exec(statement).first() if not preset: raise HTTPException(404, "Preset not found") # get tabs tabs = [] found = False for option in preset.options: if option.get("label", "").lower() == "tabs": tabs = option.get("value", []) found = True break # if its the first tab, create the tabs option if not found: preset.options.append({"label": "tabs", "value": []}) tabs.append({"name": body.name, "id": str(uuid.uuid4()), "filter": body.filter}) # update the tabs for option in preset.options: if option.get("label", "").lower() == "tabs": option["value"] = tabs break preset = update_preset_options( authenticated_entity.tenant_id, preset_id, preset.options ) logger.info("Created preset tab", extra={"preset_id": preset_id}) return PresetDto(**preset.to_dict()) @router.delete( "/{preset_id}/tab/{tab_id}", description="Delete a tab from a preset", ) def delete_tab( preset_id: uuid.UUID, tab_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:presets"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info("Deleting tab", extra={"tab_id": tab_id}) statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id == preset_id) ) preset = session.exec(statement).first() if not preset: raise HTTPException(404, "Preset not found") # get tabs tabs = [] found = False for option in preset.options: if option.get("label", "").lower() == "tabs": tabs = option.get("value", []) found = True break # if tabs not found, return 404 if not found: raise HTTPException(404, "Tabs not found") # remove the tab tabs = [tab for tab in tabs if tab.get("id") != tab_id] # update the tabs for option in preset.options: if option.get("label", "").lower() == "tabs": option["value"] = tabs break preset = update_preset_options( authenticated_entity.tenant_id, preset_id, preset.options ) logger.info("Deleted tab", extra={"tab_id": tab_id}) return PresetDto(**preset.to_dict()) class ColumnConfigurationDto(BaseModel): column_visibility: dict[str, bool] = {} column_order: list[str] = [] column_rename_mapping: dict[str, str] = {} column_time_formats: dict[str, str] = {} column_list_formats: dict[str, str] = {} @router.put( "/{preset_id}/column-config", description="Update column configuration for a preset", ) def update_preset_column_config( preset_id: uuid.UUID, body: ColumnConfigurationDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:presets"]) ), session: Session = Depends(get_session), ) -> PresetDto: tenant_id = authenticated_entity.tenant_id logger.info("Updating preset column configuration", extra={"preset_id": preset_id}) statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id == preset_id) ) preset = session.exec(statement).first() if not preset: raise HTTPException(404, "Preset not found") # Get current options and remove any existing column config options current_options = [ option for option in preset.options if option.get("label", "").lower() not in [ "column_visibility", "column_order", "column_rename_mapping", "column_time_formats", "column_list_formats" ] ] # Add new column configuration options if body.column_visibility: current_options.append({ "label": "column_visibility", "value": body.column_visibility }) if body.column_order: current_options.append({ "label": "column_order", "value": body.column_order }) if body.column_rename_mapping: current_options.append({ "label": "column_rename_mapping", "value": body.column_rename_mapping }) if body.column_time_formats: current_options.append({ "label": "column_time_formats", "value": body.column_time_formats }) if body.column_list_formats: current_options.append({ "label": "column_list_formats", "value": body.column_list_formats }) # Update the preset options preset.options = current_options session.commit() session.refresh(preset) logger.info("Updated preset column configuration", extra={"preset_id": preset_id}) return PresetDto(**preset.to_dict()) @router.get( "/{preset_id}/column-config", description="Get column configuration for a preset", ) def get_preset_column_config( preset_id: uuid.UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:preset"]) ), session: Session = Depends(get_session), ) -> ColumnConfigurationDto: tenant_id = authenticated_entity.tenant_id logger.info("Getting preset column configuration", extra={"preset_id": preset_id}) statement = ( select(Preset) .where(Preset.tenant_id == tenant_id) .where(Preset.id == preset_id) ) preset = session.exec(statement).first() if not preset: raise HTTPException(404, "Preset not found") preset_dto = PresetDto(**preset.to_dict()) return ColumnConfigurationDto( column_visibility=preset_dto.column_visibility, column_order=preset_dto.column_order, column_rename_mapping=preset_dto.column_rename_mapping, column_time_formats=preset_dto.column_time_formats, column_list_formats=preset_dto.column_list_formats, ) ================================================ FILE: keep/api/routes/provider_images.py ================================================ import logging import os from fastapi import APIRouter, Depends, File, HTTPException, Response, UploadFile from sqlmodel import Session, select from keep.api.core.db import get_session from keep.api.models.db.provider_image import ProviderImage from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) DEFAULT_IMAGE_PATH = os.environ.get( "DEFAULT_IMAGE_PATH", os.path.join(os.path.dirname(__file__), "../../../unknown-icon.png"), ) @router.post("/upload/{image_name}") async def upload_provider_image( image_name: str, file: UploadFile = File(...), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:providers"]) ), session: Session = Depends(get_session), ): """Upload a provider image""" tenant_id = authenticated_entity.tenant_id full_image_name = f"{image_name}-icon.png" if not file.content_type.startswith("image/"): raise HTTPException(400, "File must be an image") try: image_data = await file.read() # Check if image already exists existing_image = session.exec( select(ProviderImage) .where(ProviderImage.tenant_id == tenant_id) .where(ProviderImage.image_name == full_image_name) ).first() if existing_image: # Update existing image existing_image.image_blob = image_data session.add(existing_image) else: # Create new image provider_image = ProviderImage( id=f"{tenant_id}_{image_name}", tenant_id=tenant_id, image_name=full_image_name, image_blob=image_data, updated_by=authenticated_entity.email, ) session.add(provider_image) session.commit() return {"message": "Image uploaded successfully"} except Exception: logger.exception("Failed to upload image") raise HTTPException(500, "Failed to upload image") @router.get("/{image_name}") async def get_provider_image( image_name: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), session: Session = Depends(get_session), ): """Get a provider image""" tenant_id = authenticated_entity.tenant_id full_image_name = f"{image_name}-icon.png" # Try to get custom image from DB provider_image = session.exec( select(ProviderImage) .where(ProviderImage.tenant_id == tenant_id) .where(ProviderImage.image_name == full_image_name) ).first() if provider_image: return Response(content=provider_image.image_blob, media_type="image/png") # Return default image if no custom image found try: path = DEFAULT_IMAGE_PATH if not os.path.exists(path): fallback_path = "/unknown-icon.png" logger.warning( f"Default image not found at {DEFAULT_IMAGE_PATH}, using fallback path: {fallback_path}" ) path = fallback_path with open(DEFAULT_IMAGE_PATH, "rb") as f: return Response(content=f.read(), media_type="image/png") except FileNotFoundError: raise HTTPException(404, "Default image not found") @router.get("") async def list_provider_images( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), session: Session = Depends(get_session), ): """List all custom provider images for the tenant""" tenant_id = authenticated_entity.tenant_id # Query all provider images for this tenant provider_images = session.exec( select(ProviderImage).where(ProviderImage.tenant_id == tenant_id) ).all() # Return list of provider names that have custom images return [ { "provider_name": img.image_name.replace("-icon.png", ""), "id": img.id, "updated_by": img.updated_by, "last_updated": img.last_updated, } for img in provider_images ] ================================================ FILE: keep/api/routes/providers.py ================================================ import datetime import json import logging import random import time import uuid from typing import Any, Callable, Dict, Optional from fastapi import APIRouter, Body, Depends, HTTPException, Request from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse from sqlmodel import Session, select from sqlalchemy.exc import NoResultFound from starlette.datastructures import UploadFile from keep.api.core.config import config from keep.api.core.db import count_alerts, get_provider_distribution, get_session from keep.api.core.limiter import limiter from keep.api.models.db.provider import Provider from keep.api.models.provider import Provider as ProviderDTO from keep.api.models.provider import ProviderAlertsCountResponseDTO from keep.api.models.webhook import ProviderWebhookSettings from keep.api.utils.tenant_utils import get_or_create_api_key from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory from keep.providers.base.provider_exceptions import ( GetAlertException, ProviderMethodException, ) from keep.providers.providers_factory import ( ProviderConfigurationException, ProvidersFactory, ) from keep.providers.providers_service import ProvidersService from keep.secretmanager.secretmanagerfactory import SecretManagerFactory router = APIRouter() logger = logging.getLogger(__name__) READ_ONLY = config("KEEP_READ_ONLY", default="false") == "true" PROVIDER_DISTRIBUTION_ENABLED = config( "KEEP_PROVIDER_DISTRIBUTION_ENABLED", cast=bool, default=True ) def _is_localhost(): # TODO - there are more "advanced" cases that we don't catch here # e.g. IP's that are not public but not localhost # the more robust way is to try access KEEP_API_URL from another tool (such as wtfismy.com but the opposite) # # this is a temporary solution until we have a better one api_url = config("KEEP_API_URL") if "localhost" in api_url: return True if "127.0.0" in api_url: return True # default on localhost if no USE_NGROK if "0.0.0.0" in api_url: return True return False @router.get("", description="Get all providers") def get_providers( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info("Getting installed providers", extra={"tenant_id": tenant_id}) providers = ProvidersService.get_all_providers() installed_providers = ProvidersService.get_installed_providers(tenant_id) linked_providers = ProvidersService.get_linked_providers(tenant_id) if PROVIDER_DISTRIBUTION_ENABLED: # generate distribution only if not in read only mode if READ_ONLY: for provider in linked_providers + installed_providers: if "alert" not in provider.tags: continue provider.alertsDistribution = [ {"hour": i, "number": random.randint(0, 100)} for i in range(0, 24) ] provider.last_alert_received = datetime.datetime.now().isoformat() else: providers_distribution = get_provider_distribution(tenant_id) for provider in linked_providers + installed_providers: provider.alertsDistribution = providers_distribution.get( f"{provider.id}_{provider.type}", {} ).get("alert_last_24_hours", []) last_alert_received = providers_distribution.get( f"{provider.id}_{provider.type}", {} ).get("last_alert_received", None) if last_alert_received and not provider.last_alert_received: provider.last_alert_received = last_alert_received.replace( tzinfo=datetime.timezone.utc ).isoformat() is_localhost = _is_localhost() return { "providers": providers, "installed_providers": installed_providers, "linked_providers": linked_providers, "is_localhost": is_localhost, } @router.get("/{provider_id}/logs", description="Get provider logs") def get_provider_logs( provider_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info( "Getting provider logs", extra={"tenant_id": tenant_id, "provider_id": provider_id}, ) try: logs = ProvidersService.get_provider_logs(tenant_id, provider_id) return JSONResponse(content=jsonable_encoder(logs), status_code=200) except HTTPException as e: raise e except Exception as e: logger.error( f"Error getting provider logs: {str(e)}", extra={"tenant_id": tenant_id, "provider_id": provider_id}, ) raise HTTPException(status_code=500, detail=str(e)) @router.get( "/export", description="Export all installed providers", response_model=list[ProviderDTO], ) @limiter.exempt def get_installed_providers( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info("Getting installed providers", extra={"tenant_id": tenant_id}) providers = ProvidersFactory.get_all_providers() installed_providers = ProvidersFactory.get_installed_providers( tenant_id, providers, include_details=True ) return JSONResponse(content=jsonable_encoder(installed_providers), status_code=200) @router.get( "/{provider_type}/{provider_id}/configured-alerts", description="Get alerts configuration from a provider", ) def get_alerts_configuration( provider_type: str, provider_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), ) -> list: tenant_id = authenticated_entity.tenant_id logger.info( "Getting provider alerts", extra={ "tenant_id": tenant_id, "provider_type": provider_type, "provider_id": provider_id, }, ) context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) provider_config = secret_manager.read_secret( f"{tenant_id}_{provider_type}_{provider_id}", is_json=True ) provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, provider_config ) return provider.get_alerts_configuration() @router.get( "/{provider_type}/{provider_id}/logs", description="Get logs from a provider", ) def get_logs( provider_type: str, provider_id: str, limit: int = 5, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), ) -> list: try: tenant_id = authenticated_entity.tenant_id logger.info( "Getting provider logs", extra={ "tenant_id": tenant_id, "provider_type": provider_type, "provider_id": provider_id, }, ) context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) provider_config = secret_manager.read_secret( f"{tenant_id}_{provider_type}_{provider_id}", is_json=True ) provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, provider_config ) return provider.get_logs(limit=limit) except HTTPException as e: raise e except ModuleNotFoundError: raise HTTPException(404, detail=f"Provider {provider_type} not found") except Exception: logger.exception( "Failed to get provider logs", extra={ "tenant_id": tenant_id, "provider_type": provider_type, "provider_id": provider_id, }, ) return [] @router.get( "/{provider_type}/schema", description="Get the provider's API schema used to push alerts configuration", ) def get_alerts_schema( provider_type: str, ) -> dict: try: logger.info( "Getting provider alerts schema", extra={"provider_type": provider_type} ) provider = ProvidersFactory.get_provider_class(provider_type) return provider.get_alert_schema() except ModuleNotFoundError: raise HTTPException(404, detail=f"Provider {provider_type} not found") @router.get( "/{provider_type}/{provider_id}/alerts/count", description="Get number of alerts a specific provider has received (in a specific time time period or ever)", ) def get_alert_count( provider_type: str, provider_id: str, ever: bool, start_time: Optional[datetime.datetime] = None, end_time: Optional[datetime.datetime] = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), ): tenant_id = authenticated_entity.tenant_id if ever is False and (start_time is None or end_time is None): return HTTPException( status_code=400, detail="Missing start_time and/or end_time" ) return ProviderAlertsCountResponseDTO( count=count_alerts( provider_type=provider_type, provider_id=provider_id, ever=ever, start_time=start_time, end_time=end_time, tenant_id=tenant_id, ), ) @router.post( "/{provider_type}/{provider_id}/alerts", description="Push new alerts to the provider", ) def add_alert( provider_type: str, provider_id: str, alert: dict, alert_id: Optional[str] = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:alert"]) ), ) -> JSONResponse: tenant_id = authenticated_entity.tenant_id logger.info( "Adding alert to provider", extra={ "tenant_id": tenant_id, "provider_type": provider_type, "provider_id": provider_id, }, ) context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) provider_config = secret_manager.read_secret( f"{tenant_id}_{provider_type}_{provider_id}", is_json=True ) provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, provider_config ) try: provider.deploy_alert(alert, alert_id) return JSONResponse(status_code=200, content={"message": "deployed"}) except Exception as e: return JSONResponse(status_code=500, content=e.args[0]) @router.post( "/test", description="Test a provider's alert retrieval", ) def test_provider( provider_info: dict = Body(...), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), ) -> JSONResponse: # Extract parameters from the provider_info dictionary # For now, we support only 1:1 provider_type:provider_id # In the future, we might want to support multiple providers of the same type tenant_id = authenticated_entity.tenant_id provider_id = provider_info.pop("provider_id") provider_type = provider_info.pop("provider_type", None) or provider_id logger.info( "Testing provider", extra={ "provider_id": provider_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) provider_config = { "authentication": provider_info, } # TODO: valdiations: # 1. provider_type and provider id is valid # 2. the provider config is valid context_manager = ContextManager( tenant_id=tenant_id, workflow_id="" # this is not in a workflow scope ) provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, provider_config ) try: alerts = provider.get_alerts_configuration() return JSONResponse(status_code=200, content={"alerts": alerts}) except GetAlertException as e: return JSONResponse(status_code=e.status_code, content=e.message) except Exception as e: return JSONResponse(status_code=400, content=str(e)) @router.delete("/{provider_type}/{provider_id}", description="Delete provider") def delete_provider( provider_type: str, provider_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:providers"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id try: ProvidersService.delete_provider(tenant_id, provider_id, session) return JSONResponse(status_code=200, content={"message": "deleted"}) except HTTPException as e: return JSONResponse(status_code=e.status_code, content={"message": e.detail}) except Exception as e: logger.exception("Failed to delete provider") return JSONResponse(status_code=400, content={"message": str(e)}) @router.post( "/{provider_id}/scopes", description="Validate provider scopes", status_code=200, response_model=dict[str, bool | str], ) def validate_provider_scopes( provider_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:providers"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info("Validating provider scopes", extra={"provider_id": provider_id}) provider = session.exec( select(Provider).where( (Provider.tenant_id == tenant_id) & (Provider.id == provider_id) ) ).one() if not provider: raise HTTPException(404, detail="Provider not found") context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) provider_config = secret_manager.read_secret( provider.configuration_key, is_json=True ) provider_instance = ProvidersFactory.get_provider( context_manager, provider_id, provider.type, provider_config ) validated_scopes = provider_instance.validate_scopes() if validated_scopes != provider.validatedScopes: provider.validatedScopes = validated_scopes session.commit() logger.info( "Validated provider scopes", extra={"provider_id": provider_id, "validated_scopes": validated_scopes}, ) return validated_scopes @router.put("/{provider_id}", description="Update provider", status_code=200) async def update_provider( provider_id: str, request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["update:providers"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id updated_by = authenticated_entity.email logger.info( "Updating provider", extra={"provider_id": provider_id, "tenant_id": tenant_id}, ) try: provider_info = await request.json() except Exception: form_data = await request.form() provider_info = dict(form_data) if not provider_info: raise HTTPException(status_code=400, detail="No valid data provided") for key, value in provider_info.items(): if isinstance(value, UploadFile): provider_info[key] = value.file.read().decode() try: result = ProvidersService.update_provider( tenant_id, provider_id, provider_info, updated_by, session ) return JSONResponse(status_code=200, content=result) except HTTPException as e: return JSONResponse(status_code=e.status_code, content={"message": e.detail}) except Exception as e: logger.exception("Failed to update provider") return JSONResponse(status_code=400, content={"message": str(e)}) @router.post("/install", description="Install provider") async def install_provider( request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:providers"]) ), ): tenant_id = authenticated_entity.tenant_id installed_by = authenticated_entity.email try: provider_info = await request.json() except Exception: form_data = await request.form() provider_info = dict(form_data) if not provider_info: raise HTTPException(status_code=400, detail="No valid data provided") try: provider_id = provider_info.pop("provider_id") provider_name = provider_info.pop("provider_name") provider_type = provider_info.pop("provider_type", None) or provider_id pulling_enabled = provider_info.pop("pulling_enabled", True) except KeyError as e: raise HTTPException( status_code=400, detail=f"Missing required field: {e.args[0]}" ) for key, value in provider_info.items(): if isinstance(value, UploadFile): provider_info[key] = value.file.read().decode() try: result = ProvidersService.install_provider( tenant_id, installed_by, provider_id, provider_name, provider_type, provider_info, pulling_enabled=pulling_enabled, ) return JSONResponse(status_code=200, content=result) except HTTPException as e: if e.status_code == 412: logger.error( "Failed to validate mandatory provider scopes, returning 412", extra={ "provider_id": provider_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) raise except Exception as e: logger.exception( "Failed to install provider", extra={ "provider_id": provider_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) return JSONResponse(status_code=400, content={"message": str(e)}) @router.post( "/install/oauth2/{provider_type}", description="Install provider via oauth2." ) async def install_provider_oauth2( provider_type: str, provider_info: dict = Body(...), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:providers"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id installed_by = authenticated_entity.email provider_unique_id = uuid.uuid4().hex logger.info( "Installing provider", extra={ "provider_id": provider_unique_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) try: provider_class = ProvidersFactory.get_provider_class(provider_type) install_webhook = provider_info.pop("install_webhook", "true") == "true" pulling_enabled = provider_info.pop("pulling_enabled", "true") == "true" provider_info = provider_class.oauth2_logic(**provider_info) provider_name = provider_info.pop( "provider_name", f"{provider_unique_id}-oauth2" ) provider_name = provider_name.lower().replace(" ", "").replace("_", "-") provider_config = { "authentication": provider_info, "name": provider_name, } # Instantiate the provider object and perform installation process context_manager = ContextManager(tenant_id=tenant_id) provider = ProvidersFactory.get_provider( context_manager, provider_unique_id, provider_type, provider_config ) validated_scopes = ProvidersService.validate_scopes(provider) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_name = f"{tenant_id}_{provider_type}_{provider_unique_id}" secret_manager.write_secret( secret_name=secret_name, secret_value=json.dumps(provider_config), ) # add the provider to the db provider = Provider( id=provider_unique_id, tenant_id=tenant_id, name=provider_name, type=provider_type, installed_by=installed_by, installation_time=time.time(), configuration_key=secret_name, validatedScopes=validated_scopes, pulling_enabled=pulling_enabled, ) session.add(provider) session.commit() if install_webhook: install_provider_webhook( provider_type, provider.id, authenticated_entity, session ) return JSONResponse( status_code=200, content={ "type": provider_type, "id": provider_unique_id, "details": provider_config, }, ) except Exception as e: logger.exception( "Failed to install provider", extra={ "provider_id": provider_unique_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) raise HTTPException(status_code=400, detail=str(e)) def _get_provider(tenant_id: str, provider_id: str, session: Session): """ Get provider configuration from database or default providers. Returns: dict: Contains provider_id, provider_type, config """ context_manager = ContextManager(tenant_id=tenant_id) if provider_id.startswith("default-"): try: provider_type = provider_id.split("-")[1] return ProvidersFactory.get_provider( context_manager, provider_id, provider_type, {"authentication": {}}, # default providers shouldn't have auth config ) except IndexError: raise HTTPException( 400, detail="Default provider must be in the format default-", ) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) try: # Try to get provider from database provider = session.exec( select(Provider).where( (Provider.tenant_id == tenant_id) & (Provider.id == provider_id) ) ).one() provider_config = secret_manager.read_secret( provider.configuration_key, is_json=True ) return ProvidersFactory.get_provider( context_manager, provider.id, provider.type, provider_config ) except NoResultFound as e: raise HTTPException(404, detail="Provider not found") from e @router.post( "/{provider_id}/invoke/{method}", description="Invoke provider special method", status_code=200, ) def invoke_provider_method( provider_id: str, method: str, body: dict = Body(...), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:providers"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info( "Invoking provider method", extra={"provider_id": provider_id, "method": method} ) try: provider_instance = _get_provider(tenant_id, provider_id, session) # Check if method exists func: Callable | None = getattr(provider_instance, method, None) if not func: raise HTTPException(400, detail="Method not found") # Invoke the method with the body as params response = func(**body) logger.info( "Successfully invoked provider method", extra={ "provider_id": provider_instance.provider_id, "provider_type": provider_instance.provider_type, "method": method, }, ) return response except ProviderConfigurationException as e: logger.exception( "Failed to initialize provider", extra={"provider_id": provider_id, "method": method}, ) raise HTTPException(status_code=400, detail=str(e)) from e except ProviderMethodException as e: logger.exception( "Failed to invoke method", extra={"provider_id": provider_id, "method": method}, ) raise HTTPException(status_code=e.status_code, detail=e.message) from e except ProviderException as e: logger.exception( "Failed to invoke method", extra={"provider_id": provider_id, "method": method}, ) raise HTTPException(status_code=400, detail=str(e)) from e except (ValueError, TypeError) as e: logger.exception( "Invalid request parameters", extra={"provider_id": provider_id, "method": method}, ) raise HTTPException(status_code=400, detail=str(e)) from e except HTTPException: # Re-raise HTTPExceptions without modification (from _get_provider_configuration) raise except Exception as e: logger.exception( "Unexpected error while invoking provider method", extra={ "provider_id": provider_id, "method": method, "method_params": body, }, ) raise HTTPException(status_code=500, detail="Internal server error") from e # Webhook related endpoints @router.post( "/install/webhook/{provider_type}/{provider_id}", description="Install webhook for a provider.", ) def install_provider_webhook( provider_type: str, provider_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:providers"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id webhook_installed = ProvidersService.install_webhook( tenant_id, provider_type, provider_id, session ) if webhook_installed: return JSONResponse(status_code=200, content={"message": "webhook installed"}) else: return JSONResponse( status_code=400, content={"message": "provider does not support webhook"} ) @router.get("/{provider_type}/webhook", description="Get provider's webhook settings.") def get_webhook_settings( provider_type: str, provider_id: str | None = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:providers"]) ), session: Session = Depends(get_session), ) -> ProviderWebhookSettings: tenant_id = authenticated_entity.tenant_id logger.info("Getting webhook settings", extra={"provider_type": provider_type}) api_url = config("KEEP_API_URL") keep_webhook_api_url = f"{api_url}/alerts/event/{provider_type}" if provider_id: keep_webhook_api_url = f"{keep_webhook_api_url}?provider_id={provider_id}" provider_class = ProvidersFactory.get_provider_class(provider_type) webhook_api_key = get_or_create_api_key( session=session, tenant_id=tenant_id, created_by="system", unique_api_key_id="webhook", system_description="Webhooks API key", ) # for cases where we need webhook with auth keep_webhook_api_url_with_auth = keep_webhook_api_url.replace( "https://", f"https://keep:{webhook_api_key}@" ) try: webhookMarkdown = provider_class.webhook_markdown.format( keep_webhook_api_url=keep_webhook_api_url, api_key=webhook_api_key, keep_webhook_api_url_with_auth=keep_webhook_api_url_with_auth, ) except AttributeError: webhookMarkdown = None logger.info("Got webhook settings", extra={"provider_type": provider_type}) return ProviderWebhookSettings( webhookDescription=provider_class.webhook_description.format( keep_webhook_api_url=keep_webhook_api_url, api_key=webhook_api_key, keep_webhook_api_url_with_auth=keep_webhook_api_url_with_auth, ), webhookTemplate=provider_class.webhook_template.format( keep_webhook_api_url=keep_webhook_api_url, api_key=webhook_api_key, keep_webhook_api_url_with_auth=keep_webhook_api_url_with_auth, ), webhookMarkdown=webhookMarkdown, ) @router.post("/healthcheck", description="Run healthcheck on a provider") async def healthcheck_provider( request: Request, ) -> Dict[str, Any]: try: provider_info = await request.json() except Exception: form_data = await request.form() provider_info = dict(form_data) if not provider_info: raise HTTPException(status_code=400, detail="No valid data provided") try: provider_id = provider_info.pop("provider_id") provider_type = provider_info.pop("provider_type", None) or provider_id provider_name = f"{provider_type} healthcheck" except KeyError as e: raise HTTPException( status_code=400, detail=f"Missing required field: {e.args[0]}" ) for key, value in provider_info.items(): if isinstance(value, UploadFile): provider_info[key] = value.file.read().decode() provider = ProvidersService.prepare_provider( provider_id, provider_name, provider_type, provider_info, ) result = provider.get_health_report() return result @router.get("/healthcheck", description="Get all providers for healthcheck") def get_healthcheck_providers(): logger.info("Getting all providers for healthcheck") providers = ProvidersService.get_all_providers() healthcheck_providers = [provider for provider in providers if provider.health] is_localhost = _is_localhost() return { "providers": healthcheck_providers, "is_localhost": is_localhost, } ================================================ FILE: keep/api/routes/pusher.py ================================================ from fastapi import APIRouter, Depends, Form, HTTPException from pusher import Pusher from keep.api.core.dependencies import get_pusher_client from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() @router.post("/auth", status_code=200) def pusher_authentication( channel_name=Form(...), socket_id=Form(...), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:pusher"]) ), pusher_client: Pusher = Depends(get_pusher_client), ) -> dict: """ Authenticate a user to a private channel Args: request (Request): The request object tenant_id (str, optional): The tenant ID. Defaults to Depends(verify_bearer_token). pusher_client (Pusher, optional): Pusher client. Defaults to Depends(get_pusher_client). Raises: HTTPException: 403 if the user is not allowed to access the channel. Returns: dict: The authentication response. """ tenant_id = authenticated_entity.tenant_id if not pusher_client: raise HTTPException( status_code=500, detail="Pusher client not initalized on backend, PUSHER_DISABLED is set to True?", ) if channel_name == f"private-{tenant_id}": auth = pusher_client.authenticate(channel=channel_name, socket_id=socket_id) return auth raise HTTPException(status_code=403, detail="Forbidden") ================================================ FILE: keep/api/routes/rules.py ================================================ import logging from fastapi import APIRouter, Depends, HTTPException, Request from pydantic import BaseModel from keep.api.core.cel_to_sql.cel_ast_converter import CelToAstConverter from keep.api.core.db import create_rule as create_rule_db from keep.api.core.db import delete_rule as delete_rule_db from keep.api.core.db import get_rule_distribution as get_rule_distribution_db from keep.api.core.db import get_rule_incidents_count_db from keep.api.core.db import get_rules as get_rules_db from keep.api.core.db import update_rule as update_rule_db from keep.api.models.db.rule import CreateIncidentOn, ResolveOn from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) class RuleCreateDto(BaseModel): ruleName: str sqlQuery: dict celQuery: str timeframeInSeconds: int timeUnit: str groupingCriteria: list = [] groupDescription: str = None requireApprove: bool = False resolveOn: str = ResolveOn.NEVER.value createOn: str = CreateIncidentOn.ANY.value incidentNameTemplate: str = None incidentPrefix: str = None multiLevel: bool = False multiLevelPropertyName: str = None threshold: int = 1 assignee: str = None @router.get( "", description="Get Rules", ) def get_rules( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:rules"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info("Getting rules") rules = get_rules_db(tenant_id=tenant_id) # now add this: rules_dist = get_rule_distribution_db(tenant_id=tenant_id, minute=True) rules_incidents = get_rule_incidents_count_db(tenant_id=tenant_id) logger.info("Got rules") # return rules rules = [rule.model_dump() for rule in rules] for rule in rules: rule["distribution"] = rules_dist.get(rule["id"], []) rule["incidents"] = rules_incidents.get(rule["id"], 0) rule["definition_cel_ast"] = CelToAstConverter().convert_to_ast( rule["definition_cel"] ) return rules @router.post( "", description="Create Rule", ) async def create_rule( rule_create_request: RuleCreateDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:rules"]) ), ): tenant_id = authenticated_entity.tenant_id created_by = authenticated_entity.email logger.info("Creating rule") rule_name = rule_create_request.ruleName cel_query = rule_create_request.celQuery timeframe = rule_create_request.timeframeInSeconds timeunit = rule_create_request.timeUnit grouping_criteria = rule_create_request.groupingCriteria group_description = rule_create_request.groupDescription require_approve = rule_create_request.requireApprove resolve_on = rule_create_request.resolveOn create_on = rule_create_request.createOn sql = rule_create_request.sqlQuery.get("sql") params = rule_create_request.sqlQuery.get("params") incident_name_template = rule_create_request.incidentNameTemplate incident_prefix = rule_create_request.incidentPrefix multi_level = rule_create_request.multiLevel multi_level_property_name = rule_create_request.multiLevelPropertyName threshold = rule_create_request.threshold assignee = rule_create_request.assignee if not sql: raise HTTPException(status_code=400, detail="SQL is required") # params can be {} for example on '(( source is not null ))' if not params and not params == {}: raise HTTPException(status_code=400, detail="Params are required") if not cel_query: raise HTTPException(status_code=400, detail="CEL is required") if not rule_name: raise HTTPException(status_code=400, detail="Rule name is required") if not timeframe: raise HTTPException(status_code=400, detail="Timeframe is required") if not timeunit: raise HTTPException(status_code=400, detail="Timeunit is required") if not resolve_on: raise HTTPException(status_code=400, detail="resolveOn is required") if not create_on: raise HTTPException(status_code=400, detail="createOn is required") if not threshold: raise HTTPException(status_code=400, detail="threshold is required") rule = create_rule_db( tenant_id=tenant_id, name=rule_name, definition={ "sql": sql, "params": params, }, timeframe=timeframe, timeunit=timeunit, definition_cel=cel_query, created_by=created_by, grouping_criteria=grouping_criteria, group_description=group_description, require_approve=require_approve, resolve_on=resolve_on, create_on=create_on, incident_name_template=incident_name_template, incident_prefix=incident_prefix, multi_level=multi_level, multi_level_property_name=multi_level_property_name, threshold=threshold, assignee=assignee, ) logger.info("Rule created") return rule @router.delete( "/{rule_id}", description="Delete Rule", ) async def delete_rule( rule_id: str, request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:rules"]) ), ): tenant_id = authenticated_entity.tenant_id logger.info(f"Deleting rule {rule_id}") if delete_rule_db(tenant_id=tenant_id, rule_id=rule_id): logger.info(f"Rule {rule_id} deleted") return {"message": "Rule deleted"} else: logger.info(f"Rule {rule_id} not found") raise HTTPException(status_code=404, detail="Rule not found") @router.put( "/{rule_id}", description="Update Rule", ) async def update_rule( rule_id: str, request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["update:rules"]) ), ): tenant_id = authenticated_entity.tenant_id updated_by = authenticated_entity.email logger.info(f"Updating rule {rule_id}") try: body = await request.json() rule_name = body["ruleName"] sql_query = body["sqlQuery"] cel_query = body["celQuery"] timeframe = body["timeframeInSeconds"] timeunit = body["timeUnit"] resolve_on = body["resolveOn"] create_on = body["createOn"] grouping_criteria = body.get("groupingCriteria", []) require_approve = body.get("requireApprove", []) incident_template_name = body.get("incidentNameTemplate", None) incident_prefix = body.get("incidentPrefix", None) multi_level = body.get("multiLevel", False) multi_level_property_name = body.get("multiLevelPropertyName", None) threshold = body.get("threshold", 1) assignee = body.get("assignee", None) except Exception: raise HTTPException(status_code=400, detail="Invalid request body") sql = sql_query.get("sql") params = sql_query.get("params") if not sql: raise HTTPException(status_code=400, detail="SQL is required") if ( not params and not params == {} ): # params can be {} for example on '(( source is not null ))' raise HTTPException(status_code=400, detail="Params are required") if not cel_query: raise HTTPException(status_code=400, detail="CEL is required") if not rule_name: raise HTTPException(status_code=400, detail="Rule name is required") if not timeframe: raise HTTPException(status_code=400, detail="Timeframe is required") if not timeunit: raise HTTPException(status_code=400, detail="Timeunit is required") if not resolve_on: raise HTTPException(status_code=400, detail="resolveOn is required") if not create_on: raise HTTPException(status_code=400, detail="createOn is required") if not threshold: raise HTTPException(status_code=400, detail="threshold is required") rule = update_rule_db( tenant_id=tenant_id, rule_id=rule_id, name=rule_name, definition={ "sql": sql, "params": params, }, timeframe=timeframe, timeunit=timeunit, definition_cel=cel_query, updated_by=updated_by, grouping_criteria=grouping_criteria, require_approve=require_approve, resolve_on=resolve_on, create_on=create_on, incident_name_template=incident_template_name, incident_prefix=incident_prefix, multi_level=multi_level, multi_level_property_name=multi_level_property_name, threshold=threshold, assignee=assignee, ) if rule: logger.info(f"Rule {rule_id} updated") return rule else: logger.info(f"Rule {rule_id} not found") raise HTTPException(status_code=404, detail="Rule not found") ================================================ FILE: keep/api/routes/settings.py ================================================ import io import json import logging import smtplib from email.mime.text import MIMEText from typing import Optional, Tuple from fastapi import APIRouter, Body, Depends, HTTPException, Request from fastapi.responses import JSONResponse from pydantic import BaseModel, Field from sqlmodel import Session from keep.api.core.config import config from keep.api.core.db import get_session from keep.api.core.tenant_configuration import TenantConfiguration from keep.api.models.alert import AlertDto from keep.api.models.smtp import SMTPSettings from keep.api.models.webhook import WebhookSettings from keep.api.utils.tenant_utils import ( APIKeyException, create_api_key, get_api_key, get_api_keys, get_api_keys_secret, get_or_create_api_key, update_api_key_internal, ) from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory from keep.identitymanager.rbac import get_role_by_role_name from keep.secretmanager.secretmanagerfactory import SecretManagerFactory router = APIRouter() logger = logging.getLogger(__name__) class CreateUserRequest(BaseModel): email: str = Field(alias="username") password: Optional[str] = None # for auth0 we don't need a password role: str class Config: allow_population_by_field_name = True @router.get( "/webhook", description="Get details about the webhook endpoint (e.g. the API url and an API key)", ) def webhook_settings( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), session: Session = Depends(get_session), ) -> WebhookSettings: tenant_id = authenticated_entity.tenant_id logger.info("Getting webhook settings") api_url = config("KEEP_API_URL") keep_webhook_api_url = f"{api_url}/alerts/event" try: webhook_api_key = get_or_create_api_key( session=session, tenant_id=tenant_id, created_by="system", unique_api_key_id="webhook", system_description="Webhooks API key", ) except Exception as e: logger.error(f"Error retrieving webhook settings: {str(e)}") return JSONResponse( status_code=502, content={"message": str(e)}, ) logger.info("Webhook settings retrieved successfully") return WebhookSettings( webhookApi=keep_webhook_api_url, apiKey=webhook_api_key, modelSchema=AlertDto.schema(), ) @router.post("/smtp", description="Install or update SMTP settings") async def update_smtp_settings( smtp_settings: SMTPSettings = Body(...), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): tenant_id = authenticated_entity.tenant_id context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) # Save the SMTP settings in the secret manager smtp_settings = smtp_settings.dict() smtp_settings["password"] = smtp_settings["password"].get_secret_value() secret_manager.write_secret( secret_name=f"{tenant_id}_smtp", secret_value=json.dumps(smtp_settings) ) return {"status": "SMTP settings updated successfully"} @router.get("/smtp", description="Get SMTP settings") async def get_smtp_settings( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), session: Session = Depends(get_session), ): logger.info("Getting SMTP settings") tenant_id = authenticated_entity.tenant_id context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) # Read the SMTP settings from the secret manager try: smtp_settings = secret_manager.read_secret(secret_name=f"{tenant_id}_smtp") smtp_settings = json.loads(smtp_settings) logger.info("SMTP settings retrieved successfully") return JSONResponse(status_code=200, content=smtp_settings) except Exception: # everything ok but no smtp settings return JSONResponse(status_code=200, content={}) @router.delete("/smtp", description="Delete SMTP settings") async def delete_smtp_settings( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:settings"]) ), session: Session = Depends(get_session), ): logger.info("Deleting SMTP settings") tenant_id = authenticated_entity.tenant_id context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) # Read the SMTP settings from the secret manager secret_manager.delete_secret(secret_name=f"{tenant_id}_smtp") logger.info("SMTP settings deleted successfully") return JSONResponse(status_code=200, content={}) @router.post("/smtp/test", description="Test SMTP settings") async def test_smtp_settings( smtp_settings: SMTPSettings = Body(...), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), ): # Logic to test SMTP settings, perhaps by sending a test email # You would use the provided SMTP settings to try and send an email success, message, logs = test_smtp_connection(smtp_settings) if success: return JSONResponse(status_code=200, content={"message": message, "logs": logs}) else: return JSONResponse(status_code=400, content={"message": message, "logs": logs}) def test_smtp_connection(settings: SMTPSettings) -> Tuple[bool, str, str]: # Capture the SMTP session output log_stream = io.StringIO() try: # A patched version of smtplib.SMTP that captures the SMTP session output logger.info("Testing SMTP") server = PatchedSMTP( settings.host, settings.port, timeout=10, log_stream=log_stream ) if settings.secure: logger.info("Configuring TLS") server.starttls() if settings.username and settings.password: logger.info("Configuring user and pass") server.login(settings.username, settings.password.get_secret_value()) # Create an HTML test email html_content = """

SMTP Settings Test

This is a test email from Keep to verify your SMTP settings.

✓ Success! Your SMTP settings are configured correctly.

SMTP Server {}
Port {}
Security {}
""".format(settings.host, settings.port, "TLS/STARTTLS" if settings.secure else "None") # Create MIMEText with HTML content message = MIMEText(html_content, "html") message["From"] = settings.from_email message["To"] = settings.to_email message["Subject"] = "Test SMTP Settings - Keep" logger.info("Sending test email") server.sendmail(settings.from_email, [settings.to_email], message.as_string()) server.quit() # Get the SMTP session log smtp_log = log_stream.getvalue().splitlines() log_stream.close() logger.info("Finished to send test email") return True, "SMTP settings are correct and an email has been sent.", smtp_log except Exception as e: logger.exception("Failed to test SMTP") return False, str(e), log_stream.getvalue().splitlines() class PatchedSMTP(smtplib.SMTP): debuglevel = 1 def __init__( self, host="", port=0, local_hostname=None, timeout=..., source_address=None, log_stream=None, ): self.log_stream = log_stream super().__init__(host, port, local_hostname, timeout, source_address) def _print_debug(self, *args): if self.log_stream is not None: # Write debug info to the StringIO stream self.log_stream.write(" ".join(str(arg) for arg in args) + "\n") else: super()._print_debug(*args) @router.post("/apikey", description="Create API key") async def create_key( request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), session: Session = Depends(get_session), ): try: identity_manager = IdentityManagerFactory.get_identity_manager( tenant_id=authenticated_entity.tenant_id, ) body = await request.json() unique_api_key_id = body["name"].replace(" ", "") role = identity_manager.get_role_by_role_name(body["role"]) except Exception: raise HTTPException(status_code=400, detail="Invalid request body") try: api_key = create_api_key( session=session, tenant_id=authenticated_entity.tenant_id, created_by=authenticated_entity.email, unique_api_key_id=unique_api_key_id, role=role.name, is_system=False, ) tenant_api_key = get_api_key( session, unique_api_key_id=unique_api_key_id, tenant_id=authenticated_entity.tenant_id, ) return { "reference_id": tenant_api_key.reference_id, "tenant": tenant_api_key.tenant, "is_deleted": tenant_api_key.is_deleted, "created_at": tenant_api_key.created_at, "created_by": tenant_api_key.created_by, "last_used": tenant_api_key.last_used, "secret": api_key, "role": tenant_api_key.role, } except APIKeyException as e: raise HTTPException( status_code=400, detail=f"Error creating API key: {str(e)}", ) @router.get("/apikeys", description="Get API keys") def get_keys( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id role = get_role_by_role_name(authenticated_entity.role) logger.info(f"Getting active API keys for tenant {tenant_id}") api_keys = get_api_keys( session=session, tenant_id=tenant_id, email=authenticated_entity.email, role=role, ) if api_keys: api_keys = get_api_keys_secret(tenant_id=tenant_id, api_keys=api_keys) logger.info( f"Active API keys for tenant {tenant_id} retrieved successfully", ) return {"apiKeys": api_keys} @router.put("/apikey", description="Update API key secret") async def update_api_key( request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), session: Session = Depends(get_session), ): try: body = await request.json() unique_api_key_id = body["apiKeyId"] except Exception: raise HTTPException(status_code=400, detail="Invalid request body") tenant_id = authenticated_entity.tenant_id logger.info( f"Updating API key ({unique_api_key_id}) secret", extra={"tenant_id": tenant_id, "unique_api_key_id": unique_api_key_id}, ) api_key = update_api_key_internal( session=session, tenant_id=tenant_id, unique_api_key_id=unique_api_key_id, ) if api_key: logger.info(f"Api key ({unique_api_key_id}) secret updated") return {"message": "API key secret updated", "apiKey": api_key} else: logger.info(f"Api key ({unique_api_key_id}) not found") raise HTTPException( status_code=404, detail=f"API key ({unique_api_key_id}) not found" ) @router.delete("/apikey/{keyId}", description="Delete API key") def delete_api_key( keyId: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:settings"]) ), session: Session = Depends(get_session), ): logger.info(f"Deleting api key ({keyId})") tenant_id = authenticated_entity.tenant_id api_key = get_api_key( session, unique_api_key_id=keyId, tenant_id=authenticated_entity.tenant_id ) if api_key and api_key.is_deleted is False: try: context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_manager.delete_secret( secret_name=f"{tenant_id}-{api_key.reference_id}", ) except Exception as e: raise HTTPException( status_code=500, detail=f"Unable to deactivate Api key ({keyId}) secret. Error: {str(e)}", ) try: api_key.is_deleted = True session.commit() except Exception: raise HTTPException( status_code=500, detail=f"Unable to flag Api key ({keyId}) as deactivated", ) logger.info(f"Api key ({keyId}) has been deactivated") return {"message": "Api key has been deactivated"} else: logger.info(f"Api key ({keyId}) not found") raise HTTPException(status_code=404, detail=f"Api key ({keyId}) not found") @router.get("/sso") async def get_sso_settings( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ): identity_manager = IdentityManagerFactory.get_identity_manager( tenant_id=authenticated_entity.tenant_id, context_manager=ContextManager(tenant_id=authenticated_entity.tenant_id), ) if identity_manager.support_sso: providers = identity_manager.get_sso_providers() return { "sso": True, "providers": providers, "wizardUrl": identity_manager.get_sso_wizard_url(authenticated_entity), } else: return {"sso": False} @router.get("/tenant/configuration") def get_tenant_configuration( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id tenant_configuration = TenantConfiguration() config_value = tenant_configuration.get_configuration(tenant_id=tenant_id) return JSONResponse(status_code=200, content=config_value) ================================================ FILE: keep/api/routes/status.py ================================================ from fastapi import APIRouter from keep.event_subscriber.event_subscriber import EventSubscriber router = APIRouter() @router.get("", description="simple status endpoint") def status() -> dict: """ Does nothing but return 200 response code Returns: dict: empty JSON object """ event_subscriber = EventSubscriber.get_instance() return { "status": "OK", "consumer": event_subscriber.status(), } ================================================ FILE: keep/api/routes/tags.py ================================================ from fastapi import APIRouter, Depends from keep.api.core.db import get_tags as get_tags_db from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() @router.get("", description="get tags") def get_tags( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:presets"]) ), ) -> list[dict]: tags = get_tags_db(authenticated_entity.tenant_id) return tags ================================================ FILE: keep/api/routes/topology.py ================================================ import logging from typing import List, Optional from uuid import UUID from fastapi import APIRouter, Depends, HTTPException, Response, UploadFile from fastapi.responses import JSONResponse from sqlmodel import Session from keep.api.core.db import get_session, get_session_sync from keep.api.models.db.topology import ( TopologyApplicationDtoIn, TopologyApplicationDtoOut, TopologyServiceDtoIn, TopologyServiceDtoOut, TopologyServiceCreateRequestDTO, TopologyServiceUpdateRequestDTO, TopologyServiceDependencyCreateRequestDto, TopologyServiceDependencyUpdateRequestDto, TopologyServiceDependencyDto, TopologyService, DeleteServicesRequest, ) from keep.api.tasks.process_topology_task import process_topology from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory from keep.providers.base.base_provider import BaseTopologyProvider from keep.providers.providers_factory import ProvidersFactory from keep.topologies.topologies_service import ( ApplicationNotFoundException, ApplicationParseException, InvalidApplicationDataException, ServiceNotFoundException, TopologiesService, DependencyNotFoundException, ServiceNotManualException, ) from keep.functions import cyaml logger = logging.getLogger(__name__) router = APIRouter() # GET all topology data @router.get( "", description="Get all topology data", response_model=List[TopologyServiceDtoOut] ) def get_topology_data( provider_ids: Optional[str] = None, services: Optional[str] = None, environment: Optional[str] = None, include_empty_deps: Optional[bool] = True, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:topology"]) ), session: Session = Depends(get_session), ) -> List[TopologyServiceDtoOut]: tenant_id = authenticated_entity.tenant_id logger.info("Getting topology data", extra={tenant_id: tenant_id}) topology_data = TopologiesService.get_all_topology_data( tenant_id, session, provider_ids, services, environment, include_empty_deps ) return topology_data @router.get( "/applications", description="Get all applications", response_model=List[TopologyApplicationDtoOut], ) def get_applications( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:topology"]) ), session: Session = Depends(get_session), ) -> List[TopologyApplicationDtoOut]: tenant_id = authenticated_entity.tenant_id logger.info("Getting applications", extra={"tenant_id": tenant_id}) try: return TopologiesService.get_applications_by_tenant_id(tenant_id, session) except ApplicationParseException as e: raise HTTPException(status_code=400, detail=str(e)) @router.post( "/applications", description="Create a new application", response_model=TopologyApplicationDtoOut, ) def create_application( application: TopologyApplicationDtoIn, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ) -> TopologyApplicationDtoOut: tenant_id = authenticated_entity.tenant_id logger.info("Creating application", extra={tenant_id: tenant_id}) try: return TopologiesService.create_application_by_tenant_id( tenant_id, application, session ) except InvalidApplicationDataException as e: raise HTTPException(status_code=400, detail=str(e)) except ServiceNotFoundException as e: raise HTTPException(status_code=400, detail=str(e)) @router.put( "/applications/{application_id}", description="Update an application", response_model=TopologyApplicationDtoOut, ) def update_application( application_id: UUID, application: TopologyApplicationDtoIn, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ) -> TopologyApplicationDtoOut: tenant_id = authenticated_entity.tenant_id logger.info( "Updating application", extra={"tenant_id": tenant_id, "application_id": str(application_id)}, ) try: return TopologiesService.update_application_by_id( tenant_id, application_id, application, session ) except ApplicationNotFoundException as e: raise HTTPException(status_code=404, detail=str(e)) except InvalidApplicationDataException as e: raise HTTPException(status_code=400, detail=str(e)) except ServiceNotFoundException as e: raise HTTPException(status_code=400, detail=str(e)) @router.delete("/applications/{application_id}", description="Delete an application") def delete_application( application_id: UUID, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info("Deleting application", extra={tenant_id: tenant_id}) try: TopologiesService.delete_application_by_id(tenant_id, application_id, session) return JSONResponse( status_code=200, content={"message": "Application deleted successfully"} ) except ApplicationNotFoundException as e: raise HTTPException(status_code=404, detail=str(e)) @router.post( "/pull", description="Pull topology data on demand from providers", response_model=List[TopologyServiceDtoOut], ) def pull_topology_data( provider_ids: Optional[str] = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info( "Pulling topology data on demand", extra={"tenant_id": tenant_id, "provider_ids": provider_ids}, ) try: providers = ProvidersFactory.get_installed_providers( tenant_id=tenant_id, include_details=False ) # Filter providers if provider_ids is specified if provider_ids: provider_id_list = provider_ids.split(",") providers = [p for p in providers if str(p.id) in provider_id_list] for provider in providers: extra = { "provider_type": provider.type, "provider_id": provider.id, "tenant_id": tenant_id, } try: provider_class = ProvidersFactory.get_installed_provider( tenant_id=tenant_id, provider_id=provider.id, provider_type=provider.type, ) if isinstance(provider_class, BaseTopologyProvider): logger.info("Pulling topology data", extra=extra) topology_data, applications_to_create = ( provider_class.pull_topology() ) logger.info( "Pulling topology data finished, processing", extra={**extra, "topology_length": len(topology_data)}, ) process_topology( tenant_id, topology_data, provider.id, provider.type ) new_session = get_session_sync() # now we want to create the applications topology_data = TopologiesService.get_all_topology_data( tenant_id, new_session, provider_ids=[provider.id] ) for app in applications_to_create: _app = TopologyApplicationDtoIn( name=app, services=[], ) try: # replace service name with service id services = applications_to_create[app].get("services", []) for service in services: service_id = next( ( s.id for s in topology_data if s.service == service ), None, ) if not service_id: raise ServiceNotFoundException(service.service) _app.services.append( TopologyServiceDtoIn(id=service_id) ) # if the application already exists, update it existing_apps = ( TopologiesService.get_applications_by_tenant_id( tenant_id, new_session ) ) if any(a.name == app for a in existing_apps): app_id = next( (a.id for a in existing_apps if a.name == app), None, ) TopologiesService.update_application_by_id( tenant_id, app_id, _app, new_session ) else: TopologiesService.create_application_by_tenant_id( tenant_id, _app, session ) except InvalidApplicationDataException as e: logger.error( f"Error creating application {app.name}: {str(e)}", extra=extra, ) logger.info("Finished processing topology data", extra=extra) else: logger.debug( f"Provider {provider.type} ({provider.id}) does not implement pulling topology data", extra=extra, ) except NotImplementedError: logger.debug( f"Provider {provider.type} ({provider.id}) does not implement pulling topology data", extra=extra, ) except Exception as e: logger.exception( f"Error pulling topology from provider {provider.type} ({provider.id})", extra={**extra, "error": str(e)}, ) # Return the updated topology data return TopologiesService.get_all_topology_data( tenant_id, session, provider_ids=provider_ids ) except Exception as e: logger.exception( "Error during on-demand topology pull", extra={"tenant_id": tenant_id, "error": str(e)}, ) raise HTTPException( status_code=500, detail=f"Failed to pull topology data: {str(e)}" ) @router.post("/service", description="Creating a service manually") def create_service( service: TopologyServiceCreateRequestDTO, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ) -> TopologyService: """ Any services created by this endpoint will have manual set to True. """ try: return TopologiesService.create_service( service=service, tenant_id=authenticated_entity.tenant_id, session=session ) except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to create service: {str(e)}" ) @router.put("/service", description="Updating a service manually") def update_service( service: TopologyServiceUpdateRequestDTO, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ) -> TopologyService: try: return TopologiesService.update_service( service=service, tenant_id=authenticated_entity.tenant_id, session=session ) except ServiceNotManualException: raise HTTPException( status_code=404, detail="The service you're trying to updated was not created manually.", ) except ServiceNotFoundException: raise HTTPException(status_code=404, detail="Service not found") except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to update service: {str(e)}" ) @router.delete("/services", description="Delete a list of services manually") def delete_services( service_ids: DeleteServicesRequest, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ): try: TopologiesService.delete_services( service_ids=service_ids.service_ids, tenant_id=authenticated_entity.tenant_id, session=session, ) return JSONResponse( status_code=200, content={"message": "Services deleted successfully"} ) except ServiceNotManualException: raise HTTPException( status_code=404, detail="One or more service(s) you're trying to delete was not created manually.", ) except ServiceNotFoundException: raise HTTPException(status_code=404, detail="Service not found") except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to delete services: {str(e)}" ) @router.post("/dependency", description="Creating a new dependency manually") def create_dependencies( dependency: TopologyServiceDependencyCreateRequestDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ) -> TopologyServiceDependencyDto: try: return TopologiesService.create_dependency( dependency=dependency, session=session, tenant_id=authenticated_entity.tenant_id, ) except ServiceNotManualException: raise HTTPException( status_code=404, detail="You're tying to create a dependency between one or more pulled services.", ) except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to create Dependency: {str(e)}" ) @router.put("/dependency", description="Updating a dependency manually") def update_dependency( dependency: TopologyServiceDependencyUpdateRequestDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ) -> TopologyServiceDependencyDto: try: return TopologiesService.update_dependency( dependency=dependency, session=session, tenant_id=authenticated_entity.tenant_id, ) except DependencyNotFoundException: raise HTTPException(status_code=404, detail="Dependency not found") except ServiceNotManualException: raise HTTPException( status_code=404, detail="You're tying to update a dependency between one or more pulled services.", ) except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to update Dependency: {str(e)}" ) @router.delete( "/dependency/{dependency_id}", description="Deleting a dependency manually" ) def delete_dependency( dependency_id: int, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ): try: TopologiesService.delete_dependency( dependency_id=dependency_id, session=session, tenant_id=authenticated_entity.tenant_id, ) return JSONResponse( status_code=200, content={"message": "Dependency deleted successfully"} ) except DependencyNotFoundException: raise HTTPException(status_code=404, detail="Dependency not found") except ServiceNotManualException: raise HTTPException( status_code=404, detail="You're tying to delete a dependency between two or more manual services.", ) except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to delete Dependency: {str(e)}" ) @router.get( "/export", description="Exporting the topology map as a YAML", ) async def export_topology_yaml( services: Optional[str] = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:topology"]) ), session: Session = Depends(get_session), ): tenant_id = authenticated_entity.tenant_id logger.info("Getting topology data", extra={tenant_id: tenant_id}) topology_data = TopologiesService.get_topology_services( tenant_id, session, None, services, None ) full_data = {"applications": {}, "services": [], "dependencies": []} for data in topology_data: services_dict = data.model_dump() del services_dict["updated_at"] del services_dict["tenant_id"] services_dict["is_manual"] = True if services_dict["is_manual"] is True else False full_data["services"].append(services_dict) for application in data.applications: application_dict = application.model_dump() del application_dict["tenant_id"] application_dict["id"] = str(application_dict["id"]) if application_dict["id"] in full_data["applications"]: full_data["applications"][application_dict["id"]]["services"].append(data.id) else: application_dict["services"] = [data.id] full_data["applications"][application_dict["id"]] = application_dict for dependency in data.dependencies: dependency_dict = dependency.model_dump() del dependency_dict["updated_at"] full_data["dependencies"].append(dependency_dict) full_data["applications"] = list(full_data["applications"].values()) export_yaml = cyaml.dump(full_data, width=99999) return Response(content=export_yaml, media_type="application/x-yaml") @router.post( "/import", description="Import the topology map from YAML", ) async def import_topology_yaml( file: UploadFile, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:topology"]) ), session: Session = Depends(get_session), ): try: tenant_id = authenticated_entity.tenant_id topology_yaml = await file.read() topology_data: dict = cyaml.safe_load(topology_yaml) TopologiesService.import_to_db(topology_data, session, tenant_id) return JSONResponse( status_code=200, content={"message": "Topology imported successfully"} ) except cyaml.YAMLError: logger.exception("Invalid YAML format") raise HTTPException(status_code=400, detail="Invalid YAML format") except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to import topology: {str(e)}" ) ================================================ FILE: keep/api/routes/whoami.py ================================================ import logging from fastapi import APIRouter, Depends from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory router = APIRouter() logger = logging.getLogger(__name__) @router.get( "", description="Get tenant id", ) def get_tenant_id( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:settings"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id return { "tenant_id": tenant_id, } ================================================ FILE: keep/api/routes/workflows.py ================================================ import datetime import json import logging import os from typing import Any, Dict, List, Optional, Union from fastapi import ( APIRouter, Body, Depends, HTTPException, Query, Request, Response, UploadFile, status, ) from fastapi.responses import RedirectResponse from opentelemetry import trace from sqlmodel import Session from keep.api.core.cel_to_sql.sql_providers.base import CelToSqlException from keep.api.core.config import config from keep.api.core.db import ( get_alert_by_event_id, get_installed_providers, get_last_workflow_workflow_to_alert_executions, get_or_create_dummy_workflow, get_session, get_workflow_by_id as get_workflow_by_id_db, get_workflow_version, get_workflow_versions, update_workflow_by_id as update_workflow_by_id_db, ) from keep.api.core.db import get_workflow_executions as get_workflow_executions_db from keep.api.core.workflows import ( get_workflow_facets, get_workflow_facets_data, get_workflow_potential_facet_fields, ) from keep.api.models.alert import AlertDto, AlertSeverity from keep.api.models.db.incident import IncidentSeverity from keep.api.models.facet import FacetOptionsQueryDto from keep.api.models.incident import IncidentDto from keep.api.models.query import QueryDto from keep.api.models.workflow import ( WorkflowCreateOrUpdateDTO, WorkflowDTO, WorkflowExecutionDTO, WorkflowExecutionLogsDTO, WorkflowRawDto, WorkflowRunResponseDTO, WorkflowToAlertExecutionDTO, WorkflowVersionDTO, WorkflowVersionListDTO, ) from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts from keep.api.utils.pagination import WorkflowExecutionsPaginatedResultsDto from keep.contextmanager.contextmanager import ContextManager from keep.functions import cyaml from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory from keep.parser.parser import Parser from keep.providers.providers_factory import ProviderConfigurationException from keep.secretmanager.secretmanagerfactory import SecretManagerFactory from keep.workflowmanager.workflow import Workflow from keep.workflowmanager.workflowmanager import WorkflowManager from keep.workflowmanager.workflowstore import WorkflowStore router = APIRouter() logger = logging.getLogger(__name__) tracer = trace.get_tracer(__name__) PLATFORM_URL = config("KEEP_PLATFORM_URL", default="https://platform.keephq.dev") @router.post( "/facets/options", description="Query workflows facet options. Accepts dictionary where key is facet id and value is cel to query facet", ) def fetch_facet_options( facet_options_query: FacetOptionsQueryDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching workflow facets from DB", extra={ "tenant_id": tenant_id, }, ) try: facet_options = get_workflow_facets_data( tenant_id=tenant_id, facet_options_query=facet_options_query ) except CelToSqlException as e: logger.exception( f'Error parsing CEL expression "{facet_options_query.cel}". {str(e)}' ) raise HTTPException( status_code=400, detail=f"Error parsing CEL expression: {facet_options_query.cel}", ) from e logger.info( "Fetched workflow facets from DB", extra={ "tenant_id": tenant_id, }, ) return facet_options @router.get( "/facets", description="Get workflow facets", ) def fetch_facets( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> list: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching workflow facets from DB", extra={ "tenant_id": tenant_id, }, ) facets = get_workflow_facets(tenant_id=tenant_id) logger.info( "Fetched workflow facets from DB", extra={ "tenant_id": tenant_id, }, ) return facets @router.get( "/facets/fields", description="Get potential fields for workflow facets", ) def fetch_facet_fields( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> list: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching workflow facet fields from DB", extra={ "tenant_id": tenant_id, }, ) fields = get_workflow_potential_facet_fields(tenant_id=tenant_id) logger.info( "Fetched workflow facet fields from DB", extra={ "tenant_id": tenant_id, }, ) return fields @router.get( "", description="Get workflows", ) # TODO: this should be deprecated and removed def get_workflows( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> list[WorkflowDTO] | list[dict]: query_result = query_workflows( QueryDto(), authenticated_entity, ) return query_result["results"] @router.post( "/query", description="Query workflows", ) def query_workflows( query: QueryDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id workflowstore = WorkflowStore() workflows_dto = [] installed_providers = get_installed_providers(tenant_id) installed_providers_by_type = {} for installed_provider in installed_providers: if installed_provider.type not in installed_providers_by_type: installed_providers_by_type[installed_provider.type] = { installed_provider.name: installed_provider } else: installed_providers_by_type[installed_provider.type][ installed_provider.name ] = installed_provider try: # get all workflows workflows, count = workflowstore.get_all_workflows_with_last_execution( tenant_id=tenant_id, cel=query.cel, limit=query.limit, offset=query.offset, sort_by=query.sort_by, sort_dir=query.sort_dir, ) except CelToSqlException as e: logger.exception(f'Error parsing CEL expression "{query.cel}". {str(e)}') raise HTTPException( status_code=400, detail=f"Error parsing CEL expression: {query.cel}", ) from e # iterate workflows for _workflow in workflows: workflow = _workflow["workflow"] workflow_last_run_time = _workflow["workflow_last_run_time"] workflow_last_run_status = _workflow["workflow_last_run_status"] last_executions = _workflow["workflow_last_executions"] last_execution_started = _workflow["workflow_last_run_started"] try: providers_dto, triggers = workflowstore.get_workflow_meta_data( tenant_id=tenant_id, workflow=workflow, installed_providers_by_type=installed_providers_by_type, ) except Exception as e: logger.error(f"Error fetching workflow meta data: {e}") providers_dto, triggers = [], [] # Default in case of failure # create the workflow DTO try: workflow_raw = cyaml.safe_load(workflow.workflow_raw) permissions = workflow_raw.get("permissions", []) can_run = Workflow.check_run_permissions( permissions, authenticated_entity.email, authenticated_entity.role ) is_alert_rule_workflow = WorkflowStore.is_alert_rule_workflow(workflow_raw) # very big width to avoid line breaks workflow_raw = cyaml.dump(workflow_raw, width=99999) workflow_dto = WorkflowDTO( id=workflow.id, name=workflow.name, description=workflow.description or "[This workflow has no description]", created_by=workflow.created_by, creation_time=workflow.creation_time, last_execution_time=workflow_last_run_time, last_execution_status=workflow_last_run_status, interval=workflow.interval, providers=providers_dto, triggers=triggers, workflow_raw=workflow_raw, revision=workflow.revision, last_updated=workflow.last_updated, last_executions=last_executions, last_execution_started=last_execution_started, disabled=workflow.is_disabled, provisioned=workflow.provisioned, alertRule=is_alert_rule_workflow, canRun=can_run, ) except Exception as e: logger.error(f"Error creating workflow DTO: {e}") continue workflows_dto.append(workflow_dto) return { "count": count, "results": workflows_dto, "limit": query.limit, "offset": query.offset, } @router.get( "/export", description="export all workflow Yamls", ) def export_workflows( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> list[str]: tenant_id = authenticated_entity.tenant_id workflowstore = WorkflowStore() # get all workflows workflows = workflowstore.get_all_workflows_yamls(tenant_id=tenant_id) return workflows def get_event_from_body(body: dict, tenant_id: str): event_body = body.get("body", {}) or body inputs = body.get("inputs", {}) # Handle regular run from body event_class = AlertDto if body.get("type", "alert") == "alert" else IncidentDto # Handle UI triggered events if event_class == AlertDto: event_body["id"] = event_body.get("fingerprint", "manual-run") if "severity" in event_body: try: event_body["severity"] = AlertSeverity(event_body["severity"].lower()) except ValueError: pass elif event_class == IncidentDto: event_body["id"] = event_body.get("id", "manual-run") if "severity" in event_body: try: event_body["severity"] = IncidentSeverity( event_body["severity"].lower() ) except ValueError: pass event_body["name"] = event_body.get("name", "manual-run") event_body["lastReceived"] = event_body.get( "lastReceived", datetime.datetime.now(tz=datetime.timezone.utc).isoformat() ) if "source" in event_body and not isinstance(event_body["source"], list): event_body["source"] = [event_body["source"]] try: event = event_class(**event_body) if isinstance(event, IncidentDto): event._tenant_id = tenant_id except TypeError: raise HTTPException( status_code=400, detail="Invalid event format", ) return event, inputs @router.post( "/{workflow_id}/run", description="Run a workflow", ) def run_workflow( workflow_id: str, event_type: Optional[str] = Query(None), event_id: Optional[str] = Query(None), body: Optional[Dict[Any, Any]] = Body(None), authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["execute:workflows"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id created_by = authenticated_entity.email logger.info("Running workflow", extra={"workflow_id": workflow_id}) workflow_store = WorkflowStore() try: workflow = workflow_store.get_workflow(tenant_id, workflow_id) except ValueError as e: logger.exception( "Invalid workflow configuration", extra={"workflow_id": workflow_id, "tenant_id": tenant_id}, ) raise HTTPException( status_code=400, detail=f"Invalid workflow configuration: {e}" ) from e # if there are workflow permissions, check if the user has access if not Workflow.check_run_permissions( workflow.workflow_permissions, authenticated_entity.email, authenticated_entity.role, ): raise HTTPException( status_code=403, detail="Insufficient permissions to execute this workflow" ) workflowmanager = WorkflowManager.get_instance() try: # Handle replay from query parameters if event_type and event_id: if event_type == "alert": # Fetch alert from your alert store alert_db = get_alert_by_event_id(tenant_id, event_id) event = convert_db_alerts_to_dto_alerts([alert_db])[0] elif event_type == "incident": # SHAHAR: TODO raise NotImplementedError("Incident replay is not supported yet") else: raise HTTPException( status_code=400, detail=f"Invalid event type: {event_type}", ) else: # Handle regular run from body event, inputs = get_event_from_body(body, tenant_id) workflow_execution_id = workflowmanager.scheduler.handle_manual_event_workflow( workflow_id, workflow.workflow_revision, tenant_id, created_by, event, inputs=inputs, ) except HTTPException: # re-raise http exceptions as is raise except Exception as e: logger.exception( "Failed to run workflow", extra={ "workflow_id": workflow_id, "tenant_id": tenant_id, }, ) raise HTTPException( status_code=500, detail=f"Failed to run workflow {workflow_id}: {e}", ) from e logger.info( "Workflow ran successfully", extra={ "workflow_id": workflow_id, }, ) return { "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "status": "success", } @router.get("/{workflow_id}/run", description="Run a workflow") def run_workflow_with_query_params( workflow_id: str, request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:workflows"]) ), ): params = dict(request.query_params) alert_id = params.get("alert", params.get("alert_id")) if params.get("alert", params.get("alert_id")): response = run_workflow( workflow_id, "alert", alert_id, params, authenticated_entity, ) else: response = run_workflow(workflow_id, None, None, params, authenticated_entity) if response.get("status") == "success": workflow_execution_id = response.get("workflow_execution_id") return RedirectResponse( url=f"{PLATFORM_URL}/workflows/{workflow_id}/runs/{workflow_execution_id}" ) else: return RedirectResponse( url=f"{PLATFORM_URL}/workflows/{workflow_id}?error=failed_to_run_workflow" ) @router.post( "/test", description="Test run a workflow from a definition", ) async def run_workflow_from_definition( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:workflows"]) ), body: Dict[Any, Any] = Body({}), ) -> WorkflowRunResponseDTO: tenant_id = authenticated_entity.tenant_id created_by = authenticated_entity.email workflow_raw = body.get("workflow_raw", "") if not workflow_raw: raise HTTPException(status_code=400, detail="Workflow raw is required") workflow_dict = await get_workflow_dict_from_string(workflow_raw) workflowstore = WorkflowStore() workflowmanager = WorkflowManager.get_instance() workflow_id = workflow_dict.get("id") if workflow_id: # if workflow exists, use it's id for test run try: workflow_from_db = workflowstore.get_workflow(tenant_id, workflow_id) # get_workflow looks by workflow name if id is not found, so we need to assign the final id from db workflow_id = workflow_from_db.workflow_id except ProviderConfigurationException as e: logger.exception( "Invalid provider configuration", extra={"workflow_id": workflow_id, "tenant_id": tenant_id}, ) raise HTTPException( status_code=400, detail=f"Invalid provider configuration: {e}" ) from e except ValueError as e: logger.exception( "Invalid workflow configuration", extra={"workflow_id": workflow_id, "tenant_id": tenant_id}, ) raise HTTPException( status_code=400, detail=f"Invalid workflow configuration: {e}" ) from e except HTTPException: # if workflow_id is not found, use dummy workflow id for test run workflow_id = None if workflow_id is None: # otherwise, ensure dummy workflow exists and use it's id for test run try: dummy_workflow = get_or_create_dummy_workflow(tenant_id) workflow_id = dummy_workflow.id except Exception as e: logger.exception( "Failed to create dummy workflow", extra={"tenant_id": tenant_id}, ) raise HTTPException( status_code=500, detail=f"Failed to create dummy workflow: {e}" ) try: workflow = workflowstore.get_workflow_from_dict(tenant_id, workflow_dict) except Exception as e: logger.exception( "Failed to parse workflow", extra={"tenant_id": tenant_id, "workflow_dict": workflow_dict}, ) raise HTTPException( status_code=400, detail=f"Failed to parse test workflow: {e}", ) try: event, inputs = get_event_from_body(body, tenant_id) workflow_execution_id = workflowmanager.scheduler.handle_manual_event_workflow( workflow_id, workflow.workflow_revision, tenant_id, created_by, event, workflow=workflow, test_run=True, inputs=inputs, ) except Exception as e: logger.exception( "Failed to run test workflow", ) raise HTTPException( status_code=500, detail=f"Failed to run test workflow: {e}", ) return WorkflowRunResponseDTO( workflow_execution_id=workflow_execution_id, ) async def get_workflow_dict_from_string(workflow_raw: str | bytes) -> dict: try: workflow_data = cyaml.safe_load(workflow_raw) # backward compatibility if "alert" in workflow_data: workflow_data = workflow_data.pop("alert") # elif "workflow" in workflow_data: workflow_data = workflow_data.pop("workflow") except cyaml.YAMLError: logger.exception("Invalid YAML format") raise HTTPException(status_code=400, detail="Invalid YAML format") return workflow_data async def __get_workflow_raw_data( request: Request | None, file: UploadFile | None ) -> dict: if not request and not file: raise HTTPException(status_code=400, detail="Nor file nor request provided") # we support both File upload (from frontend) or raw yaml (e.g. curl) if file: workflow_raw_data = await file.read() else: workflow_raw_data = await request.body() return await get_workflow_dict_from_string(workflow_raw_data) @router.post( "", description="Create or update a workflow", status_code=status.HTTP_201_CREATED ) async def create_workflow( file: UploadFile, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:workflows"]) ), lookup_by_name: bool = Query(False), ) -> WorkflowCreateOrUpdateDTO: tenant_id = authenticated_entity.tenant_id created_by = authenticated_entity.email workflow_raw_data = await __get_workflow_raw_data(request=None, file=file) workflowstore = WorkflowStore() # Create the workflow try: workflow = workflowstore.create_workflow( tenant_id=tenant_id, created_by=created_by, workflow=workflow_raw_data, force_update=False, lookup_by_name=lookup_by_name, ) except Exception: logger.exception( "Failed to create workflow", extra={"tenant_id": tenant_id, "workflow_raw_data": workflow_raw_data}, ) raise HTTPException( status_code=400, detail="Failed to upload workflow. Please contact us via Slack for help.", ) if workflow.revision == 1: return WorkflowCreateOrUpdateDTO( workflow_id=workflow.id, status="created", revision=workflow.revision ) else: return WorkflowCreateOrUpdateDTO( workflow_id=workflow.id, status="updated", revision=workflow.revision ) @router.get("/executions", description="Get workflow executions by alert fingerprint") def get_workflow_executions_by_alert_fingerprint( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), session: Session = Depends(get_session), ) -> list[WorkflowToAlertExecutionDTO]: with tracer.start_as_current_span("get_workflow_executions_by_alert_fingerprint"): latest_workflow_to_alert_executions = ( get_last_workflow_workflow_to_alert_executions( session=session, tenant_id=authenticated_entity.tenant_id ) ) return [ WorkflowToAlertExecutionDTO( workflow_id=workflow_execution.workflow_execution.workflow_id, workflow_execution_id=workflow_execution.workflow_execution_id, alert_fingerprint=workflow_execution.alert_fingerprint, workflow_status=workflow_execution.workflow_execution.status, workflow_started=workflow_execution.workflow_execution.started, event_id=workflow_execution.event_id, ) for workflow_execution in latest_workflow_to_alert_executions ] @router.post( "/json", description="Create or update a workflow", status_code=status.HTTP_201_CREATED, ) async def create_workflow_from_body( request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:workflows"]) ), ) -> WorkflowCreateOrUpdateDTO: tenant_id = authenticated_entity.tenant_id created_by = authenticated_entity.email workflow_raw_data = await __get_workflow_raw_data(request, None) workflowstore = WorkflowStore() # Create the workflow try: workflow = workflowstore.create_workflow( tenant_id=tenant_id, created_by=created_by, workflow=workflow_raw_data ) except Exception: logger.exception( "Failed to create workflow", extra={"tenant_id": tenant_id, "workflow_raw_data": workflow_raw_data}, ) raise HTTPException( status_code=400, detail="Failed to upload workflow. Please contact us via Slack for help.", ) if workflow.revision == 1: return WorkflowCreateOrUpdateDTO( workflow_id=workflow.id, status="created", revision=workflow.revision ) else: return WorkflowCreateOrUpdateDTO( workflow_id=workflow.id, status="updated", revision=workflow.revision ) # Add Mock Workflows (6 Random Workflows on Every Request) # To add mock workflows, a new backend API endpoint has been created: /workflows/random-templates. # 1. Fetching Random Templates: When a request is made to this endpoint, all workflow YAML/YML files are read and # shuffled randomly. # 2. Response: Only the first 6 files are parsed and sent in the response. @router.get("/random-templates", description="Get random workflow templates") def get_random_workflow_templates( authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> list[dict]: """ This endpoint is deprecated and will be removed in the future. """ tenant_id = authenticated_entity.tenant_id workflowstore = WorkflowStore() default_directory = os.environ.get( "KEEP_WORKFLOWS_PATH", os.path.join(os.path.dirname(__file__), "../../../examples/workflows"), ) if not os.path.exists(default_directory): # on the container we use the following path fallback_directory = "/examples/workflows" logger.warning( f"{default_directory} does not exist, using fallback: {fallback_directory}" ) if os.path.exists(fallback_directory): default_directory = fallback_directory else: logger.error(f"Neither {default_directory} nor {fallback_directory} exist") raise FileNotFoundError( f"Neither {default_directory} nor {fallback_directory} exist" ) workflows = workflowstore.get_random_workflow_templates( tenant_id=tenant_id, workflows_dir=default_directory, limit=8 ) return workflows @router.post("/templates/query", description="Query workflow templates") def query_workflow_templates( query: QueryDto, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> dict: tenant_id = authenticated_entity.tenant_id workflowstore = WorkflowStore() default_directory = os.environ.get( "KEEP_WORKFLOWS_PATH", os.path.join(os.path.dirname(__file__), "../../../examples/workflows"), ) if not os.path.exists(default_directory): # on the container we use the following path fallback_directory = "/examples/workflows" logger.warning( f"{default_directory} does not exist, using fallback: {fallback_directory}" ) if os.path.exists(fallback_directory): default_directory = fallback_directory else: logger.error(f"Neither {default_directory} nor {fallback_directory} exist") raise FileNotFoundError( f"Neither {default_directory} nor {fallback_directory} exist" ) workflows, total_count = workflowstore.query_workflow_templates( tenant_id=tenant_id, workflows_dir=default_directory, query=query ) return { "limit": query.limit, "offset": query.offset, "count": total_count, "results": workflows, } @router.put( "/{workflow_id}", description="Update a workflow", status_code=status.HTTP_201_CREATED, ) async def update_workflow_by_id( workflow_id: str, request: Request, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:workflows"]) ), session: Session = Depends(get_session), ) -> WorkflowCreateOrUpdateDTO: """ Update a workflow Args: workflow_id (str): The workflow ID request (Request): The FastAPI Request object file (UploadFile, optional): File if was uploaded via file. Defaults to File(...). tenant_id (str, optional): The tenant ID. Defaults to Depends(verify_bearer_token). session (Session, optional): DB Session object injected via DI. Defaults to Depends(get_session). Raises: HTTPException: If the workflow was not found Returns: Workflow: The updated workflow """ tenant_id = authenticated_entity.tenant_id logger.info(f"Updating workflow {workflow_id}", extra={"tenant_id": tenant_id}) workflow_from_db = get_workflow_by_id_db( tenant_id=tenant_id, workflow_id=workflow_id ) if not workflow_from_db: logger.warning( f"Tenant tried to update workflow {workflow_id} that does not exist", extra={"tenant_id": tenant_id}, ) raise HTTPException(404, "Workflow not found") if workflow_from_db.provisioned: raise HTTPException(403, detail="Cannot update a provisioned workflow") workflow_raw_data = await __get_workflow_raw_data(request, None) parser = Parser() workflow_interval = parser.parse_interval(workflow_raw_data) updated_workflow = update_workflow_by_id_db( id=workflow_id, tenant_id=tenant_id, name=workflow_raw_data.get("name", ""), description=workflow_raw_data.get("description"), interval=workflow_interval, workflow_raw=cyaml.dump(workflow_raw_data, width=99999), updated_by=authenticated_entity.email, is_disabled=workflow_raw_data.get("disabled", False), ) logger.info(f"Updated workflow {workflow_id}", extra={"tenant_id": tenant_id}) return WorkflowCreateOrUpdateDTO( workflow_id=workflow_id, revision=updated_workflow.revision, status="updated" ) @router.get("/{workflow_id}/raw", description="Get raw workflow by ID") def get_raw_workflow_by_id( workflow_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> WorkflowRawDto: tenant_id = authenticated_entity.tenant_id workflowstore = WorkflowStore() return WorkflowRawDto( workflow_raw=workflowstore.get_raw_workflow( tenant_id=tenant_id, workflow_id=workflow_id ) ) @router.get("/{workflow_id}", description="Get workflow by ID") @router.get( "/{workflow_id}/versions/{revision}", description="Get workflow by ID and revision" ) def get_workflow_by_id( workflow_id: str, revision: int | None = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ): tenant_id = authenticated_entity.tenant_id # get all workflow workflow = get_workflow_by_id_db(tenant_id=tenant_id, workflow_id=workflow_id) if not workflow: logger.warning( f"Tenant tried to get workflow {workflow_id} that does not exist", extra={"tenant_id": tenant_id}, ) raise HTTPException(404, "Workflow not found") updated_at = workflow.last_updated updated_by = workflow.updated_by or "unknown" workflow_raw = workflow.workflow_raw if revision: workflow_version = get_workflow_version( tenant_id=tenant_id, workflow_id=workflow_id, revision=revision ) if not workflow_version: raise HTTPException(404, "Workflow version not found") updated_at = workflow_version.updated_at updated_by = workflow_version.updated_by or "unknown" workflow_raw = workflow_version.workflow_raw installed_providers = get_installed_providers(tenant_id) installed_providers_by_type = {} for installed_provider in installed_providers: if installed_provider.type not in installed_providers_by_type: installed_providers_by_type[installed_provider.type] = { installed_provider.name: installed_provider } else: installed_providers_by_type[installed_provider.type][ installed_provider.name ] = installed_provider workflowstore = WorkflowStore() try: providers_dto, triggers = workflowstore.get_workflow_meta_data( tenant_id=tenant_id, workflow=workflow, installed_providers_by_type=installed_providers_by_type, ) except Exception as e: logger.error(f"Error fetching workflow meta data: {e}") providers_dto, triggers = [], [] # Default in case of failure try: final_workflow_raw = workflowstore.format_workflow_yaml(workflow_raw) except cyaml.YAMLError: logger.exception("Invalid YAML format") raise HTTPException(status_code=500, detail="Error fetching workflow meta data") return WorkflowDTO( id=workflow.id, name=workflow.name, description=workflow.description or "[This workflow has no description]", created_by=workflow.created_by, creation_time=workflow.creation_time, interval=workflow.interval, providers=providers_dto, triggers=triggers, workflow_raw=final_workflow_raw, last_updated=updated_at, disabled=workflow.is_disabled, revision=workflow.revision, last_updated_by=updated_by, ) @router.get("/{workflow_id}/versions") def list_workflow_versions( workflow_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ): tenant_id = authenticated_entity.tenant_id versions = get_workflow_versions(tenant_id=tenant_id, workflow_id=workflow_id) return WorkflowVersionListDTO( versions=[ WorkflowVersionDTO( revision=version.revision, updated_by=version.updated_by, updated_at=version.updated_at, ) for version in versions ] ) @router.get("/{workflow_id}/runs", description="Get workflow executions by ID") def get_workflow_runs_by_id( workflow_id: str, tab: int = 1, limit: int = 25, offset: int = 0, status: Optional[List[str]] = Query(None), trigger: Optional[List[str]] = Query(None), execution_id: Optional[str] = None, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> WorkflowExecutionsPaginatedResultsDto: tenant_id = authenticated_entity.tenant_id workflow = get_workflow_by_id_db(tenant_id=tenant_id, workflow_id=workflow_id) if not workflow: logger.warning( f"Tenant tried to get workflow {workflow_id} that does not exist", extra={"tenant_id": tenant_id}, ) raise HTTPException(404, "Workflow not found") installed_providers = get_installed_providers(tenant_id) installed_providers_by_type = {} for installed_provider in installed_providers: if installed_provider.type not in installed_providers_by_type: installed_providers_by_type[installed_provider.type] = { installed_provider.name: installed_provider } else: installed_providers_by_type[installed_provider.type][ installed_provider.name ] = installed_provider with tracer.start_as_current_span("get_workflow_executions"): total_count, workflow_executions, pass_count, fail_count, avgDuration = ( get_workflow_executions_db( tenant_id, workflow_id, limit, offset, tab, status, trigger, execution_id, ) ) workflow_executions_dtos = [] with tracer.start_as_current_span("create_workflow_dtos"): for workflow_execution in workflow_executions: workflow_execution_dto = { "id": workflow_execution.id, "workflow_id": workflow_execution.workflow_id, "workflow_revision": workflow_execution.workflow_revision, "status": workflow_execution.status, "started": workflow_execution.started.isoformat(), "triggered_by": workflow_execution.triggered_by, "error": workflow_execution.error, "execution_time": workflow_execution.execution_time, } workflow_executions_dtos.append(workflow_execution_dto) workflowstore = WorkflowStore() try: providers_dto, triggers = workflowstore.get_workflow_meta_data( tenant_id=tenant_id, workflow=workflow, installed_providers_by_type=installed_providers_by_type, ) except Exception as e: logger.error(f"Error fetching workflow meta data: {e}") providers_dto, triggers = [], [] # Default in case of failure final_workflow = WorkflowDTO( id=workflow.id, name=workflow.name, description=workflow.description or "[This workflow has no description]", created_by=workflow.created_by, creation_time=workflow.creation_time, interval=workflow.interval, providers=providers_dto, triggers=triggers, workflow_raw=workflow.workflow_raw, last_updated=workflow.last_updated, disabled=workflow.is_disabled, revision=workflow.revision, ) return WorkflowExecutionsPaginatedResultsDto( limit=limit, offset=offset, count=total_count, items=workflow_executions_dtos, passCount=pass_count, failCount=fail_count, avgDuration=avgDuration, workflow=final_workflow, ) @router.delete("/{workflow_id}", description="Delete workflow") def delete_workflow_by_id( workflow_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["delete:workflows"]) ), ): tenant_id = authenticated_entity.tenant_id workflowstore = WorkflowStore() workflowstore.delete_workflow(workflow_id=workflow_id, tenant_id=tenant_id) return {"workflow_id": workflow_id, "status": "deleted"} @router.get("/runs/{workflow_execution_id}") @router.get( "/{workflow_id}/runs/{workflow_execution_id}", description="Get a workflow execution status, results, and logs", ) def get_workflow_execution_status( workflow_execution_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:workflows"]) ), ) -> WorkflowExecutionDTO: tenant_id = authenticated_entity.tenant_id workflowstore = WorkflowStore() workflow_execution, logs = workflowstore.get_workflow_execution_with_logs( workflow_execution_id=workflow_execution_id, tenant_id=tenant_id, ) workflow = get_workflow_by_id_db( tenant_id=tenant_id, workflow_id=workflow_execution.workflow_id, ) event_id = None event_type = None if workflow_execution.workflow_to_alert_execution: event_id = workflow_execution.workflow_to_alert_execution.event_id event_type = "alert" # TODO: sub triggers? on create? on update? elif workflow_execution.workflow_to_incident_execution: event_id = workflow_execution.workflow_to_incident_execution.incident_id event_type = "incident" return WorkflowExecutionDTO( id=workflow_execution.id, workflow_name=workflow.name if workflow else None, workflow_id=workflow_execution.workflow_id, workflow_revision=workflow_execution.workflow_revision, status=workflow_execution.status, started=workflow_execution.started, triggered_by=workflow_execution.triggered_by, error=workflow_execution.error, execution_time=workflow_execution.execution_time, logs=[ WorkflowExecutionLogsDTO( id=log.id, timestamp=log.timestamp, message=log.message, context=log.context if log.context else {}, ) for log in logs ], results=workflow_execution.results, event_id=event_id, event_type=event_type, ) @router.put( "/{workflow_id}/toggle", description="Enable or disable a workflow", ) def toggle_workflow_state( workflow_id: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:workflows"]) ), session: Session = Depends(get_session), ) -> dict: """ Toggle the enabled/disabled state of a workflow Args: workflow_id (str): The workflow ID authenticated_entity (AuthenticatedEntity): The authenticated entity session (Session): DB Session object Raises: HTTPException: If the workflow was not found or if it's provisioned Returns: dict: Status of the operation """ tenant_id = authenticated_entity.tenant_id logger.info(f"Toggling workflow {workflow_id}", extra={"tenant_id": tenant_id}) workflow = get_workflow_by_id_db(tenant_id=tenant_id, workflow_id=workflow_id) if not workflow: logger.warning( f"Tenant tried to toggle workflow {workflow_id} that does not exist", extra={"tenant_id": tenant_id}, ) raise HTTPException(404, "Workflow not found") if workflow.provisioned: raise HTTPException(403, detail="Cannot modify a provisioned workflow") # Toggle the disabled state # TODO: update workflow_raw workflow.is_disabled = not workflow.is_disabled workflow.last_updated = datetime.datetime.now() session.add(workflow) session.commit() logger.info( f"Workflow {workflow_id} {'disabled' if workflow.is_disabled else 'enabled'}", extra={"tenant_id": tenant_id}, ) return { "workflow_id": workflow_id, "status": "success", "is_disabled": workflow.is_disabled, } @router.post( "/{workflow_id}/secrets", description="Write a new secret or update existing secret for a workflow", ) def write_workflow_secret( workflow_id: str, secret_data: Dict[str, str], authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:secrets"]) ), ) -> Response: """ Write or update multiple secrets for a workflow in a single entry. If a secret already exists, it updates only the changed keys. """ tenant_id = authenticated_entity.tenant_id workflow = get_workflow_by_id_db(tenant_id=tenant_id, workflow_id=workflow_id) if not workflow: raise HTTPException(404, "Workflow not found") context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_key = f"{tenant_id}_{workflow_id}_secrets" try: existing_secrets = secret_manager.read_secret(secret_key, is_json=True) if not isinstance(existing_secrets, dict): existing_secrets = {} except Exception: existing_secrets = {} existing_secrets.update(secret_data) # Write back the updated secret object secret_manager.write_secret( secret_name=secret_key, secret_value=json.dumps(existing_secrets), ) return Response(status_code=201) @router.get( "/{workflow_id}/secrets", description="Read a workflow secret", ) def read_workflow_secret( workflow_id: str, is_json: bool = True, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:secrets"]) ), ) -> Union[Dict, str]: """ Read a secret value for a workflow. Optionally parse as JSON if is_json is True. """ tenant_id = authenticated_entity.tenant_id workflow = get_workflow_by_id_db(tenant_id=tenant_id, workflow_id=workflow_id) if not workflow: raise HTTPException(404, "Workflow not found") context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_key = f"{tenant_id}_{workflow_id}_secrets" try: return secret_manager.read_secret(secret_name=secret_key, is_json=is_json) except Exception: return {} @router.delete( "/{workflow_id}/secrets/{secret_name}", description="Delete a specific secret key for a workflow", ) def delete_workflow_secret( workflow_id: str, secret_name: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["write:secrets"]) ), ) -> Response: """ Delete a specific secret key inside the workflow's secrets entry. If the key exists, it is removed, but other secrets remain. """ tenant_id = authenticated_entity.tenant_id workflow = get_workflow_by_id_db(tenant_id=tenant_id, workflow_id=workflow_id) if not workflow: raise HTTPException(404, "Workflow not found") context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_key = f"{tenant_id}_{workflow_id}_secrets" secrets = secret_manager.read_secret(secret_key, is_json=True) if secret_name not in secrets: raise HTTPException(404, f"Secret '{secret_name}' not found") del secrets[secret_name] # Remove only the specific key secret_manager.write_secret( secret_name=secret_key, secret_value=json.dumps(secrets), ) return Response(status_code=201) ================================================ FILE: keep/api/tasks/__init__.py ================================================ ================================================ FILE: keep/api/tasks/notification_cache.py ================================================ import os import time from typing import Dict, Tuple # Get polling interval from env POLLING_INTERVAL = int(os.getenv("PUSHER_POLLING_INTERVAL", "15")) class NotificationCache: _instance = None __initialized = False def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self): if not self.__initialized: self.cache: Dict[Tuple[str, str], float] = {} self.__initialized = True def should_notify(self, tenant_id: str, event_type: str) -> bool: cache_key = (tenant_id, event_type) current_time = time.time() if cache_key not in self.cache: self.cache[cache_key] = current_time return True last_time = self.cache[cache_key] if current_time - last_time >= POLLING_INTERVAL: self.cache[cache_key] = current_time return True return False # Get singleton instance def get_notification_cache() -> NotificationCache: return NotificationCache() ================================================ FILE: keep/api/tasks/process_event_task.py ================================================ # builtins import copy import datetime import json import logging import os import sys import time import traceback from typing import List # third-parties import dateutil from arq import Retry from fastapi.datastructures import FormData from opentelemetry import trace from sqlmodel import Session # internals from keep.api.alert_deduplicator.alert_deduplicator import AlertDeduplicator from keep.api.bl.enrichments_bl import EnrichmentsBl from keep.api.bl.incidents_bl import IncidentBl from keep.api.bl.maintenance_windows_bl import MaintenanceWindowsBl from keep.api.consts import KEEP_CORRELATION_ENABLED, MAINTENANCE_WINDOW_ALERT_STRATEGY from keep.api.core.db import ( bulk_upsert_alert_fields, enrich_alerts_with_incidents, get_alerts_by_fingerprint, get_all_presets_dtos, get_enrichment_with_session, get_last_alert_hashes_by_fingerprints, get_session_sync, get_started_at_for_alerts, set_last_alert, ) from keep.api.core.dependencies import get_pusher_client from keep.api.core.elastic import ElasticClient from keep.api.core.metrics import ( events_error_counter, events_in_counter, events_out_counter, processing_time_summary, ) from keep.api.models.action_type import ActionType from keep.api.models.alert import AlertDto, AlertStatus from keep.api.models.db.alert import Alert, AlertAudit, AlertRaw from keep.api.models.db.incident import IncidentStatus from keep.api.models.incident import IncidentDto from keep.api.tasks.notification_cache import get_notification_cache from keep.api.utils.alert_utils import sanitize_alert from keep.api.utils.enrichment_helpers import ( calculate_firing_time_since_last_resolved, calculated_firing_counter, calculated_start_firing_time, convert_db_alerts_to_dto_alerts, calculated_unresolved_counter, ) from keep.providers.providers_factory import ProvidersFactory from keep.rulesengine.rulesengine import RulesEngine from keep.workflowmanager.workflowmanager import WorkflowManager TIMES_TO_RETRY_JOB = 5 # the number of times to retry the job in case of failure # Opt-outs/ins KEEP_STORE_RAW_ALERTS = os.environ.get("KEEP_STORE_RAW_ALERTS", "false") == "true" KEEP_ALERT_FIELDS_ENABLED = ( os.environ.get("KEEP_ALERT_FIELDS_ENABLED", "true") == "true" ) KEEP_MAINTENANCE_WINDOWS_ENABLED = ( os.environ.get("KEEP_MAINTENANCE_WINDOWS_ENABLED", "true") == "true" ) KEEP_AUDIT_EVENTS_ENABLED = ( os.environ.get("KEEP_AUDIT_EVENTS_ENABLED", "true") == "true" ) KEEP_CALCULATE_START_FIRING_TIME_ENABLED = ( os.environ.get("KEEP_CALCULATE_START_FIRING_TIME_ENABLED", "true") == "true" ) logger = logging.getLogger(__name__) def __internal_prepartion( alerts: list[AlertDto], fingerprint: str | None, api_key_name: str | None ): """ Internal function to prepare the alerts for the digest Args: alerts (list[AlertDto]): List of alerts to iterate over fingerprint (str | None): Fingerprint to set on the alerts api_key_name (str | None): API key name to set on the alerts (that were used to push them) """ for alert in alerts: try: if not alert.source: alert.source = ["keep"] # weird bug on Mailgun where source is int except Exception: logger.exception( "failed to parse source", extra={ "alert": alerts, }, ) raise if fingerprint is not None: alert.fingerprint = fingerprint if api_key_name is not None: alert.apiKeyRef = api_key_name def __validate_last_received(event): # Make sure the lastReceived is a valid date string # tb: we do this because `AlertDto` object lastReceived is a string and not a datetime object # TODO: `AlertDto` object `lastReceived` should be a datetime object so we can easily validate with pydantic if not event.lastReceived: event.lastReceived = datetime.datetime.now(tz=datetime.timezone.utc).isoformat() else: try: dateutil.parser.isoparse(event.lastReceived) except ValueError: logger.warning("Invalid lastReceived date, setting to now") event.lastReceived = datetime.datetime.now( tz=datetime.timezone.utc ).isoformat() def __save_to_db( tenant_id, provider_type, session: Session, raw_events: list[dict], formatted_events: list[AlertDto], deduplicated_events: list[AlertDto], provider_id: str | None = None, timestamp_forced: datetime.datetime | None = None, ): try: # keep raw events in the DB if the user wants to # this is mainly for debugging and research purposes if KEEP_STORE_RAW_ALERTS: if isinstance(raw_events, dict): raw_events = [raw_events] for raw_event in raw_events: alert = AlertRaw( tenant_id=tenant_id, raw_alert=raw_event, provider_type=provider_type, ) session.add(alert) enrichments_bl = EnrichmentsBl(tenant_id, session) # add audit to the deduplicated events # TODO: move this to the alert deduplicator if KEEP_AUDIT_EVENTS_ENABLED: for event in deduplicated_events: audit = AlertAudit( tenant_id=tenant_id, fingerprint=event.fingerprint, status=event.status, action=ActionType.DEDUPLICATED.value, user_id="system", description="Alert was deduplicated", ) session.add(audit) __validate_last_received(event) enrichments_bl.enrich_entity( event.fingerprint, enrichments={"lastReceived": event.lastReceived}, dispose_on_new_alert=True, action_type=ActionType.GENERIC_ENRICH, action_callee="system", action_description="Alert lastReceived enriched on deduplication", ) enriched_formatted_events = [] saved_alerts = [] fingerprints = [event.fingerprint for event in formatted_events] started_at_for_fingerprints = get_started_at_for_alerts( tenant_id, fingerprints, session=session ) for formatted_event in formatted_events: formatted_event.pushed = True started_at = started_at_for_fingerprints.get( formatted_event.fingerprint, None ) if started_at: formatted_event.startedAt = str(started_at) if KEEP_CALCULATE_START_FIRING_TIME_ENABLED: # calculate startFiring time previous_alert = get_alerts_by_fingerprint( tenant_id=tenant_id, fingerprint=formatted_event.fingerprint, limit=1, ) previous_alert = convert_db_alerts_to_dto_alerts(previous_alert) formatted_event.firingStartTime = calculated_start_firing_time( formatted_event, previous_alert ) formatted_event.firingStartTimeSinceLastResolved = ( calculate_firing_time_since_last_resolved( formatted_event, previous_alert ) ) # we now need to update the firing and unresolved counters formatted_event.firingCounter = calculated_firing_counter( formatted_event, previous_alert ) formatted_event.unresolvedCounter = calculated_unresolved_counter( formatted_event, previous_alert ) # Dispose enrichments that needs to be disposed try: enrichments_bl.dispose_enrichments(formatted_event.fingerprint) except Exception: logger.exception( "Failed to dispose enrichments", extra={ "tenant_id": tenant_id, "fingerprint": formatted_event.fingerprint, }, ) # Post format enrichment try: formatted_event = enrichments_bl.run_extraction_rules(formatted_event) except Exception: logger.exception( "Failed to run post-formatting extraction rules", extra={ "tenant_id": tenant_id, "fingerprint": formatted_event.fingerprint, }, ) __validate_last_received(formatted_event) alert_args = { "tenant_id": tenant_id, "provider_type": ( provider_type if provider_type else formatted_event.source[0] ), "event": formatted_event.dict(), "provider_id": provider_id, "fingerprint": formatted_event.fingerprint, "alert_hash": formatted_event.alert_hash, } alert_args = sanitize_alert(alert_args) if timestamp_forced is not None: alert_args["timestamp"] = timestamp_forced alert = Alert(**alert_args) session.add(alert) session.flush() saved_alerts.append(alert) alert_id = alert.id formatted_event.event_id = str(alert_id) if KEEP_AUDIT_EVENTS_ENABLED: audit = AlertAudit( tenant_id=tenant_id, fingerprint=formatted_event.fingerprint, action=( ActionType.AUTOMATIC_RESOLVE.value if formatted_event.status == AlertStatus.RESOLVED.value else ActionType.TIGGERED.value ), user_id="system", description=f"Alert recieved from provider with status {formatted_event.status}", ) session.add(audit) session.commit() session.flush() set_last_alert(tenant_id, alert, session=session) # Mapping try: enrichments_bl.run_mapping_rules(formatted_event) except Exception: logger.exception("Failed to run mapping rules") alert_enrichment = get_enrichment_with_session( session=session, tenant_id=tenant_id, fingerprint=formatted_event.fingerprint, ) if alert_enrichment: for enrichment in alert_enrichment.enrichments: # set the enrichment value = alert_enrichment.enrichments[enrichment] if isinstance(value, str): value = value.strip() setattr(formatted_event, enrichment, value) enriched_formatted_events.append(formatted_event) logger.info("Checking for incidents to resolve", extra={"tenant_id": tenant_id}) try: saved_alerts = enrich_alerts_with_incidents( tenant_id, saved_alerts, session ) # note: this only enriches incidents that were not yet ended session.expire_on_commit = False incident_bl = IncidentBl(tenant_id, session) for alert in saved_alerts: if alert.event.get("status") == AlertStatus.RESOLVED.value: logger.debug( "Checking for alert with status resolved", extra={"alert_id": alert.id, "tenant_id": tenant_id}, ) for incident in alert._incidents: if incident.status in IncidentStatus.get_active( return_values=True ): incident_bl.resolve_incident_if_require(incident) logger.info( "Completed checking for incidents to resolve", extra={"tenant_id": tenant_id}, ) except Exception: logger.exception( "Failed to check for incidents to resolve", extra={"tenant_id": tenant_id}, ) session.commit() logger.info( "Added new alerts to the DB", extra={ "provider_type": provider_type, "num_of_alerts": len(formatted_events), "provider_id": provider_id, "tenant_id": tenant_id, }, ) return enriched_formatted_events except Exception: logger.exception( "Failed to add new alerts to the DB", extra={ "provider_type": provider_type, "num_of_alerts": len(formatted_events), "provider_id": provider_id, "tenant_id": tenant_id, }, ) raise def __handle_formatted_events( tenant_id, provider_type, session: Session, raw_events: list[dict], formatted_events: list[AlertDto], tracer: trace.Tracer, provider_id: str | None = None, notify_client: bool = True, timestamp_forced: datetime.datetime | None = None, job_id: str | None = None, ): """ this is super important function and does five things: 0. checks for deduplications using alertdeduplicator 1. adds the alerts to the DB 2. adds the alerts to elasticsearch 3. runs workflows based on the alerts 4. runs the rules engine 5. update the presets TODO: add appropriate logs, trace and all of that so we can track errors """ logger.info( "Adding new alerts to the DB", extra={ "provider_type": provider_type, "num_of_alerts": len(formatted_events), "provider_id": provider_id, "tenant_id": tenant_id, "job_id": job_id, }, ) # first, check for maintenance windows if KEEP_MAINTENANCE_WINDOWS_ENABLED: with tracer.start_as_current_span("process_event_maintenance_windows_check"): maintenance_windows_bl = MaintenanceWindowsBl( tenant_id=tenant_id, session=session ) if maintenance_windows_bl.maintenance_rules: formatted_events = [ event for event in formatted_events if maintenance_windows_bl.check_if_alert_in_maintenance_windows( event ) is False ] else: logger.debug( "No maintenance windows configured for this tenant", extra={"tenant_id": tenant_id}, ) if not formatted_events: logger.info( "No alerts to process after running maintenance windows check", extra={"tenant_id": tenant_id}, ) return with tracer.start_as_current_span("process_event_deduplication"): # second, filter out any deduplicated events alert_deduplicator = AlertDeduplicator(tenant_id) deduplication_rules = alert_deduplicator.get_deduplication_rules( tenant_id=tenant_id, provider_id=provider_id, provider_type=provider_type ) last_alerts_fingerprint_to_hash = get_last_alert_hashes_by_fingerprints( tenant_id, [event.fingerprint for event in formatted_events] ) for event in formatted_events: # apply_deduplication set alert_hash and isDuplicate on event event = alert_deduplicator.apply_deduplication( event, deduplication_rules, last_alerts_fingerprint_to_hash ) # filter out the deduplicated events deduplicated_events = list( filter(lambda event: event.isFullDuplicate, formatted_events) ) formatted_events = list( filter(lambda event: not event.isFullDuplicate, formatted_events) ) with tracer.start_as_current_span("process_event_save_to_db"): # save to db enriched_formatted_events = __save_to_db( tenant_id, provider_type, session, raw_events, formatted_events, deduplicated_events, provider_id, timestamp_forced, ) # let's save all fields to the DB so that we can use them in the future such in deduplication fields suggestions # todo: also use it on correlation rules suggestions if KEEP_ALERT_FIELDS_ENABLED: with tracer.start_as_current_span("process_event_bulk_upsert_alert_fields"): for enriched_formatted_event in enriched_formatted_events: logger.debug( "Bulk upserting alert fields", extra={ "alert_event_id": enriched_formatted_event.event_id, "alert_fingerprint": enriched_formatted_event.fingerprint, }, ) fields = [] for key, value in enriched_formatted_event.dict().items(): if isinstance(value, dict): for nested_key in value.keys(): fields.append(f"{key}.{nested_key}") else: fields.append(key) bulk_upsert_alert_fields( tenant_id=tenant_id, fields=fields, provider_id=enriched_formatted_event.providerId, provider_type=enriched_formatted_event.providerType, session=session, ) logger.debug( "Bulk upserted alert fields", extra={ "alert_event_id": enriched_formatted_event.event_id, "alert_fingerprint": enriched_formatted_event.fingerprint, }, ) # after the alert enriched and mapped, lets send it to the elasticsearch with tracer.start_as_current_span("process_event_push_to_elasticsearch"): elastic_client = ElasticClient(tenant_id=tenant_id) if elastic_client.enabled: for alert in enriched_formatted_events: try: logger.debug( "Pushing alert to elasticsearch", extra={ "alert_event_id": alert.event_id, "alert_fingerprint": alert.fingerprint, }, ) elastic_client.index_alert( alert=alert, ) except Exception: logger.exception( "Failed to push alerts to elasticsearch", extra={ "provider_type": provider_type, "num_of_alerts": len(formatted_events), "provider_id": provider_id, "tenant_id": tenant_id, }, ) continue if MAINTENANCE_WINDOW_ALERT_STRATEGY == "recover_previous_status": ignored_events = list( filter( lambda event: event.status == AlertStatus.MAINTENANCE.value, enriched_formatted_events ) ) enriched_formatted_events = list( filter( lambda event: event.status != AlertStatus.MAINTENANCE.value, enriched_formatted_events ) ) with tracer.start_as_current_span("process_event_push_to_workflows"): try: # Now run any workflow that should run based on this alert # TODO: this should publish event workflow_manager = WorkflowManager.get_instance() # insert the events to the workflow manager process queue logger.info("Adding events to the workflow manager queue") workflow_manager.insert_events(tenant_id, enriched_formatted_events) logger.info("Added events to the workflow manager queue") except Exception: logger.exception( "Failed to run workflows based on alerts", extra={ "provider_type": provider_type, "num_of_alerts": len(formatted_events), "provider_id": provider_id, "tenant_id": tenant_id, }, ) incidents = [] with tracer.start_as_current_span("process_event_run_rules_engine"): # Now we need to run the rules engine if KEEP_CORRELATION_ENABLED: try: rules_engine = RulesEngine(tenant_id=tenant_id) # handle incidents, also handle workflow execution as incidents: List[IncidentDto] = rules_engine.run_rules( enriched_formatted_events, session=session ) except Exception: logger.exception( "Failed to run rules engine", extra={ "provider_type": provider_type, "num_of_alerts": len(formatted_events), "provider_id": provider_id, "tenant_id": tenant_id, }, ) if MAINTENANCE_WINDOW_ALERT_STRATEGY == "recover_previous_status": enriched_formatted_events.extend(ignored_events) with tracer.start_as_current_span("process_event_notify_client"): pusher_client = get_pusher_client() if notify_client else None if not pusher_client: return # Get the notification cache pusher_cache = get_notification_cache() # Tell the client to poll alerts if pusher_cache.should_notify(tenant_id, "poll-alerts"): try: pusher_client.trigger( f"private-{tenant_id}", "poll-alerts", "{}", ) logger.info("Told client to poll alerts") except Exception: logger.exception("Failed to tell client to poll alerts") pass if incidents and pusher_cache.should_notify(tenant_id, "incident-change"): try: pusher_client.trigger( f"private-{tenant_id}", "incident-change", {}, ) except Exception: logger.exception("Failed to tell the client to pull incidents") # Now we need to update the presets # send with pusher try: presets = get_all_presets_dtos(tenant_id) rules_engine = RulesEngine(tenant_id=tenant_id) presets_do_update = [] for preset_dto in presets: # filter the alerts based on the search query filtered_alerts = rules_engine.filter_alerts( enriched_formatted_events, preset_dto.cel_query ) # if not related alerts, no need to update if not filtered_alerts: continue presets_do_update.append(preset_dto) if pusher_cache.should_notify(tenant_id, "poll-presets"): try: pusher_client.trigger( f"private-{tenant_id}", "poll-presets", json.dumps( [p.name.lower() for p in presets_do_update], default=str ), ) except Exception: logger.exception("Failed to send presets via pusher") except Exception: logger.exception( "Failed to send presets via pusher", extra={ "provider_type": provider_type, "num_of_alerts": len(formatted_events), "provider_id": provider_id, "tenant_id": tenant_id, }, ) return enriched_formatted_events @processing_time_summary.time() def process_event( ctx: dict, # arq context tenant_id: str, provider_type: str | None, provider_id: str | None, fingerprint: str | None, api_key_name: str | None, trace_id: str | None, # so we can track the job from the request to the digest event: ( AlertDto | list[AlertDto] | IncidentDto | list[IncidentDto] | dict | None ), # the event to process, either plain (generic) or from a specific provider notify_client: bool = True, timestamp_forced: datetime.datetime | None = None, ) -> list[Alert]: start_time = time.time() job_id = ctx.get("job_id") extra_dict = { "tenant_id": tenant_id, "provider_type": provider_type, "provider_id": provider_id, "fingerprint": fingerprint, "event_type": str(type(event)), "trace_id": trace_id, "job_id": job_id, "raw_event": ( event if KEEP_STORE_RAW_ALERTS else None ), # Let's log the events if we store it for debugging } logger.info("Processing event", extra=extra_dict) tracer = trace.get_tracer(__name__) raw_event = copy.deepcopy(event) events_in_counter.inc() try: with tracer.start_as_current_span("process_event_get_db_session"): # Create a session to be used across the processing task session = get_session_sync() # Pre alert formatting extraction rules with tracer.start_as_current_span("process_event_pre_alert_formatting"): enrichments_bl = EnrichmentsBl(tenant_id, session) try: event = enrichments_bl.run_extraction_rules(event, pre=True) except Exception: logger.exception("Failed to run pre-formatting extraction rules") with tracer.start_as_current_span("process_event_provider_formatting"): if ( provider_type is not None and isinstance(event, dict) or isinstance(event, FormData) or isinstance(event, list) ): try: provider_class = ProvidersFactory.get_provider_class(provider_type) except Exception: provider_class = ProvidersFactory.get_provider_class("keep") if isinstance(event, list): event_list = [] for event_item in event: if not isinstance(event_item, AlertDto): event_list.append( provider_class.format_alert( tenant_id=tenant_id, event=event_item, provider_id=provider_id, provider_type=provider_type, ) ) else: event_list.append(event_item) event = event_list else: event = provider_class.format_alert( tenant_id=tenant_id, event=event, provider_id=provider_id, provider_type=provider_type, ) # SHAHAR: for aws cloudwatch, we get a subscription notification message that we should skip # todo: move it to be generic if event is None and provider_type == "cloudwatch": logger.info( "This is a subscription notification message from AWS - skipping processing", extra=extra_dict, ) return elif event is None: logger.info( "Provider returned None (failed silently), skipping processing", extra=extra_dict, ) if event: if isinstance(event, str): extra_dict["raw_event"] = event logger.error( "Event is a string (malformed json?), skipping processing", extra=extra_dict, ) return None # In case when provider_type is not set if isinstance(event, dict): if not event.get("name"): event["name"] = event.get("id", "unknown alert name") event = [AlertDto(**event)] raw_event = [raw_event] # Prepare the event for the digest if isinstance(event, AlertDto): event = [event] raw_event = [raw_event] with tracer.start_as_current_span("process_event_internal_preparation"): __internal_prepartion(event, fingerprint, api_key_name) formatted_events = __handle_formatted_events( tenant_id, provider_type, session, raw_event, event, tracer, provider_id, notify_client, timestamp_forced, job_id, ) logger.info( "Event processed", extra={**extra_dict, "processing_time": time.time() - start_time}, ) events_out_counter.inc() return formatted_events except Exception: stacktrace = traceback.format_exc() tb = traceback.extract_tb(sys.exc_info()[2]) # Get the name of the last function in the traceback try: last_function = tb[-1].name if tb else "" except Exception: last_function = "" # Check if the last function matches the pattern if "_format_alert" in last_function or "_format" in last_function: # In case of exception, add the alerts to the defect table error_msg = stacktrace # if this is a bug in the code, we don't want the user to see the stacktrace else: error_msg = "Error processing event, contact Keep team for more information" logger.exception( "Error processing event", extra={**extra_dict, "processing_time": time.time() - start_time}, ) __save_error_alerts(tenant_id, provider_type, raw_event, error_msg) events_error_counter.inc() # Retrying only if context is present (running the job in arq worker) if bool(ctx): raise Retry(defer=ctx["job_try"] * TIMES_TO_RETRY_JOB) finally: session.close() def __save_error_alerts( tenant_id, provider_type, raw_events: dict | list[dict] | list[AlertDto] | AlertDto | None, error_message: str, ): if not raw_events: logger.info("No raw events to save as errors") return try: logger.info( "Getting database session", extra={ "tenant_id": tenant_id, }, ) session = get_session_sync() # Convert to list if single dict if not isinstance(raw_events, list): logger.info("Converting single dict or AlertDto to list") raw_events = [raw_events] logger.info(f"Saving {len(raw_events)} error alerts") if len(raw_events) > 5: logger.info( "Raw Alert Payload", extra={ "tenant_id": tenant_id, "raw_events": raw_events, }, ) for raw_event in raw_events: # Convert AlertDto to dict if needed if isinstance(raw_event, AlertDto): logger.info("Converting AlertDto to dict") raw_event = raw_event.dict() # TODO: change to debug logger.debug( "Creating AlertRaw object", extra={ "tenant_id": tenant_id, "raw_event": raw_event, }, ) alert = AlertRaw( tenant_id=tenant_id, raw_alert=raw_event, provider_type=provider_type, error=True, error_message=error_message, ) session.add(alert) logger.info("AlertRaw object created") session.commit() logger.info("Successfully saved error alerts") except Exception: logger.exception("Failed to save error alerts") finally: session.close() async def async_process_event(*args, **kwargs): return process_event(*args, **kwargs) ================================================ FILE: keep/api/tasks/process_incident_task.py ================================================ import logging from arq import Retry from sqlmodel import Session from keep.api.bl.incidents_bl import IncidentBl from keep.api.core.db import engine, get_incident_by_fingerprint, get_incident_by_id from keep.api.models.incident import IncidentDto from keep.api.tasks.process_event_task import process_event TIMES_TO_RETRY_JOB = 5 # the number of times to retry the job in case of failure logger = logging.getLogger(__name__) def process_incident( ctx: dict, tenant_id: str, provider_id: str | None, provider_type: str, incidents: IncidentDto | list[IncidentDto], trace_id: str | None = None, ): extra = { "tenant_id": tenant_id, "provider_id": provider_id, "provider_type": provider_type, "trace_id": trace_id, } with Session(engine) as session: if ctx and isinstance(ctx, dict): extra["job_try"] = ctx.get("job_try", 0) extra["job_id"] = ctx.get("job_id", None) if isinstance(incidents, IncidentDto): incidents = [incidents] logger.info(f"Processing {len(incidents)} incidents", extra=extra) if logger.getEffectiveLevel() == logging.DEBUG: # Lets log the incidents in debug mode extra["incident"] = [i.dict() for i in incidents] incident_bl = IncidentBl(tenant_id, session) try: for incident in incidents: logger.info( f"Processing incident: {incident.id}", extra={**extra, "fingerprint": incident.fingerprint}, ) incident_from_db = get_incident_by_id( tenant_id=tenant_id, incident_id=incident.id, session=session ) # Try to get by fingerprint if no incident was found by id if incident_from_db is None and incident.fingerprint: incident_from_db = get_incident_by_fingerprint( tenant_id=tenant_id, fingerprint=incident.fingerprint, session=session, ) if incident_from_db: logger.info( f"Updating incident: {incident.id}", extra={**extra, "fingerprint": incident.fingerprint}, ) incident_from_db = incident_bl.update_incident( incident_id=incident_from_db.id, updated_incident_dto=incident, generated_by_ai=False, ) logger.info( f"Updated incident: {incident.id}", extra={**extra, "fingerprint": incident.fingerprint}, ) else: logger.info( f"Creating incident: {incident.id}", extra={**extra, "fingerprint": incident.fingerprint}, ) incident_from_db = incident_bl.create_incident( incident_dto=incident, ) logger.info( f"Created incident: {incident.id}", extra={**extra, "fingerprint": incident.fingerprint}, ) try: if incident.alerts: logger.info("Adding incident alerts", extra=extra) processed_alerts = process_event( {}, tenant_id, provider_type, provider_id, None, None, trace_id, incident.alerts, ) if processed_alerts: incident_bl.sync_add_alerts_to_incident( incident_from_db.id, [ processed_alert.fingerprint for processed_alert in processed_alerts ], override_count=True, ) logger.info("Added incident alerts", extra=extra) else: logger.info( "No alerts to add to incident, probably deduplicated", extra=extra, ) else: logger.info( "No alerts to add to incident", extra={ **extra, "incident_id": incident_from_db.id, "incident_name": incident_from_db.name, "fingerprint": incident.fingerprint, }, ) except Exception: logger.exception("Error adding incident alerts", extra=extra) logger.info("Processed incident", extra=extra) logger.info("Processed all incidents", extra=extra) except Exception: logger.exception( "Error processing incidents", extra=extra, ) # Retrying only if context is present (running the job in arq worker) if bool(ctx): raise Retry(defer=ctx["job_try"] * TIMES_TO_RETRY_JOB) async def async_process_incident(*args, **kwargs): return process_incident(*args, **kwargs) ================================================ FILE: keep/api/tasks/process_topology_task.py ================================================ import copy import logging from sqlalchemy import and_ from keep.api.core.db import get_session_sync from keep.api.core.dependencies import get_pusher_client from keep.api.models.db.topology import ( TopologyApplicationDtoIn, TopologyService, TopologyServiceDependency, TopologyServiceDtoIn, TopologyServiceInDto, ) from keep.topologies.topologies_service import TopologiesService logger = logging.getLogger(__name__) TIMES_TO_RETRY_JOB = 5 # the number of times to retry the job in case of failure def process_topology( tenant_id: str, topology_data: list[TopologyServiceInDto], provider_id: str, provider_type: str, ): extra = {"provider_id": provider_id, "tenant_id": tenant_id} if not topology_data: logger.info( "No topology data to process", extra=extra, ) return logger.info("Processing topology data", extra=extra) session = get_session_sync() try: logger.info( "Deleting existing topology data", extra=extra, ) # delete dependencies session.query(TopologyServiceDependency).filter( TopologyServiceDependency.service.has( and_( TopologyService.source_provider_id == provider_id, TopologyService.tenant_id == tenant_id, ) ) ).delete(synchronize_session=False) # delete services session.query(TopologyService).filter( TopologyService.source_provider_id == provider_id, TopologyService.tenant_id == tenant_id, ).delete() session.commit() logger.info( "Deleted existing topology data", extra=extra, ) except Exception: logger.exception( "Failed to delete existing topology data", extra=extra, ) raise logger.info( "Creating new topology data", extra={"provider_id": provider_id, "tenant_id": tenant_id}, ) service_to_keep_service_id_map = {} # First create the services so we have ids for service in topology_data: service_copy = copy.deepcopy(service.dict()) service_copy.pop("dependencies") db_service = TopologyService(**service_copy, tenant_id=tenant_id) session.add(db_service) session.flush() service_to_keep_service_id_map[service.service] = db_service.id application_to_services = {} application_to_name = {} # Then create the dependencies for service in topology_data: # Group all services by application (this is for processing application related data in the next step) if service.application_relations is not None: service_id = service_to_keep_service_id_map.get(service.service) for application_id in service.application_relations: application_to_name[application_id] = service.application_relations[ application_id ] if application_id not in application_to_services: application_to_services[application_id] = [service_id] else: application_to_services[application_id].append(service_id) for dependency in service.dependencies: service_id = service_to_keep_service_id_map.get(service.service) depends_on_service_id = service_to_keep_service_id_map.get(dependency) if not service_id or not depends_on_service_id: logger.debug( "Found a dangling service, skipping", extra={"service": service.service, "dependency": dependency}, ) continue session.add( TopologyServiceDependency( service_id=service_id, depends_on_service_id=depends_on_service_id, protocol=service.dependencies.get(dependency, "unknown"), ) ) session.commit() # Now create or update the application for application_id in application_to_services: TopologiesService.create_or_update_application( tenant_id=tenant_id, application=TopologyApplicationDtoIn( id=application_id, name=application_to_name[application_id], services=[ TopologyServiceDtoIn(id=service_id) for service_id in application_to_services[application_id] ], ), session=session, ) try: session.close() except Exception as e: logger.warning( "Failed to close session", extra={**extra, "error": str(e)}, ) try: pusher_client = get_pusher_client() if pusher_client: pusher_client.trigger( f"private-{tenant_id}", "topology-update", {"providerId": provider_id, "providerType": provider_type}, ) except Exception: logger.exception("Failed to push topology update to the client") logger.info( "Created new topology data", extra=extra, ) async def async_process_topology(*args, **kwargs): return process_topology(*args, **kwargs) ================================================ FILE: keep/api/tasks/process_watcher_task.py ================================================ import asyncio import datetime import logging from filelock import FileLock, Timeout import redis from keep.api.bl.maintenance_windows_bl import MaintenanceWindowsBl from keep.api.bl.dismissal_expiry_bl import DismissalExpiryBl from keep.api.consts import REDIS, WATCHER_LAPSED_TIME logger = logging.getLogger(__name__) async def async_process_watcher(*args): if REDIS: ctx = args[0] redis_instance: redis.Redis = ctx.get("redis") lock_key = "lock:watcher:process" is_exec_stopped = await redis_instance.set(lock_key, "1", ex=WATCHER_LAPSED_TIME+10, nx=True) if not is_exec_stopped: logger.info("Watcher process is already running, skipping this run.") return logger.info("Watcher process started, acquiring lock.") try: loop = asyncio.get_running_loop() # Run maintenance windows recovery resp = await loop.run_in_executor(ctx.get("pool"), MaintenanceWindowsBl.recover_strategy, logger) # Run dismissal expiry check await loop.run_in_executor( ctx.get("pool"), DismissalExpiryBl.check_dismissal_expiry, logger ) except Exception as e: logger.error("Error in watcher process: %s", e, exc_info=True) raise finally: await redis_instance.delete(lock_key) logger.info("Watcher process completed and lock released.") return resp else: while True: init_time = datetime.datetime.now() try: with FileLock("/tmp/watcher_process.lock", timeout=WATCHER_LAPSED_TIME//2): logger.info("Watcher process started, acquiring lock.") loop = asyncio.get_running_loop() # Run maintenance windows recovery resp = await loop.run_in_executor(None, MaintenanceWindowsBl.recover_strategy, logger) # Run dismissal expiry check await loop.run_in_executor( None, DismissalExpiryBl.check_dismissal_expiry, logger ) logger.info(f"Sleeping for {WATCHER_LAPSED_TIME} seconds before next run.") complete_time = datetime.datetime.now() await asyncio.sleep(max(0, WATCHER_LAPSED_TIME - (complete_time - init_time).total_seconds())) logger.info("Watcher process completed.") except Timeout: logger.info("Watcher process is already running, skipping this run.") ================================================ FILE: keep/api/utils/alert_utils.py ================================================ def sanitize_alert(alert_raw: dict) -> dict: """ Recursively sanitize alert data by removing null characters. The function could be used to remove/replace any unwanted characters from the alert data structure, ensuring that the data is clean and safe for further processing or storage. Args: alert_raw (dict): The raw alert data """ if alert_raw is None: return None if not isinstance(alert_raw, dict): raise ValueError("Input must be a dictionary") def sanitize(value): if isinstance(value, str): return value.replace('\x00', '') elif isinstance(value, dict): return {k: sanitize(v) for k, v in value.items()} elif isinstance(value, list): return [sanitize(i) for i in value] return value return sanitize(alert_raw) ================================================ FILE: keep/api/utils/cel_utils.py ================================================ import re from keep.api.models.alert import AlertSeverity def preprocess_cel_expression(cel_expression: str) -> str: """Preprocess CEL expressions to replace string-based comparisons with numeric values where applicable.""" # Construct a regex pattern that matches any severity level or other comparisons # and accounts for both single and double quotes as well as optional spaces around the operator severities = "|".join( [f"\"{severity.value}\"|'{severity.value}'" for severity in AlertSeverity] ) pattern = rf"(\w+)\s*([=> bool: if not KEEP_EMAILS_ENABLED: logger.debug("Emails are disabled, skipping sending email") return False # that's ok on OSS if not API_KEY: logger.debug("No SendGrid API key, skipping sending email") return False message = Mail(from_email=FROM_EMAIL, to_emails=to_email) message.template_id = template_id.value # TODO: validate the kwargs and the template parameters are the same message.dynamic_template_data = kwargs # send the email try: logger.info(f"Sending email to {to_email} with template {template_id}") sg = SendGridAPIClient(API_KEY) sg.send(message) logger.info(f"Email sent to {to_email} with template {template_id}") return True except Exception as e: logger.error( f"Failed to send email to {to_email} with template {template_id}: {e}" ) raise ================================================ FILE: keep/api/utils/enrichment_helpers.py ================================================ import logging from datetime import datetime from typing import Optional from opentelemetry import trace from sqlmodel import Session from keep.api.core.db import existed_or_new_session from keep.api.models.alert import ( AlertDto, AlertStatus, AlertWithIncidentLinkMetadataDto, ) from keep.api.models.db.alert import Alert, LastAlertToIncident from keep.api.models.incident import IncidentDto tracer = trace.get_tracer(__name__) logger = logging.getLogger(__name__) def javascript_iso_format(last_received: str) -> str: """ https://stackoverflow.com/a/63894149/12012756 """ dt = datetime.fromisoformat(last_received) return dt.isoformat(timespec="milliseconds").replace("+00:00", "Z") def parse_and_enrich_deleted_and_assignees(alert: AlertDto, enrichments: dict): # tb: we'll need to refactor this at some point since its flaky # assignees and deleted are special cases that we need to handle # they are kept as a list of timestamps and we need to check if the # timestamp of the alert is in the list, if it is, it means that the # alert at that specific time was deleted or assigned. # # THIS IS MAINLY BECAUSE WE ALSO HAVE THE PULLED ALERTS, # OTHERWISE, WE COULD'VE JUST UPDATE THE ALERT IN THE DB deleted_last_received = enrichments.get( "deletedAt", enrichments.get("deleted", []) ) # "deleted" is for backward compatibility if javascript_iso_format(alert.lastReceived) in deleted_last_received: alert.deleted = True assignees: dict = enrichments.get("assignees", {}) assignee = assignees.get(alert.lastReceived) or assignees.get( javascript_iso_format(alert.lastReceived) ) if assignee: alert.assignee = assignee alert.enriched_fields = list( filter(lambda x: not x.startswith("disposable_"), list(enrichments.keys())) ) if "assignees" in alert.enriched_fields: # User can't be un-assigned. Just re-assigned to someone else alert.enriched_fields.remove("assignees") def calculated_start_firing_time( alert: AlertDto, previous_alert: AlertDto | list[AlertDto] ) -> str: """ Calculate the start firing time of an alert based on the previous alert. Args: alert (AlertDto): The alert to calculate the start firing time for. previous_alert (AlertDto): The previous alert. Returns: str: The calculated start firing time. """ # if the alert is not firing, there is no start firing time if alert.status != AlertStatus.FIRING.value: return None # if this is the first alert, the start firing time is the same as the last received time if not previous_alert: return alert.lastReceived elif isinstance(previous_alert, list): previous_alert = previous_alert[0] # else, if the previous alert was firing, the start firing time is the same as the previous alert if previous_alert.status == AlertStatus.FIRING.value: return previous_alert.firingStartTime # else, if the previous alert was resolved, the start firing time is the same as the last received time else: return alert.lastReceived def calculate_firing_time_since_last_resolved( alert: AlertDto, previous_alert: AlertDto | list[AlertDto] ) -> int: """ Calculate the firing counter of an alert based on the previous alert. """ # if the alert is resolved, there is no firing time. if alert.status == AlertStatus.RESOLVED.value: return None else: # if there is previous alert, we need to check if it has firing time if previous_alert: if isinstance(previous_alert, list): previous_alert = previous_alert[0] if ( previous_alert.status == AlertStatus.RESOLVED.value and alert.status == AlertStatus.FIRING.value ): return alert.lastReceived # if the previous alert has firing time since last resolved, we need to return it if previous_alert.firingStartTimeSinceLastResolved: return previous_alert.firingStartTimeSinceLastResolved else: # if there is no previous alert, we need to check if the alert is firing if alert.status == AlertStatus.FIRING.value: return alert.lastReceived else: return None def calculated_firing_counter( alert: AlertDto, previous_alert: AlertDto | list[AlertDto] ) -> int: """ Calculate the firing counter of an alert based on the previous alert. Args: alert (AlertDto): The alert to calculate the firing counter for. previous_alert (AlertDto): The previous alert. Returns: int: The calculated firing counter. """ # if its an acknowledged alert, the firing counter is 0 if alert.status == AlertStatus.ACKNOWLEDGED.value: return 0 # if this is the first alert, the firing counter is 1 if not previous_alert: return 1 elif isinstance(previous_alert, list): previous_alert = previous_alert[0] if previous_alert.status == AlertStatus.ACKNOWLEDGED.value: return 1 # else, increment counter if the previous alert was firing # NOTE: firingCounter -> 0 only if acknowledged return previous_alert.firingCounter + 1 def calculated_unresolved_counter( alert: AlertDto, previous_alert: AlertDto | list[AlertDto] ) -> int: """ Calculate the unresolved counter of an alert based on the previous alert. Args: alert (AlertDto): The alert to calculate the unresolved counter for. previous_alert (AlertDto): The previous alert. Returns: int: The calculated unresolved counter. """ # if it's a resolved alert, the unresolved counter is 0 if alert.status == AlertStatus.RESOLVED.value: return 0 # if this is the first alert, the unresolved counter is 1 if not previous_alert: return 1 elif isinstance(previous_alert, list): previous_alert = previous_alert[0] if previous_alert.status == AlertStatus.RESOLVED.value: return 1 # else, increment counter if the previous alert was firing # NOTE: unresolvedCounter -> 0 only if resolved return previous_alert.unresolvedCounter + 1 def convert_db_alerts_to_dto_alerts( alerts: list[Alert | tuple[Alert, LastAlertToIncident]], with_incidents: bool = False, with_alert_instance_enrichment: bool = False, session: Optional[Session] = None, ) -> list[AlertDto | AlertWithIncidentLinkMetadataDto]: """ Enriches the alerts with the enrichment data. Args: alerts (list[Alert]): The alerts to enrich. with_incidents (bool): enrich with incidents data Returns: list[AlertDto | AlertWithIncidentLinkMetadataDto]: The enriched alerts. """ with existed_or_new_session(session) as session: alerts_dto = [] with tracer.start_as_current_span("alerts_enrichment"): # enrich the alerts with the enrichment data for _object in alerts: # We may have an Alert only or and Alert with an LastAlertToIncident if isinstance(_object, Alert): alert, alert_to_incident = _object, None else: alert, alert_to_incident = _object enrichments = {} if with_alert_instance_enrichment and alert.alert_instance_enrichment: enrichments = alert.alert_instance_enrichment.enrichments elif alert.alert_enrichment and not with_alert_instance_enrichment: enrichments = alert.alert_enrichment.enrichments alert.event.update(enrichments) if with_incidents: if alert._incidents: alert.event["incident"] = ",".join( str(incident.id) for incident in alert._incidents ) alert.event["incident_dto"] = [ IncidentDto.from_db_incident(incident) for incident in alert._incidents ] try: if alert_to_incident is not None: alert_dto = AlertWithIncidentLinkMetadataDto.from_db_instance( alert, alert_to_incident ) else: alert_dto = AlertDto(**alert.event) if enrichments: parse_and_enrich_deleted_and_assignees(alert_dto, enrichments) except Exception: # should never happen but just in case logger.exception( "Failed to parse alert", extra={ "alert": alert, }, ) continue alert_dto.event_id = str(alert.id) # if the alert is acknowledged, the firing counter is 0 if alert_dto.status == AlertStatus.ACKNOWLEDGED.value: alert_dto.firingCounter = 0 # if the alert is resolved, the unresolved counter is 0 if alert_dto.status == AlertStatus.RESOLVED.value: alert_dto.unresolvedCounter = 0 # always update provider id and type to the new values alert_dto.providerId = alert.provider_id alert_dto.providerType = alert.provider_type alerts_dto.append(alert_dto) return alerts_dto ================================================ FILE: keep/api/utils/import_ee.py ================================================ import os import pathlib import sys from keep.api.core.tenant_configuration import TenantConfiguration EE_ENABLED = os.environ.get("EE_ENABLED", "false") == "true" EE_PATH = os.environ.get( "EE_PATH", "../ee" ) # Path related to the fastapi root directory if EE_ENABLED: path_with_ee = ( str(pathlib.Path(__file__).parent.resolve()) + "/../../" # To go to the fastapi root directory + EE_PATH + "/../" # To go to the parent directory of the ee directory to allow imports like ee.abc.abc ) sys.path.insert(0, path_with_ee) else: ALGORITHM_VERBOSE_NAME = NotImplemented SUMMARY_GENERATOR_VERBOSE_NAME = NotImplemented NAME_GENERATOR_VERBOSE_NAME = NotImplemented def is_ee_enabled_for_tenant(tenant_id: str, tenant_configuration=None) -> bool: if not EE_ENABLED: return False if tenant_configuration is None: tenant_configuration = TenantConfiguration() config = tenant_configuration.get_configuration(tenant_id, "ee_enabled") if config is None: return False return bool(config) ================================================ FILE: keep/api/utils/pagination.py ================================================ from typing import Any from pydantic import BaseModel from keep.api.models.alert import AlertDto, AlertWithIncidentLinkMetadataDto from keep.api.models.db.enrichment_event import EnrichmentEvent from keep.api.models.db.workflow import * # pylint: disable=unused-wildcard-importfrom typing import Optional from keep.api.models.incident import IncidentDto from keep.api.models.workflow import WorkflowDTO, WorkflowExecutionDTO class PaginatedResultsDto(BaseModel): limit: int = 25 offset: int = 0 count: int items: list[Any] class IncidentsPaginatedResultsDto(PaginatedResultsDto): items: list[IncidentDto] class AlertPaginatedResultsDto(PaginatedResultsDto): items: list[AlertDto] class EnrichmentEventPaginatedResultsDto(PaginatedResultsDto): items: list[EnrichmentEvent] class AlertWithIncidentLinkMetadataPaginatedResultsDto(PaginatedResultsDto): items: list[AlertWithIncidentLinkMetadataDto] class WorkflowExecutionsPaginatedResultsDto(PaginatedResultsDto): items: list[WorkflowExecutionDTO] passCount: int = 0 avgDuration: float = 0.0 workflow: Optional[WorkflowDTO] = None failCount: int = 0 ================================================ FILE: keep/api/utils/pluralize.py ================================================ # Maybe to use 'pluralize' from 'inflect' library in the future def pluralize(count: int, singular: str, plural: str | None = None, include_count: bool = True) -> str: """ Returns a string with the correct plural or singular form based on count. Args: count: The number of items singular: The singular form of the word plural: The plural form of the word. If None, appends 's' to singular form include_count: Whether to include the count in the returned string Examples: >>> pluralize(1, "incident") "1 incident" >>> pluralize(2, "incident") "2 incidents" >>> pluralize(2, "category", "categories") "2 categories" >>> pluralize(1, "incident", include_count=False) "incident" """ if plural is None: plural = singular + 's' word = plural if count != 1 else singular return f"{count} {word}" if include_count else word ================================================ FILE: keep/api/utils/tenant_utils.py ================================================ import hashlib import logging from typing import Optional from uuid import uuid4 from sqlmodel import Session, select from sqlalchemy.exc import IntegrityError as SqlalchemyIntegrityError from google.api_core.exceptions import InvalidArgument as GoogleAPIInvalidArgument from keep.api.core.config import config from keep.api.models.db.tenant import TenantApiKey from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.rbac import Admin as AdminRole from keep.identitymanager.rbac import Role from keep.identitymanager.rbac import Webhook as WebhookRole from keep.secretmanager.secretmanagerfactory import SecretManagerFactory logger = logging.getLogger(__name__) class APIKeyException(Exception): pass def get_api_key( session: Session, unique_api_key_id: str, tenant_id: str ) -> TenantApiKey: """ Retrieves API key. Args: session (Session): _description_ tenant_id (str): _description_ unique_api_key_id (str): _description_ Returns: str: _description_ """ # Find API key statement = ( select(TenantApiKey) .where(TenantApiKey.reference_id == unique_api_key_id) .where(TenantApiKey.tenant_id == tenant_id) ) api_key = session.exec(statement).first() return api_key def update_api_key_internal( session: Session, tenant_id: str, unique_api_key_id: str, ) -> str: """ Updates API key secret for the given tenant. Args: session (Session): _description_ tenant_id (str): _description_ unique_api_key_id (str): _description_ Returns: str: _description_ """ # Get API Key from database statement = ( select(TenantApiKey) .where(TenantApiKey.reference_id == unique_api_key_id) .where(TenantApiKey.tenant_id == tenant_id) ) tenant_api_key_entry = session.exec(statement).first() # If no APIkey is found return if not tenant_api_key_entry: return False else: # Find current API key in secret_manager context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) # Update API key in secret_manager api_key = str(uuid4()) secret_manager.write_secret( secret_name=f"{tenant_id}-{unique_api_key_id}", secret_value=api_key, ) # Update API key hash in DB tenant_api_key_entry.key_hash = hashlib.sha256( api_key.encode("utf-8") ).hexdigest() session.commit() return api_key def create_api_key( session: Session, tenant_id: str, unique_api_key_id: str, is_system: bool, created_by: str, role: str, commit: bool = True, system_description: Optional[str] = None, ) -> str: """ Creates an API key for the given tenant. Args: session (Session): _description_ tenant_id (str): _description_ unique_api_key_id (str): _description_ is_system (bool): _description_ commit (bool, optional): _description_. Defaults to True. system_description (Optional[str], optional): _description_. Defaults to None. Returns: str: _description_ """ logger.info( "Creating API key", extra={"tenant_id": tenant_id, "unique_api_key_id": unique_api_key_id}, ) api_key = str(uuid4()) hashed_api_key = hashlib.sha256(api_key.encode("utf-8")).hexdigest() # Save the api key in the secret manager context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) try: secret_manager.write_secret( secret_name=f"{tenant_id}-{unique_api_key_id}", secret_value=api_key, ) # Save the api key in the database new_installation_api_key = TenantApiKey( tenant_id=tenant_id, reference_id=unique_api_key_id, key_hash=hashed_api_key, is_system=is_system, system_description=system_description, created_by=created_by, role=role, ) session.add(new_installation_api_key) if commit: session.commit() logger.info( "Created API key", extra={"tenant_id": tenant_id, "unique_api_key_id": unique_api_key_id}, ) return api_key except SqlalchemyIntegrityError as e: logger.warning( f"API key already exists: {e}", extra={"tenant_id": tenant_id, "unique_api_key_id": unique_api_key_id}, ) raise APIKeyException("API key already exists.") except GoogleAPIInvalidArgument as e: if "does not match the expected format" in str(e): raise APIKeyException(str(e)) except Exception as e: logger.error( "Error creating API key: " + str(e), extra={"tenant_id": tenant_id, "unique_api_key_id": unique_api_key_id}, ) raise APIKeyException("Error creating API key.") def get_api_keys( session: Session, tenant_id: str, role: Role, email: str ) -> [TenantApiKey]: """ Gets all active API keys for the given tenant. Args: session (Session): _description_ tenant_id (str): _description_ Returns: str: _description_ """ statement = None if role != AdminRole: statement = ( select(TenantApiKey) .where(TenantApiKey.tenant_id == tenant_id) .where(TenantApiKey.created_by == email) .where(TenantApiKey.is_system == False) .where(TenantApiKey.is_deleted != True) ) else: statement = ( select(TenantApiKey) .where(TenantApiKey.tenant_id == tenant_id) .where(TenantApiKey.is_system == False) .where(TenantApiKey.is_deleted != True) ) api_keys = session.exec(statement).all() return api_keys def get_api_keys_secret( tenant_id, api_keys, ): context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) api_keys_with_secret = [] for api_key in api_keys: if api_key.reference_id == "webhook": continue if api_key.is_deleted == True: api_keys_with_secret.append( { "reference_id": api_key.reference_id, "tenant": api_key.tenant, "is_deleted": api_key.is_deleted, "created_at": api_key.created_at, "created_by": api_key.created_by, "last_used": api_key.last_used, "role": api_key.role, "secret": "Key has been deactivated", } ) continue try: secret = secret_manager.read_secret( f"{api_key.tenant_id}-{api_key.reference_id}" ) read_only_bypass_key = config("KEEP_READ_ONLY_BYPASS_KEY", default="") if read_only_bypass_key and read_only_bypass_key == secret: # Do not return the bypass key if set. continue api_keys_with_secret.append( { "reference_id": api_key.reference_id, "tenant": api_key.tenant, "is_deleted": api_key.is_deleted, "created_at": api_key.created_at, "created_by": api_key.created_by, "last_used": api_key.last_used, "secret": secret, "role": api_key.role, } ) except Exception as e: logger.error( "Error reading secret", extra={"error": str(e)}, ) continue return api_keys_with_secret def get_or_create_api_key( session: Session, tenant_id: str, created_by: str, unique_api_key_id: str, system_description: Optional[str] = None, ) -> str: """ Gets or creates an API key for the given tenant. Args: session (Session): _description_ tenant_id (str): _description_ unique_api_key_id (str): _description_ system_description (Optional[str], optional): _description_. Defaults to None. Returns: str: _description_ """ logger.info( "Getting or creating API key", extra={"tenant_id": tenant_id, "unique_api_key_id": unique_api_key_id}, ) statement = ( select(TenantApiKey) .where(TenantApiKey.reference_id == unique_api_key_id) .where(TenantApiKey.tenant_id == tenant_id) ) tenant_api_key_entry = session.exec(statement).first() if not tenant_api_key_entry: # TODO: make it more robust if unique_api_key_id == "webhook": role = WebhookRole.get_name() else: role = AdminRole.get_name() tenant_api_key = create_api_key( session, tenant_id, unique_api_key_id, role=role, created_by=created_by, is_system=True, system_description=system_description, ) else: context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) tenant_api_key = secret_manager.read_secret(f"{tenant_id}-{unique_api_key_id}") logger.info( "Got API key", extra={"tenant_id": tenant_id, "unique_api_key_id": unique_api_key_id}, ) return tenant_api_key ================================================ FILE: keep/api/utils/time_stamp_helpers.py ================================================ from keep.api.models.time_stamp import TimeStampFilter from fastapi import ( HTTPException, Query ) from typing import Optional import json def get_time_stamp_filter( time_stamp: Optional[str] = Query(None) ) -> TimeStampFilter: if time_stamp: try: # Parse the JSON string time_stamp_dict = json.loads(time_stamp) # Return the TimeStampFilter object, Pydantic will map 'from' -> lower_timestamp and 'to' -> upper_timestamp return TimeStampFilter(**time_stamp_dict) except (json.JSONDecodeError, TypeError): raise HTTPException(status_code=400, detail="Invalid time_stamp format") return TimeStampFilter() ================================================ FILE: keep/cli/cli.py ================================================ import json import logging import logging.config import os import sys import typing import uuid from collections import OrderedDict from importlib import metadata import click import requests from dotenv import find_dotenv, load_dotenv from prettytable import PrettyTable from keep.api.core.posthog import posthog_client from keep.functions import cyaml from keep.providers.providers_factory import ProviderEncoder, ProvidersFactory load_dotenv(find_dotenv()) try: KEEP_VERSION = metadata.version("keep") except metadata.PackageNotFoundError: try: KEEP_VERSION = metadata.version("keephq") except metadata.PackageNotFoundError: KEEP_VERSION = os.environ.get("KEEP_VERSION", "unknown") logging_config = { "version": 1, "disable_existing_loggers": False, "formatters": { "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"}, "json": { "format": "%(asctime)s %(message)s %(levelname)s %(name)s %(filename)s %(lineno)d", "class": "pythonjsonlogger.jsonlogger.JsonFormatter", }, }, "handlers": { "default": { "level": "DEBUG", "formatter": "standard", "class": "logging.StreamHandler", "stream": "ext://sys.stdout", } }, "loggers": { "": { # root logger "handlers": ["default"], "level": "INFO", "propagate": False, } }, } logger = logging.getLogger(__name__) def get_default_conf_file_path(): DEFAULT_CONF_FILE = ".keep.yaml" from pathlib import Path home = str(Path.home()) return os.path.join(home, DEFAULT_CONF_FILE) def make_keep_request(method, url, **kwargs): if os.environ.get("KEEP_CLI_IGNORE_SSL", "false").lower() == "true": kwargs["verify"] = False try: response = requests.request(method, url, **kwargs) if response.status_code == 401: click.echo( click.style( "Authentication failed. Please check your API key.", fg="red", bold=True, ) ) sys.exit(401) return response except requests.exceptions.RequestException as e: click.echo(click.style(f"Request failed: {e}", fg="red", bold=True)) sys.exit(1) class Info: """An information object to pass data between CLI functions.""" KEEP_MANAGED_API_URL = "https://api.keephq.dev" def __init__(self): # Note: This object must have an empty constructor. """Create a new instance.""" self.verbose: int = 0 self.config = {} self.json = False self.logger = logging.getLogger(__name__) def set_config(self, keep_config: str): """Set the config file.""" try: with open(file=keep_config, mode="r") as f: self.logger.debug("Loading configuration file.") self.config = cyaml.safe_load(f) or {} self.logger.debug("Configuration file loaded.") except FileNotFoundError: logger.debug( "Configuration file could not be found. Running without configuration." ) pass self.api_key = self.config.get("api_key") or os.getenv("KEEP_API_KEY") or "" self.keep_api_url = ( self.config.get("keep_api_url") or os.getenv("KEEP_API_URL") or Info.KEEP_MANAGED_API_URL ) self.random_user_id = self.config.get("random_user_id") # if we don't have a random user id, we create one and keep it on the config file if not self.random_user_id: self.random_user_id = str(uuid.uuid4()) self.config["random_user_id"] = self.random_user_id try: with open(file=keep_config, mode="w") as f: cyaml.dump(self.config, f) # e.g. in case of openshift you don't have write access to the file except Exception as e: logger.debug( f"Error writing random user id to config file: {e}. Please set it manually." ) pass arguments = sys.argv # if we auth, we don't need to check for api key if ( "auth" in arguments or "api" in arguments or "config" in arguments or "version" in arguments or "build_cache" in arguments ): return if not self.api_key: click.echo( click.style( "No api key found. Please run `keep config` to set the api key or set KEEP_API_KEY env variable.", bold=True, ) ) sys.exit(2) if not self.keep_api_url: click.echo( click.style( "No keep api url found. Please run `keep config` to set the keep api url or set KEEP_API_URL env variable.", bold=True, ) ) sys.exit(2) click.echo( click.style( f"Using keep api url: {self.keep_api_url}", bold=True, ) ) # pass_info is a decorator for functions that pass 'Info' objects. #: pylint: disable=invalid-name pass_info = click.make_pass_decorator(Info, ensure=True) # Change the options to below to suit the actual options for your task (or # tasks). @click.group() @click.option("--verbose", "-v", count=True, help="Enable verbose output.") @click.option("--json", "-j", default=False, is_flag=True, help="Enable json output.") @click.option( "--keep-config", "-c", help=f"The path to the keep config file (default {get_default_conf_file_path()}", required=False, default=f"{get_default_conf_file_path()}", ) @pass_info @click.pass_context def cli(ctx, info: Info, verbose: int, json: bool, keep_config: str): """Run Keep CLI.""" # https://posthog.com/tutorials/identifying-users-guide#identifying-and-setting-user-ids-for-every-other-library # random user id info.set_config(keep_config) if posthog_client is not None: posthog_client.capture( info.random_user_id, "keep-cli-started", properties={ "args": sys.argv, "keep_version": KEEP_VERSION, }, ) # Use the verbosity count to determine the logging level... if verbose > 0: # set the verbosity level to debug logging_config["loggers"][""]["level"] = "DEBUG" if json: logging_config["handlers"]["default"]["formatter"] = "json" logging.config.dictConfig(logging_config) info.verbose = verbose info.json = json @ctx.call_on_close def cleanup(): if posthog_client is not None: posthog_client.flush() @cli.command() def version(): """Get the library version.""" click.echo(click.style(KEEP_VERSION, bold=True)) @cli.group() @pass_info def config(info: Info): """Manage the config.""" pass @config.command(name="show") @pass_info def show(info: Info): """show the current config.""" click.echo(click.style("Current config", bold=True)) for key, value in info.config.items(): click.echo(f"{key}: {value}") @config.command(name="new") @click.option( "--url", "-u", type=str, required=False, is_flag=False, flag_value="http://localhost:8080", help="The url of the keep api", ) @click.option( "--api-key", "-a", type=str, required=False, is_flag=False, flag_value="", help="The api key for keep", ) @click.option( "--interactive", "-i", help="Interactive mode creating keep config (default True)", is_flag=True, ) @pass_info def new_config(info: Info, url: str, api_key: str, interactive: bool): """create new config.""" ctx = click.get_current_context() if not interactive: keep_url = ctx.params.get("url") api_key = ctx.params.get("api_key") else: keep_url = click.prompt("Enter your keep url", default="http://localhost:8080") api_key = click.prompt( "Enter your api key (leave blank for localhost)", hide_input=True, default="", ) if not api_key: api_key = "localhost" with open(f"{get_default_conf_file_path()}", "w") as f: f.write(f"api_key: {api_key}\n") f.write(f"keep_api_url: {keep_url}\n") f.write(f"random_user_id: {info.random_user_id}\n") click.echo( click.style(f"Config file created at {get_default_conf_file_path()}", bold=True) ) @cli.command() @pass_info def whoami(info: Info): """Verify the api key auth.""" try: resp = make_keep_request( "GET", info.keep_api_url + "/whoami", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) except requests.exceptions.ConnectionError: click.echo(click.style(f"Timeout connecting to {info.keep_api_url}")) sys.exit(1) if resp.status_code == 401: click.echo(click.style("Api key invalid")) elif resp.ok: click.echo(click.style("Api key valid")) click.echo(resp.json()) else: click.echo(click.style("Api key invalid [unknown error]")) @cli.command() @click.option("--multi-tenant", is_flag=True, help="Enable multi-tenant mode") @click.option( "--port", "-p", type=int, default=int(os.environ.get("PORT", 8080)), help="The port to run the API on", ) @click.option( "--host", "-h", type=str, default=os.environ.get("HOST", "0.0.0.0"), help="The host to run the API on", ) def api(multi_tenant: bool, port: int, host: str): """Start the API.""" from keep.api import api ctx = click.get_current_context() api.PORT = ctx.params.get("port") api.HOST = ctx.params.get("host") if multi_tenant: auth_type = "MULTI_TENANT" else: auth_type = "NO_AUTH" app = api.get_app(auth_type=auth_type) logger.info( f"App initialized, multi tenancy flag from user [overriden by AUTH_TYPE env var]: {multi_tenant}" ) app.dependency_overrides[click.get_current_context] = lambda: ctx api.run(app) @cli.group() @pass_info def workflow(info: Info): """Manage workflows.""" pass @workflow.command(name="list") @pass_info def list_workflows(info: Info): """List workflows.""" resp = make_keep_request( "GET", info.keep_api_url + "/workflows", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting workflows: {resp.text}") workflows = resp.json() if len(workflows) == 0: click.echo(click.style("No workflows found.", bold=True)) return # Create a new table table = PrettyTable() # Add column headers table.field_names = [ "ID", "Name", "Description", "Revision", "Created By", "Creation Time", "Update Time", "Last Execution Time", "Last Execution Status", ] # TODO - add triggers, steps, actions -> the table format should be better # Add rows for each workflow for workflow in workflows: table.add_row( [ workflow["id"], workflow["name"], workflow["description"], workflow["revision"], workflow["created_by"], workflow["creation_time"], workflow["last_updated"], workflow["last_execution_time"], workflow["last_execution_status"], ] ) print(table) def get_workflows(info: Info): """Get all workflows.""" resp = make_keep_request( "GET", info.keep_api_url + "/workflows", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) return resp.json() def delete_workflow(workflow_id: str, info: Info): """Delete a workflow.""" resp = make_keep_request( "DELETE", info.keep_api_url + f"/workflows/{workflow_id}", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) return resp def apply_workflow(file: str, info: Info, lookup_by_name: bool = True): """Helper function to apply a single workflow. By default, workflow created or updated by name, since it's the most common use case for CLI.""" with open(file, "rb") as f: files = {"file": (os.path.basename(file), f)} workflow_endpoint = info.keep_api_url + "/workflows" response = make_keep_request( "POST", workflow_endpoint, headers={"x-api-key": info.api_key, "accept": "application/json"}, files=files, params={"lookup_by_name": lookup_by_name}, ) return response @workflow.command() @click.option( "--file", "-f", type=click.Path(exists=True), help="The workflow file or directory containing workflow files", required=True, ) @click.option( "--full-sync", is_flag=True, help="Delete all existing workflows and apply the new ones", default=False, ) @click.option( "--lookup-by-name", is_flag=True, help="Lookup workflows by name instead of ID", default=True, ) @pass_info def apply(info: Info, file: str, full_sync: bool, lookup_by_name: bool): """Apply a workflow or multiple workflows from a directory.""" if os.path.isdir(file): if full_sync: click.echo(click.style("Deleting all workflows", bold=True)) workflows = get_workflows(info) for workflow in workflows: click.echo( click.style(f"Deleting workflow {workflow['id']}", bold=True) ) resp = delete_workflow(workflow["id"], info) if resp.ok: click.echo( click.style(f"Deleted workflow {workflow['id']}", bold=True) ) else: click.echo( click.style( f"Error deleting workflow {workflow['id']}: {resp.text}", bold=True, ) ) click.echo(click.style("Deleted all workflows", bold=True)) for filename in os.listdir(file): if filename.endswith(".yml") or filename.endswith(".yaml"): click.echo(click.style(f"Applying workflow {filename}", bold=True)) full_path = os.path.join(file, filename) response = apply_workflow( full_path, info, lookup_by_name=lookup_by_name ) # Handle response for each file if response.ok: click.echo( click.style( f"Workflow {filename} applied successfully", bold=True ) ) else: click.echo( click.style( f"Error applying workflow {filename}: {response.text}", bold=True, ) ) else: response = apply_workflow(file, info, lookup_by_name=lookup_by_name) if response.ok: click.echo(click.style(f"Workflow {file} applied successfully", bold=True)) else: click.echo( click.style( f"Error applying workflow {file}: {response.text}", bold=True ) ) @workflow.command(name="run") @click.option( "--workflow-id", type=str, help="The ID (UUID or name) of the workflow to run", required=True, ) @click.option( "--fingerprint", type=str, help="The fingerprint to query the payload", required=True, ) @pass_info def run_workflow(info: Info, workflow_id: str, fingerprint: str): """Run a workflow with a specified ID and fingerprint.""" # Query the server for payload based on the fingerprint # Replace the following line with your actual logic to fetch the payload payload = _get_alert_by_fingerprint(info.keep_api_url, info.api_key, fingerprint) if not payload.ok: click.echo(click.style("Error: Failed to fetch alert payload", bold=True)) return payload = payload.json() # Run the workflow with the fetched payload as the request body workflow_endpoint = info.keep_api_url + f"/workflows/{workflow_id}/run" response = make_keep_request( "POST", workflow_endpoint, headers={"x-api-key": info.api_key, "accept": "application/json"}, json=payload, ) # Check the response if response.ok: response = response.json() click.echo(click.style(f"Workflow {workflow_id} run successfully", bold=True)) click.echo( click.style( f"Workflow Run ID {response.get('workflow_execution_id')}", bold=True ) ) else: click.echo( click.style( f"Error running workflow {workflow_id}: {response.text}", bold=True ) ) @workflow.group(name="runs") @pass_info def workflow_executions(info: Info): """Manage workflows executions.""" pass @workflow_executions.command(name="list") @pass_info def list_workflow_executions(info: Info): """List workflow executions.""" resp = make_keep_request( "GET", info.keep_api_url + "/workflows/executions/list", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting workflow executions: {resp.text}") workflow_executions = resp.json() if len(workflow_executions) == 0: click.echo(click.style("No workflow executions found.", bold=True)) return # Create a new table table = PrettyTable() # Add column headers table.field_names = [ "ID", "Workflow ID", "Start Time", "Triggered By", "Status", "Error", "Execution Time", ] table.max_width["Error"] = 50 table.align["Error"] = "l" # Add rows for each workflow execution for workflow_execution in workflow_executions: table.add_row( [ workflow_execution["id"], workflow_execution["workflow_id"], workflow_execution["started"], workflow_execution["triggered_by"], workflow_execution["status"], workflow_execution.get("error", "N/A"), workflow_execution["execution_time"], ] ) print(table) @workflow_executions.command(name="logs") @click.argument( "workflow_execution_id", required=True, type=str, ) @pass_info def get_workflow_execution_logs(info: Info, workflow_execution_id: str): """Get workflow execution logs.""" resp = make_keep_request( "GET", info.keep_api_url + "/workflows/executions/list?workflow_execution_id=" + workflow_execution_id, headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting workflow executions: {resp.text}") workflow_executions = resp.json() workflow_execution_logs = workflow_executions[0].get("logs", []) if len(workflow_execution_logs) == 0: click.echo(click.style("No logs found for this workflow execution.", bold=True)) return # Create a new table table = PrettyTable() # Add column headers table.field_names = [ "ID", "Timestamp", "Message", ] table.align["Message"] = "l" # Add rows for each workflow execution for log in workflow_execution_logs: table.add_row([log["id"], log["timestamp"], log["message"]]) print(table) @cli.group() @pass_info def mappings(info: Info): """Manage mappings.""" pass @mappings.command(name="list") @pass_info def list_mappings(info: Info): """List mappings.""" resp = make_keep_request( "GET", info.keep_api_url + "/mapping", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting mappings: {resp.text}") mappings = resp.json() if len(mappings) == 0: click.echo(click.style("No mappings found.", bold=True)) return # Create a new table table = PrettyTable() # Add column headers table.field_names = [ "ID", "Name", "Description", "Priority", "Matchers", "Attributes", "File Name", "Created By", "Creation Time", ] # Add rows for each mapping for mapping in mappings: table.add_row( [ mapping["id"], mapping["name"], mapping["description"], mapping["priority"], ", ".join(mapping["matchers"]), ", ".join(mapping["attributes"]), mapping["file_name"], mapping["created_by"], mapping["created_at"], ] ) print(table) @mappings.command(name="create") @click.option( "--name", "-n", type=str, help="The name of the mapping.", required=True, ) @click.option( "--description", "-d", type=str, help="The description of the mapping.", required=False, default="", ) @click.option( "--file", "-f", type=click.Path(exists=True), help="The mapping file. Must be a CSV file.", required=True, ) @click.option( "--matchers", "-m", type=str, help="The matchers of the mapping, as a comma-separated list of strings.", required=True, ) @click.option( "--priority", "-p", type=click.IntRange(0, 100), help="The priority of the mapping, higher priority means this rule will execute first.", required=False, default=0, ) @pass_info def create( info: Info, name: str, description: str, file: str, matchers: str, priority: int ): """Create a mapping rule.""" if os.path.isfile(file) and file.endswith(".csv"): with open(file, "rb") as f: file_name = os.path.basename(file) try: csv_data = f.read().decode("utf-8") csv_rows = csv_data.split("\n") csv_headers = csv_rows[0].split(",") csv_rows = csv_rows[1:] rows = [] for row in csv_rows: if row: row = row.split(",") rows.append(OrderedDict(zip(csv_headers, row))) except Exception as e: click.echo(click.style(f"Error reading or processing CSV file: {e}")) return mappings_endpoint = info.keep_api_url + "/mapping" response = make_keep_request( "POST", mappings_endpoint, headers={"x-api-key": info.api_key, "accept": "application/json"}, json={ "name": name, "description": description, "file_name": file_name, "matchers": matchers.split(","), "rows": rows, "priority": priority, }, ) # Check the response if response.ok: click.echo( click.style(f"Mapping rule {file_name} created successfully", bold=True) ) else: click.echo( click.style( f"Error creating mapping rule {file_name}: {response.text}", bold=True, ) ) @mappings.command(name="delete") @click.option( "--mapping-id", type=int, help="The ID of the mapping to delete.", required=True, ) @pass_info def delete_mapping(info: Info, mapping_id: int): """Delete a mapping with a specified ID.""" # Delete the mapping with the specified ID mappings_endpoint = info.keep_api_url + f"/mapping/{mapping_id}" response = make_keep_request( "DELETE", mappings_endpoint, headers={"x-api-key": info.api_key, "accept": "application/json"}, ) # Check the response if response.ok: click.echo( click.style(f"Mapping rule {mapping_id} deleted successfully", bold=True) ) else: click.echo( click.style( f"Error deleting mapping rule {mapping_id}: {response.text}", bold=True ) ) @cli.group() @pass_info def extraction(info: Info): """Manage extractions.""" pass @extraction.command(name="list") @pass_info def list_extraction(info: Info): """List extractions.""" resp = make_keep_request( "GET", info.keep_api_url + "/extraction", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting extractions: {resp.text}") extractions = resp.json() if len(extractions) == 0: click.echo(click.style("No extractions found.", bold=True)) return # Create a new table table = PrettyTable() # Add column headers table.field_names = [ "ID", "Name", "Description", "Priority", "Attribute", "Condition", "Disabled", "Regex", "Pre", "Created By", "Creation Time", "Updated By", "Update Time", ] # Add rows for each extraction for e in extractions: table.add_row( [ e["id"], e["name"], e["description"], e["priority"], e["attribute"], e["condition"], e["disabled"], e["regex"], e["pre"], e["created_by"], e["created_at"], e["updated_by"], e["updated_at"], ] ) print(table) @extraction.command(name="create") @click.option( "--name", "-n", type=str, help="The name of the extraction.", required=True, ) @click.option( "--description", "-d", type=str, help="The description of the extraction.", required=False, default="", ) @click.option( "--priority", "-p", type=click.IntRange(0, 100), help="The priority of the extraction, higher priority means this rule will execute first.", required=False, default=0, ) @click.option( "--pre", type=bool, help="Whether this rule should be applied before or after the alert is standardized.", required=False, default=False, ) @click.option( "--attribute", "-a", type=str, help="Event attribute name to extract from.", required=True, default="", ) @click.option( "--regex", "-r", type=str, help="The regex rule to extract by. Regex format should be like python regex pattern for group matching.", required=True, default="", ) @click.option( "--condition", "-c", type=str, help="CEL based condition.", required=True, default="", ) @pass_info def create( info: Info, name: str, description: str, priority: int, pre: bool, attribute: str, regex: str, condition: str, ): """Create a extraction rule.""" response = make_keep_request( "POST", info.keep_api_url + "/extraction", headers={"x-api-key": info.api_key, "accept": "application/json"}, json={ "name": name, "description": description, "priority": priority, "pre": pre, "attribute": attribute, "regex": regex, "condition": condition, }, ) # Check the response if response.ok: click.echo( click.style(f"Extraction rule {name} created successfully", bold=True) ) else: click.echo( click.style( f"Error creating extraction rule {name}: {response.text}", bold=True, ) ) @extraction.command(name="delete") @click.option( "--extraction-id", type=int, help="The ID of the extraction to delete.", required=True, ) @pass_info def delete_extraction(info: Info, extraction_id: int): """Delete a extraction with a specified ID.""" # Delete the extraction with the specified ID response = make_keep_request( "DELETE", info.keep_api_url + f"/extraction/{extraction_id}", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) # Check the response if response.ok: click.echo( click.style( f"Extraction rule {extraction_id} deleted successfully", bold=True ) ) else: click.echo( click.style( f"Error deleting extraction rule {extraction_id}: {response.text}", bold=True, ) ) @cli.group() @pass_info def provider(info: Info): """Manage providers.""" pass @provider.command(name="build_cache", help="Output providers cache for future use") def build_cache(): logger.info("Building providers cache") providers_cache = ProvidersFactory.get_all_providers(ignore_cache_file=True) with open("providers_cache.json", "w") as f: json.dump(providers_cache, f, cls=ProviderEncoder) logger.info( "Providers cache built successfully", extra={"file": "providers_cache.json"} ) @provider.command(name="list") @click.option( "--available", "-a", default=False, is_flag=True, help="List provider that you can install.", ) @pass_info def list_providers(info: Info, available: bool): """List providers.""" resp = make_keep_request( "GET", info.keep_api_url + "/providers", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting providers: {resp.text}") providers = resp.json() # Create a new table table = PrettyTable() # Add column headers if available: available_providers = providers.get("providers", []) # sort alphabetically by type available_providers.sort(key=lambda x: x.get("type")) table.field_names = ["Provider", "Description"] for provider in available_providers: provider_type = provider.get("type") provider_docs = provider.get("docs", "") if provider_docs: provider_docs = provider_docs.replace("\n", " ").strip() else: provider_docs = "" table.add_row( [ provider_type, provider_docs, ] ) else: table.field_names = ["ID", "Type", "Name", "Installed by", "Installation time"] installed_providers = providers.get("installed_providers", []) installed_providers.sort(key=lambda x: x.get("type")) for provider in installed_providers: table.add_row( [ provider["id"], provider["type"], provider["details"]["name"], provider["installed_by"], provider["installation_time"], ] ) print(table) @provider.command(context_settings=dict(ignore_unknown_options=True)) @click.option( "--help", "-h", default=False, is_flag=True, help="Help on how to install this provider.", ) @click.option( "--provider-name", "-n", required=False, help="Every provider shuold have a name.", ) @click.argument("provider_type") @click.argument("params", nargs=-1, type=click.UNPROCESSED) @click.pass_context def connect(ctx, help: bool, provider_name, provider_type, params): info = ctx.ensure_object(Info) resp = make_keep_request( "GET", info.keep_api_url + "/providers", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting providers: {resp.text}") available_providers = resp.json().get("providers") provider = [p for p in available_providers if p.get("type") == provider_type] if not provider: click.echo( click.style( f"Provider {provider_type} not found, you can open an issue and we will create it within a blink of an eye https://github.com/keephq/keep", bold=True, ) ) return provider = provider[0] if help: table = PrettyTable() table.field_names = [ "Provider", "Config Param", "Required", "Description", ] provider_type = provider.get("type") for param, details in provider["config"].items(): param_as_flag = f"--{param.replace('_', '-')}" table.add_row( [ provider_type, param_as_flag, details.get("required", False), details.get("description", "no description"), ] ) # Reset the provider_type for subsequent rows of the same provider to avoid repetition provider_type = "" print(table) return if not provider_name: # exit with error raise click.BadOptionUsage( "--provider-name", f"Required option --provider-name not provided for provider {provider_type}", ) # Connect the provider ctx.args options_dict = {params[i]: params[i + 1] for i in range(0, len(params), 2)} # Verify the provided options against the expected ones for the provider provider_install_payload = { "provider_id": provider["type"], "provider_name": provider_name, } for config in provider["config"]: config_as_flag = f"--{config.replace('_', '-')}" if config_as_flag not in options_dict and provider["config"][config].get( "required", True ): raise click.BadOptionUsage( config_as_flag, f"Required option --{config} not provided for provider {provider_name}", ) if config_as_flag in options_dict: provider_install_payload[config] = options_dict[config_as_flag] # Install the provider resp = make_keep_request( "POST", info.keep_api_url + "/providers/install", headers={"x-api-key": info.api_key, "accept": "application/json"}, json=provider_install_payload, ) if not resp.ok: # installation failed because the credentials are invalid if resp.status_code == 412: click.echo( click.style("Failed to install provider: invalid scopes", bold=True) ) table = PrettyTable() table.field_names = ["Scope Name", "Status"] for scope, value in resp.json().get("detail").items(): table.add_row([scope, value]) print(table) else: click.echo( click.style( f"Error installing provider {provider_name}: {resp.text}", bold=True ) ) else: resp = resp.json() click.echo( click.style(f"Provider {provider_name} installed successfully", bold=True) ) click.echo(click.style(f"Provider id: {resp.get('id')}", bold=True)) @provider.command() @click.argument( "provider_id", required=False, ) @click.pass_context def delete(ctx, provider_id): info = ctx.ensure_object(Info) dummy_provider_type = "dummy" resp = make_keep_request( "DELETE", info.keep_api_url + f"/providers/{dummy_provider_type}/{provider_id}", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: if resp.status_code == 404: click.echo( click.style(f"Provider {provider_id} not found", bold=True, fg="red") ) else: click.echo( click.style( f"Error deleting provider {provider_id}: {resp.text}", bold=True ) ) else: click.echo( click.style(f"Provider {provider_id} deleted successfully", bold=True) ) def _get_alert_by_fingerprint(keep_url, api_key, fingerprint: str): """Get an alert by fingerprint.""" resp = make_keep_request( "GET", keep_url + f"/alerts/{fingerprint}", headers={"x-api-key": api_key, "accept": "application/json"}, ) return resp @cli.group() @pass_info def alert(info: Info): """Manage alerts.""" pass @alert.command(name="get") @click.argument( "fingerprint", required=True, type=str, ) @pass_info def get_alert(info: Info, fingerprint: str): """Get an alert by fingerprint.""" resp = _get_alert_by_fingerprint(info.keep_api_url, info.api_key, fingerprint) if not resp.ok: raise Exception(f"Error getting alert: {resp.text}") else: alert = resp.json() print(json.dumps(alert, indent=4)) @alert.command(name="list") @click.option( "--filter", "-f", type=str, multiple=True, help="Filter alerts based on specific attributes. E.g., --filter source=datadog", ) @click.option( "--export", type=click.Path(), help="Export alerts to a specified JSON file." ) @pass_info def list_alerts(info: Info, filter: typing.List[str], export: bool): """List alerts.""" resp = make_keep_request( "GET", info.keep_api_url + "/alerts?sync=true", headers={"x-api-key": info.api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting providers: {resp.text}") alerts = resp.json() # aggregate by fingerprint aggregated_alerts = OrderedDict() for alert in sorted(alerts, key=lambda x: x["lastReceived"]): if alert["fingerprint"] not in aggregated_alerts: aggregated_alerts[alert["fingerprint"]] = alert alerts = aggregated_alerts.values() if len(alerts) == 0: click.echo(click.style("No alerts found.", bold=True)) return # Apply all provided filters for filt in filter: key, value = filt.split("=") _alerts = [] for alert in alerts: val = alert.get(key) if isinstance(val, list): if value in val: _alerts.append(alert) else: if val == value: _alerts.append(alert) alerts = _alerts # If --export option is provided if export: with open(export, "w") as outfile: json.dump(alerts, outfile, indent=4) click.echo(f"Alerts exported to {export}") return # Create a new table table = PrettyTable() table.field_names = [ "ID", "Fingerprint", "Name", "Severity", "Status", "Environment", "Service", "Source", "Last Received", ] table.max_width["ID"] = 20 table.max_width["Name"] = 30 table.max_width["Status"] = 10 table.max_width["Environment"] = 15 table.max_width["Service"] = 15 table.max_width["Source"] = 15 table.max_width["Last Received"] = 30 for alert in alerts: table.add_row( [ alert["id"], alert["fingerprint"], alert["name"], alert["severity"], alert["status"], alert["environment"], alert["service"], alert["source"], alert["lastReceived"], ] ) print(table) @alert.command() @click.option( "--fingerprint", required=True, help="The fingerprint of the alert to enrich." ) @click.argument("params", nargs=-1, type=click.UNPROCESSED) @pass_info def enrich(info: Info, fingerprint, params): """Enrich an alert.""" # Convert arguments to dictionary for param in params: # validate the all params are key/value pairs if len(param.split("=")) != 2: raise click.BadArgumentUsage("Parameters must be given in key=value pairs") params_dict = {param.split("=")[0]: param.split("=")[1] for param in params} params_dict = { "fingerprint": fingerprint, "enrichments": params_dict, } # Make the API request resp = make_keep_request( "POST", f"{info.keep_api_url}/alerts/enrich", headers={"x-api-key": info.api_key, "accept": "application/json"}, json=params_dict, ) # Check the response if not resp.ok: click.echo( click.style(f"Error enriching alert {fingerprint}: {resp.text}", bold=True) ) else: click.echo(click.style(f"Alert {fingerprint} enriched successfully", bold=True)) @alert.command() @click.option( "--provider-type", "-p", type=click.Path(exists=False), help="The type of the provider which will be used to simulate the alert.", required=True, ) @click.argument("params", nargs=-1, type=click.UNPROCESSED) @pass_info def simulate(info: Info, provider_type: str, params: list[str]): """Simulate an alert.""" click.echo(click.style("Simulating alert", bold=True)) try: provider = ProvidersFactory.get_provider_class(provider_type) except Exception as e: click.echo(click.style(f"No such provuder: {e}", bold=True)) return try: alert = provider.simulate_alert() except Exception: click.echo(click.style("Provider does not support alert simulation", bold=True)) return # override the alert with the provided params for param in params: key, value = param.split("=") # if the param contains "." if "." in key: # split the key by "." and set the value in the alert keys = key.split(".") alert[keys[0]][keys[1]] = value else: alert[key] = value click.echo("Simulated alert:") click.echo(json.dumps(alert, indent=4)) # send the alert to the server resp = make_keep_request( "POST", info.keep_api_url + f"/alerts/event/{provider_type}", headers={"x-api-key": info.api_key, "accept": "application/json"}, json=alert, ) if not resp.ok: click.echo(click.style(f"Error simulating alert: {resp.text}", bold=True)) else: click.echo(click.style("Alert simulated successfully", bold=True)) @cli.group() @pass_info def auth(info: Info): """Manage auth.""" pass # global token will be populated in the callback token = None @auth.command() @pass_info def login(info: Info): # first, prepare the oauth2 session: import os import threading import time import webbrowser import uvicorn from fastapi import FastAPI from fastapi.responses import PlainTextResponse from requests_oauthlib import OAuth2Session app = FastAPI() @app.get("/callback") def callback(code: str, state: str): global token token_url = "https://auth.keephq.dev/oauth/token" token = oauth_session.fetch_token( token_url, code=code, client_secret="", include_client_id=True, authorization_response=redirect_uri, ) print("Got the token") return PlainTextResponse( "Authenticated successfully, you can close this tab now, Keep rulezzz!" ) # We needed a way to run a server without blocking the main thread: # https://github.com/encode/uvicorn/discussions/1103#discussioncomment-1389875 class UvicornServer: def __init__(self): super().__init__() def start(self): # Define the FastAPI app running logic here uvicorn.run(app, host="127.0.0.1", port=8085, log_level="critical") # These are the public client_id of KeepHQ auth0 # If you have your own identity provider, we'll need to implement to flow client_id = os.getenv("KEEP_OAUTH_CLIENT_ID", "P7zzubZGLNe8BQ4HRzvrhT5qPgRFa0BL") authorization_base_url = os.getenv( "KEEP_OAUTH_AUTHORIZATION_BASE_URL", "https://auth.keephq.dev/authorize" ) scope = ["openid", "profile", "email"] redirect_uri = "http://localhost:8085/callback" oauth_session = OAuth2Session(client_id, scope=scope, redirect_uri=redirect_uri) # now that we have the state parameter, we can start the fast api server # start the server on another process server_thread = threading.Thread(target=UvicornServer().start) server_thread.start() # now, open the browser and wait for the authentication webbrowser.open(oauth_session.authorization_url(authorization_base_url)[0]) # Now wait for the callback timeout = 60 * 2 # 2 minutes times = 0 time_start = time.time() while not token: if time.time() - time_start > timeout: print("Timeout waiting for callback") # kill the server os._exit(1) # print every 15 seconds if times % 15 == 0: print("Still waiting for callback") time.sleep(1) # Ok, we got the token from the oauth2 flow, now let's get a permanent api key print("Got the token, getting the api key") id_token = token["id_token"] api_key_resp = make_keep_request( "GET", info.keep_api_url + "/settings/apikey", headers={"accept": "application/json", "Authorization": f"Bearer {id_token}"}, ) if not api_key_resp.ok: print(f"Error getting api key: {api_key_resp.text}") # kill the server os._exit(2) api_key = api_key_resp.json().get("apiKey") # keep it in the config file with open(f"{get_default_conf_file_path()}", "w") as f: f.write(f"api_key: {api_key}\n") # Authenticated successfully print("Authenticated successfully!") # Check that we can get whoami resp = make_keep_request( "GET", info.keep_api_url + "/whoami", headers={"x-api-key": api_key, "accept": "application/json"}, ) if not resp.ok: raise Exception(f"Error getting whoami: {resp.text}") print("Authenticated to Keep successfully!") print(resp.json()) # kills the server also, great success os._exit(0) if __name__ == "__main__": cli(auto_envvar_prefix="KEEP") ================================================ FILE: keep/cli/click_extensions.py ================================================ import click class NotRequiredIf(click.Option): """ https://stackoverflow.com/questions/44247099/click-command-line-interfaces-make-options-required-if-other-optional-option-is """ def __init__(self, *args, **kwargs): self.not_required_if = kwargs.pop("not_required_if") assert self.not_required_if, "'not_required_if' parameter required" kwargs["help"] = ( kwargs.get("help", "") + f" NOTE: This argument is mutually exclusive with {self.not_required_if}" ).strip() super().__init__(*args, **kwargs) def handle_parse_result(self, ctx, opts, args): we_are_present = self.name in opts other_present = self.not_required_if in opts if other_present is False: if we_are_present is False: raise click.UsageError( "Illegal usage: `%s` is required when `%s` is not provided" % (self.name, self.not_required_if) ) else: self.prompt = None return super().handle_parse_result(ctx, opts, args) ================================================ FILE: keep/conditions/__init__.py ================================================ class Condition: def __init__(self, condition_type, condition_config): self.condition_type = condition_type self.condition_config = condition_config def apply(self, context, step_output): pass ================================================ FILE: keep/conditions/assert_condition.py ================================================ from asteval import Interpreter from keep.conditions.base_condition import BaseCondition class AssertCondition(BaseCondition): """Use python assert to check if a condition is true. Args: BaseCondition (_type_): _description_ """ def __init__(self, *kargs, **kwargs): super().__init__(*kargs, **kwargs) def apply(self, compare_to, compare_value) -> bool: """apply the condition. Args: compare_to (_type_): the assertion to check compare_value (_type_): the actual value """ try: self.logger.debug(f"Asserting {compare_value}") # we need to encode/decode the string to make sure eval # will be able to parse characters such as \n compare_value = compare_value.encode("unicode_escape").decode("utf-8") # if " 'A' == 'A' ", then we should run the action (so condition is true) aeval = Interpreter() assert not aeval(compare_value) self.logger.debug(f"Asserted {compare_value}") return False # if the assertion failed, an action should be done except AssertionError: self.logger.debug(f"Failed asserting {compare_value}") return True except SyntaxError: self.logger.debug(f"Failed asserting {compare_value}") raise SyntaxError( f"AssertCondition failed - couldn't parse {compare_value}" ) def get_compare_value(self): """Get the value to compare. The actual value from the step output. Args: step_output (_type_): _description_ Returns: _type_: _description_ """ compare_value = self.condition_config.get("assert") compare_value = self.io_handler.render(compare_value) return compare_value ================================================ FILE: keep/conditions/base_condition.py ================================================ """ Base class for all conditions. """ import abc import logging from keep.contextmanager.contextmanager import ContextManager from keep.iohandler.iohandler import IOHandler class BaseCondition(metaclass=abc.ABCMeta): def __init__( self, context_manager: ContextManager, condition_type, condition_name, condition_config, **kwargs ): """ Initialize a provider. Args: **kwargs: Provider configuration loaded from the provider yaml file. """ # Initalize logger for every provider self.logger = logging.getLogger(self.__class__.__name__) self.condition_type = condition_type self.condition_config = condition_config self.condition_name = condition_name self.io_handler = IOHandler(context_manager) self.context_manager = context_manager self.condition_context = {} self.condition_alias = condition_config.get("alias") or condition_name self.logger.debug( "Initializing condition", extra={"condition": self.__class__.__name__} ) @abc.abstractmethod def apply(self, **kwargs) -> bool: """ Validate provider configuration. """ raise NotImplementedError("apply() method not implemented") def get_compare_to(self): """Get the comparison baseline. For example, for threshold conditions it'll be the threshold. Args: step_output (_type_): _description_ Returns: _type_: _description_ """ compare_to = self.condition_config.get("compare_to") compare_to = self.io_handler.render(compare_to) return compare_to def get_compare_value(self): """Get the value to compare. The actual value from the step output. Args: step_output (_type_): _description_ Returns: _type_: _description_ """ compare_value = self.condition_config.get("value") compare_value = self.io_handler.render(compare_value).strip() return compare_value ================================================ FILE: keep/conditions/condition_factory.py ================================================ import importlib from keep.conditions.base_condition import BaseCondition from keep.contextmanager.contextmanager import ContextManager class ConditionFactory: @staticmethod def get_condition( context_manager: ContextManager, condition_type, condition_name, condition_config, ) -> BaseCondition: module = importlib.import_module(f"keep.conditions.{condition_type}_condition") condition_class = getattr( module, condition_type.title().replace("_", "") + "Condition" ) return condition_class( context_manager, condition_type, condition_name, condition_config ) ================================================ FILE: keep/conditions/stddev_condition.py ================================================ import statistics from keep.conditions.base_condition import BaseCondition class StddevCondition(BaseCondition): """Apply sttdev to the input.""" def __init__(self, *kargs, **kwargs): super().__init__(*kargs, **kwargs) self.pivot_column = None self.condition_context["stddev"] = [] def _filter_values_by_stddev(self, lst, threshold): # use only the pivot column if self.pivot_column: _lst = [c[self.pivot_column] for c in lst] else: _lst = lst mean = statistics.mean(_lst) stddev = statistics.stdev(_lst, mean) results = [] for i, x in enumerate(_lst): x_stddev = abs(x - mean) / stddev self.condition_context["stddev"].append( {"value": lst[i], "stddev": x_stddev, "mean": mean} ) if x_stddev > threshold: results.append(i) return results def apply(self, compare_to, compare_value) -> bool: """apply the condition. Args: compare_to (float): the stddev threshold compare_value (list): the list of values (numbers/floats) """ values = self._filter_values_by_stddev(compare_value, compare_to) # If there are any values that are outside the standard devitation if values: return True return False def get_compare_value(self): """Get the value to compare. The actual value from the step output. Args: step_output (_type_): _description_ Returns: _type_: _description_ """ compare_value = self.condition_config.get("value") rendered_compare_value = self.io_handler.render(compare_value) self.pivot_column = self.condition_config.get("pivot_column", 0) return rendered_compare_value ================================================ FILE: keep/conditions/threshold_condition.py ================================================ from keep.conditions.base_condition import BaseCondition class ThresholdCondition(BaseCondition): """Checks if a number is above or below a threshold. Args: BaseCondition (_type_): _description_ """ def __init__(self, *kargs, **kwargs): super().__init__(*kargs, **kwargs) self.levels = [] def _check_if_multithreshold(self, compare_to): """Checks if this is a multithreshold condition. Args: compare_to (str): for single threshold could be 60 or 60%, for multithreshold will be 60, 70, 80 (comma separated values) Raises: ValueError: If the number of levels and number of thresholds do not match Returns: bool: True if multithreshold, False otherwise """ # TODO make more validations if "," in str(compare_to): levels = self.condition_config.get("level") if len(levels.split(",")) != len(compare_to.split(",")): raise ValueError( "Number of levels and number of thresholds do not match" ) self.levels = [level.strip() for level in levels.split(",")] return True return False def _apply_multithreshold(self, compare_to, compare_value): """Applies threshold for more than one threshold value (aka "multithreshold") Args: compare_to (list[str]): comma seperated list (e.g. 60, 70, 80) compare_value (list[str]: comma seperated list (e.g. major, medium, minor) Returns: bool: true if threshold applies, false otherwise """ thresholds = [t.strip() for t in compare_to.split(",")] for i, threshold in enumerate(thresholds): if self._apply_threshold(compare_value, threshold): # Keep the level in the condition context self.condition_context["level"] = self.levels[i] return True return False def _validate(self, compare_to, compare_value): """validate the condition. Args: compare_to (_type_): the threshold compare_value (_type_): the actual value """ # check if compare_to is a number (supports also float, hence the . replace) if ( str(compare_to).replace(".", "", 1).isdigit() and str(compare_to).replace(".", "", 1).isdigit() ): compare_to = float(compare_to) try: compare_value = float(compare_value) except ValueError as exc: raise Exception( "Invalid values for threshold - the compare_to is a float where the compare_value is not" ) from exc # validate they are both the same type if not isinstance(compare_value, type(compare_to)): raise Exception( "Invalid threshold value, currently support only numeric and percentage values but got {} and {}".format( compare_to, compare_value ) ) if self._is_percentage(compare_to) and not self._is_percentage(compare_value): raise Exception( "Invalid threshold value, currently support only numeric and percentage values but got {} and {}".format( compare_to, compare_value ) ) return compare_to, compare_value def apply(self, compare_to, compare_value) -> bool: """apply the condition. Args: compare_to (_type_): the threshold compare_value (_type_): the actual value """ if self._check_if_multithreshold(compare_to): return self._apply_multithreshold(compare_to, compare_value) return self._apply_threshold(compare_value, compare_to) def _is_percentage(self, a): if isinstance(a, int) or isinstance(a, float): return False if not a.endswith("%"): return False a = a.strip("%") # 0.1 is ok and 99.9 is ok if float(a) < 0 or float(a) > 100: return False return True def _apply_threshold(self, step_output, threshold): """Just compare the step output with the threshold. Args: step_output (_type_): _description_ threshold (_type_): _description_ Returns: _type_: _description_ """ step_output, threshold = self._validate(step_output, threshold) if self.condition_config.get("compare_type", "gt") == "gt": return step_output > threshold elif self.condition_config.get("compare_type", "gt") == "lt": return step_output < threshold raise Exception("Invalid threshold type, currently support only gt and lt") ================================================ FILE: keep/contextmanager/__init__.py ================================================ ================================================ FILE: keep/contextmanager/contextmanager.py ================================================ # TODO - refactor context manager to support multitenancy in a more robust way import logging from typing import Any, TypedDict import click import json5 from pympler.asizeof import asizeof from keep.api.core.config import config from keep.api.core.db import get_last_workflow_execution_by_workflow_id, get_session from keep.api.models.alert import AlertDto from keep.api.models.incident import IncidentDto class ForeachContext(TypedDict): value: Any | None items: list[Any] | None class ContextManager: def __init__( self, tenant_id, workflow_id=None, workflow_execution_id=None, workflow: dict | None = None, ): self.logger = logging.getLogger(__name__) self.workflow_id = workflow_id self.workflow_execution_id = workflow_execution_id self.tenant_id = tenant_id self.steps_context = {} self.steps_context_size = 0 self.providers_context = {} self.actions_context = {} self.event_context: AlertDto = {} self.incident_context: IncidentDto | None = None self.foreach_context: ForeachContext = { "value": None, "items": None, } self.consts_context = {} self.current_step_vars = {} self.current_step_aliases = {} self.secret_context = {} # cli context try: self.click_context = click.get_current_context() except RuntimeError: self.click_context = {} # last workflow context self.last_workflow_execution_results = {} self.last_workflow_run_time = None if self.workflow_id and workflow: try: # @tb: try to understand if the workflow tries to use last_workflow_results # if so, we need to get the last workflow execution and load it into the context workflow_str = json5.dumps(workflow) last_workflow_results_in_workflow = ( "last_workflow_results" in workflow_str or "last_workflow_run_time" in workflow_str ) if last_workflow_results_in_workflow: last_workflow_execution = ( get_last_workflow_execution_by_workflow_id( tenant_id, workflow_id, status="success" ) ) if last_workflow_execution is not None: self.last_workflow_execution_results = ( last_workflow_execution.results ) self.last_workflow_run_time = last_workflow_execution.started except Exception: self.logger.exception("Failed to get last workflow execution") pass self.aliases = {} # dependencies are used so iohandler will be able to use the output class of the providers # e.g. let's say bigquery_provider results are google.cloud.bigquery.Row # and we want to use it in iohandler, we need to import it before the eval self.dependencies = set() self.workflow_execution_id = None self.workflow_inputs = None self._api_key = None self.__loggers = {} @property def api_url(self): """ The URL of the Keep API """ return config("KEEP_API_URL") @property def api_key(self): # avoid circular import from keep.api.utils.tenant_utils import get_or_create_api_key if self._api_key is None: session = next(get_session()) self._api_key = get_or_create_api_key( session=session, created_by="system", tenant_id=self.tenant_id, unique_api_key_id="webhook", ) session.close() return self._api_key def set_execution_context(self, workflow_id, workflow_execution_id): self.workflow_execution_id = workflow_execution_id self.workflow_id = workflow_id for logger in self.__loggers.values(): logger.workflow_execution_id = workflow_execution_id def set_inputs(self, inputs): self.workflow_inputs = inputs def set_event_context(self, event): self.event_context = event def set_incident_context(self, incident): self.incident_context = incident def set_consts_context(self, consts): self.consts_context = consts def get_workflow_id(self): return self.workflow_id def set_secret_context(self): """ Set the secret context for the workflow. If no secret is provided, attempt to load it from the secret manager. """ from keep.secretmanager.secretmanagerfactory import SecretManagerFactory secret_manager = SecretManagerFactory.get_secret_manager(self) secret_key = f"{self.tenant_id}_{self.workflow_id}_secrets" try: secret = secret_manager.read_secret(secret_name=secret_key, is_json=True) self.secret_context = secret or {} except Exception: self.logger.warning( "Could not load secrets for workflow", extra={"workflow_id": self.workflow_id, "tenant_id": self.tenant_id}, ) self.secret_context = {} def get_full_context(self, exclude_providers=False, exclude_env=False): """ Gets full context on the workflows Usage: context injection used, for example, in iohandler Returns: dict: dictinoary contains all context about this workflow providers - all context about providers (configuration, etc) steps - all context about steps (output, conditions, etc) foreach - all context about the current 'foreach' foreach can be in two modes: 1. "step foreach" - for step result 2. "condition foreach" - for each condition result whereas in (2), the {{ foreach.value }} contains (1), in the (1) case, we need to explicitly put in under (value) anyway, this should be refactored to something more structured """ full_context = { "steps": self.steps_context, "actions": self.steps_context, # this is an alias for steps "foreach": self.foreach_context, "event": self.event_context, "last_workflow_results": self.last_workflow_execution_results, "last_workflow_run_time": self.last_workflow_run_time, "alert": self.event_context, # this is an alias so workflows will be able to use alert.source "incident": self.incident_context, # this is an alias so workflows will be able to use alert.source "consts": self.consts_context, "vars": self.current_step_vars, "aliases": self.current_step_aliases, "secrets": self.secret_context, "inputs": self.workflow_inputs, } if not exclude_providers: full_context["providers"] = self.providers_context full_context.update(self.aliases) return full_context def set_foreach_items(self, items: list[Any] | None = None): self.foreach_context["items"] = items def set_foreach_value(self, value: Any | None = None): self.foreach_context["value"] = value def reset_foreach_context(self): self.foreach_context = { "value": None, "items": None, } def set_condition_results( self, action_id, condition_name, condition_type, compare_to, compare_value, result, condition_alias=None, value=None, **kwargs, ): """_summary_ Args: action_id (_type_): id of the step condition_type (_type_): type of the condition compare_to (_type_): _description_ compare_value (_type_): _description_ result (_type_): _description_ condition_alias (_type_, optional): _description_. Defaults to None. value (_type_): the raw value which the condition was compared to. this is relevant only for foreach conditions """ if action_id not in self.steps_context: self.steps_context[action_id] = {"conditions": {}, "results": {}} if "conditions" not in self.steps_context[action_id]: self.steps_context[action_id]["conditions"] = {condition_name: []} if condition_name not in self.steps_context[action_id]["conditions"]: self.steps_context[action_id]["conditions"][condition_name] = [] self.steps_context[action_id]["conditions"][condition_name].append( { "value": value, "compare_value": compare_value, "compare_to": compare_to, "result": result, "type": condition_type, "alias": condition_alias, **kwargs, } ) # update the current for each context self.foreach_context.update( {"compare_value": compare_value, "compare_to": compare_to, **kwargs} ) if condition_alias: self.aliases[condition_alias] = result def set_step_provider_paremeters(self, step_id, provider_parameters): if step_id not in self.steps_context: self.steps_context[step_id] = { "provider_parameters": {}, "results": [], "vars": {}, } self.steps_context[step_id]["provider_parameters"] = provider_parameters def set_step_context(self, step_id, results, foreach=False): if step_id not in self.steps_context: self.steps_context[step_id] = { "provider_parameters": {}, "results": [], "vars": {}, } # If this is a foreach step, we need to append the results to the list # so we can iterate over them if foreach: self.steps_context[step_id]["results"].append(results) else: self.steps_context[step_id]["results"] = results # this is an alias to the current step output self.steps_context["this"] = self.steps_context[step_id] self.steps_context_size = asizeof(self.steps_context) def set_step_vars(self, step_id, _vars, _aliases): if step_id not in self.steps_context: self.steps_context[step_id] = { "provider_parameters": {}, "results": [], "vars": {}, "aliases": {}, } self.current_step_vars = _vars self.current_step_aliases = _aliases self.steps_context[step_id]["vars"] = _vars self.steps_context[step_id]["aliases"] = _aliases self.secret_context = {**self.secret_context, **_vars} def get_last_workflow_run(self, workflow_id): return get_last_workflow_execution_by_workflow_id(self.tenant_id, workflow_id) def set_last_workflow_run(self, workflow_id, workflow_context, workflow_status): # TODO: move to DB # self.logger.debug( # "Adding workflow to state", # extra={ # "workflow_id": workflow_id, # }, # ) # if workflow_id not in self.state: # self.state[workflow_id] = [] # self.state[workflow_id].append( # { # "workflow_status": workflow_status, # "workflow_context": workflow_context, # } # ) # self.logger.debug( # "Added workflow to state", # extra={ # "workflow_id": workflow_id, # }, # ) pass ================================================ FILE: keep/entrypoint.sh ================================================ #!/bin/bash # Exit immediately if a command exits with a non-zero status set -e # Print commands and their arguments as they are executed set -x # Get the directory of the current script SCRIPT_DIR=$(dirname "$0") python "$SCRIPT_DIR/server_jobs_bg.py" & # Build the providers cache { keep provider build_cache } || { echo "Failed to build providers cache, skipping" } # Check for REDIS env variable == true if [ "$REDIS" != "true" ]; then # Just run gunicorn for the API exec "$@" # else, we want different workers for API and for processing else echo "Running with Redis" # In production, always use Gunicorn for ARQ workers # default number of workers is two KEEP_WORKERS=${KEEP_WORKERS:-2} ARQ_WORKER_PORT=${ARQ_WORKER_PORT:-8001} ARQ_WORKER_TIMEOUT=${ARQ_WORKER_TIMEOUT:-120} LOG_LEVEL=${LOG_LEVEL:-INFO} echo "Starting ARQ workers under Gunicorn (workers: $KEEP_WORKERS)" # Run Gunicorn directly for ARQ workers PYTHONPATH=$PYTHONPATH \ REDIS=true \ KEEP_WORKERS=$KEEP_WORKERS \ LOG_LEVEL=$LOG_LEVEL \ gunicorn \ --bind "0.0.0.0:$ARQ_WORKER_PORT" \ --workers $KEEP_WORKERS \ --worker-class "keep.api.arq_worker_gunicorn.ARQGunicornWorker" \ --timeout $ARQ_WORKER_TIMEOUT \ --log-level $LOG_LEVEL \ --access-logfile - \ --error-logfile - \ --name "arq_worker" \ -c "/venv/lib/python3.13/site-packages/keep/api/config.py" \ "--preload" \ "keep.api.arq_worker_gunicorn:create_app()" & KEEP_ARQ_PID=$! # Give ARQ workers time to start up sleep 5 echo "Running API gunicorn" # migration will run from arq worker SKIP_DB_CREATION=true exec "$@" & KEEP_API_PID=$! # Wait for any to exit wait -n $KEEP_ARQ_PID $KEEP_API_PID # One exited — kill the other kill $KEEP_ARQ_PID $KEEP_API_PID 2>/dev/null || true # Exit to trigger container restart exit 1 fi ================================================ FILE: keep/event_subscriber/__init__.py ================================================ ================================================ FILE: keep/event_subscriber/event_subscriber.py ================================================ import logging import threading from keep.providers.base.base_provider import BaseProvider from keep.providers.providers_factory import ProvidersFactory class EventSubscriber: @staticmethod def get_instance() -> "EventSubscriber": if not hasattr(EventSubscriber, "_instance"): EventSubscriber._instance = EventSubscriber() return EventSubscriber._instance def __init__(self): self.logger = logging.getLogger(__name__) self.consumers = [] self.consumer_threads = [] self.started = False def status(self): """Returns the status of the consumers""" return { "consumers": [ { "provider_id": cp.provider_id, "status": cp.status(), } for cp in self.consumers ] } def add_consumer(self, consumer_provider: BaseProvider): """Add a consumer (on installation) Args: consumer_provider (_type_): _description_ """ self.logger.info("Adding consumer %s", consumer_provider) # start the consumer in a separate thread thread = threading.Thread( target=consumer_provider.start_consume, name=f"consumer-{consumer_provider}", ) thread.start() self.consumers.append(consumer_provider) self.consumer_threads.append(thread) self.logger.info( "Started consumer thread for event provider %s", consumer_provider ) async def start(self): """Runs the event subscriber in server mode""" if self.started: self.logger.info("Event subscriber already started") return self.logger.info("Starting event subscriber") consumer_providers = ProvidersFactory.get_consumer_providers() for consumer_provider in consumer_providers: # get the consumer for the event provider self.logger.info( "Getting consumer for event provider %s", consumer_provider ) # start the consumer in a separate thread thread = threading.Thread( target=consumer_provider.start_consume, name=f"consumer-{consumer_provider}", ) thread.start() self.consumers.append(consumer_provider) self.consumer_threads.append(thread) self.logger.info( "Started consumer thread for event provider %s", consumer_provider ) self.started = True def remove_consumer(self, provider_id: str): """Remove a consumer (on uninstallation) Args: consumer_provider (_type_): _description_ """ self.logger.info("Removing consumer %s", provider_id) for cp in self.consumers: if cp.provider_id == provider_id: cp.stop_consume() break self.logger.info("Removed consumer %s", provider_id) def stop(self): """Stops the consumers""" for consumer in self.consumers: self.logger.info("Stopping consumer %s", consumer) consumer.stop_consume() self.logger.info("Stopped consumer %s", consumer) # Join the threads self.logger.info("Joining consumer threads") for thread in self.consumer_threads: thread.join() self.started = False self.logger.info("Joined consumer threads") ================================================ FILE: keep/exceptions/__init__.py ================================================ ================================================ FILE: keep/exceptions/action_error.py ================================================ class ActionError(Exception): def __init__(self, *args: object) -> None: super().__init__(*args) ================================================ FILE: keep/exceptions/provider_config_exception.py ================================================ class ProviderConfigException(Exception): def __init__(self, message, provider_id, *args: object) -> None: super().__init__(message, *args) self.provider_id = provider_id ================================================ FILE: keep/exceptions/provider_connection_failed.py ================================================ class ProviderConnectionFailed(Exception): def __init__(self, *args: object) -> None: super().__init__(*args) ================================================ FILE: keep/exceptions/provider_exception.py ================================================ class ProviderException(Exception): def __init__(self, *args: object) -> None: super().__init__(*args) ================================================ FILE: keep/functions/__init__.py ================================================ import copy import datetime import json import logging import re import urllib.parse from datetime import timedelta from itertools import groupby from typing import Literal import json5 import pytz from dateutil import parser from dateutil.parser import ParserError from keep.api.core.db import get_alerts_by_fingerprint from keep.api.models.alert import AlertStatus from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts logger = logging.getLogger(__name__) _len = len def add(*args) -> [int, float]: args = list(map(int, args)) return sum(args) def sub(*args) -> [int, float]: args = list(map(int, args)) result = args[0] for arg in args[1:]: result -= arg return result def mul(*args) -> [int, float]: args = list(map(int, args)) result = args[0] for arg in args[1:]: result *= arg return result def div(*args) -> [int, float]: args = list(map(int, args)) result = args[0] for arg in args[1:]: result /= arg return int(result) if result.is_integer() else result def mod(*args) -> [int, float]: args = list(map(int, args)) result = args[0] for arg in args[1:]: result %= arg return result def exp(*args) -> [int, float]: args = list(map(int, args)) result = args[0] for arg in args[1:]: result **= arg return result def fdiv(*args) -> [int, float]: args = list(map(int, args)) result = args[0] for arg in args[1:]: result //= arg return result def eq(a, b) -> bool: return a == b def all(iterable) -> bool: # https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical g = groupby(iterable) return next(g, True) and not next(g, False) def diff(iterable: iter) -> bool: # Opposite of all - returns True if any element is different return not all(iterable) def len(iterable=[], **kwargs) -> int: return _len(iterable) def uppercase(string) -> str: return string.upper() def lowercase(string) -> str: return string.lower() def capitalize(string) -> str: """ Capitalize the first character of a string. Args: string (str): The string to capitalize. Returns: str: The capitalized string. """ return string.capitalize() def title(string) -> str: """ Convert a string to title case (capitalize each word). Args: string (str): The string to convert to title case. Returns: str: The title-cased string. """ return string.title() def split(string, delimeter) -> list: return string.strip().split(delimeter) def index(iterable, index) -> any: if isinstance(index, str) and index.isdigit(): # Если индекс — строка с числом index = int(index) return iterable[index] def strip(string) -> str: return string.strip() def remove_newlines(string: str = "") -> str: return string.replace("\r\n", "").replace("\n", "").replace("\t", "") def first(iterable): return iterable[0] def last(iterable): return iterable[-1] def utcnow() -> datetime.datetime: dt = datetime.datetime.now(datetime.timezone.utc) return dt def utcnowtimestamp() -> int: return int(utcnow().timestamp()) def utcnowiso() -> str: return utcnow().isoformat() def substract_minutes(dt: datetime.datetime, minutes: int) -> datetime.datetime: """ Substract minutes from a datetime object Args: dt (datetime.datetime): The datetime object minutes (int): The number of minutes to substract Returns: datetime.datetime: The new datetime object """ return dt - datetime.timedelta(minutes=minutes) def timestamp_delta( dt: datetime.datetime, amount: float, timestamp_unit: Literal["seconds", "minutes", "hours", "days", "weeks"], ) -> datetime.datetime: """ Add or subtract a time delta to/from a given datetime. Use a negative amount to subtract time. Args: dt (datetime.datetime): The original datetime. amount (float): How much to add (use negative to subtract). timestamp_unit (str): The unit for the amount ('seconds', 'minutes', 'hours', 'days', 'weeks'). Returns: datetime.datetime: The resulting datetime after adding/subtracting the delta. """ valid_units = { "seconds": "seconds", "minutes": "minutes", "hours": "hours", "days": "days", "weeks": "weeks", } if timestamp_unit not in valid_units: raise ValueError(f"Unsupported timestamp_unit: {timestamp_unit}") delta = datetime.timedelta(**{valid_units[timestamp_unit]: amount}) return dt + delta def to_utc(dt: datetime.datetime | str = "") -> datetime.datetime: if isinstance(dt, str): try: dt = parser.parse(dt.strip()) except ParserError: # Failed to parse the date return "" utc_dt = dt.astimezone(pytz.utc) return utc_dt def from_timestamp( timestamp: int | float | str, timezone: str = "UTC" ) -> datetime.datetime | str: try: if isinstance(timestamp, str): timestamp = float(timestamp) return datetime.datetime.fromtimestamp(timestamp, tz=pytz.timezone(timezone)) except Exception: return "" def to_timestamp(dt: datetime.datetime | str = "") -> int: if isinstance(dt, str): try: dt = parser.parse(dt.strip()) except ParserError: # Failed to parse the date return 0 return int(dt.timestamp()) def datetime_compare(t1: datetime = None, t2: datetime = None) -> float: if not t1 or not t2: return 0 diff = (t1 - t2).total_seconds() / 3600 return diff def json_dumps(data: str | dict) -> str: if isinstance(data, str): data = json.loads(data) return json.dumps(data, indent=4, default=str) def json_loads(data: str) -> dict: def parse_bad_json(bad_json): # Remove or replace control characters control_char_regex = re.compile(r"[\x00-\x1f\x7f-\x9f]") def replace_control_char(match): char = match.group(0) return f"\\u{ord(char):04x}" cleaned_json = control_char_regex.sub(replace_control_char, bad_json) # Parse the cleaned JSON return json.loads(cleaned_json) # in most cases, we don't need escaping try: d = json.loads(data) except json.JSONDecodeError: try: d = parse_bad_json(data) except json.JSONDecodeError: logger.exception('Failed to parse "bad" JSON') d = {} # catch any other exceptions except Exception: logger.exception("Failed to parse JSON") d = {} return d def replace(string: str, old: str, new: str) -> str: return string.replace(old, new) def encode(string) -> str: return urllib.parse.quote(string) def dict_to_key_value_list(d: dict) -> list: return [f"{k}:{v}" for k, v in d.items()] def slice(str_to_slice: str, start: int = 0, end: int = 0) -> str: if end == 0 or end == "0": return str_to_slice[int(start) :] return str_to_slice[int(start) : int(end)] def join( iterable: list | dict | str, delimiter: str = ",", prefix: str | None = None ) -> str: if isinstance(iterable, str): iterable = json5.loads(iterable) if isinstance(iterable, dict): if prefix: return delimiter.join([f"{prefix}{k}={v}" for k, v in iterable.items()]) return delimiter.join([f"{k}={v}" for k, v in iterable.items()]) if prefix: return delimiter.join([f"{prefix}{item}" for item in iterable]) return delimiter.join([str(item) for item in iterable]) def dict_pop(data: str | dict, *args) -> dict: if isinstance(data, str): data = json.loads(data) dict_copy = copy.deepcopy(data) for arg in args: dict_copy.pop(arg, None) return dict_copy def dict_pop_prefix(data: str | dict, prefix: str) -> dict: if isinstance(data, str): data = json.loads(data) return {k: v for k, v in data.items() if not k.startswith(prefix)} def dict_filter_by_prefix(data: str | dict, prefix: str) -> dict: """ This function filters a dictionary and returns only keys with the given prefix. Args: data (str | dict): the dictionary to filter prefix (str): the prefix to filter by Returns: dict: the filtered dictionary """ if isinstance(data, str): data = json.loads(data) return {k: v for k, v in data.items() if k.startswith(prefix)} def add_time_to_date(date, date_format, time_str): """ Add time to a date based on a given time string (e.g., '1w', '2d'). Args: date (str or datetime.datetime): The date to which the time will be added. date_format (str): The format of the date string if the date is provided as a string. time_str (str): The time to add (e.g., '1w', '2d'). Returns: datetime.datetime: The new datetime object with the added time. """ if isinstance(date, str): date = datetime.datetime.strptime(date, date_format) time_units = { "w": "weeks", "d": "days", "h": "hours", "m": "minutes", "s": "seconds", } time_dict = {unit: 0 for unit in time_units.values()} matches = re.findall(r"(\d+)([wdhms])", time_str) for value, unit in matches: time_dict[time_units[unit]] += int(value) new_date = date + datetime.timedelta(**time_dict) return new_date def get_firing_time(alert: dict, time_unit: str, **kwargs) -> str: """ Get the firing time of an alert. Args: alert (dict): The alert dictionary. time_unit (str): The time unit to return the result in ('m', 's', or 'h'). **kwargs: Additional keyword arguments. Returns: str: The firing time of the alert in the specified time unit. """ tenant_id = kwargs.get("tenant_id") if not tenant_id: raise ValueError("tenant_id is required") try: alert = json.loads(alert) if isinstance(alert, str) else alert except Exception: raise ValueError("alert is not a valid JSON") fingerprint = alert.get("fingerprint") if not fingerprint: raise ValueError("fingerprint is required") alert_from_db = get_alerts_by_fingerprint( tenant_id=tenant_id, fingerprint=fingerprint, limit=1, ) if alert_from_db: alert_dto = convert_db_alerts_to_dto_alerts(alert_from_db)[0] # if the alert is not firing, there is no start firing time if alert_dto.status != AlertStatus.FIRING.value: return "0.00" firing = datetime.datetime.now( tz=datetime.timezone.utc ) - datetime.datetime.fromisoformat(alert_dto.firingStartTime) else: return "0.00" if time_unit in ["m", "minutes"]: result = firing.total_seconds() / 60 elif time_unit in ["h", "hours"]: result = firing.total_seconds() / 3600 elif time_unit in ["s", "seconds"]: result = firing.total_seconds() else: raise ValueError( "Invalid time_unit. Use 'minutes', 'hours', 'seconds', 'm', 'h', or 's'." ) return f"{result:.2f}" def is_first_time(fingerprint: str, since: str = None, **kwargs) -> str: """ Get the firing time of an alert. Args: alert (dict): The alert dictionary. **kwargs: Additional keyword arguments. Returns: str: The firing time of the alert in the specified time unit. """ tenant_id = kwargs.get("tenant_id") if not tenant_id: raise ValueError("tenant_id is required") if not fingerprint: raise ValueError("fingerprint is required") prev_alerts = get_alerts_by_fingerprint( tenant_id=tenant_id, fingerprint=fingerprint, limit=2, status="firing" ) if not prev_alerts: # this should not happen since workflows are running only after the alert is saved in the database raise ValueError("No previous alerts found for the given fingerprint.") # if there is only one alert, it is the first time 100% if len(prev_alerts) == 1: return True # if there is more than one alert and no 'since' specified, it is not the first time elif not since: return False # since is "24h" or "1d" or "1w" etc. prevAlert = prev_alerts[1] if since[-1] == "d": time_delta = timedelta(days=int(since[:-1])) elif since[-1] == "w": time_delta = timedelta(weeks=int(since[:-1])) elif since[-1] == "h": time_delta = timedelta(hours=int(since[:-1])) elif since[-1] == "m": time_delta = timedelta(minutes=int(since[:-1])) else: raise ValueError("Invalid time unit. Use 'm', 'h', 'd', or 'w'.") current_time = datetime.datetime.utcnow() if current_time - prevAlert.timestamp > time_delta: return True else: return False def is_business_hours( time_to_check=None, start_hour=8, end_hour=20, business_days=(0, 1, 2, 3, 4), # Mon = 0, Sun = 6 timezone="UTC", ): """ Check if the given time or current time is between start_hour and end_hour and falls on a business day Args: time_to_check (str | datetime.datetime, optional): Time to check. If None, current UTC time will be used. start_hour (int, optional): Start hour in 24-hour format. Defaults to 8 (8:00 AM) end_hour (int, optional): End hour in 24-hour format. Defaults to 20 (8:00 PM) business_days (tuple, optional): Days of week considered as business days. Monday=0 through Sunday=6. Defaults to Mon-Fri (0,1,2,3,4) timezone (str, optional): Timezone name (e.g., 'UTC', 'America/New_York', 'Europe/London'). Defaults to 'UTC'. Returns: bool: True if time is between start_hour and end_hour on a business day Raises: ValueError: If start_hour or end_hour are not between 0 and 23 ValueError: If business_days contains invalid day numbers ValueError: If timezone string is invalid """ # Validate hour inputs start_hour = int(start_hour) end_hour = int(end_hour) if not (0 <= start_hour <= 23 and 0 <= end_hour <= 23): raise ValueError("Hours must be between 0 and 23") # Strict validation for business_days try: invalid_days = [day for day in business_days if not (0 <= day <= 6)] if invalid_days: raise ValueError( f"Invalid business days: {invalid_days}. Days must be between 0 (Monday) and 6 (Sunday)" ) except TypeError: raise ValueError( "business_days must be an iterable of integers between 0 and 6" ) # Validate and convert timezone string to pytz timezone try: tz = pytz.timezone(timezone) except pytz.exceptions.UnknownTimeZoneError: raise ValueError(f"Invalid timezone: {timezone}") # If no time provided, use current UTC time if time_to_check is None: dt = utcnow() else: # Convert string to datetime if needed dt = to_utc(time_to_check) if isinstance(time_to_check, str) else time_to_check if not dt: # Handle case where parsing failed return False # Convert to specified timezone dt = dt.astimezone(tz) # Get weekday (Monday = 0, Sunday = 6) weekday = dt.weekday() # Check if it's a business day if weekday not in business_days: return False # Get just the hour (in 24-hour format) hour = dt.hour # Check if hour is between start_hour and end_hour return start_hour <= hour < end_hour def dictget(data: str | dict, key: str, default: any = None) -> any: """ Get a value from a dictionary with a default fallback. Args: data (str | dict): The dictionary to search in. Can be a JSON string or dict. key (str): The key to look up default (any): The default value to return if key is not found Returns: any: The value found in the dictionary or the default value Example: >>> d = {"s1": "critical", "s2": "error"} >>> dictget(d, "s1", "info") 'critical' >>> dictget(d, "s3", "info") 'info' """ if isinstance(data, str): try: data = json_loads(data) except Exception: return default if not isinstance(data, dict): return default return data.get(key, default) ================================================ FILE: keep/functions/cyaml.py ================================================ import yaml from yaml import YAMLError # Define what symbols are exported from this module __all__ = ['YAMLError', 'safe_load', 'dump', 'add_representer'] class QuotedString(str): """A string that remembers if it was quoted in the original YAML.""" quote_style: str | None = None block_style: str | None = None def __new__(cls, value, quote_style=None, block_style=None): instance = super().__new__(cls, value) instance.quote_style = quote_style instance.block_style = block_style return instance class QuotePreservingLoader(yaml.CSafeLoader): """A YAML Loader that marks strings that were originally quoted.""" def construct_scalar(self, node): # Get the scalar value value = super().construct_scalar(node) # If the node had quotes in the original YAML, mark it if node.style in ('"', "'"): # Use a custom class to remember that this string was quoted return QuotedString(value, quote_style=node.style) elif node.style == '|': # Handle block scalar indicator return QuotedString(value, block_style='|') return value class QuotePreservingDumper(yaml.CDumper): """A YAML Dumper that preserves quotes for marked strings.""" def represent_scalar(self, tag, value, style=None): # If this is our special QuotedString, use its original quote style or block style if isinstance(value, QuotedString): if value.block_style: style = value.block_style elif value.quote_style: style = value.quote_style return super().represent_scalar(tag, value, style) # Register a proper representer for QuotedString def represent_quoted_string(dumper, data): style = data.block_style or data.quote_style return dumper.represent_scalar('tag:yaml.org,2002:str', str(data), style=style) QuotePreservingDumper.add_representer(QuotedString, represent_quoted_string) def safe_load(stream): """Load YAML content safely, preserving information about quoted strings.""" return yaml.load(stream, Loader=QuotePreservingLoader) def dump(data, stream=None, Dumper=None, **kwds): """ Dump YAML data while preserving quotes in strings that were originally quoted. Args: data: The Python object to dump as YAML stream: Optional stream to write to (if None, returns a string) Dumper: Optional custom YAML dumper class **kwds: Additional keyword arguments for yaml.dump Returns: The YAML string if stream is None, otherwise None """ Dumper = Dumper or QuotePreservingDumper # Default to no flow style and preserve key order kwds.setdefault('default_flow_style', False) kwds.setdefault('sort_keys', False) kwds.setdefault('allow_unicode', True) return yaml.dump(data, stream, Dumper=Dumper, **kwds) def add_representer(data_type, representer, Dumper=None): """Add a custom representer for a specific data type.""" Dumper = Dumper or QuotePreservingDumper Dumper.add_representer(data_type, representer) ================================================ FILE: keep/identitymanager/authenticatedentity.py ================================================ from typing import Optional from pydantic import ConfigDict from pydantic.dataclasses import dataclass @dataclass(config=ConfigDict(extra="allow")) class AuthenticatedEntity: """ Represents an authenticated entity in the system. This class is designed to be expandable. Different identity providers can add additional fields as needed. For example, a Keycloak implementation might add an 'org_id' field. Attributes: tenant_id (str): The ID of the tenant this entity belongs to. email (str): The email address of the authenticated entity. api_key_name (Optional[str]): The name of the API key used for authentication, if applicable. role (Optional[str]): The role of the authenticated entity, if applicable. Note: The `config=ConfigDict(extra="allow")` parameter allows for additional attributes to be added dynamically, making this class flexible for different authentication implementations. """ tenant_id: str email: str api_key_name: Optional[str] = None role: Optional[str] = None ================================================ FILE: keep/identitymanager/authverifierbase.py ================================================ import datetime import logging from typing import Optional from fastapi import Depends, HTTPException, Request, Security from fastapi.security import ( APIKeyHeader, HTTPAuthorizationCredentials, HTTPBasic, OAuth2PasswordBearer, ) from starlette.datastructures import FormData from keep.api.core.config import config from keep.api.core.db import get_api_key, update_key_last_used from keep.api.core.dependencies import extract_generic_body from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.rbac import Admin as AdminRole from keep.identitymanager.rbac import get_role_by_role_name auth_header = APIKeyHeader(name="X-API-KEY", scheme_name="API Key", auto_error=False) http_basic = HTTPBasic(auto_error=False) oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token", auto_error=False) ALL_RESOURCES = set() def get_all_scopes() -> list[str]: """ Get all scopes Returns: list: The list of scopes. """ # read, write, delete and update for every resource: scopes = [] for resource in ALL_RESOURCES: for action in ["read", "write", "delete", "update"]: scopes.append(f"{action}:{resource}") return scopes class AuthVerifierBase: """ Base class for authentication and authorization verification. This class provides a framework for implementing authentication and authorization in FastAPI applications. It supports multiple authentication methods including API keys, HTTP Basic Auth, and OAuth2 bearer tokens. Subclasses can override the following methods to customize the authentication and authorization process: - _verify_bearer_token: Implement token-based authentication - _verify_api_key: Customize API key verification - _authorize: Implement custom authorization logic The main entry point is the __call__ method, which handles the entire authentication and authorization flow. Attributes: scopes (list[str]): A list of required scopes for authorization. logger (logging.Logger): Logger for this class. """ def __init__(self, scopes: list[str] = []) -> None: ALL_RESOURCES.update([scope.split(":")[1] for scope in scopes]) self.scopes = scopes self.logger = logging.getLogger(__name__) self.impersonation_enabled = ( config("KEEP_IMPERSONATION_ENABLED", default="false") == "true" ) self.impersonation_user_header = config( "KEEP_IMPERSONATION_USER_HEADER", default="X-KEEP-USER" ) self.impersonation_role_header = config( "KEEP_IMPERSONATION_ROLE_HEADER", default="X-KEEP-ROLE" ) self.impersonation_auto_provision = ( config("KEEP_IMPERSONATION_AUTO_PROVISION", default="false") == "true" ) self.allow_mesh_alert_ingestion = ( config("KEEP_ALLOW_MESH_ALERT_INGESTION", default="false") == "true" ) # hold a cache of the last time an API key was used # the key is the f{tenant_id}:{reference_id} and the value is the last time it was updated self.update_key_interval = config("KEEP_UPDATE_KEY_INTERVAL", default=60) self.key_last_used_updates = {} # check if read only instance self.read_only = config("KEEP_READ_ONLY", default="false") == "true" self.read_only_bypass_keys = config("KEEP_READ_ONLY_BYPASS_KEY", default="") self.read_only_bypass_keys = self.read_only_bypass_keys.split(",") # if read_only is enabled, read_only_bypass_key must be set if self.read_only and not self.read_only_bypass_keys: raise ValueError( "KEEP_READ_ONLY_BYPASS_KEY must be set if KEEP_READ_ONLY is enabled" ) def __call__( self, request: Request, api_key: Optional[str] = Security(auth_header), authorization: Optional[HTTPAuthorizationCredentials] = Security(http_basic), token: Optional[str] = Depends(oauth2_scheme), body: dict | bytes | FormData = Depends(extract_generic_body), ) -> AuthenticatedEntity: """ Main entry point for authentication and authorization. Args: request (Request): The incoming request. api_key (Optional[str]): The API key from the header. authorization (Optional[HTTPAuthorizationCredentials]): The HTTP basic auth credentials. token (Optional[str]): The OAuth2 token. Returns: AuthenticatedEntity: The authenticated entity. Raises: HTTPException: If authentication or authorization fails. """ self.logger.debug("Starting authentication process") if self.read_only and api_key not in self.read_only_bypass_keys: # check if the scopes have scopes other than only read if any([scope.split(":")[0] != "read" for scope in self.scopes]): self.logger.error("Read only instance, but non-read scopes requested") raise HTTPException( status_code=403, detail="Read only instance, but non-read scopes requested", ) authenticated_entity = self.authenticate(request, api_key, authorization, token, body) self.logger.debug( f"Authentication successful for entity: {authenticated_entity}" ) self.logger.debug("Starting authorization process") self.authorize(authenticated_entity) self.logger.debug("Authorization successful") return authenticated_entity def authenticate( self, request: Request, api_key: Optional[str], authorization: Optional[HTTPAuthorizationCredentials], token: Optional[str], body: Optional[dict | bytes | FormData] = None, ) -> AuthenticatedEntity: """ Authenticate the request using either token, API key, or HTTP basic auth. Args: request (Request): The incoming request. api_key (Optional[str]): The API key from the header. authorization (Optional[HTTPAuthorizationCredentials]): The HTTP basic auth credentials. token (Optional[str]): The OAuth2 token. body (Optional[dict | bytes | FormData]): incoming request body got logs Returns: AuthenticatedEntity: The authenticated entity. Raises: HTTPException: If authentication fails. """ self.logger.debug("Attempting authentication") if token: self.logger.debug("Attempting to authenticate with bearer token") try: return self._verify_bearer_token(token) except HTTPException: raise except Exception: self.logger.exception("Failed to validate token") raise HTTPException( status_code=401, detail="Invalid authentication credentials" ) api_key = self._extract_api_key(request, api_key, authorization) # HACK for cloudwatch without api key for self hosted deployments if isinstance(api_key, AuthenticatedEntity): return api_key if api_key: self.logger.debug("Attempting to authenticate with API key") try: return self._verify_api_key(request, api_key, authorization) except HTTPException: raise except Exception: self.logger.exception("Failed to validate API Key") raise HTTPException( status_code=401, detail="Invalid authentication credentials" ) self.logger.error( "No valid authentication method found", extra={ "headers": request.headers, "body": body, } ) raise HTTPException( status_code=401, detail="Missing authentication credentials" ) def authorize(self, authenticated_entity: AuthenticatedEntity) -> None: """ Authorize the authenticated entity. Args: authenticated_entity (AuthenticatedEntity): The authenticated entity to authorize. Raises: HTTPException: If authorization fails. """ self.logger.debug(f"Authorizing entity: {authenticated_entity}") self._authorize(authenticated_entity) def _authorize(self, authenticated_entity: AuthenticatedEntity) -> None: """ Internal method to perform authorization. Args: authenticated_entity (AuthenticatedEntity): The authenticated entity to authorize. Raises: HTTPException: If the entity doesn't have the required scopes. """ role = get_role_by_role_name(authenticated_entity.role) self.logger.debug(f"Checking scopes for role: {role}") if not role.has_scopes(self.scopes): self.logger.warning(f"Authorization failed. Required scopes: {self.scopes}") raise HTTPException( status_code=403, detail=f"You don't have the required scopes to access this resource [required scopes: {self.scopes}]", ) def _extract_api_key( self, request: Request, api_key: str, authorization: HTTPAuthorizationCredentials, ) -> str: """ Extract the API key from various sources in the request. Args: request (Request): The incoming request. api_key (str): The API key from the header. authorization (HTTPAuthorizationCredentials): The HTTP basic auth credentials. Returns: str: The extracted API key. Raises: HTTPException: If no valid API key is found. """ self.logger.debug("Extracting API key") api_key = api_key or request.query_params.get("api_key", None) if not api_key: if self.allow_mesh_alert_ingestion and "/alerts/event" in request.url.path: service_name = request.headers.get( "X-Service-Name", "unknown" ) self.logger.info( "Allowing service alert ingestion from %s on %s", service_name, request.url.path, ) return AuthenticatedEntity( tenant_id="keep", email=f"service:{service_name}", api_key_name="service", role="webhook", ) # A special treatment for CloudWatch SNS Confirmation requests if ( not authorization and "Amazon Simple Notification Service Agent" in request.headers.get("user-agent", "") ): self.logger.warning("Got an SNS request without any auth") allow_unauth = config("KEEP_CLOUDWATCH_DISABLE_API_KEY", default=False) if allow_unauth and request.url.path.endswith( "/alerts/event/cloudwatch" ): tenant_id = request.query_params.get("tenant_id", "keep") self.logger.info( f"Allowing unauthenticated access for tenant: {tenant_id} for CloudWatch" ) return AuthenticatedEntity( tenant_id=tenant_id, email="system", api_key_name="webhook", role="webhook", ) raise HTTPException( status_code=401, headers={"WWW-Authenticate": "Basic"}, detail="Missing API Key", ) auth_header = request.headers.get("Authorization") try: scheme, _, credentials = auth_header.partition(" ") except Exception: self.logger.error( "Failed to parse Authorization header", extra={ "url": str(request.url), "user-agent": request.headers.get("user-agent"), }, ) raise HTTPException(status_code=401, detail="Missing API Key") if scheme.lower() == "basic": api_key = authorization.password elif scheme.lower() == "digest": if not credentials: self.logger.error("Invalid Digest credentials") raise HTTPException( status_code=403, detail="Invalid Digest credentials" ) else: api_key = credentials else: self.logger.error(f"Unsupported authentication scheme: {scheme}") raise HTTPException(status_code=401, detail="Missing API Key") self.logger.debug("API key extracted successfully") return api_key def _verify_api_key( self, request: Request, api_key: str = Security(auth_header), authorization: HTTPAuthorizationCredentials = Security(http_basic), ) -> AuthenticatedEntity: """ Verify the API key and return an authenticated entity. Args: request (Request): The incoming request. api_key (str): The API key to verify. authorization (HTTPAuthorizationCredentials): The HTTP basic auth credentials. Returns: AuthenticatedEntity: The authenticated entity. Raises: HTTPException: If the API key is invalid. """ self.logger.debug("Verifying API key") tenant_api_key = get_api_key(api_key) if not tenant_api_key: self.logger.warning("Invalid API Key") raise HTTPException(status_code=401, detail="Invalid API Key") try: self.logger.debug("Updating API Key last used") # if the key was updated in the last update_key_interval seconds, skip the update if ( f"{tenant_api_key.tenant_id}:{tenant_api_key.reference_id}" in self.key_last_used_updates ): # if the key was updated in the last update_key_interval seconds, skip the update if self.key_last_used_updates[ f"{tenant_api_key.tenant_id}:{tenant_api_key.reference_id}" ] > ( datetime.datetime.now() - datetime.timedelta(seconds=self.update_key_interval) ): self.logger.debug( f"API Key last used updated in the last {self.update_key_interval} seconds" ) # else, update the key else: update_key_last_used( tenant_api_key.tenant_id, reference_id=tenant_api_key.reference_id ) self.key_last_used_updates[ f"{tenant_api_key.tenant_id}:{tenant_api_key.reference_id}" ] = datetime.datetime.now() self.logger.debug("Successfully updated API Key last used") except Exception: self.logger.exception("Failed to update API Key last used") request.state.tenant_id = tenant_api_key.tenant_id self.logger.debug(f"API key verified for tenant: {tenant_api_key.tenant_id}") # check if impersonation is enabled, if not, return the api key's authenticated entity if not self.impersonation_enabled: return AuthenticatedEntity( tenant_api_key.tenant_id, tenant_api_key.created_by, tenant_api_key.reference_id, tenant_api_key.role, ) # check if impersonation headers are present user_name = request.headers.get(self.impersonation_user_header) role = request.headers.get(self.impersonation_role_header) # if not, return the apikey's authenticated entity if not user_name or not role: return AuthenticatedEntity( tenant_api_key.tenant_id, tenant_api_key.created_by, tenant_api_key.reference_id, tenant_api_key.role, ) self.logger.info("Impersonating user") user_name = request.headers.get(self.impersonation_user_header) role = request.headers.get(self.impersonation_role_header) if not user_name or not role: raise HTTPException(status_code=401, detail="Impersonation headers missing") # TODO - validate authorization meaning api key X has access to impersonate user Y # for now, only admin users can impersonate if tenant_api_key.role != AdminRole.get_name(): self.logger.error("Impersonation not allowed for non-admin users") raise HTTPException( status_code=401, detail="Impersonation not allowed for non-admin users" ) # auto provision user if self.impersonation_auto_provision: self.logger.info(f"Auto provisioning user: {user_name}") self._provision_user(tenant_api_key.tenant_id, user_name, role) self.logger.info(f"User {user_name} provisioned successfully") self.logger.info("User impersonated successfully") return AuthenticatedEntity( tenant_id=tenant_api_key.tenant_id, email=user_name, api_key_name=None, role=role, ) def _provision_user(self, tenant_api_key, user_name, role): """ Create a user for impersonation. Args: tenant_api_key: The API key used for impersonation. user_name: The name of the user to create. role: The role of the user to create. """ raise NotImplementedError( "User provisioning not implemented" " for {}".format(self.__class__.__name__) ) def _verify_bearer_token(self, token: str) -> AuthenticatedEntity: """ Verify the bearer token and return an authenticated entity. Args: token (str): The bearer token to verify. Returns: AuthenticatedEntity: The authenticated entity. Raises: NotImplementedError: This method needs to be implemented in subclasses. """ self.logger.error("_verify_bearer_token() method not implemented") raise NotImplementedError( "_verify_bearer_token() method not implemented" " for {}".format(self.__class__.__name__) ) ================================================ FILE: keep/identitymanager/identity_managers/__init__.py ================================================ ================================================ FILE: keep/identitymanager/identity_managers/db/__init__.py ================================================ ================================================ FILE: keep/identitymanager/identity_managers/db/db_authverifier.py ================================================ import os import jwt from fastapi import HTTPException from keep.api.core.db import create_user, user_exists from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.rbac import Admin as AdminRole from keep.identitymanager.rbac import get_role_by_role_name class DbAuthVerifier(AuthVerifierBase): """Handles authentication and authorization for single tenant mode""" def _verify_bearer_token(self, token: str) -> AuthenticatedEntity: # validate the token jwt_secret = os.environ.get("KEEP_JWT_SECRET", "jwtsecret") # if default if jwt_secret == "jwtsecret": self.logger.warning( "KEEP_JWT_SECRET environment variable is not set, using default value. Should be set in production." ) try: payload = jwt.decode( token, jwt_secret, algorithms="HS256", ) tenant_id = payload.get("tenant_id") email = payload.get("email") role_name = payload.get( "role", AdminRole.get_name() ) # default to admin for backwards compatibility role = get_role_by_role_name(role_name) except Exception: self.logger.exception("Failed to decode JWT token") raise HTTPException(status_code=401, detail="Invalid JWT token") # validate scopes if not role.has_scopes(self.scopes): raise HTTPException( status_code=403, detail="You don't have the required permissions to access this resource", ) return AuthenticatedEntity(tenant_id, email, None, role_name) # create user for auto-provisioning def _provision_user(self, tenant_id, user_name, role): if not user_exists(tenant_id, user_name): create_user(tenant_id=tenant_id, username=user_name, role=role, password="") ================================================ FILE: keep/identitymanager/identity_managers/db/db_identitymanager.py ================================================ import os import jwt from fastapi import HTTPException from fastapi.responses import JSONResponse from keep.api.core.db import create_user as create_user_in_db from keep.api.core.db import delete_user as delete_user_from_db from keep.api.core.db import get_user from keep.api.core.db import get_users as get_users_from_db from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.models.user import User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.identity_managers.db.db_authverifier import DbAuthVerifier from keep.identitymanager.identitymanager import BaseIdentityManager class DbIdentityManager(BaseIdentityManager): def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.logger.info("DB Identity Manager initialized") def on_start(self, app) -> None: """ Initialize the identity manager. """ # This is a special method that is called when the identity manager is # initialized. It is used to set up the identity manager with the FastAPI self.logger.info("Adding signin endpoint") @app.post("/signin") def signin(body: dict): # block empty passwords (e.g. user provisioned) if not body.get("password"): return JSONResponse( status_code=401, content={"message": "Empty password"}, ) # validate the user/password user = get_user(body.get("username"), body.get("password")) if not user: return JSONResponse( status_code=401, content={"message": "Invalid username or password"}, ) # generate a JWT secret jwt_secret = os.environ.get("KEEP_JWT_SECRET") if not jwt_secret: self.logger.info("missing KEEP_JWT_SECRET environment variable") raise HTTPException(status_code=401, detail="Missing JWT secret") token = jwt.encode( { "email": user.username, "tenant_id": SINGLE_TENANT_UUID, "role": user.role, }, jwt_secret, algorithm="HS256", ) # return the token return { "accessToken": token, "tenantId": SINGLE_TENANT_UUID, "email": user.username, "role": user.role, } self.logger.info("Added signin endpoint") def get_users(self, tenant_id=None) -> list[User]: users = get_users_from_db(tenant_id) users = [ User( email=f"{user.username}", name=user.username, role=user.role, last_login=str(user.last_sign_in) if user.last_sign_in else None, created_at=str(user.created_at), ) for user in users ] return users def create_user( self, user_email: str, user_name: str, password: str, role: str, groups: list ) -> dict: # Username is redundant, but we need it in other auth types # Groups: for future use try: user = create_user_in_db(self.tenant_id, user_email, password, role) return User( email=user_email, name=user_email, role=role, last_login=None, created_at=str(user.created_at), ) except Exception: raise HTTPException(status_code=409, detail="User already exists") def delete_user(self, user_email: str) -> dict: try: delete_user_from_db(user_email) return {"status": "OK"} except Exception: raise HTTPException(status_code=404, detail="User not found") def get_auth_verifier(self, scopes) -> DbAuthVerifier: return DbAuthVerifier(scopes) def update_user(self, user_email: str, update_data: dict) -> User: raise NotImplementedError("DbIdentityManager.update_user") ================================================ FILE: keep/identitymanager/identity_managers/noauth/__init__.py ================================================ ================================================ FILE: keep/identitymanager/identity_managers/noauth/noauth_authverifier.py ================================================ import json from typing import Optional from fastapi import Request from fastapi.security import HTTPAuthorizationCredentials from keep.api.core.db import get_api_key from keep.api.core.dependencies import SINGLE_TENANT_EMAIL, SINGLE_TENANT_UUID from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.rbac import Admin as AdminRole class NoAuthVerifier(AuthVerifierBase): """Handles authentication and authorization for single tenant mode""" def _verify_bearer_token(self, token: str) -> AuthenticatedEntity: try: if token.startswith("keepActiveTenant"): active_tenant, token = token.split("&") active_tenant = active_tenant.split("=")[1] tenant_id = active_tenant or SINGLE_TENANT_UUID return AuthenticatedEntity( tenant_id=tenant_id, email=SINGLE_TENANT_EMAIL, role=AdminRole.get_name(), ) else: token_payload = json.loads(token) tenant_id = token_payload["tenant_id"] or SINGLE_TENANT_UUID email = token_payload["user_id"] or SINGLE_TENANT_EMAIL return AuthenticatedEntity( tenant_id=tenant_id, email=email, role=AdminRole.get_name(), ) except Exception: return AuthenticatedEntity( tenant_id=SINGLE_TENANT_UUID, email=SINGLE_TENANT_EMAIL, role=AdminRole.get_name(), ) def _verify_api_key( self, request: Request, api_key: str, authorization: Optional[HTTPAuthorizationCredentials], ) -> AuthenticatedEntity: tenant_api_key = get_api_key(api_key) # this is ok, since we are in noauth mode if not tenant_api_key: return AuthenticatedEntity( tenant_id=SINGLE_TENANT_UUID, email=SINGLE_TENANT_EMAIL, role=AdminRole.get_name(), ) # for e2e tests where multiple tenants are supported (per tenant api key) self.logger.info(f"Using tenant_id: {tenant_api_key.tenant_id}") return AuthenticatedEntity( tenant_id=tenant_api_key.tenant_id, email=SINGLE_TENANT_EMAIL, role=AdminRole.get_name(), ) ================================================ FILE: keep/identitymanager/identity_managers/noauth/noauth_identitymanager.py ================================================ from keep.api.core.db import create_single_tenant_for_e2e from keep.api.models.user import User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.identity_managers.noauth.noauth_authverifier import ( NoAuthVerifier, ) from keep.identitymanager.identitymanager import BaseIdentityManager class NoAuthIdentityManager(BaseIdentityManager): def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.logger.info("DB Identity Manager initialized") def on_start(self, app) -> None: """ Initialize the identity manager. """ # create tenant, for e2e tests @app.post("/tenant") def tenant(body: dict): tenant_id = body.get("tenant_id") if tenant_id is None: raise Exception("Tenant ID is required") create_single_tenant_for_e2e(tenant_id) return {"message": "Tenant created"} self.logger.info("Added tenant endpoint") def get_users(self) -> list[User]: return [] def create_user(self, user_email, user_name, password, role, groups=[]) -> None: return def delete_user(self, user_email: str) -> dict: return {} def get_auth_verifier(self, scopes) -> AuthVerifierBase: return NoAuthVerifier(scopes) ================================================ FILE: keep/identitymanager/identity_managers/oauth2proxy/__init__.py ================================================ ================================================ FILE: keep/identitymanager/identity_managers/oauth2proxy/oauth2proxy_authverifier.py ================================================ from typing import Optional from fastapi import HTTPException, Request from fastapi.security import HTTPAuthorizationCredentials from keep.api.core.config import config from keep.api.core.db import ( create_user, update_user_last_sign_in, update_user_role, user_exists, ) from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.rbac import get_role_by_role_name class Oauth2proxyAuthVerifier(AuthVerifierBase): """Handles authentication and authorization for single tenant mode""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.oauth2_proxy_user_header = config( "KEEP_OAUTH2_PROXY_USER_HEADER", default="x-forwarded-email" ) self.oauth2_proxy_role_header = config( "KEEP_OAUTH2_PROXY_ROLE_HEADER", default="x-forwarded-groups" ) self.auto_create_user = config( "KEEP_OAUTH2_PROXY_AUTO_CREATE_USER", default=True ) self.role_mappings = {} for env_var, target_role in [ ("KEEP_OAUTH2_PROXY_ADMIN_ROLES", "admin"), ("KEEP_OAUTH2_PROXY_NOC_ROLES", "noc"), ("KEEP_OAUTH2_PROXY_WEBHOOK_ROLES", "webhook"), ]: roles_str = config(env_var, default="") roles = [role.strip() for role in roles_str.split(",") if role.strip()] for role in roles: self.role_mappings[role] = target_role self.logger.info("Oauth2proxy Auth Verifier initialized") def authenticate( self, request: Request, api_key: str, authorization: Optional[HTTPAuthorizationCredentials], token: Optional[str], *args, **kwargs, ) -> AuthenticatedEntity: # If we have an api key or an authorization header, we need to authenticate using that if api_key or request.headers.get("Authorization"): try: api_key = self._extract_api_key(request, api_key, authorization) if api_key: self.logger.info("Attempting to authenticate with API key") try: return self._verify_api_key(request, api_key, authorization) except HTTPException: raise except Exception: self.logger.exception("Failed to validate API Key") raise HTTPException( status_code=401, detail="Invalid authentication credentials" ) except Exception: # If we fail to validate the API key, we need to try to authenticate with the user and role headers # We will either way return a 401 status code if it fails, so we don't need to handle it here pass # https://github.com/keephq/keep/issues/1203 # get user name self.logger.info( f"Authenticating user with {self.oauth2_proxy_user_header} header" ) user_name = request.headers.get(self.oauth2_proxy_user_header) if not user_name: raise HTTPException( status_code=401, detail=f"Unauthorized - no user in {self.oauth2_proxy_user_header} header found", ) role = request.headers.get(self.oauth2_proxy_role_header) if not role: raise HTTPException( status_code=401, detail=f"Unauthorized - no role in {self.oauth2_proxy_role_header} header found", ) # else, if its a list seperated by comma e.g. org:admin, org:foobar or role:admin, role:foobar if "," in role: # split the roles by comma roles = role.split(",") # trim roles = [r.strip() for r in roles] else: roles = [role] # Define the priority order of roles role_priority = ["admin", "noc", "webhook"] mapped_role = None for priority_role in role_priority: self.logger.debug(f"Checking for role {priority_role}") for role in roles: self.logger.debug(f"Checking for role {role}") # map the role if its a mapped one, or just use the role mapped_role_name = self.role_mappings.get(role, role) self.logger.debug(f"Checking for mapped role {mapped_role_name}") if mapped_role_name == priority_role: try: self.logger.debug(f"Getting role {mapped_role_name}") mapped_role = get_role_by_role_name(mapped_role_name) self.logger.debug(f"Role {mapped_role_name} found") break except HTTPException: self.logger.debug(f"Role {mapped_role_name} not found") continue if mapped_role: self.logger.debug(f"Role {mapped_role_name} found") break # if no valid role was found, throw a 403 exception if not mapped_role: self.logger.debug(f"No valid role found among {roles}") raise HTTPException( status_code=403, detail=f"No valid role found among {roles}", ) # auto provision user if self.auto_create_user and not user_exists( tenant_id=SINGLE_TENANT_UUID, username=user_name ): self.logger.info(f"Auto provisioning user: {user_name}") create_user( tenant_id=SINGLE_TENANT_UUID, username=user_name, role=mapped_role.get_name(), password="", ) self.logger.info(f"User {user_name} created") elif user_exists(tenant_id=SINGLE_TENANT_UUID, username=user_name): # update last login self.logger.debug(f"Updating last login for user: {user_name}") try: update_user_last_sign_in( tenant_id=SINGLE_TENANT_UUID, username=user_name ) self.logger.debug(f"Last login updated for user: {user_name}") except Exception: self.logger.warning( f"Failed to update last login for user: {user_name}" ) pass # update role self.logger.debug(f"Updating role for user: {user_name}") try: update_user_role( tenant_id=SINGLE_TENANT_UUID, username=user_name, role=mapped_role.get_name(), ) self.logger.debug(f"Role updated for user: {user_name}") except Exception: self.logger.warning(f"Failed to update role for user: {user_name}") pass self.logger.info(f"User {user_name} authenticated with role {mapped_role}") return AuthenticatedEntity( tenant_id=SINGLE_TENANT_UUID, email=user_name, role=mapped_role.get_name(), ) ================================================ FILE: keep/identitymanager/identity_managers/oauth2proxy/oauth2proxy_identitymanager.py ================================================ from keep.api.core.db import get_users as get_users_from_db from keep.api.models.user import User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.identity_managers.oauth2proxy.oauth2proxy_authverifier import ( Oauth2proxyAuthVerifier, ) from keep.identitymanager.identitymanager import BaseIdentityManager class Oauth2proxyIdentityManager(BaseIdentityManager): def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.logger.info("Oauth2 proxy Identity Manager initialized") def get_users(self) -> list[User]: users = get_users_from_db() users = [ User( email=f"{user.username}", name=user.username, role=user.role, last_login=str(user.last_sign_in) if user.last_sign_in else None, created_at=str(user.created_at), ) for user in users ] return users def get_auth_verifier(self, scopes) -> Oauth2proxyAuthVerifier: return Oauth2proxyAuthVerifier(scopes) # Not implemented def create_user(self, **kawrgs) -> User: return None # Not implemented def delete_user(self, user_email=None, **kwargs) -> User: # Implementation or just return None return None ================================================ FILE: keep/identitymanager/identity_managers/okta/__init__.py ================================================ ================================================ FILE: keep/identitymanager/identity_managers/okta/okta_authverifier.py ================================================ import logging import os import jwt from fastapi import Depends, HTTPException from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase, oauth2_scheme logger = logging.getLogger(__name__) # Define constant locally instead of importing it DEFAULT_ROLE_NAME = "user" # Default role name for user access class OktaAuthVerifier(AuthVerifierBase): """Handles authentication and authorization for Okta""" def __init__(self, scopes: list[str] = []) -> None: super().__init__(scopes) self.okta_issuer = os.environ.get("OKTA_ISSUER") self.okta_audience = os.environ.get("OKTA_AUDIENCE") self.okta_client_id = os.environ.get("OKTA_CLIENT_ID") self.jwks_url = os.environ.get("OKTA_JWKS_URL") # If no explicit JWKS URL is provided, we need an issuer to construct it if not self.jwks_url and not self.okta_issuer: raise Exception("Missing both OKTA_JWKS_URL and OKTA_ISSUER environment variables") # Remove trailing slash if present on issuer if self.okta_issuer and self.okta_issuer.endswith("/"): self.okta_issuer = self.okta_issuer[:-1] # Initialize JWKS client - prefer direct JWKS URL if available if not self.jwks_url: self.jwks_url = f"{self.okta_issuer}/.well-known/jwks.json" # At this point, self.jwks_url is guaranteed to be a string assert self.jwks_url is not None self.jwks_client = jwt.PyJWKClient(self.jwks_url) logger.info(f"Initialized JWKS client with URL: {self.jwks_url}") def _verify_bearer_token(self, token: str = Depends(oauth2_scheme)) -> AuthenticatedEntity: if not token: raise HTTPException(status_code=401, detail="No token provided") try: # Get the signing key directly from the JWT signing_key = self.jwks_client.get_signing_key_from_jwt(token).key # Decode and verify the token payload = jwt.decode( token, key=signing_key, algorithms=["RS256"], audience=self.okta_audience or self.okta_client_id, issuer=self.okta_issuer, options={"verify_exp": True} ) # Extract user info from token with simplified role handling tenant_id = payload.get("keep_tenant_id", "keep") # Default to 'keep' if not specified email = payload.get("email") or payload.get("sub") or payload.get("preferred_username") # Look for role in standard locations with a default of "user" groups = payload.get("groups", []) role_name = ( payload.get("keep_role") or payload.get("role") or (groups[0] if groups else None) or DEFAULT_ROLE_NAME # Use constant for consistency ) org_id = payload.get("org_id") org_realm = payload.get("org_realm") if not email: raise HTTPException(status_code=401, detail="No email in token") logger.info(f"Successfully verified token for user with email: {email}") return AuthenticatedEntity( tenant_id=tenant_id, email=email, role=role_name, org_id=org_id, org_realm=org_realm, token=token ) except jwt.exceptions.InvalidKeyError as e: logger.error(f"Invalid key error during token validation: {str(e)}") raise HTTPException(status_code=401, detail="Invalid signing key - token validation failed") except jwt.ExpiredSignatureError: logger.warning("Token has expired") raise HTTPException(status_code=401, detail="Token has expired") except jwt.InvalidTokenError as e: logger.warning(f"Invalid token: {str(e)}") raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}") except Exception as e: logger.exception("Failed to validate token") raise HTTPException(status_code=401, detail=f"Token validation failed: {str(e)}") ================================================ FILE: keep/identitymanager/identity_managers/okta/okta_identitymanager.py ================================================ import os from keep.api.models.user import Group, Role, User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.identity_managers.okta.okta_authverifier import OktaAuthVerifier from keep.identitymanager.identitymanager import BaseIdentityManager class OktaIdentityManager(BaseIdentityManager): """ Identity manager implementation for Okta. Authentication works but management functions are disabled. """ def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.okta_domain = os.environ.get("OKTA_DOMAIN") self.okta_issuer = os.environ.get("OKTA_ISSUER") self.okta_client_id = os.environ.get("OKTA_CLIENT_ID") self.okta_client_secret = os.environ.get("OKTA_CLIENT_SECRET") # API token is not required for basic authentication self.okta_api_token = os.environ.get("OKTA_API_TOKEN") if not all([self.okta_domain, self.okta_issuer, self.okta_client_id, self.okta_client_secret]): missing_vars = [] if not self.okta_domain: missing_vars.append("OKTA_DOMAIN") if not self.okta_issuer: missing_vars.append("OKTA_ISSUER") if not self.okta_client_id: missing_vars.append("OKTA_CLIENT_ID") if not self.okta_client_secret: missing_vars.append("OKTA_CLIENT_SECRET") self.logger.error(f"Missing environment variables: {', '.join(missing_vars)}") raise Exception(f"Missing environment variables: {', '.join(missing_vars)}") # Remove any trailing slash from issuer if self.okta_issuer.endswith("/"): self.okta_issuer = self.okta_issuer[:-1] self.logger.info("Okta Identity Manager initialized (management functions disabled)") def on_start(self, app) -> None: """ Initialize the identity manager on application startup. No-op for this minimal implementation. """ self.logger.info("Okta Identity Manager started (roles creation disabled)") @property def support_sso(self) -> bool: """Indicate that Okta supports SSO""" return True def get_sso_providers(self) -> list[str]: """Get the list of SSO providers""" return ["okta"] def get_sso_wizard_url(self, authenticated_entity: AuthenticatedEntity) -> str: """Get the URL for the SSO wizard""" tenant_id = authenticated_entity.tenant_id return f"{self.okta_issuer}/sso/{tenant_id}" def get_users(self) -> list[User]: """Get all users from Okta - disabled""" self.logger.info("get_users called but management functions are disabled") return [] def create_user(self, user_email: str, user_name: str, password: str, role: str, groups: list[str] = []) -> dict: """Create a new user in Okta - disabled""" self.logger.info("create_user called but management functions are disabled") return {"status": "not_implemented", "message": "User management is disabled"} def update_user(self, user_email: str, update_data: dict) -> dict: """Update an existing user in Okta - disabled""" self.logger.info("update_user called but management functions are disabled") return {"status": "not_implemented", "message": "User management is disabled"} def delete_user(self, user_email: str) -> dict: """Delete a user from Okta - disabled""" self.logger.info("delete_user called but management functions are disabled") return {"status": "not_implemented", "message": "User management is disabled"} def get_auth_verifier(self, scopes: list) -> AuthVerifierBase: """Get the auth verifier for Okta - this still works""" return OktaAuthVerifier(scopes) def get_groups(self) -> list[Group]: """Get all groups from Okta - disabled""" self.logger.info("get_groups called but management functions are disabled") return [] def create_group(self, group_name: str, members: list[str], roles: list[str]) -> None: """Create a new group in Okta - disabled""" self.logger.info("create_group called but management functions are disabled") return None def update_group(self, group_name: str, members: list[str], roles: list[str]) -> None: """Update an existing group in Okta - disabled""" self.logger.info("update_group called but management functions are disabled") return None def delete_group(self, group_name: str) -> None: """Delete a group from Okta - disabled""" self.logger.info("delete_group called but management functions are disabled") return None def create_role(self, role: Role, predefined=False) -> str: """Create a role in Okta - disabled""" self.logger.info("create_role called but management functions are disabled") return "" def get_roles(self) -> list[Role]: """Get all roles from Okta - disabled""" self.logger.info("get_roles called but management functions are disabled") return [] def delete_role(self, role_id: str) -> None: """Delete a role from Okta - disabled""" self.logger.info("delete_role called but management functions are disabled") return None ================================================ FILE: keep/identitymanager/identity_managers/onelogin/__init__.py ================================================ ================================================ FILE: keep/identitymanager/identity_managers/onelogin/onelogin_authverifier.py ================================================ import logging import jwt from fastapi import Depends, HTTPException from keep.api.core.config import config from keep.api.core.db import user_exists, create_user, update_user_last_sign_in, update_user_role from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase, oauth2_scheme from keep.identitymanager.rbac import get_role_by_role_name logger = logging.getLogger(__name__) class OneLoginAuthVerifier(AuthVerifierBase): """Handles SSO authentication for OneLogin""" def __init__(self, scopes: list[str] = []) -> None: super().__init__(scopes) self.logger.info(f"Initializing OneLogin AuthVerifier with scopes: {scopes}") self.onelogin_issuer = config("ONELOGIN_ISSUER") self.onelogin_client_id = config("ONELOGIN_CLIENT_ID") self.auto_create_user = config("ONELOGIN_AUTO_CREATE_USER", default=True) self.role_mappings = { config("ONELOGIN_ADMIN_ROLE", default="keep_admin"): "admin", config("ONELOGIN_NOC_ROLE", default="keep_noc"): "noc", config("ONELOGIN_WEBHOOK_ROLE", default="keep_webhook"): "webhook", } if ( not self.onelogin_issuer or not self.onelogin_client_id ): raise Exception("Missing ONELOGIN_ISSUER or ONELOGIN_CLIENT_ID environment variable") # Remove trailing slash if present on issuer if self.onelogin_issuer.endswith("/"): self.onelogin_issuer = self.onelogin_issuer[:-1] self.jwks_url = f"{self.onelogin_issuer}/certs" self.jwks_client = jwt.PyJWKClient(self.jwks_url) self.logger.info(f"Initialized OneLogin JWKS client with URL: {self.jwks_url}") self.logger.info("OneLogin Auth Verifier initialized") def _verify_bearer_token(self, token: str = Depends(oauth2_scheme)) -> AuthenticatedEntity: if not token: raise HTTPException(status_code=401, detail="No token provided") try: # Get the signing key directly from the JWT signing_key = self.jwks_client.get_signing_key_from_jwt(token).key # Decode and verify the token payload = jwt.decode( token, key=signing_key, algorithms=["RS256"], audience= self.onelogin_client_id, issuer=self.onelogin_issuer, options={"verify_exp": True} ) user_name = payload.get("email") or payload.get("sub") or payload.get("preferred_username") onelogin_groups = payload.get("groups", []) # When one configures basic roles on OneLogin it comes as a list but when you perform a role mapping it comes as comma separated string if type(onelogin_groups) is str: onelogin_groups = onelogin_groups.split(",") onelogin_groups = [g.strip() for g in onelogin_groups] self.logger.debug(f"OneLogin Groups: {onelogin_groups}") # Define the priority order of roles role_priority = ["admin", "noc", "webhook"] mapped_role = None self.logger.debug(f"OneLogin to Keep Role Mapping: {self.role_mappings}") for role in role_priority: self.logger.debug(f"Checking for role {role}") for onelogin_grp in onelogin_groups: self.logger.debug(f"Checking for onelogin group {onelogin_grp}") mapped_role_name=self.role_mappings.get(onelogin_grp, "") self.logger.debug(f"Checking for mapped role name {mapped_role_name}") if role == mapped_role_name: try: self.logger.debug(f"Getting role {mapped_role_name}") mapped_role = get_role_by_role_name(mapped_role_name) self.logger.debug(f"Role {mapped_role_name} found") break except HTTPException: self.logger.debug(f"Role {mapped_role_name} not found") continue if mapped_role: self.logger.debug(f"Role {mapped_role.get_name()} found") break # if no valid role was found, throw a 403 exception if not mapped_role: self.logger.warning(f"No valid role-group mapping found among {onelogin_groups}") raise HTTPException( status_code=403, detail=f"No valid role found among {onelogin_groups}", ) # auto provision user if self.auto_create_user and not user_exists( tenant_id=SINGLE_TENANT_UUID, username=user_name ): self.logger.info(f"Auto provisioning user: {user_name}") create_user( tenant_id=SINGLE_TENANT_UUID, username=user_name, role=mapped_role.get_name(), password="", ) self.logger.info(f"User {user_name} created") elif user_exists(tenant_id=SINGLE_TENANT_UUID, username=user_name): # update last login self.logger.debug(f"Updating last login for user: {user_name}") try: update_user_last_sign_in( tenant_id=SINGLE_TENANT_UUID, username=user_name ) self.logger.debug(f"Last login updated for user: {user_name}") except Exception: self.logger.warning(f"Failed to update last login for user: {user_name}") pass # update role self.logger.debug(f"Updating role for user: {user_name}") try: update_user_role( tenant_id=SINGLE_TENANT_UUID, username=user_name, role=mapped_role.get_name(), ) self.logger.debug(f"Role updated for user: {user_name}") except Exception: self.logger.warning(f"Failed to update role for user: {user_name}") pass self.logger.info(f"User {user_name} authenticated with role {mapped_role.get_name()}") return AuthenticatedEntity( tenant_id=SINGLE_TENANT_UUID, email=user_name, role=mapped_role.get_name(), token=token ) except jwt.exceptions.InvalidKeyError as e: self.logger.error(f"Invalid key error during token validation: {str(e)}") raise HTTPException(status_code=401, detail="Invalid signing key - token validation failed") except jwt.ExpiredSignatureError: self.logger.warning("Token has expired") raise HTTPException(status_code=401, detail="Token has expired") except jwt.InvalidTokenError as e: self.logger.warning(f"Invalid token: {str(e)}") raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}") except Exception as e: self.logger.exception("Failed to validate token") raise HTTPException(status_code=401, detail=f"Token validation failed: {str(e)}") ================================================ FILE: keep/identitymanager/identity_managers/onelogin/onelogin_identitymanager.py ================================================ import os from keep.api.models.user import Group, Role, User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.identity_managers.onelogin.onelogin_authverifier import OneLoginAuthVerifier from keep.identitymanager.identitymanager import BaseIdentityManager class OneLoginIdentityManager(BaseIdentityManager): """ Identity manager implementation for OneLogin SSO. Only handles SSO authentication - all user management is disabled. """ def __init__(self, tenant_id, context_manager: ContextManager, **kwargs): super().__init__(tenant_id, context_manager, **kwargs) self.logger.info("OneLoginIdentityManager initialized") self.onelogin_issuer = os.environ.get("ONELOGIN_ISSUER") self.onelogin_client_id = os.environ.get("ONELOGIN_CLIENT_ID") self.onelogin_client_secret = os.environ.get("ONELOGIN_CLIENT_SECRET") # Only require the essential variables for SSO if not all([self.onelogin_issuer, self.onelogin_client_id, self.onelogin_client_secret]): missing_vars = [] if not self.onelogin_issuer: missing_vars.append("ONELOGIN_ISSUER") if not self.onelogin_client_id: missing_vars.append("ONELOGIN_CLIENT_ID") if not self.onelogin_client_secret: missing_vars.append("ONELOGIN_CLIENT_SECRET") self.logger.error(f"Missing environment variables: {', '.join(missing_vars)}") raise Exception(f"Missing environment variables: {', '.join(missing_vars)}") # Remove any trailing slash from issuer if self.onelogin_issuer.endswith("/"): self.onelogin_issuer = self.onelogin_issuer[:-1] self.logger.info("OneLogin Identity Manager initialized for SSO authentication only") def on_start(self, app) -> None: """ Initialize the identity manager on application startup. No-op for SSO-only implementation. """ self.logger.info("OneLogin Identity Manager started (SSO authentication only)") @property def support_sso(self) -> bool: """Indicate that OneLogin supports SSO""" return True def get_sso_providers(self) -> list[str]: """Get the list of SSO providers""" return ["onelogin"] def get_sso_wizard_url(self, authenticated_entity: AuthenticatedEntity) -> str: """Get the URL for the SSO wizard - redirect to OneLogin login""" return f"{self.onelogin_issuer}/auth" def get_users(self) -> list[User]: """Get all users from OneLogin - disabled""" self.logger.info("get_users called but management functions are disabled") return [] def create_user(self, user_email: str, user_name: str, password: str, role: str, groups: list[str] = []) -> dict: """Create a new user in OneLogin - disabled""" self.logger.info("create_user called but management functions are disabled") return {"status": "not_implemented", "message": "User management is disabled"} def update_user(self, user_email: str, update_data: dict) -> dict: """Update an existing user in OneLogin - disabled""" self.logger.info("update_user called but management functions are disabled") return {"status": "not_implemented", "message": "User management is disabled"} def delete_user(self, user_email: str) -> dict: """Delete a user from OneLogin - disabled""" self.logger.info("delete_user called but management functions are disabled") return {"status": "not_implemented", "message": "User management is disabled"} def get_auth_verifier(self, scopes: list) -> AuthVerifierBase: """Get the auth verifier for OneLogin - this still works""" return OneLoginAuthVerifier(scopes) def get_groups(self) -> list[Group]: """Get all groups from OneLogin - disabled""" self.logger.info("get_groups called but management functions are disabled") return [] def create_group(self, group_name: str, members: list[str], roles: list[str]) -> None: """Create a new group in OneLogin - disabled""" self.logger.info("create_group called but management functions are disabled") return None def update_group(self, group_name: str, members: list[str], roles: list[str]) -> None: """Update an existing group in OneLogin - disabled""" self.logger.info("update_group called but management functions are disabled") return None def delete_group(self, group_name: str) -> None: """Delete a group from OneLogin - disabled""" self.logger.info("delete_group called but management functions are disabled") return None def create_role(self, role: Role, predefined=False) -> str: """Create a role in OneLogin - disabled""" self.logger.info("create_role called but management functions are disabled") return "" def get_roles(self) -> list[Role]: """Get all roles from OneLogin - disabled""" self.logger.info("get_roles called but management functions are disabled") return [] def delete_role(self, role_id: str) -> None: """Delete a role from OneLogin - disabled""" self.logger.info("delete_role called but management functions are disabled") return None ================================================ FILE: keep/identitymanager/identitymanager.py ================================================ import abc import importlib import inspect import logging from keep.api.models.user import ResourcePermission, Role, User from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.authverifierbase import ALL_RESOURCES, AuthVerifierBase from keep.identitymanager.rbac import get_role_by_role_name rbac_module = importlib.import_module("keep.identitymanager.rbac") PREDEFINED_ROLES = [] # Dynamically import all roles from rbac.py for name, obj in inspect.getmembers(rbac_module): if ( inspect.isclass(obj) and issubclass(obj, rbac_module.Role) and obj != rbac_module.Role ): PREDEFINED_ROLES.append( Role( id=obj.get_name(), name=obj.get_name(), description=obj.DESCRIPTION, scopes=obj.SCOPES, ) ) class BaseIdentityManager(metaclass=abc.ABCMeta): def __init__(self, tenant_id, context_manager: ContextManager = None, **kwargs): self.tenant_id = tenant_id self.logger = logging.getLogger(__name__) def on_start(self, app) -> None: """ Initialize the identity manager. Do all the necessary setup for the identity manager. """ pass # default identity manager does not support sso @property def support_sso(self) -> bool: return False def get_sso_providers(self) -> list[str]: raise NotImplementedError( "get_sso_providers() method not implemented" " for {}".format(self.__class__.__name__) ) def get_sso_wizard_url(self, authenticated_entity: AuthenticatedEntity) -> str: raise NotImplementedError( "get_sso_wizard_url() method not implemented" " for {}".format(self.__class__.__name__) ) @abc.abstractmethod def get_users(self) -> list[User]: """ Get users Returns: list: The list of users. """ raise NotImplementedError( "get_users() method not implemented" " for {}".format(self.__class__.__name__) ) def get_groups(self) -> str | dict: """ Get groups Returns: list: The list of groups. """ # should be implemented by the identity manager return [] @abc.abstractmethod def create_user(self, user_email, user_name, password, role, groups=[]) -> None: """ Create a user in the identity manager. Args: user_email (str): The email of the user to create. tenant_id (str): The tenant id of the user to create. password (str): The password of the user to create. role (str): The role of the user to create. """ def update_user(self, user_email: str, update_data: dict): """ Update a user in the identity manager. :param user_email: :param update_data: :return: """ raise NotImplementedError("update_user() method not implemented") @abc.abstractmethod def delete_user(self, username: str) -> None: """ Delete a user from the identity manager. Args: username (str): The name of the user to delete. """ raise NotImplementedError("delete_secret() method not implemented") @abc.abstractmethod def get_auth_verifier(self, scopes: list) -> AuthVerifierBase: """ Get the authentication verifier for a token. Args: token (str): The token to verify. Returns: dict: The authentication verifier. """ raise NotImplementedError( "get_auth_verifier() method not implemented" " for {}".format(self.__class__.__name__) ) def create_resource( self, resource_id: str, resource_name: str, scopes: list[str] ) -> None: """ Create a resource in the identity manager for authorization purposes. This method is used to define a new resource that can be protected by the authorization system. It allows specifying the resource's unique identifier, name, and associated scopes, which are used to control access to the resource. Args: resource_id (str): The unique identifier of the resource. resource_name (str): The human-readable name of the resource. scopes (list): A list of scopes associated with the resource, defining the types of actions that can be performed. """ pass def delete_resource(self, resource_id: str) -> None: """ Delete a resource from the identity manager's authorization system. This method removes a previously created resource from the authorization system. After deletion, the resource will no longer be available for permission checks or access control. Args: resource_id (str): The unique identifier of the resource to be deleted. """ pass def check_permission( self, resource_id: str, scope: str, authenticated_entity: AuthenticatedEntity ) -> None: """ Check if the authenticated entity has permission to access the resource. This method is a crucial part of the authorization process. It verifies whether the given authenticated entity has the necessary permissions to perform a specific action (defined by the scope) on a particular resource. Args: resource_id (str): The unique identifier of the resource being accessed. scope (str): The specific action or permission being checked. authenticated_entity (AuthenticatedEntity): The entity (user or service) requesting access. Raises: HTTPException: If the authenticated entity does not have the required permission, an exception with a 403 status code should be raised. """ pass def create_permissions(self, permissions: list[ResourcePermission]) -> None: """ Create permissions in the identity manager for authorization purposes. This method is used to define new permissions that can be used to control access to resources. It allows specifying the resources, scopes, and users or groups associated with each permission. Args: permissions (list): A list of permission objects, each containing the resource, scope, and user or group information. """ pass def get_permissions(self) -> list[ResourcePermission]: """ Get permissions in the identity manager for authorization purposes. This method is used to retrieve the permissions that have been defined in the identity manager. It returns a list of permission objects, each containing the resource, scope, and user or group information. Args: resource_ids (list): A list of resource IDs for which to retrieve permissions. Returns: list: A list of permission objects. """ return [] def get_user_permission_on_resource_type( self, resource_type: str, authenticated_entity: AuthenticatedEntity ) -> list[ResourcePermission]: """ Get permissions for a specific user on a specific resource type. Args: resource_type (str): The type of resource for which to retrieve permissions. user_id (str): The ID of the user for which to retrieve permissions. Returns: list: A list of permission objects. """ pass def get_roles(self) -> list[Role]: """ Get roles in the identity manager for authorization purposes. This method is used to retrieve the roles that have been defined in the identity manager. It returns a list of role objects, each containing the resource, scope, and user or group information. Returns: list: A list of role objects. """ roles_dto = [] for role in PREDEFINED_ROLES: role_name = role.name _role = get_role_by_role_name(role_name) # expand scopes so read:* become read:alert, etc expanded_scopes = [] for scope in _role.SCOPES: if scope.endswith(":*"): for resource in ALL_RESOURCES: expanded_scopes.append(f"{scope[:-2]}:{resource}") else: expanded_scopes.append(scope) roles_dto.append( Role( id=role_name, name=role_name, description=_role.DESCRIPTION, scopes=expanded_scopes, ) ) return roles_dto def get_role_by_role_name(self, role_name: str) -> Role: """ Get role by role name. Args: role_name (str): The name of the role. Returns: Role: The role object. """ _role = get_role_by_role_name(role_name) return Role( id=role_name, name=role_name, description=_role.DESCRIPTION, scopes=_role.SCOPES, ) def create_role(self, role: Role) -> Role: """ Create role in the identity manager for authorization purposes. This method is used to define new role that can be used to control access to resources. It allows specifying the resources, scopes, and users or groups associated with each role. Args: role (Role): A role object, containing the resource, scope, and user or group information. """ # default implementation does not support creating roles return role ================================================ FILE: keep/identitymanager/identitymanagerfactory.py ================================================ import enum import importlib import logging import os import time from typing import Type from keep.api.core.config import config from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.authverifierbase import AuthVerifierBase from keep.identitymanager.identitymanager import BaseIdentityManager logger = logging.getLogger(__name__) class IdentityManagerTypes(enum.Enum): """ Enum class representing different types of identity managers. """ AUTH0 = "auth0" KEYCLOAK = "keycloak" OKTA = "okta" ONELOGIN = "onelogin" DB = "db" NOAUTH = "noauth" OAUTH2PROXY = "oauth2proxy" class IdentityManagerFactory: """ Factory class for creating identity managers and authentication verifiers. """ @staticmethod def get_identity_manager( tenant_id: str = None, context_manager: ContextManager = None, identity_manager_type: IdentityManagerTypes = None, **kwargs, ) -> BaseIdentityManager: """ Get an instance of the identity manager based on the specified type. Args: tenant_id (str, optional): The ID of the tenant. context_manager (ContextManager, optional): The context manager instance. identity_manager_type (IdentityManagerTypes, optional): The type of identity manager to create. **kwargs: Additional keyword arguments to pass to the identity manager. Returns: BaseIdentityManager: An instance of the specified identity manager. """ if not identity_manager_type: identity_manager_type = config( "AUTH_TYPE", default=IdentityManagerTypes.NOAUTH.value ) elif isinstance(identity_manager_type, IdentityManagerTypes): identity_manager_type = identity_manager_type.value.lower() return IdentityManagerFactory._load_manager( identity_manager_type, "identitymanager", tenant_id, context_manager, **kwargs, ) @staticmethod def get_auth_verifier(scopes: list[str] = []) -> AuthVerifierBase: """ Get an instance of the authentication verifier. Args: scopes (list[str], optional): A list of scopes for the auth verifier. Returns: AuthVerifierBase: An instance of the authentication verifier. """ auth_type = os.environ.get( "AUTH_TYPE", IdentityManagerTypes.NOAUTH.value ).lower() return IdentityManagerFactory._load_manager(auth_type, "authverifier", scopes) @staticmethod def _load_manager(manager_type: str, manager_class: str, *args, **kwargs): """ Load and instantiate a manager class based on the specified type and class. Args: manager_type (str): The type of manager to load. manager_class (str): The class of manager to load. *args: Positional arguments to pass to the manager constructor. **kwargs: Keyword arguments to pass to the manager constructor. Returns: The instantiated manager object. Raises: NotImplementedError: If the specified manager type or class is not implemented. """ try: t = time.time() logger.debug(f"Loading {manager_class} for {manager_type}") manager_type = ( IdentityManagerFactory._backward_compatible_get_identity_manager( manager_type ) ) try: module = importlib.import_module( f"keep.identitymanager.identity_managers.{manager_type}.{manager_type}_{manager_class}" ) # look for the module in ee except ModuleNotFoundError: try: module = importlib.import_module( f"ee.identitymanager.identity_managers.{manager_type}.{manager_type}_{manager_class}" ) except ModuleNotFoundError: raise NotImplementedError( f"{manager_class} for {manager_type} not implemented" ) logger.debug( f"Loaded {manager_class} for {manager_type} in {time.time() - t} seconds" ) # look for the class that contains the manager_class in its name for _attr in dir(module): if manager_class in _attr.lower() and "base" not in _attr.lower(): class_name = _attr break manager_class: Type = getattr(module, class_name) resp = manager_class(*args, **kwargs) logger.debug(f"Found class {class_name} in {time.time() - t} seconds") return resp except (ImportError, AttributeError): raise NotImplementedError( f"{manager_class} for {manager_type} not implemented" ) @staticmethod def _backward_compatible_get_identity_manager( auth_type: str = None, ): """ Map old auth_type to new IdentityManagerTypes enum. """ if auth_type.lower() == "single_tenant": return IdentityManagerTypes.DB.value elif auth_type.lower() == "no_auth": return IdentityManagerTypes.NOAUTH.value elif auth_type.lower() == "multi_tenant": return IdentityManagerTypes.AUTH0.value else: return auth_type.lower() ================================================ FILE: keep/identitymanager/rbac.py ================================================ # Most simple and naive RBAC implementation # Got the inspiration from Auth0 - # - https://github.com/auth0-developer-hub/api_fastapi_python_hello-world # - https://developer.auth0.com/resources/code-samples/api/fastapi/basic-role-based-access-control#set-up-role-based-access-control-rbac # The scope convention {verb}:{resource} is inspired by Auth0's RBAC # Note that since we don't use Auth0's RBAC, I just took the concepts but left the implementation more simple # TODO: move resources (alert, rule, etc.) to class constants # TODO: move verbs (read, write, delete, update) to class constants # TODO: custom roles # TODO: implement a solid RBAC mechanism (probably OPA over Keycloak) import enum from fastapi import HTTPException class Roles(enum.Enum): ADMIN = "admin" NOC = "noc" WEBHOOK = "webhook" WORKFLOW_RUNNER = "workflowrunner" class Role: @classmethod def get_name(cls): return cls.__name__.lower() @classmethod def has_scopes(cls, scopes: list[str]) -> bool: required_scopes = set(scopes) available_scopes = set(cls.SCOPES) for scope in required_scopes: # First, check if the scope is available if scope in available_scopes: # Exact match, on to the next scope continue # If not, check if there's a wildcard permission for this action scope_parts = scope.split(":") if len(scope_parts) != 2: return False # Invalid scope format action, resource = scope_parts if f"{action}:*" not in available_scopes: return False # No wildcard permission for this action # All scopes are available return True # Noc has read permissions and it can assign itself to alert class Noc(Role): SCOPES = ["read:*", "execute:workflows"] DESCRIPTION = "read permissions and assign itself to alert" # Admin has all permissions class Admin(Role): SCOPES = ["read:*", "write:*", "delete:*", "update:*", "execute:*"] DESCRIPTION = "do everything" # Webhook has write:alert permission to write alerts # this is internal role used by API keys class Webhook(Role): SCOPES = ["write:alert", "write:incident"] DESCRIPTION = "write alerts using API keys" class WorkflowRunner(Role): SCOPES = ["write:workflows", "execute:workflows"] DESCRIPTION = "Run workflows using API keys" def get_role_by_role_name(role_name: str) -> list[str]: if role_name == Roles.ADMIN.value: return Admin elif role_name == Roles.NOC.value: return Noc elif role_name == Roles.WEBHOOK.value: return Webhook elif role_name == Roles.WORKFLOW_RUNNER.value: return WorkflowRunner else: raise HTTPException( status_code=403, detail=f"Role {role_name} not found", ) ================================================ FILE: keep/iohandler/iohandler.py ================================================ import ast import copy import html # TODO: fix this! It screws up the eval statement if these are not imported import inspect import io import json import logging import re import sys import astunparse import chevron import requests import keep.functions as keep_functions from keep.contextmanager.contextmanager import ContextManager from keep.step.step_provider_parameter import StepProviderParameter # Mustache lambda helpers injected into every render context. # Usage in workflow YAML: {{#fn.na}}{{ alert.someOptionalField }}{{/fn.na}} # When a referenced field is missing or empty the helper returns the default # instead of raising RenderException (safe mode is disabled automatically # when fn.* sections are detected — see _render()). WORKFLOW_HELPERS = { "fn": { "default": lambda text, render: render(text) or "", "na": lambda text, render: render(text) or "N/A", "upper": lambda text, render: render(text).upper(), "lower": lambda text, render: render(text).lower(), "strip": lambda text, render: render(text).strip(), } } class RenderException(Exception): def __init__(self, message, missing_keys=None): self.missing_keys = missing_keys super().__init__(message) class IOHandler: def __init__(self, context_manager: ContextManager): self.context_manager = context_manager self.logger = logging.getLogger(self.__class__.__name__) # whether Keep should shorten urls in the message or not # todo: have a specific parameter for this? self.shorten_urls = False if ( self.context_manager.click_context and self.context_manager.click_context.params.get("api_key") and self.context_manager.click_context.params.get("api_url") ): self.shorten_urls = True def render(self, template, safe=False, default="", additional_context=None): # rendering is only support for strings if not isinstance(template, str): return template # check if inside the mustache is object in the context if template.count("}}") != template.count("{{"): raise Exception( f"Invalid template - number of }} and {{ does not match {template}" ) # TODO - better validate functions if template.count("(") != template.count(")"): raise Exception( f"Invalid template - number of ( and ) does not match {template}" ) val = self.parse(template, safe, default, additional_context) return val def quote(self, template): """Quote {{ }} with '' Args: template (str): string with {{ }} variables in it Returns: str: string with {{ }} variables quoted with '' """ pattern = r"(? -1: # Opening '(' found after "keep." i = func_start + 1 # Move i to the character after '(' parent_count = 1 in_string = False escape_next = False quote_char = "" escapes = {} while i < len(text) and (parent_count > 0 or in_string): if text[i] == "\\" and in_string and not escape_next: escape_next = True i += 1 continue elif text[i] in ('"', "'"): if not in_string: # Detecting the beginning of the string in_string = True quote_char = text[i] elif ( text[i] == quote_char and not escape_next and ( str(text[i + 1]).isalnum() == False and str(text[i + 1]) != " " ) # end of statement, arg, etc. if its alpha numeric or whitespace, we just need to escape it ): # Detecting the end of the string # If the next character is not alphanumeric or whitespace, it's the end of the string in_string = False quote_char = "" elif text[i] == quote_char and not escape_next: escapes[i] = text[ i ] # Save the quote character where we need to escape for valid ast parsing elif text[i] == "(" and not in_string: parent_count += 1 elif text[i] == ")" and not in_string: parent_count -= 1 escape_next = False i += 1 if parent_count == 0: matches.append((text[start:i], escapes)) continue # Skip the increment at the end of the loop to continue from the current position else: # If no '(' found, increment i to move past "keep." i += 5 else: i += 1 return matches def _trim_token_error(self, token): # trim too long tokens so that the error message will be readable if len(token) > 64: try: func_name = token.split("keep.")[1].split("(")[0] err = f"keep.{func_name}(...)" except Exception: err = token finally: return err else: return token def parse(self, string, safe=False, default="", additional_context=None): """Use AST module to parse 'call stack'-like string and return the result Example - string = "first(split('1 2 3', ' '))" ==> 1 Args: tree (_type_): _description_ Returns: _type_: _description_ """ # break the string to tokens # this will break the following string to 3 tokens: # string - "Number of errors: {{ steps.grep.condition.threshold.compare_to }} # [threshold was set to len({{ steps.grep.condition.threshold.value }})] # Error: split({{ foreach.value }},'a', 'b') # and first(split({{ foreach.value }},'a', 'b'))" # tokens (with {{ expressions }} already rendered) - # len({{ steps.grep.condition.threshold.value }}) # split({{ foreach.value }},'a', 'b') # first(split({{ foreach.value }},'a', 'b')) # first render everything using chevron # inject the context string = self._render(string, safe, default, additional_context) # Now, extract the token if exists parsed_string = copy.copy(string) if string.startswith("raw_render_without_execution(") and string.endswith(")"): tokens = [] string = string.replace("raw_render_without_execution(", "", 1) string = string[::-1].replace(")", "", 1)[::-1] # Remove the last ')' parsed_string = copy.copy(string) else: tokens = self.extract_keep_functions(parsed_string) if len(tokens) == 0: return parsed_string elif len(tokens) == 1: token, escapes = tokens[0] token_to_replace = token try: escapes_counter = 0 if escapes: for escape in escapes: token = ( token[: escape + escapes_counter] + "\\" + token[escape + escapes_counter :] ) escapes_counter += 1 # we need to increment the counter because we added a character val = self._parse_token(token) except Exception as e: # trim stacktrace since we have limitation on the error message trimmed_token = self._trim_token_error(token) err_message = str(e).splitlines()[-1] raise Exception( f"Got {e.__class__.__name__} while parsing token '{trimmed_token}': {err_message}" ) # support JSON if isinstance(val, dict): # if the value we need to replace is the whole string, # and its a dict, just return the dict # the usage is for # with: # method: POST # body: # alert: keep.json_loads('{{ alert }}') if parsed_string == token_to_replace: return val else: val = json.dumps(val) else: val = str(val) parsed_string = parsed_string.replace(token_to_replace, val) return parsed_string # this basically for complex expressions with functions and operators tokens_handled = set() for token in tokens: token, escapes = token # imagine " keep.f(..) > 1 and keep.f(..) <2" # so keep.f already handled, we don't want to handle it again if token in tokens_handled: continue token_to_replace = token try: if escapes: for escape in escapes: token = token[:escape] + "\\" + token[escape:] val = self._parse_token(token) except Exception as e: trimmed_token = self._trim_token_error(token) err_message = str(e).splitlines()[-1] raise Exception( f"Got {e.__class__.__name__} while parsing token '{trimmed_token}': {err_message}" ) parsed_string = parsed_string.replace(token_to_replace, str(val)) tokens_handled.add(token_to_replace) return parsed_string def _parse_token(self, token): # else, it contains a function e.g. len({{ value }}) or split({{ value }}, 'a', 'b') def _parse(self, tree): if isinstance(tree, ast.Module): return _parse(self, tree.body[0].value) if isinstance(tree, ast.Call): func = tree.func args = tree.args keywords = tree.keywords # Get keyword arguments # Parse positional args _args = [] for arg in args: _arg = None if isinstance(arg, ast.Call): _arg = _parse(self, arg) elif isinstance(arg, ast.Str) or isinstance(arg, ast.Constant): _arg = str(arg.s) elif isinstance(arg, ast.Dict): _arg = ast.literal_eval(arg) elif ( isinstance(arg, ast.Set) or isinstance(arg, ast.List) or isinstance(arg, ast.Tuple) ): _arg = astunparse.unparse(arg).strip() if ( (_arg.startswith("[") and _arg.endswith("]")) or (_arg.startswith("{") and _arg.endswith("}")) or (_arg.startswith("(") and _arg.endswith(")")) ): try: import datetime from dateutil.tz import tzutc g = globals() # we need to pass the classes of the dependencies to the eval for dependency in self.context_manager.dependencies: g[dependency.__name__] = dependency g["tzutc"] = tzutc g["datetime"] = datetime _arg = eval(_arg, g) except ValueError: pass else: _arg = arg.id # if the value is empty '', we still need to pass it to the function # also, if the value is 0 or 0.0, we need to pass it to the function # 0 == False, so we need to check if the value is not False explicitly if ( _arg or _arg == "" or (_arg == 0 or _arg == 0.0) and _arg is not False ): _args.append(_arg) # Parse keyword args _kwargs = {} for keyword in keywords: key = keyword.arg value = keyword.value if isinstance(value, ast.Call): _kwargs[key] = _parse(self, value) elif isinstance(value, ast.Str) or isinstance(value, ast.Constant): _kwargs[key] = str(value.s) elif isinstance(value, ast.Dict): _kwargs[key] = ast.literal_eval(value) elif ( isinstance(value, ast.Set) or isinstance(value, ast.List) or isinstance(value, ast.Tuple) ): parsed_value = astunparse.unparse(value).strip() if ( ( parsed_value.startswith("[") and parsed_value.endswith("]") ) or ( parsed_value.startswith("{") and parsed_value.endswith("}") ) or ( parsed_value.startswith("(") and parsed_value.endswith(")") ) ): try: import datetime from dateutil.tz import tzutc g = globals() for dependency in self.context_manager.dependencies: g[dependency.__name__] = dependency g["tzutc"] = tzutc g["datetime"] = datetime _kwargs[key] = eval(parsed_value, g) except ValueError: pass else: _kwargs[key] = value.id # Get the function and its signature keep_func = getattr(keep_functions, func.attr) func_signature = inspect.signature(keep_func) # Add tenant_id if needed if "kwargs" in func_signature.parameters: _kwargs["tenant_id"] = self.context_manager.tenant_id try: # Call function with both positional and keyword arguments val = keep_func(*_args, **_kwargs) except ValueError: # Handle newline escaping if needed _args = [ arg.replace("\n", "\\n") if isinstance(arg, str) else arg for arg in _args ] _kwargs = { k: v.replace("\n", "\\n") if isinstance(v, str) else v for k, v in _kwargs.items() } val = keep_func(*_args, **_kwargs) return val try: tree = ast.parse(token) except SyntaxError as e: if "unterminated string literal" in str(e): # try to HTML escape the string # this is happens when libraries such as datadog api client # HTML escapes the string and then ast.parse fails () # https://github.com/keephq/keep/issues/137 try: unescaped_token = html.unescape( token.replace("\r\n", "") .replace("\n", "") .replace("\\n", "") .replace("\r", "") ) tree = ast.parse(unescaped_token) # try best effort to parse the string # this is some nasty bug. see test test_openobserve_rows_bug on test_iohandler # and this ticket - except Exception as e: # for strings such as "45%\n", we need to escape t = ( html.unescape(token.replace("\r\n", "").replace("\n", "")) .replace("\\n", "\n") .replace("\\", "") .replace("\n", "\\n") ) t = self._encode_single_quotes_in_double_quotes(t) try: tree = ast.parse(t) except Exception: # For strings where ' is used as the delimeter and we failed to escape all ' in the string # @tb: again, this is not ideal but it's best effort... t = ( t.replace("('", '("') .replace("')", '")') .replace("',", '",') ) tree = ast.parse(t) else: # for strings such as "45%\n", we need to escape tree = ast.parse(token.encode("unicode_escape")) return _parse(self, tree) def _render(self, key: str, safe=False, default="", additional_context=None): if "{{^" in key or "{{ ^" in key: self.logger.debug( "Safe render is not supported when there are inverted sections." ) safe = False # fn.* helper sections explicitly handle missing/empty keys — the lambda # returns a default value so RenderException must not be raised. if "{{#fn." in key or "{{ #fn." in key: safe = False context = self.context_manager.get_full_context(exclude_providers=True) if additional_context: context.update(additional_context) # Inject workflow helper lambdas so fn.* sections are resolvable. context.update(WORKFLOW_HELPERS) stderr_capture = io.StringIO() original_stderr = sys.stderr sys.stderr = stderr_capture try: rendered = self.render_recursively(key, context) # chevron.render will escape the quotes, we need to unescape them rendered = rendered.replace(""", '"') stderr_output = stderr_capture.getvalue() finally: sys.stderr = original_stderr # If render should failed if value does not exists if safe and "Could not find key" in stderr_output: # if more than one keys missing, pretiffy the error if stderr_output.count("Could not find key") > 1: missing_keys = stderr_output.split("Could not find key") missing_keys = [ missing_key.strip().replace("\n", "") for missing_key in missing_keys[1:] ] missing_keys = list(set(missing_keys)) err = "Could not find keys: " + ", ".join(missing_keys) else: missing_keys = [stderr_output.split("Could not find key")[1].strip()] err = stderr_output.replace("\n", "") raise RenderException(f"{err} in the context.", missing_keys=missing_keys) if not rendered: return default return rendered def _encode_single_quotes_in_double_quotes(self, s): result = [] in_double_quotes = False i = 0 while i < len(s): if s[i] == '"': in_double_quotes = not in_double_quotes elif s[i] == "'" and in_double_quotes: if i > 0 and s[i - 1] == "\\": # If the single quote is already escaped, don't add another backslash result.append(s[i]) else: result.append("\\" + s[i]) i += 1 continue result.append(s[i]) i += 1 return "".join(result) def render_context(self, context_to_render: dict, additional_context: dict = None): """ Iterates the provider context and renders it using the workflow context. """ # Don't modify the original context context_to_render = copy.deepcopy(context_to_render) for key, value in context_to_render.items(): if isinstance(value, str): context_to_render[key] = self._render_template_with_context( value, safe=True, additional_context=additional_context ) elif isinstance(value, list): context_to_render[key] = self._render_list_context( value, additional_context=additional_context ) elif isinstance(value, dict): context_to_render[key] = self.render_context( value, additional_context=additional_context ) elif isinstance(value, StepProviderParameter): safe = value.safe and value.default is not None context_to_render[key] = self._render_template_with_context( value.key, safe=safe, default=value.default, additional_context=additional_context, ) return context_to_render def _render_list_context( self, context_to_render: list, additional_context: dict = None ): """ Iterates the provider context and renders it using the workflow context. """ for i in range(0, len(context_to_render)): value = context_to_render[i] if isinstance(value, str): context_to_render[i] = self._render_template_with_context( value, safe=True, additional_context=additional_context ) if isinstance(value, list): context_to_render[i] = self._render_list_context( value, additional_context=additional_context ) if isinstance(value, dict): context_to_render[i] = self.render_context( value, additional_context=additional_context ) return context_to_render def _render_template_with_context( self, template: str, safe: bool = False, default: str = "", additional_context: dict = None, ) -> str: """ Renders a template with the given context. Args: template (str): template (string) to render Returns: str: rendered template """ rendered_template = self.render( template, safe, default, additional_context=additional_context ) # shorten urls if enabled if self.shorten_urls: rendered_template = self.__patch_urls(rendered_template) return rendered_template def __patch_urls(self, rendered_template: str) -> str: """ shorten URLs found in the message. Args: rendered_template (str): The rendered template that might contain URLs """ urls = re.findall( r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/?.*", rendered_template ) # didn't find any url if not urls: return rendered_template shortened_urls = self.__get_short_urls(urls) for url, shortened_url in shortened_urls.items(): rendered_template = rendered_template.replace(url, shortened_url) return rendered_template def __get_short_urls(self, urls: list) -> dict: """ Shorten URLs using Keep API. Args: urls (list): list of urls to shorten Returns: dict: a dictionary containing the original url as key and the shortened url as value """ try: api_url = self.context_manager.click_context.params.get("api_url") api_key = self.context_manager.click_context.params.get("api_key") response = requests.post( f"{api_url}/s", json=urls, headers={"x-api-key": api_key} ) if response.ok: return response.json() else: self.logger.error( "Failed to request short URLs from API", extra={ "response": response.text, "status_code": response.status_code, }, ) except Exception: self.logger.exception("Failed to request short URLs from API") def render_recursively( self, template: str, context: dict, max_iterations: int = 10 ) -> str: """ Recursively render a template until there are no more mustache tags or max iterations reached. Args: template: The template string containing mustache tags context: The context dictionary for rendering max_iterations: Maximum number of rendering iterations to prevent infinite loops Returns: The fully rendered string """ current = template iterations = 0 while iterations < max_iterations: rendered = chevron.render( current, context, warn=True if iterations == 0 else False ) # https://github.com/keephq/keep/issues/2326 rendered = html.unescape(rendered) # If no more changes or no more mustache tags, we're done # we don't want to render providers. ever, so this is a hack for it for now if rendered == current or "{{" not in rendered or "providers." in rendered: return rendered current = rendered iterations += 1 # Return the last rendered version even if we hit max iterations return current if __name__ == "__main__": # debug & test context_manager = ContextManager("keep") context_manager.steps_context = { "query-keep": {"results": [{"a": 1}, {"b": 2}]}, "postgres-selection": {"results": []}, } iohandler = IOHandler(context_manager) res = iohandler.render( "keep.mul(keep.len({{ steps.query-keep.results }}), keep.len({{ steps.postgres-selection.results }})) > 0" ) from asteval import Interpreter aeval = Interpreter() evaluated_if_met = aeval(res) print(evaluated_if_met) ================================================ FILE: keep/parser/parser.py ================================================ import copy import json import logging import os import re import typing import keyword from keep.actions.actions_factory import ActionsCRUD from keep.api.core.config import config from keep.api.core.db import get_installed_providers, get_workflow_id from keep.contextmanager.contextmanager import ContextManager from keep.functions import cyaml from keep.providers.providers_factory import ProvidersFactory from keep.step.step import Step, StepType from keep.step.step_provider_parameter import StepProviderParameter from keep.workflowmanager.workflow import Workflow, WorkflowStrategy class Parser: def __init__(self): self.logger = logging.getLogger(__name__) self._loaded_providers_cache = {} self._use_loaded_provider_cache = config( "KEEP_USE_PROVIDER_CACHE", default=False ) def _get_workflow_id(self, tenant_id, workflow: dict) -> str: """Support both CLI and API workflows Args: workflow (dict): _description_ Raises: ValueError: _description_ Returns: str: _description_ """ # for backward compatibility reasons, the id on the YAML is actually the name # and the id is a unique generated id stored in the db workflow_name = workflow.get("id") if workflow_name is None: raise ValueError("Workflow dict must have an id") # get the workflow id from the database workflow_id = get_workflow_id(tenant_id, workflow_name) # if the workflow id is not found, it means that the workflow is not stored in the db # for example when running from CLI # so for backward compatibility, we will use the workflow name as the id # todo - refactor CLI to use db also if not workflow_id: workflow_id = workflow_name return workflow_id def parse( self, tenant_id, parsed_workflow_yaml: dict, providers_file: str = None, actions_file: str = None, workflow_db_id: str = None, workflow_revision: int = None, is_test: bool = False, ) -> typing.List[Workflow]: """_summary_ Args: parsed_workflow_yaml (str): could be a url or a file path providers_file (str, optional): _description_. Defaults to None. Returns: typing.List[Workflow]: _description_ """ # Parse the workflow itself (the alerts here is backward compatibility) workflow_providers = parsed_workflow_yaml.get("providers") workflow_actions = parsed_workflow_yaml.get("actions") if parsed_workflow_yaml.get("workflows") or parsed_workflow_yaml.get("alerts"): raw_workflows = parsed_workflow_yaml.get( "workflows" ) or parsed_workflow_yaml.get("alerts") workflows = [ self._parse_workflow( tenant_id, workflow, providers_file, workflow_revision, workflow_providers, actions_file, workflow_actions, workflow_db_id, is_test, ) for workflow in raw_workflows ] # the alert here is backward compatibility elif parsed_workflow_yaml.get("workflow") or parsed_workflow_yaml.get("alert"): raw_workflow = parsed_workflow_yaml.get( "workflow" ) or parsed_workflow_yaml.get("alert") workflow = self._parse_workflow( tenant_id, raw_workflow, providers_file, workflow_revision, workflow_providers, actions_file, workflow_actions, workflow_db_id, is_test, ) workflows = [workflow] # else, if it stored in the db, it stored without the "workflow" key else: workflow = self._parse_workflow( tenant_id, parsed_workflow_yaml, providers_file, workflow_revision, workflow_providers, actions_file, workflow_actions, workflow_db_id=workflow_db_id, is_test=is_test, ) workflows = [workflow] return workflows def _get_workflow_provider_types_from_steps_and_actions( self, steps: list[Step], actions: list[Step] ) -> list[str]: provider_types = [] steps_and_actions = [*steps, *actions] for step_or_action in steps_and_actions: try: provider_type = step_or_action.provider.provider_type if provider_type not in provider_types: provider_types.append(provider_type) except Exception: self.logger.warning( "Could not get provider type from step or action", extra={"step_or_action": step_or_action}, ) return provider_types def _parse_workflow( self, tenant_id, workflow: dict, providers_file: str, workflow_revision: int = None, workflow_providers: dict = None, actions_file: str = None, workflow_actions: dict = None, workflow_db_id: str = None, is_test: bool = False, ) -> Workflow: self.logger.debug("Parsing workflow") # @tb: we need to remove this id in workflow yaml, it has no real use. # or at least, align it with the id in the DB. workflow_id = workflow_db_id or self._get_workflow_id(tenant_id, workflow) context_manager = ContextManager( tenant_id=tenant_id, workflow_id=workflow_id, workflow=workflow ) # Parse the providers (from the workflow yaml or from the providers directory) self._load_providers_config( tenant_id, context_manager, workflow, providers_file, workflow_providers ) # Parse the actions (from workflow, actions yaml and database) self._load_actions_config( tenant_id, context_manager, workflow, actions_file, workflow_actions ) workflow_name = workflow.get("name", "Untitled") workflow_description = workflow.get("description", "No description") workflow_permissions = workflow.get("permissions", []) workflow_disabled = self.__class__.parse_disabled(workflow) workflow_owners = self._parse_owners(workflow) workflow_tags = self._parse_tags(workflow) workflow_steps = self._parse_steps( context_manager, workflow, workflow_id, workflow_description, workflow_db_id ) workflow_actions = self._parse_actions( context_manager, workflow, workflow_id, workflow_description, workflow_db_id ) workflow_interval = self.parse_interval(workflow) on_failure_action = self._get_on_failure_action(context_manager, workflow) workflow_triggers = self.get_triggers_from_workflow_dict(workflow) workflow_provider_types = ( self._get_workflow_provider_types_from_steps_and_actions( workflow_steps, workflow_actions ) ) workflow_strategy = workflow.get( "strategy", WorkflowStrategy.NONPARALLEL_WITH_RETRY.value ) workflow_consts = workflow.get("consts", {}) workflow_debug = workflow.get("debug", False) workflow_class = Workflow( workflow_id=workflow_id, workflow_revision=workflow_revision, workflow_name=workflow_name, workflow_description=workflow_description, workflow_disabled=workflow_disabled, workflow_owners=workflow_owners, workflow_tags=workflow_tags, workflow_interval=workflow_interval, workflow_triggers=workflow_triggers, workflow_steps=workflow_steps, workflow_actions=workflow_actions, on_failure=on_failure_action, context_manager=context_manager, workflow_providers_type=workflow_provider_types, workflow_strategy=workflow_strategy, workflow_consts=workflow_consts, workflow_debug=workflow_debug, workflow_permissions=workflow_permissions, is_test=is_test, ) self.logger.debug("Workflow parsed successfully") return workflow_class def _load_providers_config( self, tenant_id, context_manager: ContextManager, workflow: dict, providers_file: str, workflow_providers: dict = None, ): self.logger.debug("Parsing providers") providers_file = ( providers_file or os.environ.get("KEEP_PROVIDERS_FILE") or "providers.yaml" ) if providers_file and os.path.exists(providers_file): self._parse_providers_from_file(context_manager, providers_file) # if the workflow file itself contain providers (mainly backward compatibility) if workflow_providers: context_manager.providers_context.update(workflow_providers) self._parse_providers_from_env(context_manager) self._load_providers_from_db(context_manager, tenant_id) self.logger.debug("Providers parsed and loaded successfully") def _load_providers_from_db( self, context_manager: ContextManager, tenant_id: str = None ): """_summary_ Args: context_manager (ContextManager): _description_ tenant_id (str, optional): _description_. Defaults to None. Returns: _type_: _description_ """ # If there is no tenant id, e.g. running from CLI, no db here self.logger.debug("Loading installed providers to context") if not tenant_id: return # Load installed providers all_providers = ProvidersFactory.get_all_providers() # _use_loaded_provider_cache is a flag to control whether to use the loaded providers cache if not self._loaded_providers_cache or not self._use_loaded_provider_cache: # this should print once when the providers are loaded for the first time self.logger.info("Loading installed providers to workflow") installed_providers = ProvidersFactory.get_installed_providers( tenant_id=tenant_id, all_providers=all_providers, override_readonly=True ) self._loaded_providers_cache = installed_providers self.logger.info("Installed providers loaded successfully") else: self.logger.debug("Using cached loaded providers") # before we can use cache, we need to check if new providers are added or deleted _installed_providers = get_installed_providers(tenant_id=tenant_id) _installed_providers_ids = set([p.id for p in _installed_providers]) _cached_provider_ids = set([p.id for p in self._loaded_providers_cache]) if _installed_providers_ids != _cached_provider_ids: # this should print only when provider deleted/added self.logger.info("Providers cache is outdated, reloading providers") installed_providers = ProvidersFactory.get_installed_providers( tenant_id=tenant_id, all_providers=all_providers, override_readonly=True, ) self._loaded_providers_cache = installed_providers self.logger.info("Providers cache reloaded") else: installed_providers = self._loaded_providers_cache for provider in installed_providers: self.logger.debug("Loading provider", extra={"provider_id": provider.id}) try: provider_name = provider.details.get("name") context_manager.providers_context[provider.id] = provider.details # map also the name of the provider, not only the id # so that we can use the name to reference the provider context_manager.providers_context[provider_name] = provider.details self.logger.debug(f"Provider {provider.id} loaded successfully") except Exception as e: self.logger.error( f"Error loading provider {provider.id}", extra={"exception": e} ) self.logger.debug("Installed providers loaded successfully") return installed_providers def _parse_providers_from_env(self, context_manager: ContextManager): """ Parse providers from the KEEP_PROVIDERS environment variables. Either KEEP_PROVIDERS to load multiple providers or KEEP_PROVIDER_ can be used. KEEP_PROVIDERS is a JSON string of the providers config. (e.g. {"slack-prod": {"authentication": {"webhook_url": "https://hooks.slack.com/services/..."}}}) """ providers_json = os.environ.get("KEEP_PROVIDERS") # check if env var is absolute or relative path to a providers json file if providers_json and re.compile(r"^(\/|\.\/|\.\.\/).*\.json$").match( providers_json ): with open(file=providers_json, mode="r", encoding="utf8") as file: providers_json = file.read() if providers_json: try: self.logger.debug( "Parsing providers from KEEP_PROVIDERS environment variable" ) providers_dict = json.loads(providers_json) self._inject_env_variables(providers_dict) context_manager.providers_context.update(providers_dict) self.logger.debug( "Providers parsed successfully from KEEP_PROVIDERS environment variable" ) except json.JSONDecodeError: self.logger.error( "Error parsing providers from KEEP_PROVIDERS environment variable" ) for env in os.environ.keys(): if env.startswith("KEEP_PROVIDER_"): # KEEP_PROVIDER_SLACK_PROD provider_name = ( env.replace("KEEP_PROVIDER_", "").replace("_", "-").lower() ) try: self.logger.debug(f"Parsing provider {provider_name} from {env}") # {'authentication': {'webhook_url': 'https://hooks.slack.com/services/...'}} provider_config = json.loads(os.environ.get(env)) self._inject_env_variables(provider_config) context_manager.providers_context[provider_name] = provider_config self.logger.debug( f"Provider {provider_name} parsed successfully from {env}" ) except json.JSONDecodeError: self.logger.error( f"Error parsing provider config from environment variable {env}" ) def _inject_env_variables(self, config): """ Recursively inject environment variables into the config. """ if isinstance(config, dict): for key, value in config.items(): config[key] = self._inject_env_variables(value) elif isinstance(config, list): return [self._inject_env_variables(item) for item in config] elif ( isinstance(config, str) and config.startswith("$(") and config.endswith(")") ): env_var = config[2:-1] env_var_val = os.environ.get(env_var) if not env_var_val: self.logger.warning( f"Environment variable {env_var} not found while injecting into config" ) return config return env_var_val return config def _parse_providers_from_workflow( self, context_manager: ContextManager, workflow: dict ) -> None: context_manager.providers_context.update(workflow.get("providers")) self.logger.debug("Workflow providers parsed successfully") def _parse_providers_from_file( self, context_manager: ContextManager, providers_file: str ): with open(providers_file, "r") as file: try: providers = cyaml.safe_load(file) except cyaml.YAMLError: self.logger.exception(f"Error parsing providers file {providers_file}") raise context_manager.providers_context.update(providers) self.logger.debug("Providers config parsed successfully") def _parse_id(self, workflow) -> str: workflow_id = workflow.get("id") if workflow_id is None: raise ValueError("Workflow ID is required") return workflow_id def _parse_owners(self, workflow) -> typing.List[str]: workflow_owners = workflow.get("owners", []) return workflow_owners def _parse_tags(self, workflow) -> typing.List[str]: workflow_tags = workflow.get("tags", []) return workflow_tags def parse_interval(self, workflow) -> int: # backward compatibility workflow_interval = workflow.get("interval", 0) triggers = workflow.get("triggers", []) for trigger in triggers: if trigger.get("type") == "interval": workflow_interval = trigger.get("value", 0) # Convert time strings to seconds if isinstance(workflow_interval, str): if workflow_interval.isnumeric(): workflow_interval = int(workflow_interval) elif workflow_interval.endswith("m"): try: minutes = int(workflow_interval[:-1]) workflow_interval = minutes * 60 except ValueError: self.logger.warning(f"Invalid interval format: {workflow_interval}") elif workflow_interval.endswith("h"): try: hours = int(workflow_interval[:-1]) workflow_interval = hours * 3600 except ValueError: self.logger.warning(f"Invalid interval format: {workflow_interval}") elif workflow_interval.endswith("d"): try: days = int(workflow_interval[:-1]) workflow_interval = days * 86400 except ValueError: self.logger.warning(f"Invalid interval format: {workflow_interval}") if not isinstance(workflow_interval, int): raise ValueError(f"Invalid interval format: {workflow_interval}") return workflow_interval @staticmethod def parse_disabled(workflow_dict: dict) -> bool: workflow_is_disabled_in_yml = workflow_dict.get("disabled") return ( True if ( workflow_is_disabled_in_yml == "true" or workflow_is_disabled_in_yml is True ) else False ) @staticmethod def parse_provider_parameters(provider_parameters: dict) -> dict: parsed_provider_parameters = {} for parameter in provider_parameters: if keyword.iskeyword(parameter): # add suffix _ to provider parameters if it's a reserved keyword in python parameter_name = parameter + "_" else: parameter_name = parameter if isinstance(provider_parameters[parameter], (str, list, int, bool)): parsed_provider_parameters[parameter_name] = provider_parameters[ parameter ] elif isinstance(provider_parameters[parameter], dict): try: parsed_provider_parameters[parameter_name] = StepProviderParameter( **provider_parameters[parameter] ) except Exception: # It could be a dict/list but not of ProviderParameter type parsed_provider_parameters[parameter_name] = provider_parameters[ parameter ] return parsed_provider_parameters def _parse_steps( self, context_manager: ContextManager, workflow: dict, workflow_id: str | None = None, workflow_description: str | None = None, workflow_db_id: str | None = None, ) -> typing.List[Step]: self.logger.debug("Parsing steps") workflow_steps = workflow.get("steps", []) workflow_steps_parsed = [] for _step in workflow_steps: provider = self._get_step_provider( context_manager, _step, workflow_id, workflow_description, workflow_db_id, ) provider_parameters = _step.get("provider", {}).get("with") parsed_provider_parameters = Parser.parse_provider_parameters( provider_parameters ) step_id = _step.get("name") step = Step( context_manager=context_manager, step_id=step_id, config=_step, provider=provider, provider_parameters=parsed_provider_parameters, step_type=StepType.STEP, ) workflow_steps_parsed.append(step) self.logger.debug("Steps parsed successfully") return workflow_steps_parsed def _get_step_provider( self, context_manager: ContextManager, _step: dict, workflow_id: str | None = None, workflow_description: str | None = None, workflow_db_id: str | None = None, ) -> dict: step_provider = _step.get("provider") try: step_provider_type = step_provider.pop("type") except AttributeError: raise ValueError("Step provider type is required") try: step_provider_config = step_provider.pop("config") except KeyError: step_provider_config = {"authentication": {}} provider_id, provider_config = self._parse_provider_config( context_manager, step_provider_type, step_provider_config ) try: provider = ProvidersFactory.get_provider( context_manager, provider_id, step_provider_type, provider_config ) except Exception as ex: self.logger.warning( f"Error getting provider {provider_id} for step {_step.get('name')}", exc_info=ex, extra={ "workflow_name": workflow_id, "workflow_description": workflow_description, "provider_id": provider_id, "provider_type": step_provider_type, "provider_config_name": step_provider_config, "workflow_db_id": workflow_db_id, "tenant_id": context_manager.tenant_id, }, ) raise return provider def _load_actions_config( self, tenant_id, context_manager: ContextManager, workflow: dict, actions_file: str, workflow_actions: dict = None, ): self.logger.debug("Parsing actions") actions_file = ( actions_file or os.environ.get("KEEP_ACTIONS_FILE") or "actions.yaml" ) if actions_file and os.path.exists(actions_file): self._parse_actions_from_file(context_manager, actions_file) # if the workflow file itself contain actions (mainly backward compatibility) if workflow_actions: for action in workflow_actions: context_manager.actions_context.update( {action.get("use") or action.get("name"): action} ) self._load_actions_from_db(context_manager, tenant_id) self.logger.debug("Actions parsed and loaded successfully") def _parse_actions_from_file( self, context_manager: ContextManager, actions_file: str ): """load actions from file into context manager""" if actions_file and os.path.isfile(actions_file): with open(actions_file, "r") as file: try: actions_content = cyaml.safe_load(file) except cyaml.YAMLError: self.logger.exception(f"Error parsing actions file {actions_file}") raise # create a hashmap -> action for action in actions_content.get("actions", []): context_manager.actions_context.update( {action.get("use") or action.get("name"): action} ) def _load_actions_from_db( self, context_manager: ContextManager, tenant_id: str = None ): # If there is no tenant id, e.g. running from CLI, no db here if not tenant_id: return # Load actions from db actions = ActionsCRUD.get_all_actions(tenant_id) for action in actions: self.logger.debug("Loading action", extra={"action_id": action.use}) try: context_manager.actions_context[action.use] = action.details self.logger.debug(f"action {action.use} loaded successfully") except Exception as e: self.logger.error( f"Error loading action {action.use}", extra={"exception": e} ) def _get_action( self, context_manager: ContextManager, action: dict, action_name: str | None = None, workflow_id: str | None = None, workflow_description: str | None = None, workflow_db_id: str | None = None, ) -> Step: name = action_name or action.get("name") provider = action.get("provider", {}) provider_config_name = provider.get("config") provider_parameters = provider.get("with", {}) parsed_provider_parameters = Parser.parse_provider_parameters( provider_parameters ) provider_type = provider.get("type") provider_id, provider_config = self._parse_provider_config( context_manager, provider_type, provider_config_name ) try: provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, provider_config, **parsed_provider_parameters, ) except Exception as ex: self.logger.warning( f"Error getting provider {provider_id} for action {name}", exc_info=ex, extra={ "workflow_name": workflow_id, "workflow_description": workflow_description, "provider_id": provider_id, "provider_type": provider_type, "provider_config_name": provider_config_name, "workflow_db_id": workflow_db_id, "tenant_id": context_manager.tenant_id, }, ) raise action = Step( context_manager=context_manager, step_id=name, provider=provider, config=action, provider_parameters=provider_parameters, step_type=StepType.ACTION, ) return action def _parse_actions( self, context_manager: ContextManager, workflow: dict, workflow_id: str | None = None, workflow_description: str | None = None, workflow_db_id: str | None = None, ) -> typing.List[Step]: self.logger.debug("Parsing actions") workflow_actions_raw = workflow.get("actions", []) workflow_actions = self._merge_action_by_use( workflow_actions=workflow_actions_raw, actions_context=context_manager.actions_context, ) workflow_actions_parsed = [] for _action in workflow_actions: parsed_action = self._get_action( context_manager, _action, None, workflow_id, workflow_description, workflow_db_id, ) workflow_actions_parsed.append(parsed_action) self.logger.debug("Actions parsed successfully") return workflow_actions_parsed def _load_actions_from_file( self, actions_file: typing.Optional[str] ) -> typing.Mapping[str, dict]: """load actions from file and convert results into a set of unique actions by id""" actions_set = {} if actions_file and os.path.isfile(actions_file): # load actions from a file actions = [] with open(actions_file, "r") as file: try: actions = cyaml.safe_load(file) except cyaml.YAMLError: self.logger.exception(f"Error parsing actions file {actions_file}") raise # convert actions into dictionary of unique object by id for action in actions: action_id = action.get("id") or action.get("name") if action_id or action_id not in actions_set: actions_set[action_id] = action else: self.logger.exception( f"action defined in {actions_file} should have id as unique field" ) else: self.logger.warning( f"No action located at {actions_file}, skip loading reusable actions" ) return actions_set def _merge_action_by_use( self, workflow_actions: typing.List[dict], actions_context: typing.Mapping[str, dict], ) -> typing.Iterable[dict]: """Merge actions from workflow and reusable actions file into one""" for action in workflow_actions: extended_action = actions_context.get(action.get("use"), {}) yield ParserUtils.deep_merge(action, extended_action) def _get_on_failure_action( self, context_manager: ContextManager, workflow: dict ) -> Step | None: """ Parse the on-failure action Args: context_manager (ContextManager): _description_ workflow (dict): _description_ Returns: Action | None: _description_ """ self.logger.debug("Parsing on-failure") workflow_on_failure = workflow.get("on-failure", {}) if workflow_on_failure: parsed_action = self._get_action( context_manager=context_manager, action=workflow_on_failure, action_name="on-failure", ) self.logger.debug("Parsed on-failure successfully") return parsed_action self.logger.debug("No on-failure action") def _extract_provider_id(self, context_manager: ContextManager, provider_type: str): """ Translate {{ . }} to a provider id Args: provider_type (str): _description_ Raises: ValueError: _description_ Returns: _type_: _description_ """ # TODO FIX THIS SHIT provider_type = provider_type.split(".") if len(provider_type) != 2: raise ValueError( f"Provider config ({provider_type}) is not valid, should be in the format: {{{{ . }}}} (workflow_id: {context_manager.workflow_id})" ) provider_id = provider_type[1].replace("}}", "").strip() return provider_id def _parse_provider_config( self, context_manager: ContextManager, provider_type: str, provider_config: str | dict | None, ) -> tuple: """ Parse provider config. If the provider config is a dict, return it as is. If the provider config is None, return an empty dict. If the provider config is a string, extract the config from the providers context. * When provider config is either dict or None, provider config id is the same as the provider type. Args: provider_type (str): The provider type provider_config (str | dict | None): The provider config Raises: ValueError: When the provider config is a string and the provider config id is not found in the providers context. Returns: tuple: provider id and provider parsed config """ # Support providers without config such as logfile or mock if isinstance(provider_config, dict): return provider_type, provider_config elif provider_config is None: return provider_type, {"authentication": {}} # extract config when using {{ . }} elif isinstance(provider_config, str): config_id = self._extract_provider_id(context_manager, provider_config) provider_config = context_manager.providers_context.get(config_id) if not provider_config: self.logger.warning( "Provider not found in configuration, did you configure it?", extra={ "provider_id": config_id, "provider_type": provider_type, "provider_config": provider_config, "tenant_id": context_manager.tenant_id, }, ) provider_config = {"authentication": {}} return config_id, provider_config def get_providers_from_workflow_dict(self, workflow: dict): """extract the provider names from a worklow Args: workflow (dict): _description_ """ actions_providers = [ action.get("provider") for action in workflow.get("actions", []) if "provider" in action ] steps_providers = [ step.get("provider") for step in workflow.get("steps", []) if "provider" in step ] providers = actions_providers + steps_providers try: providers = [ { "name": p.get("config", f"NAME.{p.get('type')}") .split(".")[1] .replace("}}", "") .strip(), "type": p.get("type"), } for p in providers ] except: self.logger.error( "Failed to extract providers from workflow", extra={"workflow": workflow}, ) raise return providers def get_triggers_from_workflow_dict(self, workflow: dict): """extract the trigger names from a worklow Args: workflow (dict): _description_ """ # triggers: # - type: alert # filters: # - key: alert.source # value: awscloudwatch triggers = workflow.get("triggers", []) return triggers class ParserUtils: @staticmethod def deep_merge(source: dict, dest: dict) -> dict: """Perform deep merge on two objects. Example: source = {"deep1": {"deep2": 1}} dest = {"deep1", {"deep2": 2, "deep3": 3}} returns -> {"deep1": {"deep2": 1, "deep3": 3}} Returns: dict: The new object contains merged results """ # make sure not to modify dest object by creating new one out = copy.deepcopy(dest) ParserUtils._merge(source, out) return out @staticmethod def _merge(ob1: dict, ob2: dict) -> dict: """Merge two objects, in case of duplicate key in two objects, take value of the first source""" for key, value in ob1.items(): # encounter dict, merge into one if isinstance(value, dict) and key in ob2: next_node = ob2.get(key) ParserUtils._merge(value, next_node) # encounter list, merge by index and concat two lists elif isinstance(value, list) and key in ob2: next_nodes = ob2.get(key, []) for i in range(max(len(value), len(next_nodes))): next_node = next_nodes[i] if i < len(next_nodes) else {} value_node = value[i] if i < len(value) else {} ParserUtils._merge(value_node, next_node) else: ob2[key] = value ================================================ FILE: keep/providers/__init__.py ================================================ ================================================ FILE: keep/providers/airflow_provider/__init__.py ================================================ ================================================ FILE: keep/providers/airflow_provider/airflow_provider.py ================================================ from datetime import datetime, timezone from keep.api.models.alert import AlertDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class AirflowProvider(BaseProvider): """Enrich alerts with data sent from Airflow.""" PROVIDER_DISPLAY_NAME = "Airflow" PROVIDER_CATEGORY = ["Orchestration"] FINGERPRINT_FIELDS = ["fingerprint"] webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ 💡 For more details on configuring Airflow to send alerts to Keep, refer to the [Keep documentation](https://docs.keephq.dev/providers/documentation/airflow-provider). ### 1. Configure Keep's Webhook Credentials To send alerts to Keep, set up the webhook URL and API key: - **Keep Webhook URL**: {keep_webhook_api_url} - **Keep API Key**: {api_key} ### 2. Configure Airflow to Send Alerts to Keep Airflow uses a callback function to send alerts to Keep. Below is an example configuration: ```python import os import requests def task_failure_callback(context): # Replace with your specific Keep webhook URL if different. keep_webhook_url = "{keep_webhook_api_url}" api_key = "{api_key}" headers = {{ "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": api_key, }} data = {{ "name": f"Airflow Task Failure", "message": f"Task failed in DAG", "status": "firing", "service": "pipeline", "severity": "critical", }} response = requests.post(keep_webhook_url, headers=headers, json=data) response.raise_for_status() ``` ### 3. Attach the Callback to the DAG Attach the failure callback to the DAG using the `on_failure_callback` parameter: ```python from airflow import DAG from datetime import datetime dag = DAG( dag_id="keep_dag", default_args=default_args, description="A simple DAG with Keep integration", schedule_interval=None, start_date=datetime(2025, 1, 1), catchup=False, on_failure_callback=task_failure_callback, ) ``` """ def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): pass def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: alert = AlertDto( id=event.get("fingerprint"), fingerprint=event.get("fingerprint"), name=event.get("name", "Airflow Alert"), message=event.get("message"), description=event.get("description"), severity=event.get("severity", "critical"), status=event.get("status", "firing"), environment=event.get("environment", "undefined"), service=event.get("service"), source=["airflow"], url=event.get("url"), lastReceived=event.get( "lastReceived", datetime.now(tz=timezone.utc).isoformat() ), labels=event.get("labels", {}), ) return alert ================================================ FILE: keep/providers/aks_provider/aks_provider.py ================================================ import dataclasses import logging import pydantic from azure.identity import ClientSecretCredential from azure.mgmt.containerservice import ContainerServiceClient from kubernetes import client, config from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.functions import cyaml from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class AksProviderAuthConfig: """AKS authentication configuration.""" subscription_id: str = dataclasses.field( metadata={ "name": "subscription_id", "description": "The azure subscription id", "required": True, "sensitive": True, } ) client_id: str = dataclasses.field( metadata={ "name": "client_id", "description": "The azure client id", "required": True, "sensitive": True, } ) client_secret: str = dataclasses.field( metadata={ "name": "client_secret", "description": "The azure client secret", "required": True, "sensitive": True, } ) tenant_id: str = dataclasses.field( metadata={ "name": "tenant_id", "description": "The azure tenant id", "required": True, "sensitive": True, } ) resource_group_name: str = dataclasses.field( metadata={ "name": "resource_group_name", "description": "The azure aks resource group name", "required": True, "sensitive": True, } ) resource_name: str = dataclasses.field( metadata={ "name": "resource_name", "description": "The azure aks cluster name", "required": True, "sensitive": True, } ) class AksProvider(BaseProvider): """Enrich alerts using data from AKS.""" PROVIDER_DISPLAY_NAME = "Azure AKS" PROVIDER_CATEGORY = ["Cloud Infrastructure"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._client = None def dispose(self): pass def validate_config(self): self.authentication_config = AksProviderAuthConfig(**self.config.authentication) @property def client(self): if self._client is None: self._client = self.__generate_client() return self._client def __generate_client(self): try: # generate credential instance credential = ClientSecretCredential( tenant_id=self.authentication_config.tenant_id, client_id=self.authentication_config.client_id, client_secret=self.authentication_config.client_secret, ) # generate aks client aks_client = ContainerServiceClient( credential=credential, subscription_id=self.authentication_config.subscription_id, ) # get user credential for given cluster name cluster_creds = aks_client.managed_clusters.list_cluster_user_credentials( resource_group_name=self.authentication_config.resource_group_name, resource_name=self.authentication_config.resource_name, ) # parse the kubeconfig (parsed as yml string) kubeconfig = cyaml.safe_load( cluster_creds.kubeconfigs[0].value.decode("utf-8") ) config.load_kube_config_from_dict(config_dict=kubeconfig) self.logger.info("Loading kubeconfig...") return client.CoreV1Api() except Exception as e: raise ProviderException(f"Failed to load kubeconfig: {e}") def _query(self, command_type: str, **kwargs: dict): """ Query AKS resources using the Kubernetes client. Args: command_type (str): The command type to operate on the k8s cluster (`get_pods`, `get_pvc`, `get_node_pressure`). """ if command_type == "get_pods": pods = self.client.list_pod_for_all_namespaces(watch=False) return [pod.to_dict() for pod in pods.items] elif command_type == "get_pvc": pvcs = self.client.list_persistent_volume_claim_for_all_namespaces( watch=False ) return [pvc.to_dict() for pvc in pvcs.items] elif command_type == "get_node_pressure": nodes = self.client.list_node(watch=False) node_pressures = [] for node in nodes.items: pressures = { "name": node.metadata.name, "conditions": [], } for condition in node.status.conditions: if condition.type in [ "MemoryPressure", "DiskPressure", "PIDPressure", ]: pressures["conditions"].append(condition.to_dict()) node_pressures.append(pressures) return node_pressures raise NotImplementedError("command type not implemented") if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os config = { "authentication": { "subscription_id": os.environ.get("AKS_SUBSCRIPTION_ID"), "client_secret": os.environ.get("AKS_CLIENT_SECRET"), "client_id": os.environ.get("AKS_CLIENT_ID"), "tenant_id": os.environ.get("AKS_TENANT_ID"), "resource_name": os.environ.get("AKS_RESOURCE_NAME"), "resource_group_name": os.environ.get("AKS_RESOURCE_GROUP_NAME"), } } provider = ProvidersFactory.get_provider( context_manager, provider_id="aks-demo", provider_type="aks", provider_config=config, ) # Query AKS resources using the provider's methods. pods = provider.query(command_type="get_pods") pvc = provider.query(command_type="get_pvc") node_pressure = provider.query(command_type="get_node_pressure") print(pods, pvc, node_pressure) ================================================ FILE: keep/providers/amazonsqs_provider/__init__.py ================================================ ================================================ FILE: keep/providers/amazonsqs_provider/amazonsqs_provider.py ================================================ """ Amazonsqs Provider is a class that allows to receive alerts and notify the Amazon SQS Queue """ import dataclasses import inspect import logging import time import uuid from datetime import datetime import boto3 import botocore import pydantic from keep.api.models.alert import AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class AmazonsqsProviderAuthConfig: """ AmazonSQS authentication configuration. """ region_name: str = dataclasses.field( metadata={ "required": True, "description": "Region name", "hint": "Region name: eg. us-east-1 | ap-sout-1 | etc.", "sensitive": False, }, ) sqs_queue_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "SQS Queue URL", "hint": "Example: https://sqs.ap-south-1.amazonaws.com/614100018813/Q2", }, ) access_key_id: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Access Key Id (Leave empty if using IAM role at EC2)", "hint": "Access Key ID", }, ) secret_access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Secret access key (Leave empty if using IAM role at EC2)", "hint": "Secret access key", # "sensitive": True, }, ) class ClientIdInjector(logging.Filter): def filter(self, record): # For this example, let's pretend we can obtain the client_id # by inspecting the caller or some context. Replace the next line # with the actual logic to get the client_id. client_id, provider_id = self.get_client_id_from_caller() if not hasattr(record, "extra"): record.extra = { "client_id": client_id, "provider_id": provider_id, } return True def get_client_id_from_caller(self): # Here, you should implement the logic to extract client_id based on the caller. # This can be tricky and might require you to traverse the call stack. # Return a default or None if you can't find it. import copy frame = inspect.currentframe() client_id = None while frame: local_vars = copy.copy(frame.f_locals) for var_name, var_value in local_vars.items(): if isinstance(var_value, AmazonsqsProvider): client_id = var_value.context_manager.tenant_id provider_id = var_value.provider_id break if client_id: return client_id, provider_id frame = frame.f_back return None, None class AmazonsqsProvider(BaseProvider): """Sends and receive alerts from AmazonSQS.""" PROVIDER_CATEGORY = ["Monitoring", "Queues"] PROVIDER_TAGS = ["queue"] alert_severity_dict = { "critical": AlertSeverity.CRITICAL, "high": AlertSeverity.HIGH, "warning": AlertSeverity.WARNING, "info": AlertSeverity.INFO, "low": AlertSeverity.LOW, } PROVIDER_DISPLAY_NAME = "AmazonSQS" PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="Key-Id pair is valid and working", mandatory=True, alias="Authenticated", ), ProviderScope( name="sqs::read", description="Required privileges to receive alert from SQS. If you only want to give read scope to your key-secret pair the permission policy: AmazonSQSReadOnlyAccess.", mandatory=True, alias="Read Access", ), ProviderScope( name="sqs::write", description="Required privileges to push messages to SQS. If you only want to give read & write scope to your key-secret pair the permission policy: AmazonSQSFullAccess.", mandatory=False, alias="Write Access", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.consume = False self.consumer = None self.err = "" # patch all AmazonSQS loggers to contain the tenant_id for logger_name in logging.Logger.manager.loggerDict: if logger_name.startswith("amazonsqs"): logger = logging.getLogger(logger_name) if not any(isinstance(f, ClientIdInjector) for f in logger.filters): self.logger.info(f"Patching amazonsqs logger {logger_name}") logger.addFilter(ClientIdInjector()) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Amazonsqs provider. """ self.logger.debug("Validating configuration for Amazonsqs provider") self.authentication_config = AmazonsqsProviderAuthConfig( **self.config.authentication ) @property def __get_sqs_client(self): if self.consumer is None: self.consumer = boto3.client( "sqs", region_name=self.authentication_config.region_name, aws_access_key_id=self.authentication_config.access_key_id, aws_secret_access_key=self.authentication_config.secret_access_key, ) return self.consumer def validate_scopes(self) -> dict[str, bool | str]: self.logger.info("Validating user scopes for AmazonSQS provider") scopes = { "authenticated": False, "sqs::read": False, "sqs::write": False, } sts = boto3.client( "sts", region_name=self.authentication_config.region_name, aws_access_key_id=self.authentication_config.access_key_id, aws_secret_access_key=self.authentication_config.secret_access_key, ) try: sts.get_caller_identity() self.logger.info( "User identity fetched successfully, user is authenticated." ) scopes["authenticated"] = True except botocore.exceptions.ClientError as e: self.logger.error( "Error while getting user identity, authentication failed", extra={"exception": str(e)}, ) scopes["authenticated"] = str(e) return scopes try: self.__write_to_queue( message="KEEP_SCOPE_TEST_MSG_PLEASE_IGNORE", dedup_id=str(uuid.uuid4()), group_id="keep", ) self.logger.info("All scopes verified successfully") scopes["sqs::write"] = True scopes["sqs::read"] = True except botocore.exceptions.ClientError as e: self.logger.error( "User does not have permission to write to SQS queue", extra={"exception": str(e)}, ) scopes["sqs::write"] = str(e) try: self.__read_from_queue() self.logger.info("User has permission to read from SQS Queue") scopes["sqs::read"] = True except botocore.exceptions.ClientError as e: self.logger.error( "User does not have permission to read from SQS queue", extra={"exception": str(e)}, ) scopes["sqs::read"] = str(e) return scopes def __read_from_queue(self): self.logger.info("Getting messages from SQS Queue") try: return self.__get_sqs_client.receive_message( QueueUrl=self.authentication_config.sqs_queue_url, MessageAttributeNames=["All"], MessageSystemAttributeNames=["All"], MaxNumberOfMessages=10, WaitTimeSeconds=10, ) except Exception as e: self.logger.error( "Error while reading from SQS Queue", extra={"exception": str(e)} ) def __write_to_queue(self, message, group_id, dedup_id, **kwargs): try: self.logger.info("Sending message to SQS Queue") message = str(message) group_id = str(group_id) dedup_id = str(dedup_id) is_fifo = self.authentication_config.sqs_queue_url.endswith(".fifo") self.logger.info("Building MessageAttributes") msg_attrs = { key: {"StringValue": kwargs[key], "DataType": "String"} for key in kwargs } if is_fifo: if not dedup_id or not group_id: self.logger.error( "Mandatory to provide dedup_id (Message deduplication ID) & group_id (Message group ID) when pushing to fifo queue" ) raise Exception( "Mandatory to provide dedup_id (Message deduplication ID) & group_id (Message group ID) when pushing to fifo queue" ) response = self.__get_sqs_client.send_message( QueueUrl=self.authentication_config.sqs_queue_url, MessageAttributes=msg_attrs, MessageBody=message, MessageDeduplicationId=dedup_id, MessageGroupId=group_id, ) else: response = self.__get_sqs_client.send_message( QueueUrl=self.authentication_config.sqs_queue_url, MessageAttributes=msg_attrs, MessageBody=message, ) self.logger.info( "Successfully pushed the message to SQS", extra={"response": str(response)}, ) return response except Exception as e: self.logger.error( "Error while writing to SQS queue", extra={"exception": str(e)} ) raise e def __delete_from_queue(self, receipt: str): self.logger.info("Deleting message from SQS Queue") try: self.__get_sqs_client.delete_message( QueueUrl=self.authentication_config.sqs_queue_url, ReceiptHandle=receipt ) self.logger.info("Successfully deleted message from SQS Queue") except Exception as e: self.logger.error( "Error while deleting message from SQS queue", extra={"exception": str(e)}, ) raise e @staticmethod def get_status_or_default(status_value): try: # Check if status_value is a valid member of AlertStatus return AlertStatus(status_value) except ValueError: # If not, return the default AlertStatus.FIRING return AlertStatus.FIRING def _notify(self, message, group_id, dedup_id, **kwargs): return self.__write_to_queue( message=message, group_id=group_id, dedup_id=dedup_id, **kwargs ) def start_consume(self): self.consume = True while self.consume: response = self.__read_from_queue() messages = response.get("Messages", []) if not messages: self.logger.info("No messages found. Queue is empty!") for message in messages: try: labels = {} attrs = message.get("MessageAttributes", {}) for msg_attr in attrs: labels[msg_attr.lower()] = attrs[msg_attr].get( "StringValue", attrs[msg_attr].get("BinaryValue", "") ) alert_dict = { "id": message["MessageId"], "name": labels.get("name", message["Body"]), "description": labels.get("description", message["Body"]), "message": message["Body"], "status": AmazonsqsProvider.get_status_or_default( labels.get("status", "firing") ), "severity": self.alert_severity_dict.get( labels.get("severity", "high"), AlertSeverity.HIGH ), "lastReceived": datetime.fromtimestamp( float(message["Attributes"]["SentTimestamp"]) / 1000 ).isoformat(), "firingStartTime": datetime.fromtimestamp( float(message["Attributes"]["SentTimestamp"]) / 1000 ).isoformat(), "labels": labels, "source": ["amazonsqs"], } self._push_alert(alert_dict) self.__delete_from_queue(receipt=message["ReceiptHandle"]) except Exception as e: self.logger.error(f"Error processing message: {e}") time.sleep(0.1) self.logger.info("Consuming stopped") def stop_consume(self): self.consume = False ================================================ FILE: keep/providers/anthropic_provider/__init__.py ================================================ ================================================ FILE: keep/providers/anthropic_provider/anthropic_provider.py ================================================ import json import dataclasses import pydantic from anthropic import Anthropic from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class AnthropicProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "Anthropic API Key", "sensitive": True, }, ) class AnthropicProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Anthropic" PROVIDER_CATEGORY = ["AI"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = AnthropicProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _query( self, prompt, model="claude-3-sonnet-20240229", max_tokens=1024, structured_output_format=None, ): """ Query the Anthropic API with the given prompt and model. Args: prompt (str): The prompt to query the model with. model (str): The model to query. max_tokens (int): The maximum number of tokens to generate. structured_output_format (dict): The structured output format to use. """ client = Anthropic(api_key=self.authentication_config.api_key) messages = [{"role": "user", "content": prompt}] # Handle structured output with system prompt if needed system_prompt = "" if structured_output_format: schema = structured_output_format.get("json_schema", {}) system_prompt = ( f"You must respond with valid JSON that matches this schema: {json.dumps(schema)}\n" "Your response must be parseable JSON and nothing else." ) response = client.messages.create( model=model, max_tokens=max_tokens, messages=messages, system=system_prompt if system_prompt else None, ) content = response.content[0].text try: content = json.loads(content) except Exception: pass return { "response": content, } if __name__ == "__main__": import os import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) api_key = os.environ.get("ANTHROPIC_API_KEY") config = ProviderConfig( description="Claude Provider", authentication={ "api_key": api_key, }, ) provider = AnthropicProvider( context_manager=context_manager, provider_id="claude_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works.", model="claude-3-sonnet-20240229", structured_output_format={ "type": "json_schema", "json_schema": { "name": "environment_restoration", "schema": { "type": "object", "properties": { "environment": { "type": "string", "enum": ["production", "debug", "pre-prod"], }, }, "required": ["environment"], "additionalProperties": False, }, "strict": True, }, }, max_tokens=100, ) ) ================================================ FILE: keep/providers/appdynamics_provider/__init__.py ================================================ ================================================ FILE: keep/providers/appdynamics_provider/appdynamics_provider.py ================================================ """ AppDynamics Provider is a class that allows to install webhooks in AppDynamics. """ import dataclasses import json import tempfile from pathlib import Path from typing import List, Optional from urllib.parse import urlencode, urljoin import pydantic import requests from dateutil import parser from keep.api.models.alert import AlertDto, AlertSeverity from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope class ResourceAlreadyExists(Exception): def __init__(self, *args): super().__init__(*args) @pydantic.dataclasses.dataclass class AppdynamicsProviderAuthConfig: """ AppDynamics authentication configuration. """ appDynamicsAccountName: str = dataclasses.field( metadata={ "required": True, "description": "AppDynamics Account Name", "hint": "AppDynamics Account Name", }, ) appId: str = dataclasses.field( metadata={ "required": True, "description": "AppDynamics appId", "hint": "the app instance in which the webhook should be installed", }, ) host: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "AppDynamics host", "hint": "e.g. https://baseball202404101029219.saas.appdynamics.com", "validation": "any_http_url" }, ) appDynamicsAccessToken: Optional[str] = dataclasses.field( default=None, metadata={ "description": "AppDynamics Access Token", "hint": "Access Token", "config_sub_group": "access_token", "config_main_group": "authentication", }, ) appDynamicsUsername: Optional[str] = dataclasses.field( default=None, metadata={ "description": "Username", "hint": "Username associated with your account", "config_sub_group": "basic_auth", "config_main_group": "authentication", }, ) appDynamicsPassword: Optional[str] = dataclasses.field( default=None, metadata={ "description": "Password", "hint": "Password associated with your account", "sensitive": True, "config_sub_group": "basic_auth", "config_main_group": "authentication", }, ) @pydantic.root_validator def check_password_or_token(cls, values): username, password, token = ( values.get("appDynamicsUsername"), values.get("appDynamicsPassword"), values.get("appDynamicsAccessToken"), ) if not (username and password) and not token: raise ValueError( "Either username/password or access token must be provided" ) return values class AppdynamicsProvider(BaseProvider): """Install Webhooks and receive alerts from AppDynamics.""" PROVIDER_DISPLAY_NAME = "AppDynamics" PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authorized", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ProviderScope( name="administrator", description="Administrator privileges", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ] SEVERITIES_MAP = { "ERROR": AlertSeverity.CRITICAL, "WARN": AlertSeverity.WARNING, "INFO": AlertSeverity.INFO, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for AppDynamics provider. """ self.authentication_config = AppdynamicsProviderAuthConfig( **self.config.authentication ) if not self.authentication_config.host.startswith( "https://" ) and not self.authentication_config.host.startswith("http://"): self.authentication_config.host = ( f"https://{self.authentication_config.host}" ) def __get_url(self, paths: List[str] = None, query_params: dict = None, **kwargs): """ Helper method to build the url for AppDynamics api requests. Example: paths = ["issue", "createmeta"] query_params = {"projectKeys": "key1"} url = __get_url("test", paths, query_params) # url = https://baseballxyz.saas.appdynamics.com/rest/api/2/issue/createmeta?projectKeys=key1 """ paths = paths or [] url = urljoin( f"{self.authentication_config.host}/controller", "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def get_user_id_by_name(self, name: str) -> Optional[str]: self.logger.info("Getting user ID by name") response = requests.get( url=self.__get_url(paths=["controller/api/rbac/v1/users/"]), headers=self.__get_headers(), auth=self.__get_auth(), ) if response.ok: users = response.json() for user in users["users"]: if user["name"].lower() == name.lower(): return user["id"] return None else: self.logger.error( "Error while validating scopes for AppDynamics", extra=response.json() ) def validate_scopes(self) -> dict[str, bool | str]: authenticated = False administrator = "Missing Administrator Privileges" self.logger.info("Validating AppDynamics Scopes") user_id = self.get_user_id_by_name( self.authentication_config.appDynamicsAccountName ) url = self.__get_url( paths=[ "controller/api/rbac/v1/users/", user_id, ] ) response = requests.get( url=url, headers=self.__get_headers(), auth=self.__get_auth(), ) if response.ok: authenticated = True response = response.json() for role in response["roles"]: if ( role["name"] == "Account Administrator" or role["name"] == "Administrator" ): administrator = True self.logger.info( "All scopes validated successfully for AppDynamics" ) break else: self.logger.error( "Error while validating scopes for AppDynamics", extra=response.content ) return {"authenticated": authenticated, "administrator": administrator} def __get_headers(self): if self.authentication_config.appDynamicsAccessToken: return { "Authorization": f"Bearer {self.authentication_config.appDynamicsAccessToken}", } def __get_auth(self) -> tuple[str, str]: if ( self.authentication_config.appDynamicsUsername and self.authentication_config.appDynamicsPassword ): return ( f"{self.authentication_config.appDynamicsUsername}@{self.authentication_config.appDynamicsAccountName}", self.authentication_config.appDynamicsPassword, ) def __create_http_response_template(self, keep_api_url: str, api_key: str): keep_api_host, keep_api_path = keep_api_url.rsplit("/", 1) # The httpactiontemplate.json is a template/skeleton for creating a new HTTP Request Action in AppDynamics temp = tempfile.NamedTemporaryFile(mode="w+t", delete=True) template = json.load(open(rf"{Path(__file__).parent}/httpactiontemplate.json")) template[0]["host"] = keep_api_host.lstrip("http://").lstrip("https://") template[0]["path"], template[0]["query"] = keep_api_path.split("?") template[0]["path"] = "/" + template[0]["path"].rstrip("/") template[0]["headers"][0]["value"] = api_key temp.write(json.dumps(template)) temp.seek(0) res = requests.post( self.__get_url(paths=["controller/actiontemplate/httprequest"]), files={"template": temp}, headers=self.__get_headers(), auth=self.__get_auth(), ) res = res.json() temp.close() if res["success"] == "True" or res["success"] is True: self.logger.info("HTTP Response template Successfully Created") else: self.logger.info("HTTP Response template creation failed", extra=res) if "already exists" in res["errors"][0]: self.logger.info( "HTTP Response template creation failed as it already exists", extra=res, ) raise ResourceAlreadyExists() raise Exception(res["errors"]) def __create_action(self): response = requests.post( url=self.__get_url( paths=[ "alerting/rest/v1/applications", self.authentication_config.appId, "actions", ] ), headers=self.__get_headers(), auth=self.__get_auth(), json={ "actionType": "HTTP_REQUEST", "name": "KeepAction", "httpRequestTemplateName": "KeepWebhook", "customTemplateVariables": [], }, ) if response.ok: self.logger.info("Action Created") else: response = response.json() self.logger.info("Action Creation failed") if "already exists" in response["message"]: raise ResourceAlreadyExists() raise Exception(response["message"]) def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): try: self.__create_http_response_template( keep_api_url=keep_api_url, api_key=api_key ) except ResourceAlreadyExists: self.logger.info("Template already exists, proceeding with webhook setup") except Exception as e: raise e try: self.__create_action() except ResourceAlreadyExists: self.logger.info("Template already exists, proceeding with webhook setup") except Exception as e: raise e # Listing all policies in the specified app policies_response = requests.get( url=self.__get_url( paths=[ "alerting/rest/v1/applications", self.authentication_config.appId, "policies", ] ), headers=self.__get_headers(), auth=self.__get_auth(), ) policies = policies_response.json() policy_config = { "actionName": "KeepAction", "actionType": "HTTP_REQUEST", } for policy in policies: curr_policy = requests.get( url=self.__get_url( paths=[ "alerting/rest/v1/applications", self.authentication_config.appId, "policies", policy["id"], ] ), headers=self.__get_headers(), auth=self.__get_auth(), ).json() if policy_config not in curr_policy["actions"]: curr_policy["actions"].append(policy_config) if "executeActionsInBatch" not in curr_policy: curr_policy["executeActionsInBatch"] = True new_events_dictionary = {} for event_key, event_value in curr_policy["events"].items(): if event_value is None or len(event_value) == 0: continue else: new_events_dictionary[event_key] = event_value curr_policy["events"] = new_events_dictionary request = requests.put( url=self.__get_url( paths=[ "/alerting/rest/v1/applications", self.authentication_config.appId, "policies", policy["id"], ] ), headers=self.__get_headers(), auth=self.__get_auth(), json=curr_policy, ) if not request.ok: self.logger.info("Failed to add Webhook") raise Exception("Could not create webhook") self.logger.info("Webhook created") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: return AlertDto( id=event["id"], name=event["name"], severity=AppdynamicsProvider.SEVERITIES_MAP.get(event["severity"]), lastReceived=parser.parse(event["lastReceived"]).isoformat(), message=event["message"], description=event["description"], event_id=event["event_id"], url=event["url"], source=["appdynamics"], ) @staticmethod def parse_event_raw_body(raw_body: bytes | dict) -> dict: if isinstance(raw_body, dict): return raw_body return json.loads(raw_body, strict=False) ================================================ FILE: keep/providers/appdynamics_provider/httpactiontemplate.json ================================================ [ { "actionPlanType": "httprequest", "name": "KeepWebhook", "oneRequestPerEvent": false, "eventClampLimit": -1, "defaultCustomProperties": [], "method": "POST", "scheme": "HTTPS", "host": "", "port": 0, "path": "", "query": "", "urlCharset": "UTF_8", "authType": "NONE", "authUsername": null, "authPassword": "", "headers": [ { "id": 0, "version": 0, "name": "X-API-KEY", "value": "" } ], "payloadTemplate": { "httpRequestActionMediaType": "application/json", "charset": "UTF_8", "formDataPairs": [], "payload": "{\"id\": \"${latestEvent.id}\", \"name\": \"${latestEvent.displayName}\", \"severity\": \"${latestEvent.severity}\", \"lastReceived\": \"${latestEvent.eventTime}\", \"message\": \"${latestEvent.eventMessage}\", \"description\": \"${latestEvent.summaryMessage}\", \"event_id\": \"${latestEvent.guid}\", \"url\": \"${latestEvent.deepLink}\"}" }, "connectTimeoutInMillis": 5000, "socketTimeoutInMillis": 15000, "maxFollowRedirects": 0, "responseMatchCriteriaAnyTemplate": [], "responseMatchCriteriaNoneTemplate": [], "testLogLevel": "DEBUG", "testPropertiesPairs": [], "eventTypeCountPairs": [] } ] ================================================ FILE: keep/providers/argocd_provider/README.md ================================================ # Instructions for ~~a quick~~ setup ## Setting up ArgoCD ### Installation 1. Spin up Docker Daemon 2. Wait for kubernetes to start 3. Run the commands below ```bash kubectl create namespace argocd kubectl apply -n argocd -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml ``` 4. If you're on Mac/Linux ```bash brew install argocd ``` If you're on windows: Download the executable from here https://github.com/argoproj/argo-cd/releases/latest 5. cd to the `argocd_provider` & run this command (This will create a dummy ApplicationSetwith application app-1 and app-2) ```bash kubectl apply -f applicationset.yaml ``` 6. Run this command to open configmap ```bash kubectl edit configmap argocd-cm -n argocd ``` 7. add this in the configmap ```yaml data: accounts.admin: apiKey, login ``` Finally, your configmap should look similar to this ```yaml # Please edit the object below. Lines beginning with a '#' will be ignored, # and an empty file will abort the edit. If an error occurs while saving this file will be # reopened with the relevant failures. # apiVersion: v1 ################ This is the new part########### data: accounts.admin: apiKey, login ################################################ kind: ConfigMap metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"v1","kind":"ConfigMap","metadata":{"annotations":{},"labels":{"app.kubernetes.io/name":"argocd-cm","app.kubernetes.io/part-of":"argocd"},"name":"argocd-cm","namespace":"argocd"}} creationTimestamp: "2024-12-27T15:40:06Z" labels: app.kubernetes.io/name: argocd-cm app.kubernetes.io/part-of: argocd name: argocd-cm namespace: argocd resourceVersion: "807860" uid: e2d8722f-e3bc-4299-9bb6-669b2873acdd ``` 8. Restart your server ``` bash kubectl rollout restart deployment argocd-server -n argocd ``` 9. Expose the port ```bash kubectl port-forward svc/argocd-server -n argocd 8000:443 ``` 10. Run this to get the initial Password & copy this ```bash argocd admin initial-password -n argocd ``` 11. Go to https://localhost:8000, login with credentials Username: admin, Password: . 12. Click `+ New App` > `Edit as YAML` > Paste the yaml below > Click `Save` > Click `Create`: ```yaml apiVersion: argoproj.io/v1alpha1 kind: Application metadata: name: application-1 spec: destination: name: '' namespace: default server: https://kubernetes.default.svc source: path: apps repoURL: https://github.com/argoproj/argocd-example-apps.git targetRevision: HEAD sources: [] project: default ``` 13. Find Card `application-1` and click `Sync` > Click `SYNCHRONIZE`. ### Getting Access Token 1. Go to `Settings` > `Accounts` > `Admin` > `Generate New` under tokens, this will generate an access token (Copy this). ### Setting up provider 1. Provider Name: UwU 2. Access Token: `` 3. Deployment URL: `https://localhost:8000` ================================================ FILE: keep/providers/argocd_provider/__init__.py ================================================ ================================================ FILE: keep/providers/argocd_provider/applicationset.yaml ================================================ apiVersion: argoproj.io/v1alpha1 kind: ApplicationSet metadata: name: list-applicationset namespace: argocd spec: generators: - list: elements: - cluster: https://kubernetes.default.svc namespace: app1 name: app1 path: app1-config - cluster: https://kubernetes.default.svc namespace: app2 name: app2 path: app2-config template: metadata: name: '{{name}}' spec: project: default source: repoURL: https://github.com/your-org/your-repo targetRevision: main path: '{{path}}' destination: server: '{{cluster}}' namespace: '{{namespace}}' ================================================ FILE: keep/providers/argocd_provider/argocd_provider.py ================================================ """ Argocd Provider is a class that allows to get Applications and ApplicationSets from ArgoCD and map them to keep services and aplications respectively. """ import dataclasses import uuid from typing import List from urllib.parse import urlencode, urljoin import pydantic import requests from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseTopologyProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class ArgocdProviderAuthConfig: """ Argocd authentication configuration. """ argocd_access_token: str = dataclasses.field( metadata={ "required": True, "description": "Argocd Access Token", "hint": "Argocd Access Token ", "sensitive": True, }, ) deployment_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Deployment Url", "hint": "Example: https://loaclhost:8080", "validation": "any_http_url", }, ) class ArgocdProvider(BaseTopologyProvider): """Install Webhooks and receive alerts from Argocd.""" PROVIDER_CATEGORY = ["Cloud Infrastructure"] PROVIDER_DISPLAY_NAME = "ArgoCD" PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authorized", mandatory=True, mandatory_for_webhook=True, alias="Authenticated", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._host = None def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Argocd provider. """ self.logger.debug("Validating configuration for Argocd provider") self.authentication_config = ArgocdProviderAuthConfig( **self.config.authentication ) @property def argocd_host(self): self.logger.debug("Fetching Argocd host") if self._host: self.logger.debug("Returning cached Argocd host") return self._host # Handle host determination logic with logging if self.authentication_config.deployment_url.startswith( "http://" ) or self.authentication_config.deployment_url.startswith("https://"): self.logger.info("Using supplied Argocd host with protocol") self._host = self.authentication_config.deployment_url return self._host # Otherwise, attempt to use https try: self.logger.debug( f"Trying HTTPS for {self.authentication_config.deployment_url}" ) requests.get( f"https://{self.authentication_config.deployment_url}", verify=False, ) self.logger.info("HTTPS protocol confirmed") self._host = f"https://{self.authentication_config.deployment_url}" except requests.exceptions.SSLError: self.logger.warning("SSL error encountered, falling back to HTTP") self._host = f"http://{self.authentication_config.deployment_url}" except Exception as e: self.logger.error( "Failed to determine Argocd host", extra={"exception": str(e)} ) self._host = self.authentication_config.deployment_url.rstrip("/") return self._host @property def _headers(self): return { "Authorization": f"Bearer {self.authentication_config.argocd_access_token}", } def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for Argocd api requests. """ host = self.argocd_host.rstrip("/").rstrip() + "/api/v1/" self.logger.info(f"Building URL with host: {host}") url = urljoin( host, "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" self.logger.debug(f"Constructed URL: {url}") return url def validate_scopes(self) -> dict[str, bool | str]: self.logger.info("Validating user scopes for Argocd provider") authenticated = True try: self.__pull_applications() except Exception as e: self.logger.error( "Error while validating scope for ArgoCD", extra={"exception": str(e)} ) authenticated = str(e) return { "authenticated": authenticated, } def __pull_applications(self): self.logger.info("Pulling applications from Argocd...") try: response = requests.get( url=self.__get_url(paths=["applications"]), headers=self._headers, verify=False, timeout=10, ) if response.status_code != 200: raise Exception(response.text) self.logger.info("Successfully pulled all ArgoCD applications") return response.json()["items"] except Exception as e: self.logger.error( "Error while getting applications from ArgoCD", extra={"exception": str(e)}, ) raise e def __get_relation(self, name: str, namespace: str): try: response = requests.get( url=self.__get_url( paths=["applications", name, "resource-tree"], query_params={"appNamespace": namespace}, ), headers=self._headers, verify=False, timeout=10, ) if response.status_code != 200: raise Exception(response.text) return response.json()["nodes"] except Exception as e: self.logger.error( "Error while getting resource-tree from ArgoCD", extra={"exception": str(e)}, ) def pull_topology(self): applications = self.__pull_applications() service_topology = {} for application in applications: namespace = application["metadata"]["namespace"] name = application["metadata"]["name"] nodes = self.__get_relation(name, namespace) if nodes is None: nodes = [] metadata = application["metadata"] applicationSets = metadata.get("ownerReferences", None) spec = application["spec"] service_topology[metadata["uid"]] = TopologyServiceInDto( source_provider_id=self.provider_id, service=metadata["uid"], display_name=metadata["name"], repository=self.__get_repository_urls(spec), ) applications = {} if applicationSets: for application_set in applicationSets: if application_set["kind"] == "ApplicationSet": application_name: str = ( application_set["name"] + "::" + application_set["uid"] ) applications[uuid.UUID(application_set["uid"])] = ( application_name ) if len(applications) > 0: service_topology[metadata["uid"]].application_relations = ( applications ) for node in nodes: if node["kind"] == "Application": uid = node.get("uid") if not uid: self.logger.warning("Skipping node with missing 'uid': %s", node) continue service_topology[metadata["uid"]].dependencies[ node["uid"] ] = "unknown" return list(service_topology.values()), {} def __get_repository_urls(self, spec: dict) -> str: """ Extract repository URLs from application spec, handling both single and multiple sources. Returns a comma-separated string of repository URLs. """ repos = [] if "sources" in spec: # Handle multiple sources repos.extend(source.get("repoURL") for source in spec["sources"] if source.get("repoURL")) elif "source" in spec and spec["source"].get("repoURL"): # Handle single source repos.append(spec["source"]["repoURL"]) return ", ".join(repos) if repos else None ================================================ FILE: keep/providers/asana_provider/__init__.py ================================================ ================================================ FILE: keep/providers/asana_provider/asana_provider.py ================================================ """ Asana Provider is a class that provides a way to create tasks in Asana. """ import dataclasses import typing import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class AsanaProviderAuthConfig: """ Asana Provider Auth Config. """ pat_token: str = dataclasses.field( metadata={ "required": True, "description": "Personal Access Token for Asana.", "sensitive": True, "documentation_url": "https://developers.asana.com/docs/personal-access-token", } ) class AsanaProvider(BaseProvider): """ Asana Provider is a class that provides a way to create tasks in Asana. """ PROVIDER_CATEGORY = ["Collaboration", "Organizational Tools", "Ticketing"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is authenticated to Asana.", mandatory=True, ) ] PROVIDER_TAGS = ["ticketing"] PROVIDER_DISPLAY_NAME = "Asana" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_scopes(self): """ Validate the scopes of the provider. """ headers = self._generate_auth_headers() url = "https://app.asana.com/api/1.0/projects" try: response = requests.get(url, headers=headers) if response.status_code != 200: response.raise_for_status() self.logger.info( "Successfully validated scopes", extra={"response": response.json()} ) return {"authenticated": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"exception": e}) return {"authenticated": str(e)} def validate_config(self): """ Validate the configuration of the provider. """ self.authentication_config = AsanaProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def _generate_auth_headers(self): """ Generate the authentication headers for the provider. """ return { "Authorization": f"Bearer {self.authentication_config.pat_token}", "Accept": "application/json", } def _create_task(self, name: str, projects: typing.List[str], **kwargs: dict): """ Create a task in Asana. """ headers = self._generate_auth_headers() url = "https://app.asana.com/api/1.0/tasks" payload = {"data": {"projects": projects, "name": name, **kwargs}} try: response = requests.post(url, headers=headers, json=payload) if response.status_code != 201: response.raise_for_status() self.logger.info( "Successfully created task", extra={"response": response.json()} ) return response.json()["data"] except Exception as e: self.logger.exception("Failed to create task", extra={"exception": e}) raise ProviderException(str(e)) def _update_task(self, task_id: str, **kwargs: dict): """ Update a task in Asana. """ headers = self._generate_auth_headers() url = f"https://app.asana.com/api/1.0/tasks/{task_id}" payload = {"data": {**kwargs}} try: response = requests.put(url, headers=headers, json=payload) if response.status_code != 200: response.raise_for_status() self.logger.info( "Successfully updated task", extra={"response": response.json()} ) return response.json()["data"] except Exception as e: self.logger.exception("Failed to update task", extra={"exception": e}) raise ProviderException(str(e)) def _notify(self, name: str, projects: typing.List[str], **kwargs: dict): """ Create task in Asana. Args: name (str): Task Name. projects (List[str]): List of Project IDs. **kwargs (dict): Apart from the above parameters, you can also provide few other parameters. Refer to the [Asana API documentation](https://developers.asana.com/docs/update-a-task) for more details. """ return self._create_task(name, projects, **kwargs) def _query(self, task_id: str, **kwargs: dict): """ Query tasks in Asana. Args: task_id (str): Task ID. **kwargs (dict): Apart from the above parameters, you can also provide few other parameters. Refer to the [Asana API documentation](https://developers.asana.com/docs/update-a-task) for more details. """ return self._update_task(task_id, **kwargs) if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os pat_token = os.getenv("ASANA_PAT_TOKEN") config = ProviderConfig( description="Asana Provider", authentication={"pat_token": pat_token} ) provider = AsanaProvider(context_manager, "asana_provider", config) print(provider._notify("Test Task", ["1234567890"], notes="This is a test task")) ================================================ FILE: keep/providers/auth0_provider/__init__.py ================================================ ================================================ FILE: keep/providers/auth0_provider/auth0_provider.py ================================================ """ Auth0 provider. """ import dataclasses import datetime import os import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.validation.fields import HttpsUrl @dataclasses.dataclass class Auth0ProviderAuthConfig: """ Auth0 authentication configuration. """ domain: HttpsUrl = dataclasses.field( metadata={ "required": True, "description": "Auth0 Domain", "hint": "https://tenantname.us.auth0.com", "validation": "https_url", }, ) token: str = dataclasses.field( metadata={ "required": True, "sensitive": True, "description": "Auth0 API Token", "hint": "https://manage.auth0.com/dashboard/us/YOUR_ACCOUNT/apis/management/explorer", }, ) class Auth0Provider(BaseProvider): """Enrich alerts with data from Auth0.""" PROVIDER_DISPLAY_NAME = "Auth0" PROVIDER_CATEGORY = ["Identity and Access Management"] provider_id: str config: ProviderConfig def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for Auth0 provider. """ self.authentication_config = Auth0ProviderAuthConfig( **self.config.authentication ) def _query(self, log_type: str, from_: str = None, **kwargs: dict): """ Query Auth0 logs. Args: log_type (str): The log type: https://auth0.com/docs/deploy-monitor/logs/log-event-type-codes from_ (str, optional): 2023-09-10T11:43:34.213Z for example. Defaults to None. Raises: Exception: _description_ Returns: _type_: _description_ """ url = f"{self.authentication_config.domain}/api/v2/logs" headers = { "content-type": "application/json", "Authorization": f"Bearer {self.authentication_config.token}", } if not log_type: raise Exception("log_type is required") params = { "q": f"type:{log_type}", # Lucene query syntax to search for logs with type 's' (Success Signup) "per_page": 100, # specify the number of entries per page } if from_: params["q"] = ( f"({params['q']}) AND (date:[{from_} TO {datetime.datetime.now().isoformat()}])" ) response = requests.get(url, headers=headers, params=params) response.raise_for_status() logs = response.json() return logs def dispose(self): pass class Auth0LogsProvider(Auth0Provider): def _query(self, log_type: str, previous_users: list, **kargs: dict): logs = super().query(log_type=log_type, **kargs) self.logger.debug(f"Previous users: {previous_users}") previous_users_count = len(previous_users) users_count = len(logs) self.logger.debug(f"New users: {users_count - int(previous_users_count)}") new_users = [] for log in logs: if log["user_id"] not in previous_users: self.logger.debug(f"New user: {log['user_id']}") new_users.append(log) return { "users": [log["user_id"] for log in logs], "new_users": new_users, "new_users_count": len(new_users), } if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # If you want to use application default credentials, you can omit the authentication config config = { "authentication": { "token": os.environ.get("AUTH0_TOKEN"), "domain": os.environ.get("AUTH0_PROVIDER_DOMAIN"), }, } # Create the provider provider = Auth0Provider( context_manager, provider_id="auth0-provider", config=ProviderConfig(**config) ) logs = provider.query(log_type="f", from_="2023-09-10T11:43:34.213Z") print(logs) ================================================ FILE: keep/providers/axiom_provider/__init__.py ================================================ ================================================ FILE: keep/providers/axiom_provider/alerts_mock.py ================================================ ALERTS = { "action": "Open", "event": { "monitorID": "Rxg89nIwu9WwrOsJKA", "body": "Event Matched", "description": "", "queryEndTime": "2025-02-15 14:59:03.266529825 +0000 UTC", "queryStartTime": "2025-02-15 14:58:03.266529825 +0000 UTC", "timestamp": "2025-02-15 14:59:03 +0000 UTC", "title": "Triggered: New monitor", "value": 0, "matchedEvent": { "_sysTime": "2025-02-15T14:58:13.204120361Z", "_time": "2025-02-15T14:58:13.204114531Z", "bar": "baz" } } } ================================================ FILE: keep/providers/axiom_provider/axiom_provider.py ================================================ """ AxiomProvider is a class that allows to ingest/digest data from Axiom. """ import dataclasses from typing import Optional from datetime import datetime import pydantic import requests from keep.api.models.alert import AlertDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class AxiomProviderAuthConfig: """ Axiom authentication configuration. """ api_token: str = dataclasses.field( metadata={"required": True, "sensitive": True, "description": "Axiom API Token"} ) organization_id: Optional[str] = dataclasses.field( metadata={ "required": False, "sensitive": False, "description": "Axiom Organization ID", }, default=None, ) class AxiomProvider(BaseProvider): """Enrich alerts with data from Axiom.""" webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Axiom to Keep, Use the following webhook url to configure Axiom send alerts to Keep: 1. In Axiom, go to the Monitors tab in the Axiom dashboard. 2. Click on Notifiers in the left sidebar and create a new webhook. 3. Give it a name and select Custom Webhook as kind of notifier with webhook url as {keep_webhook_api_url}. 4. Add 'X-API-KEY' as the request header with the value as {api_key}. 5. Save the webhook. 6. Go to Monitors tab and click on the Monitors in the left sidebar and create a new monitor. 7. Create a new monitor and select the notifier created in the previous step as per your requirement. Refer [Axiom Monitors](https://axiom.co/docs/monitor-data/monitors) to create a new monitor. 8. Save the monitor. Now, you will receive alerts in Keep. """ PROVIDER_DISPLAY_NAME = "Axiom" PROVIDER_CATEGORY = ["Monitoring"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Axiom provider. """ self.authentication_config = AxiomProviderAuthConfig( **self.config.authentication ) def _query( self, dataset=None, datasets_api_url=None, organization_id=None, startTime=None, endTime=None, **kwargs: dict, ): """ Query Axiom using the given query Args: query (str): command to execute Returns: https://axiom.co/docs/restapi/query#response-example """ datasets_api_url = datasets_api_url or kwargs.get( "api_url", "https://api.axiom.co/v1/datasets" ) organization_id = organization_id or self.authentication_config.organization_id if not organization_id: raise Exception("organization_id is required for Axiom provider") if not dataset: raise Exception("dataset is required for Axiom provider") nocache = kwargs.get("nocache", "true") headers = { "Authorization": f"Bearer {self.authentication_config.api_token}", "X-Axiom-Org-ID": organization_id, } # Todo: support easier syntax (e.g. 1d, 1h, 1m, 1s, etc) body = {"startTime": startTime, "endTime": endTime} # Todo: add support for body parameters (https://axiom.co/docs/restapi/query#request-example) response = requests.post( f"{datasets_api_url}/{dataset}/query?nocache={nocache}?format=tabular", headers=headers, json=body, ) # Todo: log response details for better error handling return response.json() @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: action = event.get("action", "Unable to fetch action") axiom_event = event.get("event") monitorId = axiom_event.get("monitorID") body = axiom_event.get("body", "Unable to fetch body") description = axiom_event.get("description", "Unable to fetch description") queryEndTime = axiom_event.get("queryEndTime") queryStartTime = axiom_event.get("queryStartTime") timestamp = axiom_event.get("timestamp") title = axiom_event.get("title", "Unable to fetch title") value = axiom_event.get("value", "Unable to fetch value") matchedEvent = axiom_event.get("matchedEvent", {}) def convert_to_iso_format(date_str): try: dt = datetime.strptime(date_str[:19], "%Y-%m-%d %H:%M:%S") if len(date_str) > 19 and date_str[19] == ".": milliseconds = date_str[20:23].ljust(3, "0") else: milliseconds = "000" return dt.strftime(f"%Y-%m-%dT%H:%M:%S.{milliseconds}Z") except (ValueError, IndexError): return None queryEndTime = convert_to_iso_format(queryEndTime) queryStartTime = convert_to_iso_format(queryStartTime) timestamp = convert_to_iso_format(timestamp) alert = AlertDto( action=action, id=monitorId, name=title, body=body, description=description, queryEndTime=queryEndTime, queryStartTime=queryStartTime, timestamp=timestamp, title=title, value=value, matchedEvent=matchedEvent, startedAt=queryStartTime, lastReceived=timestamp, monitorId=monitorId, source=["axiom"], ) return alert if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os api_token = os.environ.get("AXIOM_API_TOKEN") config = { "authentication": {"api_token": api_token, "organization_id": "keephq-rxpb"}, } provider = ProvidersFactory.get_provider( context_manager, provider_id="axiom_test", provider_type="axiom", provider_config=config, ) result = provider.query(dataset="test", startTime="2023-04-26T09:52:04.000Z") print(result) ================================================ FILE: keep/providers/azuremonitoring_provider/__init__.py ================================================ ================================================ FILE: keep/providers/azuremonitoring_provider/azuremonitoring_provider.py ================================================ """ PrometheusProvider is a class that provides a way to read data from Prometheus. """ import datetime from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class AzuremonitoringProvider(BaseProvider): """Get alerts from Azure Monitor into Keep.""" webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Azure Monitor to Keep, Use the following webhook url to configure Azure Monitor send alerts to Keep: 1. In Azure Monitor, create a new Action Group. 2. In the Action Group, add a new action of type "Webhook". 3. In the Webhook action, configure the webhook with the following settings. - **Name**: keep-azuremonitoring-webhook-integration - **URL**: {keep_webhook_api_url_with_auth} 4. Save the Action Group. 5. In the Alert Rule, configure the Action Group to use the Action Group created in step 1. 6. Save the Alert Rule. 7. Test the Alert Rule to ensure that the alerts are being sent to Keep. """ # Maps Azure Monitor severity to Keep's format SEVERITIES_MAP = { "Sev0": AlertSeverity.CRITICAL, "Sev1": AlertSeverity.HIGH, "Sev2": AlertSeverity.WARNING, "Sev3": AlertSeverity.INFO, "Sev4": AlertSeverity.LOW, } # Maps Azure Monitor monitor condition to Keep's format STATUS_MAP = { "Resolved": AlertStatus.RESOLVED, "Fired": AlertStatus.FIRING, } PROVIDER_DISPLAY_NAME = "Azure Monitor" PROVIDER_CATEGORY = ["Monitoring", "Cloud Infrastructure"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for Prometheus's provider. """ # no config pass @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: essentials = event.get("data", {}).get("essentials", {}) alert_context = event.get("data", {}).get("alertContext", {}) # Extract and format the alert ID alert_id = essentials.get("alertId", "").split("/")[-1] # Format the severity severity = AzuremonitoringProvider.SEVERITIES_MAP.get( essentials.get("severity"), AlertSeverity.INFO ) # Format the status status = AzuremonitoringProvider.STATUS_MAP.get( essentials.get("monitorCondition"), AlertStatus.FIRING ) # Parse and format the timestamp event_time = essentials.get("firedDateTime", essentials.get("resolvedDateTime")) if event_time: event_time = datetime.datetime.fromisoformat(event_time) # Extract other essential fields resource_ids = essentials.get("alertTargetIDs", []) description = essentials.get("description", "") subscription = essentials.get("alertId", "").split("/")[2] url = f"https://portal.azure.com/#view/Microsoft_Azure_Monitoring_Alerts/AlertDetails.ReactView/alertId~/%2Fsubscriptions%2F{subscription}%2Fproviders%2FMicrosoft.AlertsManagement%2Falerts%2F{alert_id}" # Construct the alert object alert = AlertDto( id=alert_id, name=essentials.get("alertRule") or "", status=status, lastReceived=str(event_time), source=["azuremonitoring"], description=description, groups=resource_ids, severity=severity, url=url, monitor_id=essentials.get("originAlertId", ""), alertContext=alert_context, essentials=essentials, customProperties=event.get("data", {}).get("customProperties", {}), ) # Set fingerprint if applicable return alert if __name__ == "__main__": pass ================================================ FILE: keep/providers/base/__init__.py ================================================ ================================================ FILE: keep/providers/base/base_provider.py ================================================ """ Base class for all providers. """ import abc import copy import datetime import hashlib import itertools import json import logging import operator import os import re import uuid from collections import Counter from operator import attrgetter from typing import Literal, Optional import opentelemetry.trace as trace import requests from dateutil.parser import parse from keep.api.bl.enrichments_bl import EnrichmentsBl from keep.api.core.db import ( get_custom_deduplication_rule, get_enrichments, get_provider_by_name, is_linked_provider, ) from keep.api.logging import ProviderLoggerAdapter from keep.api.models.action_type import ActionType from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.api.models.db.topology import TopologyServiceInDto from keep.api.models.incident import IncidentDto from keep.api.utils.enrichment_helpers import parse_and_enrich_deleted_and_assignees from keep.contextmanager.contextmanager import ContextManager from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod tracer = trace.get_tracer(__name__) SPAMMY_ALERTS_THRESHOLD_HOURS = 1 SPAMMY_ALERTS_THRESHOLD = datetime.timedelta(hours=SPAMMY_ALERTS_THRESHOLD_HOURS) class BaseProvider(metaclass=abc.ABCMeta): OAUTH2_URL = None PROVIDER_SCOPES: list[ProviderScope] = [] PROVIDER_METHODS: list[ProviderMethod] = [] FINGERPRINT_FIELDS: list[str] = [] PROVIDER_COMING_SOON = False # tb: if the provider is coming soon, we show it in the UI but don't allow it to be added PROVIDER_CATEGORY: list[ Literal[ "AI", "Monitoring", "Incident Management", "Cloud Infrastructure", "Ticketing", "Identity", "Developer Tools", "Database", "Identity and Access Management", "Security", "Collaboration", "Organizational Tools", "CRM", "Queues", "Orchestration", "Others", ] ] = [ "Others" ] # tb: Default category for providers that don't declare a category PROVIDER_TAGS: list[ Literal[ "alert", "ticketing", "messaging", "data", "queue", "topology", "incident" ] ] = [] WEBHOOK_INSTALLATION_REQUIRED = False # webhook installation is required for this provider, making it required in the UI def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig, webhooke_template: Optional[str] = None, webhook_description: Optional[str] = None, webhook_markdown: Optional[str] = None, provider_description: Optional[str] = None, ): """ Initialize a provider. Args: provider_id (str): The provider id. **kwargs: Provider configuration loaded from the provider yaml file. """ self.provider_id = provider_id self.config = config self.webhooke_template = webhooke_template self.webhook_description = webhook_description self.webhook_markdown = webhook_markdown self.provider_description = provider_description self.context_manager = context_manager # Initialize the logger with our custom adapter base_logger = logging.getLogger(self.provider_id) # If logs should be stored on the DB, use the custom adapter if os.environ.get("KEEP_STORE_PROVIDER_LOGS", "false").lower() == "true": self.logger = ProviderLoggerAdapter( base_logger, self, context_manager.tenant_id, provider_id ) else: self.logger = base_logger self.logger.setLevel( os.environ.get( "KEEP_{}_PROVIDER_LOG_LEVEL".format(self.provider_id.upper()), os.environ.get("LOG_LEVEL", "INFO"), ) ) self.validate_config() self.logger.debug( "Base provider initialized", extra={"provider": self.__class__.__name__} ) self.provider_type = self._extract_type() self.results = [] # tb: we can have this overriden by customer configuration, when initializing the provider self.fingerprint_fields = self.FINGERPRINT_FIELDS self.step_id = None def _extract_type(self): """ Extract the provider type from the provider class name. Returns: str: The provider type. """ name = self.__class__.__name__ name_without_provider = name.replace("Provider", "") name_with_spaces = ( re.sub("([A-Z])", r" \1", name_without_provider).lower().strip() ) return name_with_spaces.replace(" ", ".") @abc.abstractmethod def dispose(self): """ Dispose of the provider. """ raise NotImplementedError("dispose() method not implemented") @abc.abstractmethod def validate_config(self): """ Validate provider configuration. """ raise NotImplementedError("validate_config() method not implemented") def validate_scopes(self) -> dict[str, bool | str]: """ Validate provider scopes. Returns: dict: where key is the scope name and value is whether the scope is valid (True boolean) or string with error message. """ return {} def get_provider_metadata(self) -> dict: """ Get provider metadata. E.g. Provider Version. Should be implemented by the provider. Returns: dict: The provider metadata. """ return {} def notify(self, **kwargs): """ Output alert message. Args: **kwargs (dict): The provider context (with statement) """ # Pop Keep-internal fields before passing kwargs to the provider enrich_alert = kwargs.pop("enrich_alert", []) enrich_incident = kwargs.pop("enrich_incident", []) audit_enabled = bool(kwargs.pop("audit_enabled", True)) # trigger the provider results = self._notify(**kwargs) self.results.append(results) # if the alert should be enriched, enrich it enrich_event = enrich_alert or enrich_incident if enrich_event: self._enrich(enrich_event, results, audit_enabled=audit_enabled) return results if results else None def _enrich(self, enrichments, results, audit_enabled=True): """ Enrich alert with provider specific data. """ self.logger.debug("Extracting the fingerprint from the alert") event = None entity_type: Literal["alert", "incident"] = "alert" if "fingerprint" in results: fingerprint = results["fingerprint"] elif self.context_manager.foreach_context.get("value", {}): foreach_context: dict | tuple = self.context_manager.foreach_context.get( "value", {} ) if isinstance(foreach_context, tuple): # This is when we are in a foreach context that is zipped foreach_context: dict = foreach_context[0] event = foreach_context if isinstance(foreach_context, AlertDto): fingerprint = foreach_context.fingerprint # if we are in a dict context, use the fingerprint from the dict elif isinstance(foreach_context, dict) and "fingerprint" in foreach_context: fingerprint = foreach_context.get("fingerprint") # in case the foreach itself doesn't have a fingerprint, use the event fingerprint elif self.context_manager.event_context: fingerprint = self.context_manager.event_context.fingerprint else: self.logger.warning( "No fingerprint found for alert enrichment", extra={"provider": self.provider_id}, ) fingerprint = None # else, if we are in an event context, use the event fingerprint elif self.context_manager.event_context: # TODO: map all casses event_context is dict and update them to the DTO # and remove this if statement event = self.context_manager.event_context if isinstance(self.context_manager.event_context, dict): fingerprint = self.context_manager.event_context.get("fingerprint") # Alert DTO else: fingerprint = self.context_manager.event_context.fingerprint elif self.context_manager.incident_context: entity_type = "incident" fingerprint = self.context_manager.incident_context.id else: fingerprint = None if not fingerprint: self.logger.error( "No fingerprint found for alert enrichment", extra={"provider": self.provider_id}, ) raise Exception("No fingerprint found for alert enrichment") self.logger.debug("Fingerprint extracted", extra={"fingerprint": fingerprint}) _enrichments = {} disposable_enrichments = {} # enrich only the requested fields for enrichment in enrichments: try: value = enrichment["value"] disposable = bool(enrichment.get("disposable", False)) if value.startswith("results."): val = enrichment["value"].replace("results.", "") parts = val.split(".") r = copy.copy(results) for part in parts: r = r[part] value = r # support smth like results[0][0].message.source # 1. first convert to results[0][0]["message"]["source"] # 2. use eval elif value.startswith("results["): self.logger.info("Trying to convert") # try convert def convert_dot_to_bracket(match): return f'["{match.group(1)}"]' converted_value = value bracket_pattern = r"\.([a-zA-Z_][a-zA-Z0-9_]*)" converted_value = re.sub( bracket_pattern, convert_dot_to_bracket, converted_value ) try: # this is secured since if we are here it means converted_value starts with results[ value = eval( converted_value, {"__builtins__": {}}, {"results": results} ) except Exception: self.logger.exception( "Could not parse results", extra={"value": value} ) if disposable: disposable_enrichments[enrichment["key"]] = value else: _enrichments[enrichment["key"]] = value if event is not None: if isinstance(event, dict): event[enrichment["key"]] = value else: setattr(event, enrichment["key"], value) except Exception: self.logger.error( f"Failed to enrich alert - enrichment: {enrichment}", extra={"fingerprint": fingerprint, "provider": self.provider_id}, ) continue self.logger.info("Enriching alert", extra={"fingerprint": fingerprint}) try: enrichments_bl = EnrichmentsBl(self.context_manager.tenant_id) enrichment_string = ", ".join( [f"{key}={value}" for key, value in _enrichments.items()] ) disposable_enrichment_string = ", ".join( [f"{key}={value}" for key, value in disposable_enrichments.items()] ) common_kwargs = { "fingerprint": fingerprint, "action_type": ActionType.WORKFLOW_ENRICH, "action_callee": "system", "audit_enabled": audit_enabled, } if _enrichments: # enrich the alert with _enrichments enrichments_bl.enrich_entity( enrichments=_enrichments, action_description=f"Workflow enriched the {entity_type} with {enrichment_string}", **common_kwargs, ) # todo: incidents do not have disposable enrichments if disposable_enrichments and entity_type == "alert": # enrich with disposable enrichments enrichments_bl.disposable_enrich_entity( enrichments=disposable_enrichments, action_description=f"Workflow enriched the {entity_type} with {disposable_enrichment_string}", **common_kwargs, ) should_check_incidents_resolution = ( _enrichments.get("status", None) == "resolved" or disposable_enrichments.get("status", None) == "resolved" ) if event and should_check_incidents_resolution: enrichments_bl.check_incident_resolution(event) except Exception as e: self.logger.error( f"Failed to enrich {entity_type} in db", extra={"fingerprint": fingerprint, "provider": self.provider_id}, ) raise e self.logger.info( f"{entity_type.capitalize()} enriched", extra={"fingerprint": fingerprint} ) def _notify(self, **kwargs): """ Output alert message. Args: **kwargs (dict): The provider context (with statement) """ raise NotImplementedError("notify() method not implemented") def _query(self, **kwargs: dict): """ Query the provider using the given query Args: kwargs (dict): The provider context (with statement) Raises: NotImplementedError: _description_ """ raise NotImplementedError("query() method not implemented") def query(self, **kwargs: dict): # Pop Keep-internal fields before passing kwargs to the provider enrich_alert = kwargs.pop("enrich_alert", []) audit_enabled = bool(kwargs.pop("audit_enabled", True)) # just run the query results = self._query(**kwargs) self.results.append(results) # now add the type of the results to the global context if results and isinstance(results, list): self.context_manager.dependencies.add(results[0].__class__) elif results: self.context_manager.dependencies.add(results.__class__) if enrich_alert: self._enrich(enrich_alert, results, audit_enabled=audit_enabled) # and return the results return results @staticmethod def _format_alert( event: dict | list[dict], provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: """ Format an incoming alert. Args: event (dict): The raw provider event payload. Raises: NotImplementedError: For providers who does not implement this method. Returns: AlertDto | list[AlertDto]: The formatted alert(s). """ raise NotImplementedError("format_alert() method not implemented") @classmethod def format_alert( cls, event: dict | list[dict], tenant_id: str | None, provider_type: str | None, provider_id: str | None, ) -> AlertDto | list[AlertDto] | None: logger = logging.getLogger(__name__) provider_instance: BaseProvider | None = None if provider_id and provider_type and tenant_id: try: if is_linked_provider(tenant_id, provider_id): logger.debug( "Provider is linked, skipping loading provider instance" ) provider_instance = None else: # To prevent circular imports from keep.providers.providers_factory import ProvidersFactory provider_instance: BaseProvider = ( ProvidersFactory.get_installed_provider( tenant_id, provider_id, provider_type ) ) except Exception: logger.exception( "Failed loading provider instance although all parameters were given", extra={ "tenant_id": tenant_id, "provider_id": provider_id, "provider_type": provider_type, }, ) logger.debug("Formatting alert") formatted_alert = cls._format_alert(event, provider_instance) if formatted_alert is None: logger.debug( "Provider returned None, which means it decided not to format the alert" ) return None logger.debug("Alert formatted") # after the provider calculated the default fingerprint # check if there is a custom deduplication rule and apply custom_deduplication_rule = get_custom_deduplication_rule( tenant_id=tenant_id, provider_id=provider_id, provider_type=provider_type, ) if not isinstance(formatted_alert, list): formatted_alert.providerId = provider_id formatted_alert.providerType = provider_type formatted_alert = [formatted_alert] else: for alert in formatted_alert: alert.providerId = provider_id alert.providerType = provider_type # if there is no custom deduplication rule, return the formatted alert if not custom_deduplication_rule: return formatted_alert # if there is a custom deduplication rule, apply it # apply the custom deduplication rule to calculate the fingerprint for alert in formatted_alert: logger.info( "Applying custom deduplication rule", extra={ "tenant_id": tenant_id, "provider_id": provider_id, "alert_id": alert.id, }, ) alert.fingerprint = cls.get_alert_fingerprint( alert, custom_deduplication_rule.fingerprint_fields ) return formatted_alert @staticmethod def get_alert_fingerprint(alert: AlertDto, fingerprint_fields: list = []) -> str: """ Get the fingerprint of an alert. Args: event (AlertDto): The alert to get the fingerprint of. fingerprint_fields (list, optional): The fields we calculate the fingerprint upon. Defaults to []. Returns: str: hexdigest of the fingerprint or the event.name if no fingerprint_fields were given. """ if not fingerprint_fields: return alert.name fingerprint = hashlib.sha256() event_dict = alert.dict() for fingerprint_field in fingerprint_fields: keys = fingerprint_field.split(".") fingerprint_field_value = event_dict for key in keys: if isinstance(fingerprint_field_value, dict): fingerprint_field_value = fingerprint_field_value.get(key, None) else: fingerprint_field_value = None break if isinstance(fingerprint_field_value, (list, dict)): fingerprint_field_value = json.dumps(fingerprint_field_value) if fingerprint_field_value is not None: fingerprint.update(str(fingerprint_field_value).encode()) return fingerprint.hexdigest() def get_alerts_configuration(self, alert_id: Optional[str] = None): """ Get configuration of alerts from the provider. Args: alert_id (Optional[str], optional): If given, gets a specific alert by id. Defaults to None. """ # todo: we'd want to have a common alert model for all providers (also for consistent output from GPT) raise NotImplementedError("get_alerts() method not implemented") def deploy_alert(self, alert: dict, alert_id: Optional[str] = None): """ Deploy an alert to the provider. Args: alert (dict): The alert to deploy. alert_id (Optional[str], optional): If given, deploys a specific alert by id. Defaults to None. """ raise NotImplementedError("deploy_alert() method not implemented") def _get_alerts(self) -> list[AlertDto]: """ Get alerts from the provider. """ raise NotImplementedError("get_alerts() method not implemented") def get_alerts(self) -> list[AlertDto]: """ Get alerts from the provider. """ with tracer.start_as_current_span(f"{self.__class__.__name__}-get_alerts"): alerts = self._get_alerts() # enrich alerts with provider id for alert in alerts: alert.providerId = self.provider_id alert.providerType = self.provider_type return alerts def get_alerts_by_fingerprint(self, tenant_id: str) -> dict[str, list[AlertDto]]: """ Get alerts from the provider grouped by fingerprint, sorted by lastReceived. Returns: dict[str, list[AlertDto]]: A dict of alerts grouped by fingerprint, sorted by lastReceived. """ try: alerts = self.get_alerts() except NotImplementedError: return {} if not alerts: return {} # get alerts, group by fingerprint and sort them by lastReceived with tracer.start_as_current_span(f"{self.__class__.__name__}-get_last_alerts"): get_attr = operator.attrgetter("fingerprint") grouped_alerts = { fingerprint: list(alerts) for fingerprint, alerts in itertools.groupby( sorted( alerts, key=get_attr, ), get_attr, ) } # enrich alerts with tracer.start_as_current_span(f"{self.__class__.__name__}-enrich_alerts"): pulled_alerts_enrichments = get_enrichments( tenant_id=tenant_id, fingerprints=grouped_alerts.keys(), ) for alert_enrichment in pulled_alerts_enrichments: if alert_enrichment: alerts_to_enrich = grouped_alerts.get( alert_enrichment.alert_fingerprint ) for alert_to_enrich in alerts_to_enrich: parse_and_enrich_deleted_and_assignees( alert_to_enrich, alert_enrichment.enrichments ) for enrichment in alert_enrichment.enrichments: # set the enrichment setattr( alert_to_enrich, enrichment, alert_enrichment.enrichments[enrichment], ) return grouped_alerts def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ) -> dict | None: """ Setup a webhook for the provider. Args: tenant_id (str): _description_ keep_api_url (str): _description_ api_key (str): _description_ setup_alerts (bool, optional): _description_. Defaults to True. Returns: dict | None: If some secrets needs to be saved, return them in a dict. Raises: NotImplementedError: _description_ """ raise NotImplementedError("setup_webhook() method not implemented") def clean_up(self): """ Clean up the provider. Raises:s NotImplementedError: for providers who does not implement this method. """ raise NotImplementedError("clean_up() method not implemented") @staticmethod def get_alert_schema() -> dict: """ Get the alert schema description for the provider. e.g. How to define an alert for the provider that can be pushed via the API. Returns: str: The alert format description. """ raise NotImplementedError( "get_alert_format_description() method not implemented" ) @staticmethod def oauth2_logic(**payload) -> dict: """ Logic for oauth2 authentication. For example, in Slack oauth2, we need to get the code from the payload and exchange it for a token. return: dict: The secrets to be saved as the provider configuration. (e.g. the Slack access token) """ raise NotImplementedError("oauth2_logic() method not implemented") @staticmethod def parse_event_raw_body(raw_body: bytes | dict) -> dict: """ Parse the raw body of an event and create an ingestable dict from it. For instance, in parseable, the "event" is just a string > b'Alert: Server side error triggered on teststream1\nMessage: server reporting status as 500\nFailing Condition: status column equal to abcd, 2 times' and we want to return an object > b"{'alert': 'Server side error triggered on teststream1', 'message': 'server reporting status as 500', 'failing_condition': 'status column equal to abcd, 2 times'}" If this method is not implemented for a provider, just return the raw body. Args: raw_body (bytes): The raw body of the incoming event (/event endpoint in alerts.py) Returns: dict: Ingestable event """ return raw_body def get_logs(self, limit: int = 5) -> list: """ Get logs from the provider. Args: limit (int): The number of logs to get. """ raise NotImplementedError("get_logs() method not implemented") def expose(self): """Expose parameters that were calculated during query time. Each provider can expose parameters that were calculated during query time. E.g. parameters that were supplied by the user and were rendered by the provider. A concrete example is the "_from" and "to" of the Datadog Provider which are calculated during execution. """ # TODO - implement dynamically using decorators and return {} def start_consume(self): """Get the consumer for the provider. should be implemented by the provider if it has a consumer. for an example, see Kafka Provider Returns: Consumer: The consumer for the provider. """ return def status(self) -> bool: """Return the status of the provider. Returns: bool: The status of the provider. """ return { "status": "should be implemented by the provider if it has a consumer", "error": "", } @property def is_consumer(self) -> bool: """Return consumer if the inherited class has a start_consume method. Returns: bool: _description_ """ return self.start_consume.__qualname__ != "BaseProvider.start_consume" def _push_alert(self, alert: dict): """ Push an alert to the provider. Args: alert (dict): The alert to push. """ # if this is not a dict, try to convert it to a dict if not isinstance(alert, dict): try: alert_data = json.loads(alert) except Exception: alert_data = alert_data else: alert_data = alert # if this is still not a dict, we can't push it if not isinstance(alert_data, dict): self.logger.warning( "We currently support only alert represented as a dict, dismissing alert", extra={"alert": alert}, ) return # now try to build the alert model # we will have a lot of default values here to support all providers and all cases, the # way to fine tune those would be to use the provider specific model or enforce that the event from the queue will be casted into the fields alert_model = AlertDto( id=alert_data.get("id", str(uuid.uuid4())), name=alert_data.get("name", "alert-from-event-queue"), status=alert_data.get("status", AlertStatus.FIRING), lastReceived=alert_data.get( "lastReceived", datetime.datetime.now(tz=datetime.timezone.utc).isoformat(), ), environment=alert_data.get("environment", "alert-from-event-queue"), isDuplicate=alert_data.get("isDuplicate", False), duplicateReason=alert_data.get("duplicateReason", None), service=alert_data.get("service", "alert-from-event-queue"), source=alert_data.get("source", [self.provider_type]), message=alert_data.get("message", "alert-from-event-queue"), description=alert_data.get("description", "alert-from-event-queue"), severity=alert_data.get("severity", AlertSeverity.INFO), pushed=alert_data.get("pushed", False), event_id=alert_data.get("event_id", str(uuid.uuid4())), url=alert_data.get("url", None), fingerprint=alert_data.get("fingerprint", None), providerId=self.provider_id, ) # push the alert to the provider url = f'{os.environ["KEEP_API_URL"]}/alerts/event' headers = { "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": self.context_manager.api_key, } response = requests.post( url, json=alert_model.dict(), headers=headers, params={"provider_id": self.provider_id}, ) try: response.raise_for_status() self.logger.info("Alert pushed successfully") except Exception: self.logger.error( f"Failed to push alert to {self.provider_id}: {response.content}" ) @classmethod def simulate_alert(cls) -> dict: # can be overridden by the provider import importlib import random module_path = ".".join(cls.__module__.split(".")[0:-1]) + ".alerts_mock" module = importlib.import_module(module_path) ALERTS = getattr(module, "ALERTS", None) alert_type = random.choice(list(ALERTS.keys())) alert_data = ALERTS[alert_type] # Start with the base payload simulated_alert = alert_data["payload"].copy() return simulated_alert @property def is_installed(self) -> bool: """ Check if provider has been recorded in the database. """ provider = get_provider_by_name( self.context_manager.tenant_id, self.config.name ) return provider is not None @property def is_provisioned(self) -> bool: """ Check if provider exist in env provisioning. """ from keep.parser.parser import Parser parser = Parser() parser._parse_providers_from_env(self.context_manager) return self.config.name in self.context_manager.providers_context @classmethod def has_health_report(cls) -> bool: return getattr(cls, "HAS_HEALTH_CHECK", False) class BaseTopologyProvider(BaseProvider): def pull_topology(self) -> tuple[list[TopologyServiceInDto], dict]: raise NotImplementedError("get_topology() method not implemented") class BaseIncidentProvider(BaseProvider): def _get_incidents(self) -> list[IncidentDto]: raise NotImplementedError("_get_incidents() in not implemented") def get_incidents(self) -> list[IncidentDto]: return self._get_incidents() @staticmethod def _format_incident( event: dict, provider_instance: "BaseProvider" = None ) -> IncidentDto | list[IncidentDto]: raise NotImplementedError("_format_incidents() not implemented") @classmethod def format_incident( cls, event: dict, tenant_id: str | None, provider_type: str | None, provider_id: str | None, ) -> IncidentDto | list[IncidentDto]: logger = logging.getLogger(__name__) provider_instance: BaseProvider | None = None if provider_id and provider_type and tenant_id: try: # To prevent circular imports from keep.providers.providers_factory import ProvidersFactory provider_instance: BaseProvider = ( ProvidersFactory.get_installed_provider( tenant_id, provider_id, provider_type ) ) except Exception: logger.exception( "Failed loading provider instance although all parameters were given", extra={ "tenant_id": tenant_id, "provider_id": provider_id, "provider_type": provider_type, }, ) logger.debug("Formatting Incident") return cls._format_incident(event, provider_instance) def setup_incident_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True, ) -> dict | None: """ Setup a webhook for the provider. Args: tenant_id (str): _description_ keep_api_url (str): _description_ api_key (str): _description_ setup_alerts (bool, optional): _description_. Defaults to True. Returns: dict | None: If some secrets needs to be saved, return them in a dict. Raises: NotImplementedError: _description_ """ raise NotImplementedError("setup_webhook() method not implemented") class ProviderHealthMixin: HAS_HEALTH_CHECK = True def get_health_report(self): health = {} alerts = self.get_alerts() self.check_topology_coverage(alerts, health) self.check_spammy_alerts(alerts, health) self.check_alerting_rules(alerts, health) return health def check_topology_coverage(self, alerts, health): if hasattr(self, "pull_topology"): topology, _ = self.pull_topology() uncovered_topology = copy.deepcopy(topology) for alert in alerts: uncovered_topology = list( filter(lambda t: not alert.service == t.service, uncovered_topology) ) health["topology"] = { "covered": [t for t in topology if t not in uncovered_topology], "uncovered": uncovered_topology, } def check_alerting_rules(self, alerts, health): if hasattr(self, "get_alerts_configuration"): rules = self.get_alerts_configuration() try: rules = list(map(json.loads, rules)) except json.JSONDecodeError: pass unused_rules = [] compiled_patterns = [re.compile(rule["message"]) for rule in rules] matched_patterns = set() for alert in alerts: for idx, pattern in enumerate(compiled_patterns): if idx in matched_patterns: continue if pattern.search(alert.message): matched_patterns.add(idx) health["rules"] = { "total": len(rules), "used": len(rules) - len(unused_rules), "unused": len(unused_rules), } def check_spammy_alerts(self, alerts, health): sorter = sorted(alerts, key=attrgetter("fingerprint")) alerts_per_fingerprint = itertools.groupby( sorter, key=attrgetter("fingerprint") ) spammy_alerts = [] for fingerprint, fingerprint_alerts in alerts_per_fingerprint: close_alerts = [] fingerprint_alerts = list(fingerprint_alerts) fingerprint_alerts.sort(key=attrgetter("lastReceived")) # Iterate through alerts to check if some of them are too close for i in range(len(fingerprint_alerts)): for j in range(i + 1, len(fingerprint_alerts)): if ( parse(fingerprint_alerts[j].lastReceived) - parse(fingerprint_alerts[i].lastReceived) <= SPAMMY_ALERTS_THRESHOLD ): close_alerts.append( (fingerprint_alerts[i], fingerprint_alerts[j]) ) else: break if len(close_alerts) > 2: spammy_alerts.extend(fingerprint_alerts) timestamps = [parse(alert.lastReceived) for alert in spammy_alerts] hours = [ts.strftime("%Y-%m-%d %H:00") for ts in timestamps] hourly_alerts = Counter(hours) health["spammy"] = [ {"date": date, "value": value} for date, value in hourly_alerts.items() ] ================================================ FILE: keep/providers/base/provider_exceptions.py ================================================ class GetAlertException(Exception): def __init__(self, message, status_code=403): self.message = message self.status_code = status_code class ProviderMethodException(Exception): def __init__(self, message, status_code=400): self.message = message self.status_code = status_code ================================================ FILE: keep/providers/bash_provider/__init__.py ================================================ ================================================ FILE: keep/providers/bash_provider/bash_provider.py ================================================ """ BashProvider is a class that implements the BaseOutputProvider. """ import shlex import subprocess from keep.iohandler.iohandler import IOHandler from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class BashProvider(BaseProvider): """Enrich alerts with data using Bash.""" def __init__(self, context_manager, provider_id: str, config: ProviderConfig): super().__init__(context_manager, provider_id, config) self.io_handler = IOHandler(context_manager=context_manager) def validate_config(self): pass def _query( self, timeout: int = 60, command: str = "", shell: bool = False, **kwargs ): """Bash provider eval shell command to get results Returns: _type_: _description_ """ parsed_command = self.io_handler.parse(command) if shell: # Use shell=True for complex commands try: result = subprocess.run( parsed_command, shell=True, capture_output=True, timeout=timeout, text=True, ) return { "stdout": result.stdout, "stderr": result.stderr, "return_code": result.returncode, } except subprocess.TimeoutExpired: try: self.logger.warning( "TimeoutExpired, using check_output - MacOS bug?" ) stdout = subprocess.check_output( parsed_command, stderr=subprocess.STDOUT, timeout=timeout, shell=True, ).decode() return { "stdout": stdout, "stderr": None, "return_code": 0, } except Exception as e: return { "stdout": None, "stderr": str(e), "return_code": -1, } else: # Original logic for simple commands parsed_commands = parsed_command.split("|") input_stream = None processes = [] for cmd in parsed_commands: cmd_args = shlex.split(cmd.strip()) process = subprocess.Popen( cmd_args, stdin=input_stream, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if input_stream is not None: input_stream.close() input_stream = process.stdout processes.append(process) try: stdout, stderr = processes[-1].communicate(timeout=timeout) return_code = processes[-1].returncode if stdout or stdout == b"": stdout = stdout.decode() if stderr or stderr == b"": stderr = stderr.decode() except subprocess.TimeoutExpired: try: self.logger.warning( "TimeoutExpired, using check_output - MacOS bug?" ) stdout = subprocess.check_output( parsed_command, stderr=subprocess.STDOUT, timeout=timeout, shell=True, ).decode() stderr = None return_code = 0 except Exception as e: stdout = None stderr = str(e) return_code = -1 return { "stdout": str(stdout), "stderr": str(stderr), "return_code": return_code, } def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass ================================================ FILE: keep/providers/bigquery_provider/__init__.py ================================================ ================================================ FILE: keep/providers/bigquery_provider/bigquery_provider.py ================================================ """ BigQuery provider. """ import dataclasses import json import os from typing import Optional import pydantic from google.cloud import bigquery from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class BigqueryProviderAuthConfig: """ BigQuery authentication configuration. """ service_account_json: str = dataclasses.field( metadata={ "required": True, "description": "The service account JSON with container.viewer role", "sensitive": True, "type": "file", "name": "service_account_json", "file_type": "application/json", }, ) project_id: Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "Google Cloud project ID. If not provided, " "it will try to fetch it from the environment variable 'GOOGLE_CLOUD_PROJECT'", }, ) class BigqueryProvider(BaseProvider): """Enrich alerts with data from BigQuery.""" provider_id: str config: ProviderConfig PROVIDER_DISPLAY_NAME = "BigQuery" PROVIDER_CATEGORY = ["Cloud Infrastructure", "Database"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for BigQuery provider. """ if self.config.authentication is None: self.config.authentication = {} self.authentication_config = BigqueryProviderAuthConfig( **self.config.authentication ) # Check for project_id and handle it here. if "project_id" not in self.config.authentication: try: self.config.authentication["project_id"] = os.environ[ "GOOGLE_CLOUD_PROJECT" ] except KeyError: raise ValueError( "GOOGLE_CLOUD_PROJECT environment variable is not set." ) if ( self.config.authentication["project_id"] is None or self.config.authentication["project_id"] == "" ): # If default project not found, raise error raise ValueError("BigQuery project id is missing.") def init_client(self): if self.authentication_config.service_account_json: # this is the content of the service account json if isinstance(self.authentication_config.service_account_json, dict): self.client = bigquery.Client.from_service_account_info( self.authentication_config.service_account_json ) elif isinstance(self.authentication_config.service_account_json, str): self.client = bigquery.Client.from_service_account_info( json.loads(self.authentication_config.service_account_json) ) # file? should never happen? else: self.client = bigquery.Client.from_service_account_json( self.authentication_config.service_account_json ) else: self.client = bigquery.Client() # check if the project id was set in the environment and use it if exists if self.authentication_config.project_id: self.client.project = self.authentication_config.project_id elif "GOOGLE_CLOUD_PROJECT" in os.environ: self.client.project = os.environ["GOOGLE_CLOUD_PROJECT"] else: raise ValueError( "Project ID must be set in either the configuration or the 'GOOGLE_CLOUD_PROJECT' environment variable." ) def dispose(self): self.client.close() def notify(self, **kwargs): pass # Define how to notify about any alerts or issues def _query(self, query: str): self.init_client() query_job = self.client.query(query) results = list(query_job.result()) return results def get_alerts_configuration(self, alert_id: Optional[str] = None): pass # Define how to get alerts from BigQuery if applicable def deploy_alert(self, alert: dict, alert_id: Optional[str] = None): pass # Define how to deploy an alert to BigQuery if applicable @staticmethod def get_alert_schema() -> dict: pass # Return alert schema specific to BigQuery def get_logs(self, limit: int = 5) -> list: pass # Define how to get logs from BigQuery if applicable def expose(self): return {} # Define any parameters to expose if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # If you want to use application default credentials, you can omit the authentication config config = { # "authentication": {"service_account.json": "/path/to/your/service_account.json"}, "authentication": {}, } # Create the provider provider = BigqueryProvider( context_manager, provider_id="bigquery-provider", provider_type="bigquery", config=ProviderConfig(**config), ) # Use the provider to execute a query results = provider.query( query=""" SELECT name, SUM(number) as num FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE state = 'TX' GROUP BY name ORDER BY num DESC LIMIT 10; """ ) # Print the results for row in results: print("{}: {}".format(row.name, row.num)) ================================================ FILE: keep/providers/centreon_provider/__init__.py ================================================ ================================================ FILE: keep/providers/centreon_provider/centreon_provider.py ================================================ """ Centreon is a class that provides a set of methods to interact with the Centreon API. """ import dataclasses import datetime import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class CentreonProviderAuthConfig: """ CentreonProviderAuthConfig is a class that holds the authentication information for the CentreonProvider. """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Centreon Host URL", "sensitive": False, "validation": "any_http_url", }, ) api_token: str = dataclasses.field( metadata={ "required": True, "description": "Centreon API Token", "sensitive": True, }, default=None, ) class CentreonProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Centreon" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope(name="authenticated", description="User is authenticated"), ] """ Centreon only supports the following host state (UP = 0, DOWN = 2, UNREA = 3) https://docs.centreon.com/docs/api/rest-api-v1/#realtime-information """ STATUS_MAP = { 2: AlertStatus.FIRING, 3: AlertStatus.FIRING, 0: AlertStatus.RESOLVED, } SEVERITY_MAP = { "CRITICAL": AlertSeverity.CRITICAL, "WARNING": AlertSeverity.WARNING, "UNKNOWN": AlertSeverity.INFO, "OK": AlertSeverity.LOW, "PENDING": AlertSeverity.INFO, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): """ Validates the configuration of the Centreon provider. """ self.authentication_config = CentreonProviderAuthConfig( **self.config.authentication ) def __get_url(self, params: str): url = self.authentication_config.host_url + "/centreon/api/index.php?" + params return url def __get_headers(self): return { "Content-Type": "application/json", "centreon-auth-token": self.authentication_config.api_token, } def validate_scopes(self) -> dict[str, bool | str]: """ Validate the scopes of the provider. """ try: response = requests.get( self.__get_url("object=centreon_realtime_hosts&action=list"), headers=self.__get_headers(), ) if response.ok: scopes = {"authenticated": True} else: scopes = { "authenticated": f"Error validating scopes: {response.status_code} {response.text}" } except Exception as e: scopes = { "authenticated": f"Error validating scopes: {e}", } return scopes def __get_host_status(self) -> list[AlertDto]: try: url = self.__get_url("object=centreon_realtime_hosts&action=list") response = requests.get(url, headers=self.__get_headers()) if not response.ok: self.logger.error( "Failed to get host status from Centreon: %s", response.json() ) raise ProviderException("Failed to get host status from Centreon") return [ AlertDto( id=host["id"], name=host["name"], address=host["address"], description=host["output"], status=host["state"], severity=host["output"].split()[0], instance_name=host["instance_name"], acknowledged=host["acknowledged"], max_check_attempts=host["max_check_attempts"], lastReceived=datetime.datetime.fromtimestamp( host["last_check"] ).isoformat(), source=["centreon"], ) for host in response.json() ] except Exception as e: self.logger.error("Error getting host status from Centreon: %s", e) raise ProviderException( f"Error getting host status from Centreon: {e}" ) from e def __get_service_status(self) -> list[AlertDto]: try: url = self.__get_url("object=centreon_realtime_services&action=list") response = requests.get(url, headers=self.__get_headers()) if not response.ok: self.logger.error( "Failed to get service status from Centreon: %s", response.json() ) raise ProviderException("Failed to get service status from Centreon") return [ AlertDto( id=service["service_id"], host_id=service["host_id"], name=service["name"], description=service["description"], status=service["state"], severity=service["output"].split(":")[0], acknowledged=service["acknowledged"], max_check_attempts=service["max_check_attempts"], lastReceived=datetime.datetime.fromtimestamp( service["last_check"] ).isoformat(), source=["centreon"], ) for service in response.json() ] except Exception as e: self.logger.error("Error getting service status from Centreon: %s", e) raise ProviderException( f"Error getting service status from Centreon: {e}" ) from e def _get_alerts(self) -> list[AlertDto]: alerts = [] try: self.logger.info("Collecting alerts (host status) from Centreon") host_status_alerts = self.__get_host_status() alerts.extend(host_status_alerts) except Exception as e: self.logger.error("Error getting host status from Centreon: %s", e) try: self.logger.info("Collecting alerts (service status) from Centreon") service_status_alerts = self.__get_service_status() alerts.extend(service_status_alerts) except Exception as e: self.logger.error("Error getting service status from Centreon: %s", e) return alerts if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os host_url = os.environ.get("CENTREON_HOST_URL") api_token = os.environ.get("CENTREON_API_TOKEN") if host_url is None: raise ProviderException("CENTREON_HOST_URL is not set") config = ProviderConfig( description="Centreon Provider", authentication={ "host_url": host_url, "api_token": api_token, }, ) provider = CentreonProvider( context_manager, provider_id="centreon", config=config, ) provider._get_alerts() ================================================ FILE: keep/providers/checkly_provider/__init__.py ================================================ ================================================ FILE: keep/providers/checkly_provider/alerts_mock.py ================================================ ALERTS = { "event": "API Check #1 has recovered", "alert_type": "ALERT_RECOVERY", "check_name": "API Check #1", "group_name": "", "check_id": "927a2982-1007-4b81-b383-eae8bf717e61", "check_type": "API", "check_result_id": "a34867c0-9239-421f-92f2-4408bbd05417", "check_error_message": "", "response_time": "258", "api_check_response_status_code": "200", "api_check_response_status_text": "OK", "run_location": "Singapore", "ssl_days_remaining": "", "ssl_check_domain": "", "started_at": "2025-01-26T11:19:40.544Z", "tags": "", "link": "https://app.checklyhq.com/checks/927a2982-1007-4b81-b383-eae8bf717e61/check-sessions/478cacb1-c40f-4675-89d7-a4e3ecaafb7b", "region": "", "uuid": "4583208e-0bca-48c6-8dc8-d14faf6102b3" } ================================================ FILE: keep/providers/checkly_provider/checkly_provider.py ================================================ """ ChecklyProvider is a class that allows you to receive alerts from Checkly using API endpoints as well as webhooks. """ import dataclasses import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class ChecklyProviderAuthConfig: """ ChecklyProviderAuthConfig is a class that allows you to authenticate in Checkly. """ checklyApiKey: str = dataclasses.field( metadata={ "required": True, "description": "Checkly API Key", "sensitive": True, }, ) accountId: str = dataclasses.field( metadata={ "required": True, "description": "Checkly Account ID", "sensitive": True, }, ) class ChecklyProvider(BaseProvider): """ Get alerts from Checkly into Keep. """ webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Checkly to Keep, Use the following webhook url to configure Checkly send alerts to Keep: 1. In Checkly dashboard open "Alerts" tab. 2. Click on "Add more channels". 3. Select "Webhook" from the list. 4. Enter a name for the webhook, select the method as "POST" and enter the webhook URL as {keep_webhook_api_url}. 5. Copy the Body template from the [Keep documentation](https://docs.keephq.dev/providers/documentation/checkly-provider) and paste it in the Body field of the webhook. 6. Add a request header with the key "X-API-KEY" and the value as {api_key}. 7. Save the webhook. """ PROVIDER_DISPLAY_NAME = "Checkly" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="read_alerts", description="Read alerts from Checkly", ), ] # Based on the Alert states in Checkly, we map them to the AlertStatus and AlertSeverity in Keep. STATUS_MAP = { "NO_ALERT": AlertStatus.RESOLVED, "ALERT_DEGRADED": AlertStatus.FIRING, "ALERT_FAILURE": AlertStatus.FIRING, "ALERT_DEGRADED_REMAIN": AlertStatus.ACKNOWLEDGED, "ALERT_DEGRADED_RECOVERY": AlertStatus.RESOLVED, "ALERT_DEGRADED_FAILURE": AlertStatus.FIRING, "ALERT_FAILURE_REMAIN": AlertStatus.ACKNOWLEDGED, "ALERT_FAILURE_DEGRADED": AlertStatus.ACKNOWLEDGED, "ALERT_RECOVERY": AlertStatus.RESOLVED } SEVERITY_MAP = { "NO_ALERT": AlertSeverity.INFO, "ALERT_DEGRADED": AlertSeverity.WARNING, "ALERT_FAILURE": AlertSeverity.CRITICAL, "ALERT_DEGRADED_REMAIN": AlertSeverity.WARNING, "ALERT_DEGRADED_RECOVERY": AlertSeverity.INFO, "ALERT_DEGRADED_FAILURE": AlertSeverity.HIGH, "ALERT_FAILURE_REMAIN": AlertSeverity.CRITICAL, "ALERT_FAILURE_DEGRADED": AlertSeverity.WARNING, "ALERT_RECOVERY": AlertSeverity.INFO } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for ilert provider. """ self.authentication_config = ChecklyProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validate scopes for the provider """ self.logger.info("Validating Checkly provider scopes") try: response = requests.get( self.__get_url(), headers=self.__get_auth_headers(), ) if response.status_code != 200: response.raise_for_status() self.logger.info("Successfully validated scopes", extra={"response": response.json()}) return {"read_alerts": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"error": e}) return {"read_alerts": str(e)} def _get_alerts(self) -> list[AlertDto]: """ Get alerts from Checkly. """ self.logger.info("Getting alerts from Checkly") alerts = self.__get_paginated_data() return [ AlertDto( id=alert["id"], name=alert["name"], status=ChecklyProvider.STATUS_MAP[alert["alertType"]], severity=ChecklyProvider.SEVERITY_MAP[alert["alertType"]], lastReceivedAt=alert["created_at"], alertType=alert["alertType"], checkId=alert["checkId"], checkType=alert["checkType"], runLocation=alert["runLocation"], responseTime=alert["responseTime"], error=alert["error"], statusCode=alert["statusCode"], created_at=alert["created_at"], startedAt=alert["startedAt"], source=["checkly"] ) for alert in alerts ] @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: alert = AlertDto( id=event["uuid"], name=event["check_name"], description=event["event"], status=ChecklyProvider.STATUS_MAP[event["alert_type"]], severity=ChecklyProvider.SEVERITY_MAP[event["alert_type"]], lastReceived=event["started_at"], alertType=event["alert_type"], groupName=event["group_name"], checkId=event["check_id"], checkType=event["check_type"], checkResultId=event["check_result_id"], checkErrorMessage=event["check_error_message"], responseTime=event["response_time"], apiCheckResponseStatus=event["api_check_response_status_code"], apiCheckResponseStatusText=event["api_check_response_status_text"], runLocation=event["run_location"], sslDaysRemaining=event["ssl_days_remaining"], sslCheckDomain=event["ssl_check_domain"], startedAt=event["started_at"], tags=event["tags"], url=event["link"], region=event["region"], source=["checkly"] ) return alert def __get_auth_headers(self): return { "Authorization": f"Bearer {self.authentication_config.checklyApiKey}", "X-Checkly-Account": self.authentication_config.accountId, "accept": "application/json" } def __get_paginated_data(self, query_params: dict = {}) -> list: data = [] page = 1 while True: self.logger.info(f"Getting data from page {page}") query_params["page"] = page try: url = self.__get_url(query_params) headers = self.__get_auth_headers() response = requests.get(url, headers=headers) response.raise_for_status() page_data = response.json() if not page_data: break self.logger.info(f"Got {len(page_data)} data from page {page}") data.extend(page_data) page += 1 except Exception as e: self.logger.error(f"Error getting data from page {page}: {e}") break return data def __get_url(self, query_params: dict = {}): url = "https://api.checklyhq.com/v1/check-alerts" if query_params: url += "?" for key, value in query_params.items(): url += f"{key}={value}&" url = url[:-1] return url if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os checkly_api_key = os.getenv("CHECKLY_API_KEY") checkly_account_id = os.getenv("CHECKLY_ACCOUNT_ID") config = ProviderConfig( description="Checkly Provider", authentication={ "checklyApiKey": checkly_api_key, "accountId": checkly_account_id, } ) provider = ChecklyProvider(context_manager, "checkly", config) alerts = provider.get_alerts() print(alerts) ================================================ FILE: keep/providers/checkmk_provider/README.md ================================================ ## Checkmk Setup using Docker 1. Pull the check-mk-cloud image ```bash docker pull checkmk/check-mk-cloud:2.3.0p19 ``` 2. Start the container ```bash docker container run -dit \ -p 8080:5000 \ -p 8000:8000 \ --tmpfs /opt/omd/sites/cmk/tmp:uid=1000,gid=1000 \ -v monitoring:/omd/sites \ --name monitoring \ -v /etc/localtime:/etc/localtime:ro \ --restart always \ checkmk/check-mk-cloud:2.3.0p19 ``` 3. Access the Checkmk web interface at `http://localhost:8080/` 4. You can view your login credentials by running the following command ```bash docker container logs monitoring ``` ================================================ FILE: keep/providers/checkmk_provider/__init__.py ================================================ ================================================ FILE: keep/providers/checkmk_provider/alerts_mock.py ================================================ ALERTS = { "id": "18", "summary": "CheckMK server1 - DOWN -> UP", "host": "server1", "alias": "server1", "address": "10.10.0.185", "event": "DOWN -> UP", "output": "Packet received via smart PING", "long_output": "", "status": "UP", "severity": "OK", "url": "/check_mk/index.py?start_url=view.py?view_name%3Dhoststatus%26host%3Dserver1%26site%3Dcmk", "check_command": "check-mk-host-smart", "site": "cmk", "what": "HOST", "notification_type": "RECOVERY", "contact_name": "agent_registration", "contact_email": "", "contact_pager": "", "date": "2024-10-26", "long_date_time": "Sat Oct 26 23:20:39 UTC 2024", "short_date_time": "2024-10-26 23:20:39" } ================================================ FILE: keep/providers/checkmk_provider/checkmk_provider.py ================================================ """ Checkmk is a monitoring tool for Infrastructure and Application Monitoring. """ import logging from datetime import datetime, timezone from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig logger = logging.getLogger(__name__) class CheckmkProvider(BaseProvider): """Get alerts from Checkmk into Keep""" webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ 1. Checkmk supports custom notification scripts. 2. Install Keep webhook script following the [Keep documentation](https://docs.keephq.dev/providers/documentation/checkmk-provider). 3. In Checkmk WebUI, go to Setup. 4. Click on Add rule. 5. In the Notifications method section, select Webhook - KeepHQ and choose "Call with the following parameters:". 6. Configure the Rule properties, Contact selections, and Conditions according to your requirements. 7. The first parameter is the Webhook URL of Keep which is {keep_webhook_api_url}. 8. The second parameter is the API Key of Keep which is {api_key}. 9. Click on Save. 10. Now Checkmk will be able to send alerts to Keep. """ SEVERITIES_MAP = { "OK": AlertSeverity.INFO, "WARN": AlertSeverity.WARNING, "CRIT": AlertSeverity.CRITICAL, "UNKNOWN": AlertSeverity.INFO, } STATUS_MAP = { "UP": AlertStatus.RESOLVED, "DOWN": AlertStatus.FIRING, "ACKNOWLEDGED": AlertStatus.ACKNOWLEDGED, "UNREACH": AlertStatus.FIRING, } PROVIDER_DISPLAY_NAME = "Checkmk" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] FINGERPRINT_FIELDS = ["id"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(): """ No validation required for Checkmk provider. """ pass @staticmethod def convert_to_utc_isoformat(long_date_time: str, default: str) -> str: # Early return if long_date_time is None if long_date_time is None: logger.warning("Received None as long_date_time, returning default value") return default logger.info(f"Converting {long_date_time} to UTC ISO format") formats = [ "%a %b %d %H:%M:%S %Z %Y", # For timezone names (e.g., CEST, UTC) "%a %b %d %H:%M:%S %z %Y", # For timezone offsets (e.g., +0700, -0500) "%a %b %d %H:%M:%S %z%z %Y", # For space-separated offsets (e.g., +07 00) ] for date_format in formats: try: # Handle special case where timezone offset has a space if "+" in long_date_time or "-" in long_date_time: # Remove space in timezone offset if present (e.g., '+07 00' -> '+0700') parts = long_date_time.split() if ( len(parts) == 6 and len(parts[4]) == 3 ): # If offset is +07, we need +0700 parts[4] = parts[4] + "00" long_date_time = " ".join(parts) if len(parts) == 7: # If offset is split into two parts offset = parts[-3] + parts[-2] long_date_time = " ".join(parts[:-3] + [offset] + parts[-1:]) # Parse the datetime string local_dt = datetime.strptime(long_date_time, date_format) # Convert to UTC if it has timezone info, otherwise assume UTC if local_dt.tzinfo is None: local_dt = local_dt.replace(tzinfo=timezone.utc) utc_dt = local_dt.astimezone(timezone.utc) # Return the ISO 8601 format return utc_dt.isoformat() except ValueError: continue # If none of the formats match logger.exception(f"Error converting {long_date_time} to UTC ISO format") return default @staticmethod def _format_alert( event: dict, provider_instance: BaseProvider = None ) -> AlertDto | list[AlertDto]: """ Service alerts and Host alerts have different fields, so we are mapping the fields based on the event type. """ def _check_values(value): if value not in event or event.get(value) == "": return None return event.get(value) # Service alerts don't have a status field, so we are mapping the status based on the severity. def _set_severity(status): if status == "UP": return AlertSeverity.INFO elif status == "DOWN": return AlertSeverity.CRITICAL elif status == "UNREACH": return AlertSeverity.CRITICAL # https://forum.checkmk.com/t/convert-notify-shortdatetime-to-utc-timezone/20158/2 microtime = _check_values("micro_time") logger.info(f"Microtime: {microtime}") if microtime: ts = int(int(microtime) / 1000000) dt_object = datetime.fromtimestamp(ts) last_received = dt_object.isoformat() else: last_received = CheckmkProvider.convert_to_utc_isoformat( _check_values("long_date_time"), _check_values("short_date_time") ) alert = AlertDto( id=_check_values("id"), name=_check_values("check_command"), description=_check_values("summary"), severity=CheckmkProvider.SEVERITIES_MAP.get( event.get("severity"), _set_severity(event.get("status")) ), status=CheckmkProvider.STATUS_MAP.get( event.get("status"), AlertStatus.FIRING ), host=_check_values("host"), alias=_check_values("alias"), address=_check_values("address"), service=_check_values("service"), source=["checkmk"], current_event=_check_values("event"), output=_check_values("output"), long_output=_check_values("long_output"), path_url=_check_values("url"), perf_data=_check_values("perf_data"), site=_check_values("site"), what=_check_values("what"), notification_type=_check_values("notification_type"), contact_name=_check_values("contact_name"), contact_email=_check_values("contact_email"), contact_pager=_check_values("contact_pager"), date=_check_values("date"), lastReceived=last_received, long_date=_check_values("long_date_time"), ) return alert if __name__ == "__main__": pass ================================================ FILE: keep/providers/checkmk_provider/webhook-keep.py ================================================ #!/usr/bin/env python3 # webhook-keep """ This script needs to be copied to the Checkmk server to send notifications to keep. For more details on how to configure Checkmk to send alerts to Keep, see https://docs.keephq.dev/providers/documentation/checkmk-provider. """ import os import sys import requests # Get keep Webhook URL and API Key from environment variables def GetPluginParams(): env_vars = os.environ WebHookURL = str(env_vars.get("NOTIFY_PARAMETER_1")) API_KEY = str(env_vars.get("NOTIFY_PARAMETER_2")) # "None", if not in the environment variables if WebHookURL == "None" or API_KEY == "None": print("keep-plugin: Missing Webhook URL or API Key") return ( 2, "", ) # https://docs.checkmk.com/latest/en/notifications.html#_traceable_notifications return 0, WebHookURL # Notification details are stored in environment variables def GetNotificationDetails(): # https://docs.checkmk.com/latest/en/notifications.html#environment_variables env_vars = os.environ print(env_vars) SITE = env_vars.get("OMD_SITE") WHAT = env_vars.get("NOTIFY_WHAT") NOTIFICATIONTYPE = env_vars.get("NOTIFY_NOTIFICATIONTYPE") CONTACTNAME = env_vars.get("NOTIFY_CONTACTNAME") CONTACTEMAIL = env_vars.get("NOTIFY_CONTACTEMAIL") CONTACTPAGER = env_vars.get("NOTIFY_CONTACTPAGER") DATE = env_vars.get("NOTIFY_DATE") LONGDATETIME = env_vars.get("NOTIFY_LONGDATETIME") SHORTDATETIME = env_vars.get("NOTIFY_SHORTDATETIME") MICROTIME = env_vars.get("NOTIFY_MICROTIME") HOSTNAME = env_vars.get("NOTIFY_HOSTNAME") HOSTALIAS = env_vars.get("NOTIFY_HOSTALIAS") ADDRESS = env_vars.get("NOTIFY_HOSTADDRESS") HOST_PROBLEM_ID = env_vars.get("NOTIFY_HOSTPROBLEMID") OUTPUT_HOST = env_vars.get("NOTIFY_HOSTOUTPUT") NOTIFY_HOSTSTATE = env_vars.get("NOTIFY_HOSTSTATE") LONG_OUTPUT_HOST = env_vars.get("NOTIFY_LONGHOSTOUTPUT") HOST_URL = env_vars.get("NOTIFY_HOSTURL") HOST_CHECK_COMMAND = env_vars.get("NOTIFY_HOSTCHECKCOMMAND") NOTIFY_LASTHOSTSHORTSTATE = env_vars.get("NOTIFY_LASTHOSTSHORTSTATE") EVENT_HOST = f"{NOTIFY_LASTHOSTSHORTSTATE} -> {NOTIFY_HOSTSTATE}" CURRENT_HOST_STATE = env_vars.get("NOTIFY_HOSTSTATE") SERVICE_PROBLEM_ID = env_vars.get("NOTIFY_SERVICEPROBLEMID") SERVICE = env_vars.get("NOTIFY_SERVICEDESC") OUTPUT_SERVICE = env_vars.get("NOTIFY_SERVICEOUTPUT") LONG_OUTPUT_SERVICE = env_vars.get("NOTIFY_LONGSERVICEOUTPUT") SERVICE_URL = env_vars.get("NOTIFY_SERVICEURL") SERVICE_CHECK_COMMAND = env_vars.get("NOTIFY_SERVICECHECKCOMMAND") PERF_DATA = env_vars.get("NOTIFY_SERVICEPERFDATA") NOTIFY_SERVICESTATE = env_vars.get("NOTIFY_SERVICESTATE") NOTIFY_LASTSERVICESTATE = env_vars.get("NOTIFY_LASTSERVICESTATE") EVENT_SERVICE = f"{NOTIFY_LASTSERVICESTATE} -> {NOTIFY_SERVICESTATE}" CURRENT_SERVICE_STATE = env_vars.get("NOTIFY_SERVICESTATE") # General information general = { "site": SITE, "what": WHAT, "notification_type": NOTIFICATIONTYPE, "contact_name": CONTACTNAME, "contact_email": CONTACTEMAIL, "contact_pager": CONTACTPAGER, "date": DATE, "long_date_time": LONGDATETIME, "short_date_time": SHORTDATETIME, "micro_time": MICROTIME, } # Host related information host_notify = { "id": HOST_PROBLEM_ID, "summary": f"CheckMK {HOSTNAME} - {EVENT_HOST}", "host": HOSTNAME, "alias": HOSTALIAS, "address": ADDRESS, "event": EVENT_HOST, "output": OUTPUT_HOST, "long_output": LONG_OUTPUT_HOST, "status": CURRENT_HOST_STATE, "severity": "OK", "url": HOST_URL, "check_command": HOST_CHECK_COMMAND, **general, } # Service related information # See NOTIFY_NOTIFICATIONTYPE in https://docs.checkmk.com/latest/en/notifications.html#environment_variables if NOTIFICATIONTYPE == "RECOVERY": status = "UP" elif NOTIFICATIONTYPE == "PROBLEM": status = "DOWN" elif NOTIFICATIONTYPE == "ACKNOWLEDGEMENT": status = "ACKNOWLEDGED" # FLAPPINGSTART, FLAPPINGSTOP, FLAPPINGDISABLED, DOWNTIMESTART, DOWNTIMEEND, DOWNTIMECANCELLED, etc else: status = "DOWN" service_notify = { "id": SERVICE_PROBLEM_ID, "summary": f"CheckMK {HOSTNAME}/{SERVICE} {EVENT_SERVICE}", "host": HOSTNAME, "alias": HOSTALIAS, "address": ADDRESS, "service": SERVICE, "event": EVENT_SERVICE, "output": OUTPUT_SERVICE, "long_output": LONG_OUTPUT_SERVICE, "status": status, "severity": CURRENT_SERVICE_STATE, "url": SERVICE_URL, "check_command": SERVICE_CHECK_COMMAND, "perf_data": PERF_DATA, **general, } # Handle HOST and SERVICE notifications if WHAT == "SERVICE": notify = service_notify else: notify = host_notify return notify # Start Keep workflow def StartKeepWorkflow(WebHookURL, data): return_code = 0 API_KEY = str(os.environ.get("NOTIFY_PARAMETER_2")) headers = { "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": API_KEY, } try: response = requests.post(WebHookURL, headers=headers, json=data) if response.status_code == 200: print("keep-plugin: Workflow started successfully.") else: print( f"keep-plugin: Failed to start the workflow. Status code: {response.status_code}" ) print(response.text) return_code = 2 except Exception as e: print(f"keep-plugin: An error occurred: {e}") return_code = 2 return return_code def main(): print("keep-plugin: Starting...") return_code, WebHookURL = GetPluginParams() if return_code != 0: return return_code # Abort, if parameter for the webhook is missing print("keep-plugin: Getting notification details...") data = GetNotificationDetails() print("keep-plugin: Starting Keep workflow...") return_code = StartKeepWorkflow(WebHookURL, data) print("keep-plugin: Finished.") return return_code if __name__ == "__main__": sys.exit(main()) ================================================ FILE: keep/providers/cilium_provider/__init__.py ================================================ ================================================ FILE: keep/providers/cilium_provider/cilium_provider.py ================================================ import dataclasses from collections import defaultdict import grpc import pydantic from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseTopologyProvider from keep.providers.models.provider_config import ProviderConfig from keep.validation.fields import NoSchemeUrl @pydantic.dataclasses.dataclass class CiliumProviderAuthConfig: """Cilium authentication configuration.""" cilium_base_endpoint: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "The base endpoint of the cilium hubble relay", "sensitive": False, "hint": "localhost:4245", "validation": "no_scheme_url", } ) class CiliumProvider(BaseTopologyProvider): """Manage Cilium provider.""" PROVIDER_TAGS = ["topology"] PROVIDER_DISPLAY_NAME = "Cilium" PROVIDER_CATEGORY = ["Cloud Infrastructure", "Security"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ return {} def validate_config(self): self.authentication_config = CiliumProviderAuthConfig( **self.config.authentication ) def _extract_name_from_label(self, label: str) -> str: if label.startswith("k8s:app="): return label.split("=")[1] elif label.startswith("k8s:app.kubernetes.io/name="): return label.split("=")[1] return None def _get_service_name(self, endpoint) -> str: # 1. try to get from workfload if endpoint.workloads: return endpoint.workloads[0].name # 2. try to get from labels for label in endpoint.labels: name = self._extract_name_from_label(label) if name: return name # 3. try to get from pod name service = endpoint.pod_name parts = service.split("-") if len(parts) > 2: return "-".join(parts[:-2]) elif len(parts) == 2: return parts[0] if not service: return "unknown" return service def pull_topology(self) -> list[TopologyServiceInDto]: # for some providers that depends on grpc like cilium provider, this might fail on imports not from Keep (such as the docs script) from keep.providers.cilium_provider.grpc.observer_pb2 import ( # noqa FlowFilter, GetFlowsRequest, ) from keep.providers.cilium_provider.grpc.observer_pb2_grpc import ( # noqa ObserverStub, ) channel = grpc.insecure_channel(self.authentication_config.cilium_base_endpoint) stub = ObserverStub(channel) # Create a request for the last 1000 flows request = GetFlowsRequest( number=1000, whitelist=[FlowFilter(source_pod={}, destination_pod={})] ) # Query the API responses = stub.GetFlows(request) # Process the responses service_map = defaultdict(lambda: {"dependencies": set(), "namespace": ""}) # https://docs.cilium.io/en/stable/_api/v1/flow/README/#flow-FlowFilter # get the responses as list responses = list(responses) # Track applications and their services application_to_services = {} application_to_name = {} for response in responses: flow = response.flow if not flow.source: continue # https://docs.cilium.io/en/stable/_api/v1/flow/README/#endpoint if flow.source.pod_name and flow.destination.pod_name: source = self._get_service_name(flow.source) destination = self._get_service_name(flow.destination) source_namespace = flow.source.namespace destination_namespace = flow.destination.namespace node_labels = list(flow.node_labels) destination_port = flow.l4.TCP.destination_port # source_port = flow.l4.TCP.source_port category = "http" if destination_port == 5432: category = "postgres" # Check for application label try: application_label = [ label for label in flow.source.labels if label.startswith("k8s:keepapp=") ] # If no application label, skip if not application_label: continue application_id = application_label[0].split("=")[1] # Store application name (using app ID as name for now) application_to_name[application_id] = application_id # Add service to application if application_id not in application_to_services: application_to_services[application_id] = set() application_to_services[application_id].add(source) except Exception: pass service_map[source]["dependencies"].add(destination) service_map[source]["namespace"] = source_namespace service_map[source]["tags"] = list(flow.source.labels) service_map[source]["tags"].append(flow.source.pod_name) service_map[source]["tags"].append(flow.source.cluster_name) service_map[source]["tags"] += node_labels if destination not in service_map: service_map[destination] = { "dependencies": set(), "namespace": destination_namespace or "internet", } service_map[destination]["dependencies"].add(source) service_map[destination]["tags"] = list(flow.destination.labels) service_map[destination]["category"] = category else: service_map[destination]["dependencies"].add(source) service_map[destination]["tags"] = list(flow.destination.labels) # if its outside the cluster elif ( flow.destination and flow.destination.labels and "reserved:world" in flow.destination.labels ): source = self._get_service_name(flow.source) destination = flow.IP.destination source_namespace = flow.source.namespace node_labels = list(flow.node_labels) destination_port = flow.l4.TCP.destination_port # source_port = flow.l4.TCP.source_port category = "http" if destination_port == 5432: category = "postgres" service_map[source]["dependencies"].add(destination) service_map[source]["namespace"] = source_namespace service_map[source]["tags"] = list(flow.source.labels) service_map[source]["tags"].append(flow.source.pod_name) service_map[source]["tags"].append(flow.source.cluster_name) service_map[source]["tags"] += node_labels # Check if this source service belongs to any applications for app_id, services in application_to_services.items(): if source in services: self.logger.debug( f"Adding {destination} to application {app_id}" ) application_to_services[app_id].add(destination) if destination not in service_map: service_map[destination] = { "dependencies": set(), "namespace": "internet", } # destination_namespace is external service_map[destination]["dependencies"].add(source) service_map[destination]["tags"] = list(flow.destination.labels) service_map[destination]["category"] = category else: service_map[destination]["dependencies"].add(source) service_map[destination]["tags"] = list(flow.destination.labels) # Convert to TopologyServiceInDto topology = [] app_ids_to_uuids = {} for service, data in service_map.items(): try: # Create application_relations dictionary for this service application_relations = {} for app_id, services in application_to_services.items(): if service in services: # idk what Jay did... import uuid if app_id in app_ids_to_uuids: app_uuid = app_ids_to_uuids[app_id] else: app_ids_to_uuids[app_id] = uuid.uuid4() app_uuid = app_ids_to_uuids[app_id] application_relations[app_uuid] = app_id topology_service = TopologyServiceInDto( source_provider_id=self.provider_id, service=service, display_name=service, environment=data["namespace"], dependencies={dep: "network" for dep in data["dependencies"]}, tags=list(data["tags"]), category=data.get("category", "http"), namespace=data["namespace"], application_relations=( application_relations if application_relations else None ), ) topology.append(topology_service) except Exception as e: self.logger.error( "Error processing service", extra={ "service": service, "data": data, "error": str(e), }, ) pass self.logger.info( "Topology pulling completed", extra={ "tenant_id": self.context_manager.tenant_id, "len_of_topology": len(topology), }, ) # Return only the topology data as the application info is now included in each service return topology, {} def get_existing_services(self, all_services): """Helper function to create a set of all valid service names""" return {service for service in all_services} def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables cilium_base_endpoint = "localhost:4245" # Initialize the provider and provider config config = ProviderConfig( description="Cilium Provider", authentication={ "cilium_base_endpoint": cilium_base_endpoint, }, ) provider = CiliumProvider(context_manager, provider_id="cilium", config=config) r, _ = provider.pull_topology() print(r) ================================================ FILE: keep/providers/cilium_provider/generate_protobuf.py ================================================ import os import subprocess """ Shahar: this is internal script to produce the protobuf files that are used in the cilium provider. In short: - It downloads the proto files from the cilium repository - It generates the python code from the proto files The generated code is used in the cilium provider to communicate with the cilium hubble relay. Notice that the generated code is unaware of the location of the provider in Keep, so there are few adjustments needed: 1. change all from flow.flow_pb2 import * to from keep.providers.cilium_provider.grpc.flow.flow_pb2 import * 2. comment all the # from google.protobuf import runtime_version as _runtime_version 3. comment all the ValidateProtobufRuntimeVersion Anyway - if you are reading this, you probably need to talk with me. """ # Create directories for the proto files os.makedirs("hubble_proto/google/protobuf", exist_ok=True) os.makedirs("hubble_proto/flow", exist_ok=True) os.makedirs("hubble_proto/relay", exist_ok=True) # Download the necessary proto files proto_files = [ ( "https://raw.githubusercontent.com/cilium/cilium/master/api/v1/flow/flow.proto", "hubble_proto/flow/flow.proto", ), ( "https://raw.githubusercontent.com/cilium/cilium/master/api/v1/observer/observer.proto", "hubble_proto/observer.proto", ), ( "https://raw.githubusercontent.com/cilium/cilium/master/api/v1/relay/relay.proto", "hubble_proto/relay/relay.proto", ), ( "https://raw.githubusercontent.com/protocolbuffers/protobuf/master/src/google/protobuf/timestamp.proto", "hubble_proto/google/protobuf/timestamp.proto", ), ( "https://raw.githubusercontent.com/protocolbuffers/protobuf/master/src/google/protobuf/duration.proto", "hubble_proto/google/protobuf/duration.proto", ), ( "https://raw.githubusercontent.com/protocolbuffers/protobuf/master/src/google/protobuf/wrappers.proto", "hubble_proto/google/protobuf/wrappers.proto", ), ] for proto_url, proto_path in proto_files: subprocess.run(["curl", "-o", proto_path, proto_url]) # Generate Python code from proto files subprocess.run( [ "python", "-m", "grpc_tools.protoc", "-I", "hubble_proto", "--python_out=.", "--grpc_python_out=.", "hubble_proto/flow/flow.proto", "hubble_proto/observer.proto", "hubble_proto/relay/relay.proto", ] ) print("gRPC Python client generation completed.") ================================================ FILE: keep/providers/cilium_provider/grpc/__init__.py ================================================ ================================================ FILE: keep/providers/cilium_provider/grpc/flow/__init__.py ================================================ ================================================ FILE: keep/providers/cilium_provider/grpc/flow/flow.proto ================================================ // SPDX-License-Identifier: Apache-2.0 // Copyright Authors of Hubble syntax = "proto3"; import "google/protobuf/any.proto"; import "google/protobuf/wrappers.proto"; import "google/protobuf/timestamp.proto"; package flow; option go_package = "github.com/cilium/cilium/api/v1/flow"; message Flow { google.protobuf.Timestamp time = 1; // uuid is a universally unique identifier for this flow. string uuid = 34; Verdict verdict = 2; // only applicable to Verdict = DROPPED. // deprecated in favor of drop_reason_desc. uint32 drop_reason = 3 [deprecated=true]; // auth_type is the authentication type specified for the flow in Cilium Network Policy. // Only set on policy verdict events. AuthType auth_type = 35; // l2 Ethernet ethernet = 4; // l3 IP IP = 5; // l4 Layer4 l4 = 6; reserved 7; // removed, do not use Endpoint source = 8; Endpoint destination = 9; FlowType Type = 10; // NodeName is the name of the node from which this Flow was captured. string node_name = 11; // node labels in `foo=bar` format. repeated string node_labels = 37; reserved 12; // removed, do not use // all names the source IP can have. repeated string source_names = 13; // all names the destination IP can have. repeated string destination_names = 14; // L7 information. This field is set if and only if FlowType is L7. Layer7 l7 = 15; // Deprecated. This suffers from false negatives due to protobuf not being // able to distinguish between the value being false or it being absent. // Please use is_reply instead. bool reply = 16 [deprecated=true]; reserved 17, 18; // removed, do not use // EventType of the originating Cilium event CiliumEventType event_type = 19; // source_service contains the service name of the source Service source_service = 20; // destination_service contains the service name of the destination Service destination_service = 21; // traffic_direction of the connection, e.g. ingress or egress TrafficDirection traffic_direction = 22; // policy_match_type is only applicable to the cilium event type PolicyVerdict // https://github.com/cilium/cilium/blob/e831859b5cc336c6d964a6d35bbd34d1840e21b9/pkg/monitor/datapath_policy.go#L50 uint32 policy_match_type = 23; // Only applicable to cilium trace notifications, blank for other types. TraceObservationPoint trace_observation_point = 24; // Cilium datapath trace reason info. TraceReason trace_reason = 36; // Cilium datapath filename and line number. Currently only applicable when // Verdict = DROPPED. FileInfo file = 38; // only applicable to Verdict = DROPPED. DropReason drop_reason_desc = 25; // is_reply indicates that this was a packet (L4) or message (L7) in the // reply direction. May be absent (in which case it is unknown whether it // is a reply or not). google.protobuf.BoolValue is_reply = 26; // Only applicable to cilium debug capture events, blank for other types DebugCapturePoint debug_capture_point = 27; // interface is the network interface on which this flow was observed NetworkInterface interface = 28; // proxy_port indicates the port of the proxy to which the flow was forwarded uint32 proxy_port = 29; // trace_context contains information about a trace related to the flow, if // any. TraceContext trace_context = 30; // sock_xlate_point is the socket translation point. // Only applicable to TraceSock notifications, blank for other types SocketTranslationPoint sock_xlate_point = 31; // socket_cookie is the Linux kernel socket cookie for this flow. // Only applicable to TraceSock notifications, zero for other types uint64 socket_cookie = 32; // cgroup_id of the process which emitted this event. // Only applicable to TraceSock notifications, zero for other types uint64 cgroup_id = 33; // This is a temporary workaround to support summary field for pb.Flow without // duplicating logic from the old parser. This field will be removed once we // fully migrate to the new parser. string Summary = 100000 [deprecated=true]; // extensions can be used to add arbitrary additional metadata to flows. // This can be used to extend functionality for other Hubble compatible // APIs, or experiment with new functionality without needing to change the public API. google.protobuf.Any extensions = 150000; // The CiliumNetworkPolicies allowing the egress of the flow. repeated Policy egress_allowed_by = 21001; // The CiliumNetworkPolicies allowing the ingress of the flow. repeated Policy ingress_allowed_by = 21002; // The CiliumNetworkPolicies denying the egress of the flow. repeated Policy egress_denied_by = 21004; // The CiliumNetworkPolicies denying the ingress of the flow. repeated Policy ingress_denied_by = 21005; } enum FlowType { UNKNOWN_TYPE = 0; L3_L4 = 1; // not sure about the underscore here, but `L34` also reads strange L7 = 2; SOCK = 3; } // These types correspond to definitions in pkg/policy/l4.go. enum AuthType { DISABLED = 0; SPIRE = 1; TEST_ALWAYS_FAIL = 2; } enum TraceObservationPoint { // Cilium treats 0 as TO_LXC, but its's something we should work to remove. // This is intentionally set as unknown, so proto API can guarantee the // observation point is always going to be present on trace events. UNKNOWN_POINT = 0; // TO_PROXY indicates network packets are transmitted towards the l7 proxy. TO_PROXY = 1; // TO_HOST indicates network packets are transmitted towards the host // namespace. TO_HOST = 2; // TO_STACK indicates network packets are transmitted towards the Linux // kernel network stack on host machine. TO_STACK = 3; // TO_OVERLAY indicates network packets are transmitted towards the tunnel // device. TO_OVERLAY = 4; // TO_ENDPOINT indicates network packets are transmitted towards endpoints // (containers). TO_ENDPOINT = 101; // FROM_ENDPOINT indicates network packets were received from endpoints // (containers). FROM_ENDPOINT = 5; // FROM_PROXY indicates network packets were received from the l7 proxy. FROM_PROXY = 6; // FROM_HOST indicates network packets were received from the host // namespace. FROM_HOST = 7; // FROM_STACK indicates network packets were received from the Linux kernel // network stack on host machine. FROM_STACK = 8; // FROM_OVERLAY indicates network packets were received from the tunnel // device. FROM_OVERLAY = 9; // FROM_NETWORK indicates network packets were received from native // devices. FROM_NETWORK = 10; // TO_NETWORK indicates network packets are transmitted towards native // devices. TO_NETWORK = 11; } enum TraceReason { TRACE_REASON_UNKNOWN = 0; NEW = 1; ESTABLISHED = 2; REPLY = 3; RELATED = 4; REOPENED = 5 [deprecated=true]; SRV6_ENCAP = 6; SRV6_DECAP = 7; ENCRYPT_OVERLAY = 8; } message FileInfo { string name = 1; uint32 line = 2; } message Layer4 { oneof protocol { TCP TCP = 1; UDP UDP = 2; // ICMP is technically not L4, but mutually exclusive with the above ICMPv4 ICMPv4 = 3; ICMPv6 ICMPv6 = 4; SCTP SCTP = 5; } } // This enum corresponds to Cilium's L7 accesslog [FlowType](https://github.com/cilium/cilium/blob/728c79e427438ab6f8d9375b62fccd6fed4ace3a/pkg/proxy/accesslog/record.go#L26): enum L7FlowType { UNKNOWN_L7_TYPE = 0; REQUEST = 1; RESPONSE = 2; SAMPLE = 3; } // Message for L7 flow, which roughly corresponds to Cilium's accesslog [LogRecord](https://github.com/cilium/cilium/blob/728c79e427438ab6f8d9375b62fccd6fed4ace3a/pkg/proxy/accesslog/record.go#L141): message Layer7 { L7FlowType type = 1; // Latency of the response uint64 latency_ns = 2; // L7 field. This field is set if and only if FlowType is L7. oneof record { DNS dns = 100; HTTP http = 101; Kafka kafka = 102; } } // TraceContext contains trace context propagation data, i.e. information about a // distributed trace. // For more information about trace context, check the [W3C Trace Context specification](https://www.w3.org/TR/trace-context/). message TraceContext { // parent identifies the incoming request in a tracing system. TraceParent parent = 1; } // TraceParent identifies the incoming request in a tracing system. message TraceParent { // trace_id is a unique value that identifies a trace. It is a byte array // represented as a hex string. string trace_id = 1; } message Endpoint { uint32 ID = 1; uint32 identity = 2; string cluster_name = 7; string namespace = 3; // labels in `foo=bar` format. repeated string labels = 4; string pod_name = 5; repeated Workload workloads = 6; } message Workload { string name = 1; string kind = 2; } message TCP { uint32 source_port = 1; uint32 destination_port = 2; TCPFlags flags = 3; } message IP { string source = 1; // source_xlated is the post translation source IP when the flow was SNATed // (and in that case source is the the original source IP). string source_xlated = 5; string destination = 2; IPVersion ipVersion = 3; // This field indicates whether the TraceReasonEncryptMask is set or not. // https://github.com/cilium/cilium/blob/ba0ed147bd5bb342f67b1794c2ad13c6e99d5236/pkg/monitor/datapath_trace.go#L27 bool encrypted = 4; } message Ethernet { string source = 1; string destination = 2; } message TCPFlags { bool FIN = 1; bool SYN = 2; bool RST = 3; bool PSH = 4; bool ACK = 5; bool URG = 6; bool ECE = 7; bool CWR = 8; bool NS = 9; } message UDP { uint32 source_port = 1; uint32 destination_port = 2; } message SCTP { uint32 source_port = 1; uint32 destination_port = 2; } message ICMPv4 { uint32 type = 1; uint32 code = 2; } message ICMPv6 { uint32 type = 1; uint32 code = 2; } enum IPVersion { IP_NOT_USED = 0; IPv4 = 1; IPv6 = 2; } enum Verdict { // UNKNOWN is used if there is no verdict for this flow event VERDICT_UNKNOWN = 0; // FORWARDED is used for flow events where the trace point has forwarded // this packet or connection to the next processing entity. FORWARDED = 1; // DROPPED is used for flow events where the connection or packet has // been dropped (e.g. due to a malformed packet, it being rejected by a // network policy etc). The exact drop reason may be found in drop_reason_desc. DROPPED = 2; // ERROR is used for flow events where an error occurred during processing ERROR = 3; // AUDIT is used on policy verdict events in policy audit mode, to // denominate flows that would have been dropped by policy if audit mode // was turned off AUDIT = 4; // REDIRECTED is used for flow events which have been redirected to the proxy REDIRECTED = 5; // TRACED is used for flow events which have been observed at a trace point, // but no particular verdict has been reached yet TRACED = 6; // TRANSLATED is used for flow events where an address has been translated TRANSLATED = 7; } // These values are shared with pkg/monitor/api/drop.go and bpf/lib/common.h. // Note that non-drop reasons (i.e. values less than api.DropMin) are not used // here. enum DropReason { // non-drop reasons DROP_REASON_UNKNOWN = 0; // drop reasons INVALID_SOURCE_MAC = 130 [deprecated = true]; INVALID_DESTINATION_MAC = 131 [deprecated = true]; INVALID_SOURCE_IP = 132; POLICY_DENIED = 133; INVALID_PACKET_DROPPED = 134; CT_TRUNCATED_OR_INVALID_HEADER = 135; CT_MISSING_TCP_ACK_FLAG = 136; CT_UNKNOWN_L4_PROTOCOL = 137; CT_CANNOT_CREATE_ENTRY_FROM_PACKET = 138 [deprecated = true]; UNSUPPORTED_L3_PROTOCOL = 139; MISSED_TAIL_CALL = 140; ERROR_WRITING_TO_PACKET = 141; UNKNOWN_L4_PROTOCOL = 142; UNKNOWN_ICMPV4_CODE = 143; UNKNOWN_ICMPV4_TYPE = 144; UNKNOWN_ICMPV6_CODE = 145; UNKNOWN_ICMPV6_TYPE = 146; ERROR_RETRIEVING_TUNNEL_KEY = 147; ERROR_RETRIEVING_TUNNEL_OPTIONS = 148 [deprecated = true]; INVALID_GENEVE_OPTION = 149 [deprecated = true]; UNKNOWN_L3_TARGET_ADDRESS = 150; STALE_OR_UNROUTABLE_IP = 151; NO_MATCHING_LOCAL_CONTAINER_FOUND = 152 [deprecated = true]; ERROR_WHILE_CORRECTING_L3_CHECKSUM = 153; ERROR_WHILE_CORRECTING_L4_CHECKSUM = 154; CT_MAP_INSERTION_FAILED = 155; INVALID_IPV6_EXTENSION_HEADER = 156; IP_FRAGMENTATION_NOT_SUPPORTED = 157; SERVICE_BACKEND_NOT_FOUND = 158; NO_TUNNEL_OR_ENCAPSULATION_ENDPOINT = 160; FAILED_TO_INSERT_INTO_PROXYMAP = 161; REACHED_EDT_RATE_LIMITING_DROP_HORIZON = 162; UNKNOWN_CONNECTION_TRACKING_STATE = 163; LOCAL_HOST_IS_UNREACHABLE = 164; NO_CONFIGURATION_AVAILABLE_TO_PERFORM_POLICY_DECISION = 165; UNSUPPORTED_L2_PROTOCOL = 166; NO_MAPPING_FOR_NAT_MASQUERADE = 167; UNSUPPORTED_PROTOCOL_FOR_NAT_MASQUERADE = 168; FIB_LOOKUP_FAILED = 169; ENCAPSULATION_TRAFFIC_IS_PROHIBITED = 170; INVALID_IDENTITY = 171; UNKNOWN_SENDER = 172; NAT_NOT_NEEDED = 173; IS_A_CLUSTERIP = 174; FIRST_LOGICAL_DATAGRAM_FRAGMENT_NOT_FOUND = 175; FORBIDDEN_ICMPV6_MESSAGE = 176; DENIED_BY_LB_SRC_RANGE_CHECK = 177; SOCKET_LOOKUP_FAILED = 178; SOCKET_ASSIGN_FAILED = 179; PROXY_REDIRECTION_NOT_SUPPORTED_FOR_PROTOCOL = 180; POLICY_DENY = 181; VLAN_FILTERED = 182; INVALID_VNI = 183; INVALID_TC_BUFFER = 184; NO_SID = 185; MISSING_SRV6_STATE = 186 [deprecated = true]; NAT46 = 187; NAT64 = 188; AUTH_REQUIRED = 189; CT_NO_MAP_FOUND = 190; SNAT_NO_MAP_FOUND = 191; INVALID_CLUSTER_ID = 192; UNSUPPORTED_PROTOCOL_FOR_DSR_ENCAP = 193; NO_EGRESS_GATEWAY = 194; UNENCRYPTED_TRAFFIC = 195; TTL_EXCEEDED = 196; NO_NODE_ID = 197; DROP_RATE_LIMITED = 198; IGMP_HANDLED = 199; IGMP_SUBSCRIBED = 200; MULTICAST_HANDLED = 201; // A BPF program wants to tail call into bpf_host, but the host datapath // hasn't been loaded yet. DROP_HOST_NOT_READY = 202; // A BPF program wants to tail call some endpoint's policy program in // cilium_call_policy, but the program is not available. DROP_EP_NOT_READY = 203; // An Egress Gateway node matched a packet against an Egress Gateway policy // that didn't select a valid Egress IP. DROP_NO_EGRESS_IP = 204; } enum TrafficDirection { TRAFFIC_DIRECTION_UNKNOWN = 0; INGRESS = 1; EGRESS = 2; } // These values are shared with pkg/monitor/api/datapath_debug.go and bpf/lib/dbg.h. enum DebugCapturePoint { DBG_CAPTURE_POINT_UNKNOWN = 0; reserved 1 to 3; DBG_CAPTURE_DELIVERY = 4; DBG_CAPTURE_FROM_LB = 5; DBG_CAPTURE_AFTER_V46 = 6; DBG_CAPTURE_AFTER_V64 = 7; DBG_CAPTURE_PROXY_PRE = 8; DBG_CAPTURE_PROXY_POST = 9; DBG_CAPTURE_SNAT_PRE = 10; DBG_CAPTURE_SNAT_POST = 11; } message Policy { string name = 1; string namespace = 2; repeated string labels = 3; uint64 revision = 4; string kind = 5; } // EventTypeFilter is a filter describing a particular event type. message EventTypeFilter { // type is the primary flow type as defined by: // github.com/cilium/cilium/pkg/monitor/api.MessageType* int32 type = 1; // match_sub_type is set to true when matching on the sub_type should // be done. This flag is required as 0 is a valid sub_type. bool match_sub_type = 2; // sub_type is the secondary type, e.g. // - github.com/cilium/cilium/pkg/monitor/api.Trace* int32 sub_type = 3; } // CiliumEventType from which the flow originated. message CiliumEventType { // type of event the flow originated from, i.e. // github.com/cilium/cilium/pkg/monitor/api.MessageType* int32 type = 1; // sub_type may indicate more details depending on type, e.g. // - github.com/cilium/cilium/pkg/monitor/api.Trace* // - github.com/cilium/cilium/pkg/monitor/api.Drop* // - github.com/cilium/cilium/pkg/monitor/api.DbgCapture* int32 sub_type = 2; } // FlowFilter represent an individual flow filter. All fields are optional. If // multiple fields are set, then all fields must match for the filter to match. message FlowFilter { // uuid filters by a list of flow uuids. repeated string uuid = 29; // source_ip filters by a list of source ips. Each of the source ips can be // specified as an exact match (e.g. "1.1.1.1") or as a CIDR range (e.g. // "1.1.1.0/24"). repeated string source_ip = 1; // source_ip_xlated filters by a list IPs. Each of the IPs can be specified // as an exact match (e.g. "1.1.1.1") or as a CIDR range (e.g. // "1.1.1.0/24"). repeated string source_ip_xlated = 34; // source_pod filters by a list of source pod name prefixes, optionally // within a given namespace (e.g. "xwing", "kube-system/coredns-"). // The pod name can be omitted to only filter by namespace // (e.g. "kube-system/") or the namespace can be omitted to filter for // pods in any namespace (e.g. "/xwing") repeated string source_pod = 2; // source_fqdn filters by a list of source fully qualified domain names repeated string source_fqdn = 7; // source_labels filters on a list of source label selectors. Selectors // support the full Kubernetes label selector syntax. repeated string source_label = 10; // source_service filters on a list of source service names. This field // supports the same syntax as the source_pod field. repeated string source_service = 16; // source_workload filters by a list of source workload. repeated Workload source_workload = 26; // destination_ip filters by a list of destination ips. Each of the // destination ips can be specified as an exact match (e.g. "1.1.1.1") or // as a CIDR range (e.g. "1.1.1.0/24"). repeated string destination_ip = 3; // destination_pod filters by a list of destination pod names repeated string destination_pod = 4; // destination_fqdn filters by a list of destination fully qualified domain names repeated string destination_fqdn = 8; // destination_label filters on a list of destination label selectors repeated string destination_label = 11; // destination_service filters on a list of destination service names repeated string destination_service = 17; // destination_workload filters by a list of destination workload. repeated Workload destination_workload = 27; // traffic_direction filters flow by direction of the connection, e.g. // ingress or egress. repeated TrafficDirection traffic_direction = 30; // only return Flows that were classified with a particular verdict. repeated Verdict verdict = 5; // only applicable to Verdict = DROPPED (e.g. "POLICY_DENIED", "UNSUPPORTED_L3_PROTOCOL") repeated DropReason drop_reason_desc = 33; // interface is the network interface on which this flow was observed. repeated NetworkInterface interface = 35; // event_type is the list of event types to filter on repeated EventTypeFilter event_type = 6; // http_status_code is a list of string prefixes (e.g. "4+", "404", "5+") // to filter on the HTTP status code repeated string http_status_code = 9; // protocol filters flows by L4 or L7 protocol, e.g. (e.g. "tcp", "http") repeated string protocol = 12; // source_port filters flows by L4 source port repeated string source_port = 13; // destination_port filters flows by L4 destination port repeated string destination_port = 14; // reply filters flows based on the direction of the flow. repeated bool reply = 15; // dns_query filters L7 DNS flows by query patterns (RE2 regex), e.g. 'kube.*local'. repeated string dns_query = 18; // source_identity filters by the security identity of the source endpoint. repeated uint32 source_identity = 19; // destination_identity filters by the security identity of the destination endpoint. repeated uint32 destination_identity = 20; // GET, POST, PUT, etc. methods. This type of field is well suited for an // enum but every single existing place is using a string already. repeated string http_method = 21; // http_path is a list of regular expressions to filter on the HTTP path. repeated string http_path = 22; // http_url is a list of regular expressions to filter on the HTTP URL. repeated string http_url = 31; // http_header is a list of key:value pairs to filter on the HTTP headers. repeated HTTPHeader http_header = 32; // tcp_flags filters flows based on TCP header flags repeated TCPFlags tcp_flags = 23; // node_name is a list of patterns to filter on the node name, e.g. "k8s*", // "test-cluster/*.domain.com", "cluster-name/" etc. repeated string node_name = 24; // node_labels filters on a list of node label selectors. Selectors support // the full Kubernetes label selector syntax. repeated string node_labels = 36; // filter based on IP version (ipv4 or ipv6) repeated IPVersion ip_version = 25; // trace_id filters flows by trace ID repeated string trace_id = 28; // Experimental contains filters that are not stable yet. Support for // experimental features is always optional and subject to change. message Experimental { // cel_expression takes a common expression language (CEL) expression // returning a boolean to determine if the filter matched or not. // You can use the `_flow` variable to access fields on the flow using // the flow.Flow protobuf field names. // See https://github.com/google/cel-spec/blob/v0.14.0/doc/intro.md#introduction // for more details on CEL and accessing the protobuf fields in CEL. // Using CEL has performance cost compared to other filters, so prefer // using non-CEL filters when possible, and try to specify CEL filters // last in the list of FlowFilters. repeated string cel_expression = 1; } // experimental contains filters that are not stable yet. Support for // experimental features is always optional and subject to change. Experimental experimental = 999; } // EventType are constants are based on the ones from . enum EventType { UNKNOWN = 0; // EventSample is equivalent to PERF_RECORD_SAMPLE. EventSample = 9; // RecordLost is equivalent to PERF_RECORD_LOST. RecordLost = 2; } // DNS flow. This is basically directly mapped from Cilium's [LogRecordDNS](https://github.com/cilium/cilium/blob/04f3889d627774f79e56d14ddbc165b3169e2d01/pkg/proxy/accesslog/record.go#L264): message DNS { // DNS name that's being looked up: e.g. "isovalent.com." string query = 1; // List of IP addresses in the DNS response. repeated string ips = 2; // TTL in the DNS response. uint32 ttl = 3; // List of CNames in the DNS response. repeated string cnames = 4; // Corresponds to DNSDataSource defined in: // https://github.com/cilium/cilium/blob/04f3889d627774f79e56d14ddbc165b3169e2d01/pkg/proxy/accesslog/record.go#L253 string observation_source = 5; // Return code of the DNS request defined in: // https://www.iana.org/assignments/dns-parameters/dns-parameters.xhtml#dns-parameters-6 uint32 rcode = 6; // String representation of qtypes defined in: // https://tools.ietf.org/html/rfc1035#section-3.2.3 repeated string qtypes = 7; // String representation of rrtypes defined in: // https://www.iana.org/assignments/dns-parameters/dns-parameters.xhtml#dns-parameters-4 repeated string rrtypes = 8; } message HTTPHeader { string key = 1; string value = 2; } // L7 information for HTTP flows. It corresponds to Cilium's [accesslog.LogRecordHTTP](https://github.com/cilium/cilium/blob/728c79e427438ab6f8d9375b62fccd6fed4ace3a/pkg/proxy/accesslog/record.go#L206) type. message HTTP { uint32 code = 1; string method = 2; string url = 3; string protocol = 4; repeated HTTPHeader headers = 5; } // L7 information for Kafka flows. It corresponds to Cilium's [accesslog.LogRecordKafka](https://github.com/cilium/cilium/blob/728c79e427438ab6f8d9375b62fccd6fed4ace3a/pkg/proxy/accesslog/record.go#L229) type. message Kafka { int32 error_code = 1; int32 api_version = 2; string api_key = 3; int32 correlation_id = 4; string topic = 5; } message Service { string name = 1; string namespace = 2; } enum LostEventSource { UNKNOWN_LOST_EVENT_SOURCE = 0; // PERF_EVENT_RING_BUFFER indicates that events were dropped in the BPF // perf event ring buffer, indicating that userspace agent did not keep up // with the events produced by the datapath. PERF_EVENT_RING_BUFFER = 1; // OBSERVER_EVENTS_QUEUE indicates that events were dropped because the // Hubble events queue was full, indicating that the Hubble observer did // not keep up. OBSERVER_EVENTS_QUEUE = 2; // HUBBLE_RING_BUFFER indicates that the event was dropped because it could // not be read from Hubble's ring buffer in time before being overwritten. HUBBLE_RING_BUFFER = 3; } // LostEvent is a message which notifies consumers about a loss of events // that happened before the events were captured by Hubble. message LostEvent { // source is the location where events got lost. LostEventSource source = 1; // num_events_lost is the number of events that haven been lost at source. uint64 num_events_lost = 2; // cpu on which the event was lost if the source of lost events is // PERF_EVENT_RING_BUFFER. google.protobuf.Int32Value cpu = 3; } // AgentEventType is the type of agent event. These values are shared with type // AgentNotification in pkg/monitor/api/types.go. enum AgentEventType { AGENT_EVENT_UNKNOWN = 0; // used for AGENT_EVENT_GENERIC in monitor API, but there are currently no // such events; reserved 1; AGENT_STARTED = 2; POLICY_UPDATED = 3; POLICY_DELETED = 4; ENDPOINT_REGENERATE_SUCCESS = 5; ENDPOINT_REGENERATE_FAILURE = 6; ENDPOINT_CREATED = 7; ENDPOINT_DELETED = 8; IPCACHE_UPSERTED = 9; IPCACHE_DELETED = 10; SERVICE_UPSERTED = 11; SERVICE_DELETED = 12; } message AgentEvent { AgentEventType type = 1; oneof notification { AgentEventUnknown unknown = 100; TimeNotification agent_start = 101; // used for POLICY_UPDATED and POLICY_DELETED PolicyUpdateNotification policy_update = 102; // used for ENDPOINT_REGENERATE_SUCCESS and ENDPOINT_REGENERATE_FAILURE EndpointRegenNotification endpoint_regenerate = 103; // used for ENDPOINT_CREATED and ENDPOINT_DELETED EndpointUpdateNotification endpoint_update = 104; // used for IPCACHE_UPSERTED and IPCACHE_DELETED IPCacheNotification ipcache_update = 105; ServiceUpsertNotification service_upsert = 106; ServiceDeleteNotification service_delete = 107; } } message AgentEventUnknown { string type = 1; string notification = 2; } message TimeNotification { google.protobuf.Timestamp time = 1; } message PolicyUpdateNotification { repeated string labels = 1; uint64 revision = 2; int64 rule_count = 3; } message EndpointRegenNotification { uint64 id = 1; repeated string labels = 2; string error = 3; } message EndpointUpdateNotification { uint64 id = 1; repeated string labels = 2; string error = 3; string pod_name = 4; string namespace = 5; } message IPCacheNotification { string cidr = 1; uint32 identity = 2; google.protobuf.UInt32Value old_identity = 3; string host_ip = 4; string old_host_ip = 5; uint32 encrypt_key = 6; string namespace = 7; string pod_name = 8; } message ServiceUpsertNotificationAddr { string ip = 1; uint32 port = 2; } message ServiceUpsertNotification { uint32 id = 1; ServiceUpsertNotificationAddr frontend_address = 2; repeated ServiceUpsertNotificationAddr backend_addresses = 3; string type = 4; string traffic_policy = 5 [deprecated = true]; string name = 6; string namespace = 7; string ext_traffic_policy = 8; string int_traffic_policy = 9; } message ServiceDeleteNotification { uint32 id = 1; } message NetworkInterface { uint32 index = 1; string name = 2; } // This mirrors enum xlate_point in bpf/lib/trace_sock.h enum SocketTranslationPoint { SOCK_XLATE_POINT_UNKNOWN = 0; SOCK_XLATE_POINT_PRE_DIRECTION_FWD = 1; // Pre service translation SOCK_XLATE_POINT_POST_DIRECTION_FWD = 2; // Post service translation SOCK_XLATE_POINT_PRE_DIRECTION_REV = 3; // Pre reverse service translation SOCK_XLATE_POINT_POST_DIRECTION_REV = 4; // Post reverse service translation } message DebugEvent { DebugEventType type = 1; Endpoint source = 2; google.protobuf.UInt32Value hash = 3; google.protobuf.UInt32Value arg1 = 4; google.protobuf.UInt32Value arg2 = 5; google.protobuf.UInt32Value arg3 = 6; string message = 7; google.protobuf.Int32Value cpu = 8; } // These values are shared with pkg/monitor/api/datapath_debug.go and bpf/lib/dbg.h. enum DebugEventType { DBG_EVENT_UNKNOWN = 0; DBG_GENERIC = 1; DBG_LOCAL_DELIVERY = 2; DBG_ENCAP = 3; DBG_LXC_FOUND = 4; DBG_POLICY_DENIED = 5; DBG_CT_LOOKUP = 6; DBG_CT_LOOKUP_REV = 7; DBG_CT_MATCH = 8; DBG_CT_CREATED = 9; DBG_CT_CREATED2 = 10; DBG_ICMP6_HANDLE = 11; DBG_ICMP6_REQUEST = 12; DBG_ICMP6_NS = 13; DBG_ICMP6_TIME_EXCEEDED = 14; DBG_CT_VERDICT = 15; DBG_DECAP = 16; DBG_PORT_MAP = 17; DBG_ERROR_RET = 18; DBG_TO_HOST = 19; DBG_TO_STACK = 20; DBG_PKT_HASH = 21; DBG_LB6_LOOKUP_FRONTEND = 22; DBG_LB6_LOOKUP_FRONTEND_FAIL = 23; DBG_LB6_LOOKUP_BACKEND_SLOT = 24; DBG_LB6_LOOKUP_BACKEND_SLOT_SUCCESS = 25; DBG_LB6_LOOKUP_BACKEND_SLOT_V2_FAIL = 26; DBG_LB6_LOOKUP_BACKEND_FAIL = 27; DBG_LB6_REVERSE_NAT_LOOKUP = 28; DBG_LB6_REVERSE_NAT = 29; DBG_LB4_LOOKUP_FRONTEND = 30; DBG_LB4_LOOKUP_FRONTEND_FAIL = 31; DBG_LB4_LOOKUP_BACKEND_SLOT = 32; DBG_LB4_LOOKUP_BACKEND_SLOT_SUCCESS = 33; DBG_LB4_LOOKUP_BACKEND_SLOT_V2_FAIL = 34; DBG_LB4_LOOKUP_BACKEND_FAIL = 35; DBG_LB4_REVERSE_NAT_LOOKUP = 36; DBG_LB4_REVERSE_NAT = 37; DBG_LB4_LOOPBACK_SNAT = 38; DBG_LB4_LOOPBACK_SNAT_REV = 39; DBG_CT_LOOKUP4 = 40; DBG_RR_BACKEND_SLOT_SEL = 41; DBG_REV_PROXY_LOOKUP = 42; DBG_REV_PROXY_FOUND = 43; DBG_REV_PROXY_UPDATE = 44; DBG_L4_POLICY = 45; DBG_NETDEV_IN_CLUSTER = 46; DBG_NETDEV_ENCAP4 = 47; DBG_CT_LOOKUP4_1 = 48; DBG_CT_LOOKUP4_2 = 49; DBG_CT_CREATED4 = 50; DBG_CT_LOOKUP6_1 = 51; DBG_CT_LOOKUP6_2 = 52; DBG_CT_CREATED6 = 53; DBG_SKIP_PROXY = 54; DBG_L4_CREATE = 55; DBG_IP_ID_MAP_FAILED4 = 56; DBG_IP_ID_MAP_FAILED6 = 57; DBG_IP_ID_MAP_SUCCEED4 = 58; DBG_IP_ID_MAP_SUCCEED6 = 59; DBG_LB_STALE_CT = 60; DBG_INHERIT_IDENTITY = 61; DBG_SK_LOOKUP4 = 62; DBG_SK_LOOKUP6 = 63; DBG_SK_ASSIGN = 64; DBG_L7_LB = 65; DBG_SKIP_POLICY = 66; } ================================================ FILE: keep/providers/cilium_provider/grpc/flow/flow_pb2.py ================================================ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # NO CHECKED-IN PROTOBUF GENCODE # source: flow/flow.proto # Protobuf Python Version: 5.27.2 """Generated protocol buffer code.""" # from google.protobuf import runtime_version as _runtime_version from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder """ _runtime_version.ValidateProtobufRuntimeVersion( _runtime_version.Domain.PUBLIC, 5, 27, 2, '', 'flow/flow.proto' ) """ # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( b'\n\x0f\x66low/flow.proto\x12\x04\x66low\x1a\x19google/protobuf/any.proto\x1a\x1egoogle/protobuf/wrappers.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x89\x0b\n\x04\x46low\x12(\n\x04time\x18\x01 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04uuid\x18" \x01(\t\x12\x1e\n\x07verdict\x18\x02 \x01(\x0e\x32\r.flow.Verdict\x12\x17\n\x0b\x64rop_reason\x18\x03 \x01(\rB\x02\x18\x01\x12!\n\tauth_type\x18# \x01(\x0e\x32\x0e.flow.AuthType\x12 \n\x08\x65thernet\x18\x04 \x01(\x0b\x32\x0e.flow.Ethernet\x12\x14\n\x02IP\x18\x05 \x01(\x0b\x32\x08.flow.IP\x12\x18\n\x02l4\x18\x06 \x01(\x0b\x32\x0c.flow.Layer4\x12\x1e\n\x06source\x18\x08 \x01(\x0b\x32\x0e.flow.Endpoint\x12#\n\x0b\x64\x65stination\x18\t \x01(\x0b\x32\x0e.flow.Endpoint\x12\x1c\n\x04Type\x18\n \x01(\x0e\x32\x0e.flow.FlowType\x12\x11\n\tnode_name\x18\x0b \x01(\t\x12\x13\n\x0bnode_labels\x18% \x03(\t\x12\x14\n\x0csource_names\x18\r \x03(\t\x12\x19\n\x11\x64\x65stination_names\x18\x0e \x03(\t\x12\x18\n\x02l7\x18\x0f \x01(\x0b\x32\x0c.flow.Layer7\x12\x11\n\x05reply\x18\x10 \x01(\x08\x42\x02\x18\x01\x12)\n\nevent_type\x18\x13 \x01(\x0b\x32\x15.flow.CiliumEventType\x12%\n\x0esource_service\x18\x14 \x01(\x0b\x32\r.flow.Service\x12*\n\x13\x64\x65stination_service\x18\x15 \x01(\x0b\x32\r.flow.Service\x12\x31\n\x11traffic_direction\x18\x16 \x01(\x0e\x32\x16.flow.TrafficDirection\x12\x19\n\x11policy_match_type\x18\x17 \x01(\r\x12<\n\x17trace_observation_point\x18\x18 \x01(\x0e\x32\x1b.flow.TraceObservationPoint\x12\'\n\x0ctrace_reason\x18$ \x01(\x0e\x32\x11.flow.TraceReason\x12\x1c\n\x04\x66ile\x18& \x01(\x0b\x32\x0e.flow.FileInfo\x12*\n\x10\x64rop_reason_desc\x18\x19 \x01(\x0e\x32\x10.flow.DropReason\x12,\n\x08is_reply\x18\x1a \x01(\x0b\x32\x1a.google.protobuf.BoolValue\x12\x34\n\x13\x64\x65\x62ug_capture_point\x18\x1b \x01(\x0e\x32\x17.flow.DebugCapturePoint\x12)\n\tinterface\x18\x1c \x01(\x0b\x32\x16.flow.NetworkInterface\x12\x12\n\nproxy_port\x18\x1d \x01(\r\x12)\n\rtrace_context\x18\x1e \x01(\x0b\x32\x12.flow.TraceContext\x12\x36\n\x10sock_xlate_point\x18\x1f \x01(\x0e\x32\x1c.flow.SocketTranslationPoint\x12\x15\n\rsocket_cookie\x18 \x01(\x04\x12\x11\n\tcgroup_id\x18! \x01(\x04\x12\x15\n\x07Summary\x18\xa0\x8d\x06 \x01(\tB\x02\x18\x01\x12*\n\nextensions\x18\xf0\x93\t \x01(\x0b\x32\x14.google.protobuf.Any\x12)\n\x11\x65gress_allowed_by\x18\x89\xa4\x01 \x03(\x0b\x32\x0c.flow.Policy\x12*\n\x12ingress_allowed_by\x18\x8a\xa4\x01 \x03(\x0b\x32\x0c.flow.Policy\x12(\n\x10\x65gress_denied_by\x18\x8c\xa4\x01 \x03(\x0b\x32\x0c.flow.Policy\x12)\n\x11ingress_denied_by\x18\x8d\xa4\x01 \x03(\x0b\x32\x0c.flow.PolicyJ\x04\x08\x07\x10\x08J\x04\x08\x0c\x10\rJ\x04\x08\x11\x10\x12J\x04\x08\x12\x10\x13"&\n\x08\x46ileInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04line\x18\x02 \x01(\r"\xa4\x01\n\x06Layer4\x12\x18\n\x03TCP\x18\x01 \x01(\x0b\x32\t.flow.TCPH\x00\x12\x18\n\x03UDP\x18\x02 \x01(\x0b\x32\t.flow.UDPH\x00\x12\x1e\n\x06ICMPv4\x18\x03 \x01(\x0b\x32\x0c.flow.ICMPv4H\x00\x12\x1e\n\x06ICMPv6\x18\x04 \x01(\x0b\x32\x0c.flow.ICMPv6H\x00\x12\x1a\n\x04SCTP\x18\x05 \x01(\x0b\x32\n.flow.SCTPH\x00\x42\n\n\x08protocol"\x9a\x01\n\x06Layer7\x12\x1e\n\x04type\x18\x01 \x01(\x0e\x32\x10.flow.L7FlowType\x12\x12\n\nlatency_ns\x18\x02 \x01(\x04\x12\x18\n\x03\x64ns\x18\x64 \x01(\x0b\x32\t.flow.DNSH\x00\x12\x1a\n\x04http\x18\x65 \x01(\x0b\x32\n.flow.HTTPH\x00\x12\x1c\n\x05kafka\x18\x66 \x01(\x0b\x32\x0b.flow.KafkaH\x00\x42\x08\n\x06record"1\n\x0cTraceContext\x12!\n\x06parent\x18\x01 \x01(\x0b\x32\x11.flow.TraceParent"\x1f\n\x0bTraceParent\x12\x10\n\x08trace_id\x18\x01 \x01(\t"\x96\x01\n\x08\x45ndpoint\x12\n\n\x02ID\x18\x01 \x01(\r\x12\x10\n\x08identity\x18\x02 \x01(\r\x12\x14\n\x0c\x63luster_name\x18\x07 \x01(\t\x12\x11\n\tnamespace\x18\x03 \x01(\t\x12\x0e\n\x06labels\x18\x04 \x03(\t\x12\x10\n\x08pod_name\x18\x05 \x01(\t\x12!\n\tworkloads\x18\x06 \x03(\x0b\x32\x0e.flow.Workload"&\n\x08Workload\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04kind\x18\x02 \x01(\t"S\n\x03TCP\x12\x13\n\x0bsource_port\x18\x01 \x01(\r\x12\x18\n\x10\x64\x65stination_port\x18\x02 \x01(\r\x12\x1d\n\x05\x66lags\x18\x03 \x01(\x0b\x32\x0e.flow.TCPFlags"w\n\x02IP\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\rsource_xlated\x18\x05 \x01(\t\x12\x13\n\x0b\x64\x65stination\x18\x02 \x01(\t\x12"\n\tipVersion\x18\x03 \x01(\x0e\x32\x0f.flow.IPVersion\x12\x11\n\tencrypted\x18\x04 \x01(\x08"/\n\x08\x45thernet\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65stination\x18\x02 \x01(\t"~\n\x08TCPFlags\x12\x0b\n\x03\x46IN\x18\x01 \x01(\x08\x12\x0b\n\x03SYN\x18\x02 \x01(\x08\x12\x0b\n\x03RST\x18\x03 \x01(\x08\x12\x0b\n\x03PSH\x18\x04 \x01(\x08\x12\x0b\n\x03\x41\x43K\x18\x05 \x01(\x08\x12\x0b\n\x03URG\x18\x06 \x01(\x08\x12\x0b\n\x03\x45\x43\x45\x18\x07 \x01(\x08\x12\x0b\n\x03\x43WR\x18\x08 \x01(\x08\x12\n\n\x02NS\x18\t \x01(\x08"4\n\x03UDP\x12\x13\n\x0bsource_port\x18\x01 \x01(\r\x12\x18\n\x10\x64\x65stination_port\x18\x02 \x01(\r"5\n\x04SCTP\x12\x13\n\x0bsource_port\x18\x01 \x01(\r\x12\x18\n\x10\x64\x65stination_port\x18\x02 \x01(\r"$\n\x06ICMPv4\x12\x0c\n\x04type\x18\x01 \x01(\r\x12\x0c\n\x04\x63ode\x18\x02 \x01(\r"$\n\x06ICMPv6\x12\x0c\n\x04type\x18\x01 \x01(\r\x12\x0c\n\x04\x63ode\x18\x02 \x01(\r"Y\n\x06Policy\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\tnamespace\x18\x02 \x01(\t\x12\x0e\n\x06labels\x18\x03 \x03(\t\x12\x10\n\x08revision\x18\x04 \x01(\x04\x12\x0c\n\x04kind\x18\x05 \x01(\t"I\n\x0f\x45ventTypeFilter\x12\x0c\n\x04type\x18\x01 \x01(\x05\x12\x16\n\x0ematch_sub_type\x18\x02 \x01(\x08\x12\x10\n\x08sub_type\x18\x03 \x01(\x05"1\n\x0f\x43iliumEventType\x12\x0c\n\x04type\x18\x01 \x01(\x05\x12\x10\n\x08sub_type\x18\x02 \x01(\x05"\xc2\x08\n\nFlowFilter\x12\x0c\n\x04uuid\x18\x1d \x03(\t\x12\x11\n\tsource_ip\x18\x01 \x03(\t\x12\x18\n\x10source_ip_xlated\x18" \x03(\t\x12\x12\n\nsource_pod\x18\x02 \x03(\t\x12\x13\n\x0bsource_fqdn\x18\x07 \x03(\t\x12\x14\n\x0csource_label\x18\n \x03(\t\x12\x16\n\x0esource_service\x18\x10 \x03(\t\x12\'\n\x0fsource_workload\x18\x1a \x03(\x0b\x32\x0e.flow.Workload\x12\x16\n\x0e\x64\x65stination_ip\x18\x03 \x03(\t\x12\x17\n\x0f\x64\x65stination_pod\x18\x04 \x03(\t\x12\x18\n\x10\x64\x65stination_fqdn\x18\x08 \x03(\t\x12\x19\n\x11\x64\x65stination_label\x18\x0b \x03(\t\x12\x1b\n\x13\x64\x65stination_service\x18\x11 \x03(\t\x12,\n\x14\x64\x65stination_workload\x18\x1b \x03(\x0b\x32\x0e.flow.Workload\x12\x31\n\x11traffic_direction\x18\x1e \x03(\x0e\x32\x16.flow.TrafficDirection\x12\x1e\n\x07verdict\x18\x05 \x03(\x0e\x32\r.flow.Verdict\x12*\n\x10\x64rop_reason_desc\x18! \x03(\x0e\x32\x10.flow.DropReason\x12)\n\tinterface\x18# \x03(\x0b\x32\x16.flow.NetworkInterface\x12)\n\nevent_type\x18\x06 \x03(\x0b\x32\x15.flow.EventTypeFilter\x12\x18\n\x10http_status_code\x18\t \x03(\t\x12\x10\n\x08protocol\x18\x0c \x03(\t\x12\x13\n\x0bsource_port\x18\r \x03(\t\x12\x18\n\x10\x64\x65stination_port\x18\x0e \x03(\t\x12\r\n\x05reply\x18\x0f \x03(\x08\x12\x11\n\tdns_query\x18\x12 \x03(\t\x12\x17\n\x0fsource_identity\x18\x13 \x03(\r\x12\x1c\n\x14\x64\x65stination_identity\x18\x14 \x03(\r\x12\x13\n\x0bhttp_method\x18\x15 \x03(\t\x12\x11\n\thttp_path\x18\x16 \x03(\t\x12\x10\n\x08http_url\x18\x1f \x03(\t\x12%\n\x0bhttp_header\x18 \x03(\x0b\x32\x10.flow.HTTPHeader\x12!\n\ttcp_flags\x18\x17 \x03(\x0b\x32\x0e.flow.TCPFlags\x12\x11\n\tnode_name\x18\x18 \x03(\t\x12\x13\n\x0bnode_labels\x18$ \x03(\t\x12#\n\nip_version\x18\x19 \x03(\x0e\x32\x0f.flow.IPVersion\x12\x10\n\x08trace_id\x18\x1c \x03(\t\x12\x34\n\x0c\x65xperimental\x18\xe7\x07 \x01(\x0b\x32\x1d.flow.FlowFilter.Experimental\x1a&\n\x0c\x45xperimental\x12\x16\n\x0e\x63\x65l_expression\x18\x01 \x03(\t"\x8a\x01\n\x03\x44NS\x12\r\n\x05query\x18\x01 \x01(\t\x12\x0b\n\x03ips\x18\x02 \x03(\t\x12\x0b\n\x03ttl\x18\x03 \x01(\r\x12\x0e\n\x06\x63names\x18\x04 \x03(\t\x12\x1a\n\x12observation_source\x18\x05 \x01(\t\x12\r\n\x05rcode\x18\x06 \x01(\r\x12\x0e\n\x06qtypes\x18\x07 \x03(\t\x12\x0f\n\x07rrtypes\x18\x08 \x03(\t"(\n\nHTTPHeader\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t"f\n\x04HTTP\x12\x0c\n\x04\x63ode\x18\x01 \x01(\r\x12\x0e\n\x06method\x18\x02 \x01(\t\x12\x0b\n\x03url\x18\x03 \x01(\t\x12\x10\n\x08protocol\x18\x04 \x01(\t\x12!\n\x07headers\x18\x05 \x03(\x0b\x32\x10.flow.HTTPHeader"h\n\x05Kafka\x12\x12\n\nerror_code\x18\x01 \x01(\x05\x12\x13\n\x0b\x61pi_version\x18\x02 \x01(\x05\x12\x0f\n\x07\x61pi_key\x18\x03 \x01(\t\x12\x16\n\x0e\x63orrelation_id\x18\x04 \x01(\x05\x12\r\n\x05topic\x18\x05 \x01(\t"*\n\x07Service\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\tnamespace\x18\x02 \x01(\t"u\n\tLostEvent\x12%\n\x06source\x18\x01 \x01(\x0e\x32\x15.flow.LostEventSource\x12\x17\n\x0fnum_events_lost\x18\x02 \x01(\x04\x12(\n\x03\x63pu\x18\x03 \x01(\x0b\x32\x1b.google.protobuf.Int32Value"\xfc\x03\n\nAgentEvent\x12"\n\x04type\x18\x01 \x01(\x0e\x32\x14.flow.AgentEventType\x12*\n\x07unknown\x18\x64 \x01(\x0b\x32\x17.flow.AgentEventUnknownH\x00\x12-\n\x0b\x61gent_start\x18\x65 \x01(\x0b\x32\x16.flow.TimeNotificationH\x00\x12\x37\n\rpolicy_update\x18\x66 \x01(\x0b\x32\x1e.flow.PolicyUpdateNotificationH\x00\x12>\n\x13\x65ndpoint_regenerate\x18g \x01(\x0b\x32\x1f.flow.EndpointRegenNotificationH\x00\x12;\n\x0f\x65ndpoint_update\x18h \x01(\x0b\x32 .flow.EndpointUpdateNotificationH\x00\x12\x33\n\x0eipcache_update\x18i \x01(\x0b\x32\x19.flow.IPCacheNotificationH\x00\x12\x39\n\x0eservice_upsert\x18j \x01(\x0b\x32\x1f.flow.ServiceUpsertNotificationH\x00\x12\x39\n\x0eservice_delete\x18k \x01(\x0b\x32\x1f.flow.ServiceDeleteNotificationH\x00\x42\x0e\n\x0cnotification"7\n\x11\x41gentEventUnknown\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x14\n\x0cnotification\x18\x02 \x01(\t"<\n\x10TimeNotification\x12(\n\x04time\x18\x01 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"P\n\x18PolicyUpdateNotification\x12\x0e\n\x06labels\x18\x01 \x03(\t\x12\x10\n\x08revision\x18\x02 \x01(\x04\x12\x12\n\nrule_count\x18\x03 \x01(\x03"F\n\x19\x45ndpointRegenNotification\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06labels\x18\x02 \x03(\t\x12\r\n\x05\x65rror\x18\x03 \x01(\t"l\n\x1a\x45ndpointUpdateNotification\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0e\n\x06labels\x18\x02 \x03(\t\x12\r\n\x05\x65rror\x18\x03 \x01(\t\x12\x10\n\x08pod_name\x18\x04 \x01(\t\x12\x11\n\tnamespace\x18\x05 \x01(\t"\xc9\x01\n\x13IPCacheNotification\x12\x0c\n\x04\x63idr\x18\x01 \x01(\t\x12\x10\n\x08identity\x18\x02 \x01(\r\x12\x32\n\x0cold_identity\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12\x0f\n\x07host_ip\x18\x04 \x01(\t\x12\x13\n\x0bold_host_ip\x18\x05 \x01(\t\x12\x13\n\x0b\x65ncrypt_key\x18\x06 \x01(\r\x12\x11\n\tnamespace\x18\x07 \x01(\t\x12\x10\n\x08pod_name\x18\x08 \x01(\t"9\n\x1dServiceUpsertNotificationAddr\x12\n\n\x02ip\x18\x01 \x01(\t\x12\x0c\n\x04port\x18\x02 \x01(\r"\xa9\x02\n\x19ServiceUpsertNotification\x12\n\n\x02id\x18\x01 \x01(\r\x12=\n\x10\x66rontend_address\x18\x02 \x01(\x0b\x32#.flow.ServiceUpsertNotificationAddr\x12>\n\x11\x62\x61\x63kend_addresses\x18\x03 \x03(\x0b\x32#.flow.ServiceUpsertNotificationAddr\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x1a\n\x0etraffic_policy\x18\x05 \x01(\tB\x02\x18\x01\x12\x0c\n\x04name\x18\x06 \x01(\t\x12\x11\n\tnamespace\x18\x07 \x01(\t\x12\x1a\n\x12\x65xt_traffic_policy\x18\x08 \x01(\t\x12\x1a\n\x12int_traffic_policy\x18\t \x01(\t"\'\n\x19ServiceDeleteNotification\x12\n\n\x02id\x18\x01 \x01(\r"/\n\x10NetworkInterface\x12\r\n\x05index\x18\x01 \x01(\r\x12\x0c\n\x04name\x18\x02 \x01(\t"\xbb\x02\n\nDebugEvent\x12"\n\x04type\x18\x01 \x01(\x0e\x32\x14.flow.DebugEventType\x12\x1e\n\x06source\x18\x02 \x01(\x0b\x32\x0e.flow.Endpoint\x12*\n\x04hash\x18\x03 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12*\n\x04\x61rg1\x18\x04 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12*\n\x04\x61rg2\x18\x05 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12*\n\x04\x61rg3\x18\x06 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12\x0f\n\x07message\x18\x07 \x01(\t\x12(\n\x03\x63pu\x18\x08 \x01(\x0b\x32\x1b.google.protobuf.Int32Value*9\n\x08\x46lowType\x12\x10\n\x0cUNKNOWN_TYPE\x10\x00\x12\t\n\x05L3_L4\x10\x01\x12\x06\n\x02L7\x10\x02\x12\x08\n\x04SOCK\x10\x03*9\n\x08\x41uthType\x12\x0c\n\x08\x44ISABLED\x10\x00\x12\t\n\x05SPIRE\x10\x01\x12\x14\n\x10TEST_ALWAYS_FAIL\x10\x02*\xea\x01\n\x15TraceObservationPoint\x12\x11\n\rUNKNOWN_POINT\x10\x00\x12\x0c\n\x08TO_PROXY\x10\x01\x12\x0b\n\x07TO_HOST\x10\x02\x12\x0c\n\x08TO_STACK\x10\x03\x12\x0e\n\nTO_OVERLAY\x10\x04\x12\x0f\n\x0bTO_ENDPOINT\x10\x65\x12\x11\n\rFROM_ENDPOINT\x10\x05\x12\x0e\n\nFROM_PROXY\x10\x06\x12\r\n\tFROM_HOST\x10\x07\x12\x0e\n\nFROM_STACK\x10\x08\x12\x10\n\x0c\x46ROM_OVERLAY\x10\t\x12\x10\n\x0c\x46ROM_NETWORK\x10\n\x12\x0e\n\nTO_NETWORK\x10\x0b*\xa0\x01\n\x0bTraceReason\x12\x18\n\x14TRACE_REASON_UNKNOWN\x10\x00\x12\x07\n\x03NEW\x10\x01\x12\x0f\n\x0b\x45STABLISHED\x10\x02\x12\t\n\x05REPLY\x10\x03\x12\x0b\n\x07RELATED\x10\x04\x12\x10\n\x08REOPENED\x10\x05\x1a\x02\x08\x01\x12\x0e\n\nSRV6_ENCAP\x10\x06\x12\x0e\n\nSRV6_DECAP\x10\x07\x12\x13\n\x0f\x45NCRYPT_OVERLAY\x10\x08*H\n\nL7FlowType\x12\x13\n\x0fUNKNOWN_L7_TYPE\x10\x00\x12\x0b\n\x07REQUEST\x10\x01\x12\x0c\n\x08RESPONSE\x10\x02\x12\n\n\x06SAMPLE\x10\x03*0\n\tIPVersion\x12\x0f\n\x0bIP_NOT_USED\x10\x00\x12\x08\n\x04IPv4\x10\x01\x12\x08\n\x04IPv6\x10\x02*|\n\x07Verdict\x12\x13\n\x0fVERDICT_UNKNOWN\x10\x00\x12\r\n\tFORWARDED\x10\x01\x12\x0b\n\x07\x44ROPPED\x10\x02\x12\t\n\x05\x45RROR\x10\x03\x12\t\n\x05\x41UDIT\x10\x04\x12\x0e\n\nREDIRECTED\x10\x05\x12\n\n\x06TRACED\x10\x06\x12\x0e\n\nTRANSLATED\x10\x07*\xaf\x11\n\nDropReason\x12\x17\n\x13\x44ROP_REASON_UNKNOWN\x10\x00\x12\x1b\n\x12INVALID_SOURCE_MAC\x10\x82\x01\x1a\x02\x08\x01\x12 \n\x17INVALID_DESTINATION_MAC\x10\x83\x01\x1a\x02\x08\x01\x12\x16\n\x11INVALID_SOURCE_IP\x10\x84\x01\x12\x12\n\rPOLICY_DENIED\x10\x85\x01\x12\x1b\n\x16INVALID_PACKET_DROPPED\x10\x86\x01\x12#\n\x1e\x43T_TRUNCATED_OR_INVALID_HEADER\x10\x87\x01\x12\x1c\n\x17\x43T_MISSING_TCP_ACK_FLAG\x10\x88\x01\x12\x1b\n\x16\x43T_UNKNOWN_L4_PROTOCOL\x10\x89\x01\x12+\n"CT_CANNOT_CREATE_ENTRY_FROM_PACKET\x10\x8a\x01\x1a\x02\x08\x01\x12\x1c\n\x17UNSUPPORTED_L3_PROTOCOL\x10\x8b\x01\x12\x15\n\x10MISSED_TAIL_CALL\x10\x8c\x01\x12\x1c\n\x17\x45RROR_WRITING_TO_PACKET\x10\x8d\x01\x12\x18\n\x13UNKNOWN_L4_PROTOCOL\x10\x8e\x01\x12\x18\n\x13UNKNOWN_ICMPV4_CODE\x10\x8f\x01\x12\x18\n\x13UNKNOWN_ICMPV4_TYPE\x10\x90\x01\x12\x18\n\x13UNKNOWN_ICMPV6_CODE\x10\x91\x01\x12\x18\n\x13UNKNOWN_ICMPV6_TYPE\x10\x92\x01\x12 \n\x1b\x45RROR_RETRIEVING_TUNNEL_KEY\x10\x93\x01\x12(\n\x1f\x45RROR_RETRIEVING_TUNNEL_OPTIONS\x10\x94\x01\x1a\x02\x08\x01\x12\x1e\n\x15INVALID_GENEVE_OPTION\x10\x95\x01\x1a\x02\x08\x01\x12\x1e\n\x19UNKNOWN_L3_TARGET_ADDRESS\x10\x96\x01\x12\x1b\n\x16STALE_OR_UNROUTABLE_IP\x10\x97\x01\x12*\n!NO_MATCHING_LOCAL_CONTAINER_FOUND\x10\x98\x01\x1a\x02\x08\x01\x12\'\n"ERROR_WHILE_CORRECTING_L3_CHECKSUM\x10\x99\x01\x12\'\n"ERROR_WHILE_CORRECTING_L4_CHECKSUM\x10\x9a\x01\x12\x1c\n\x17\x43T_MAP_INSERTION_FAILED\x10\x9b\x01\x12"\n\x1dINVALID_IPV6_EXTENSION_HEADER\x10\x9c\x01\x12#\n\x1eIP_FRAGMENTATION_NOT_SUPPORTED\x10\x9d\x01\x12\x1e\n\x19SERVICE_BACKEND_NOT_FOUND\x10\x9e\x01\x12(\n#NO_TUNNEL_OR_ENCAPSULATION_ENDPOINT\x10\xa0\x01\x12#\n\x1e\x46\x41ILED_TO_INSERT_INTO_PROXYMAP\x10\xa1\x01\x12+\n&REACHED_EDT_RATE_LIMITING_DROP_HORIZON\x10\xa2\x01\x12&\n!UNKNOWN_CONNECTION_TRACKING_STATE\x10\xa3\x01\x12\x1e\n\x19LOCAL_HOST_IS_UNREACHABLE\x10\xa4\x01\x12:\n5NO_CONFIGURATION_AVAILABLE_TO_PERFORM_POLICY_DECISION\x10\xa5\x01\x12\x1c\n\x17UNSUPPORTED_L2_PROTOCOL\x10\xa6\x01\x12"\n\x1dNO_MAPPING_FOR_NAT_MASQUERADE\x10\xa7\x01\x12,\n\'UNSUPPORTED_PROTOCOL_FOR_NAT_MASQUERADE\x10\xa8\x01\x12\x16\n\x11\x46IB_LOOKUP_FAILED\x10\xa9\x01\x12(\n#ENCAPSULATION_TRAFFIC_IS_PROHIBITED\x10\xaa\x01\x12\x15\n\x10INVALID_IDENTITY\x10\xab\x01\x12\x13\n\x0eUNKNOWN_SENDER\x10\xac\x01\x12\x13\n\x0eNAT_NOT_NEEDED\x10\xad\x01\x12\x13\n\x0eIS_A_CLUSTERIP\x10\xae\x01\x12.\n)FIRST_LOGICAL_DATAGRAM_FRAGMENT_NOT_FOUND\x10\xaf\x01\x12\x1d\n\x18\x46ORBIDDEN_ICMPV6_MESSAGE\x10\xb0\x01\x12!\n\x1c\x44\x45NIED_BY_LB_SRC_RANGE_CHECK\x10\xb1\x01\x12\x19\n\x14SOCKET_LOOKUP_FAILED\x10\xb2\x01\x12\x19\n\x14SOCKET_ASSIGN_FAILED\x10\xb3\x01\x12\x31\n,PROXY_REDIRECTION_NOT_SUPPORTED_FOR_PROTOCOL\x10\xb4\x01\x12\x10\n\x0bPOLICY_DENY\x10\xb5\x01\x12\x12\n\rVLAN_FILTERED\x10\xb6\x01\x12\x10\n\x0bINVALID_VNI\x10\xb7\x01\x12\x16\n\x11INVALID_TC_BUFFER\x10\xb8\x01\x12\x0b\n\x06NO_SID\x10\xb9\x01\x12\x1b\n\x12MISSING_SRV6_STATE\x10\xba\x01\x1a\x02\x08\x01\x12\n\n\x05NAT46\x10\xbb\x01\x12\n\n\x05NAT64\x10\xbc\x01\x12\x12\n\rAUTH_REQUIRED\x10\xbd\x01\x12\x14\n\x0f\x43T_NO_MAP_FOUND\x10\xbe\x01\x12\x16\n\x11SNAT_NO_MAP_FOUND\x10\xbf\x01\x12\x17\n\x12INVALID_CLUSTER_ID\x10\xc0\x01\x12\'\n"UNSUPPORTED_PROTOCOL_FOR_DSR_ENCAP\x10\xc1\x01\x12\x16\n\x11NO_EGRESS_GATEWAY\x10\xc2\x01\x12\x18\n\x13UNENCRYPTED_TRAFFIC\x10\xc3\x01\x12\x11\n\x0cTTL_EXCEEDED\x10\xc4\x01\x12\x0f\n\nNO_NODE_ID\x10\xc5\x01\x12\x16\n\x11\x44ROP_RATE_LIMITED\x10\xc6\x01\x12\x11\n\x0cIGMP_HANDLED\x10\xc7\x01\x12\x14\n\x0fIGMP_SUBSCRIBED\x10\xc8\x01\x12\x16\n\x11MULTICAST_HANDLED\x10\xc9\x01\x12\x18\n\x13\x44ROP_HOST_NOT_READY\x10\xca\x01\x12\x16\n\x11\x44ROP_EP_NOT_READY\x10\xcb\x01\x12\x16\n\x11\x44ROP_NO_EGRESS_IP\x10\xcc\x01*J\n\x10TrafficDirection\x12\x1d\n\x19TRAFFIC_DIRECTION_UNKNOWN\x10\x00\x12\x0b\n\x07INGRESS\x10\x01\x12\n\n\x06\x45GRESS\x10\x02*\x8d\x02\n\x11\x44\x65\x62ugCapturePoint\x12\x1d\n\x19\x44\x42G_CAPTURE_POINT_UNKNOWN\x10\x00\x12\x18\n\x14\x44\x42G_CAPTURE_DELIVERY\x10\x04\x12\x17\n\x13\x44\x42G_CAPTURE_FROM_LB\x10\x05\x12\x19\n\x15\x44\x42G_CAPTURE_AFTER_V46\x10\x06\x12\x19\n\x15\x44\x42G_CAPTURE_AFTER_V64\x10\x07\x12\x19\n\x15\x44\x42G_CAPTURE_PROXY_PRE\x10\x08\x12\x1a\n\x16\x44\x42G_CAPTURE_PROXY_POST\x10\t\x12\x18\n\x14\x44\x42G_CAPTURE_SNAT_PRE\x10\n\x12\x19\n\x15\x44\x42G_CAPTURE_SNAT_POST\x10\x0b"\x04\x08\x01\x10\x03*9\n\tEventType\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x0f\n\x0b\x45ventSample\x10\t\x12\x0e\n\nRecordLost\x10\x02*\x7f\n\x0fLostEventSource\x12\x1d\n\x19UNKNOWN_LOST_EVENT_SOURCE\x10\x00\x12\x1a\n\x16PERF_EVENT_RING_BUFFER\x10\x01\x12\x19\n\x15OBSERVER_EVENTS_QUEUE\x10\x02\x12\x16\n\x12HUBBLE_RING_BUFFER\x10\x03*\xae\x02\n\x0e\x41gentEventType\x12\x17\n\x13\x41GENT_EVENT_UNKNOWN\x10\x00\x12\x11\n\rAGENT_STARTED\x10\x02\x12\x12\n\x0ePOLICY_UPDATED\x10\x03\x12\x12\n\x0ePOLICY_DELETED\x10\x04\x12\x1f\n\x1b\x45NDPOINT_REGENERATE_SUCCESS\x10\x05\x12\x1f\n\x1b\x45NDPOINT_REGENERATE_FAILURE\x10\x06\x12\x14\n\x10\x45NDPOINT_CREATED\x10\x07\x12\x14\n\x10\x45NDPOINT_DELETED\x10\x08\x12\x14\n\x10IPCACHE_UPSERTED\x10\t\x12\x13\n\x0fIPCACHE_DELETED\x10\n\x12\x14\n\x10SERVICE_UPSERTED\x10\x0b\x12\x13\n\x0fSERVICE_DELETED\x10\x0c"\x04\x08\x01\x10\x01*\xd8\x01\n\x16SocketTranslationPoint\x12\x1c\n\x18SOCK_XLATE_POINT_UNKNOWN\x10\x00\x12&\n"SOCK_XLATE_POINT_PRE_DIRECTION_FWD\x10\x01\x12\'\n#SOCK_XLATE_POINT_POST_DIRECTION_FWD\x10\x02\x12&\n"SOCK_XLATE_POINT_PRE_DIRECTION_REV\x10\x03\x12\'\n#SOCK_XLATE_POINT_POST_DIRECTION_REV\x10\x04*\x81\r\n\x0e\x44\x65\x62ugEventType\x12\x15\n\x11\x44\x42G_EVENT_UNKNOWN\x10\x00\x12\x0f\n\x0b\x44\x42G_GENERIC\x10\x01\x12\x16\n\x12\x44\x42G_LOCAL_DELIVERY\x10\x02\x12\r\n\tDBG_ENCAP\x10\x03\x12\x11\n\rDBG_LXC_FOUND\x10\x04\x12\x15\n\x11\x44\x42G_POLICY_DENIED\x10\x05\x12\x11\n\rDBG_CT_LOOKUP\x10\x06\x12\x15\n\x11\x44\x42G_CT_LOOKUP_REV\x10\x07\x12\x10\n\x0c\x44\x42G_CT_MATCH\x10\x08\x12\x12\n\x0e\x44\x42G_CT_CREATED\x10\t\x12\x13\n\x0f\x44\x42G_CT_CREATED2\x10\n\x12\x14\n\x10\x44\x42G_ICMP6_HANDLE\x10\x0b\x12\x15\n\x11\x44\x42G_ICMP6_REQUEST\x10\x0c\x12\x10\n\x0c\x44\x42G_ICMP6_NS\x10\r\x12\x1b\n\x17\x44\x42G_ICMP6_TIME_EXCEEDED\x10\x0e\x12\x12\n\x0e\x44\x42G_CT_VERDICT\x10\x0f\x12\r\n\tDBG_DECAP\x10\x10\x12\x10\n\x0c\x44\x42G_PORT_MAP\x10\x11\x12\x11\n\rDBG_ERROR_RET\x10\x12\x12\x0f\n\x0b\x44\x42G_TO_HOST\x10\x13\x12\x10\n\x0c\x44\x42G_TO_STACK\x10\x14\x12\x10\n\x0c\x44\x42G_PKT_HASH\x10\x15\x12\x1b\n\x17\x44\x42G_LB6_LOOKUP_FRONTEND\x10\x16\x12 \n\x1c\x44\x42G_LB6_LOOKUP_FRONTEND_FAIL\x10\x17\x12\x1f\n\x1b\x44\x42G_LB6_LOOKUP_BACKEND_SLOT\x10\x18\x12\'\n#DBG_LB6_LOOKUP_BACKEND_SLOT_SUCCESS\x10\x19\x12\'\n#DBG_LB6_LOOKUP_BACKEND_SLOT_V2_FAIL\x10\x1a\x12\x1f\n\x1b\x44\x42G_LB6_LOOKUP_BACKEND_FAIL\x10\x1b\x12\x1e\n\x1a\x44\x42G_LB6_REVERSE_NAT_LOOKUP\x10\x1c\x12\x17\n\x13\x44\x42G_LB6_REVERSE_NAT\x10\x1d\x12\x1b\n\x17\x44\x42G_LB4_LOOKUP_FRONTEND\x10\x1e\x12 \n\x1c\x44\x42G_LB4_LOOKUP_FRONTEND_FAIL\x10\x1f\x12\x1f\n\x1b\x44\x42G_LB4_LOOKUP_BACKEND_SLOT\x10 \x12\'\n#DBG_LB4_LOOKUP_BACKEND_SLOT_SUCCESS\x10!\x12\'\n#DBG_LB4_LOOKUP_BACKEND_SLOT_V2_FAIL\x10"\x12\x1f\n\x1b\x44\x42G_LB4_LOOKUP_BACKEND_FAIL\x10#\x12\x1e\n\x1a\x44\x42G_LB4_REVERSE_NAT_LOOKUP\x10$\x12\x17\n\x13\x44\x42G_LB4_REVERSE_NAT\x10%\x12\x19\n\x15\x44\x42G_LB4_LOOPBACK_SNAT\x10&\x12\x1d\n\x19\x44\x42G_LB4_LOOPBACK_SNAT_REV\x10\'\x12\x12\n\x0e\x44\x42G_CT_LOOKUP4\x10(\x12\x1b\n\x17\x44\x42G_RR_BACKEND_SLOT_SEL\x10)\x12\x18\n\x14\x44\x42G_REV_PROXY_LOOKUP\x10*\x12\x17\n\x13\x44\x42G_REV_PROXY_FOUND\x10+\x12\x18\n\x14\x44\x42G_REV_PROXY_UPDATE\x10,\x12\x11\n\rDBG_L4_POLICY\x10-\x12\x19\n\x15\x44\x42G_NETDEV_IN_CLUSTER\x10.\x12\x15\n\x11\x44\x42G_NETDEV_ENCAP4\x10/\x12\x14\n\x10\x44\x42G_CT_LOOKUP4_1\x10\x30\x12\x14\n\x10\x44\x42G_CT_LOOKUP4_2\x10\x31\x12\x13\n\x0f\x44\x42G_CT_CREATED4\x10\x32\x12\x14\n\x10\x44\x42G_CT_LOOKUP6_1\x10\x33\x12\x14\n\x10\x44\x42G_CT_LOOKUP6_2\x10\x34\x12\x13\n\x0f\x44\x42G_CT_CREATED6\x10\x35\x12\x12\n\x0e\x44\x42G_SKIP_PROXY\x10\x36\x12\x11\n\rDBG_L4_CREATE\x10\x37\x12\x19\n\x15\x44\x42G_IP_ID_MAP_FAILED4\x10\x38\x12\x19\n\x15\x44\x42G_IP_ID_MAP_FAILED6\x10\x39\x12\x1a\n\x16\x44\x42G_IP_ID_MAP_SUCCEED4\x10:\x12\x1a\n\x16\x44\x42G_IP_ID_MAP_SUCCEED6\x10;\x12\x13\n\x0f\x44\x42G_LB_STALE_CT\x10<\x12\x18\n\x14\x44\x42G_INHERIT_IDENTITY\x10=\x12\x12\n\x0e\x44\x42G_SK_LOOKUP4\x10>\x12\x12\n\x0e\x44\x42G_SK_LOOKUP6\x10?\x12\x11\n\rDBG_SK_ASSIGN\x10@\x12\r\n\tDBG_L7_LB\x10\x41\x12\x13\n\x0f\x44\x42G_SKIP_POLICY\x10\x42\x42&Z$github.com/cilium/cilium/api/v1/flowb\x06proto3' ) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "flow.flow_pb2", _globals) if not _descriptor._USE_C_DESCRIPTORS: _globals["DESCRIPTOR"]._loaded_options = None _globals["DESCRIPTOR"]._serialized_options = ( b"Z$github.com/cilium/cilium/api/v1/flow" ) _globals["_TRACEREASON"].values_by_name["REOPENED"]._loaded_options = None _globals["_TRACEREASON"].values_by_name[ "REOPENED" ]._serialized_options = b"\010\001" _globals["_DROPREASON"].values_by_name["INVALID_SOURCE_MAC"]._loaded_options = None _globals["_DROPREASON"].values_by_name[ "INVALID_SOURCE_MAC" ]._serialized_options = b"\010\001" _globals["_DROPREASON"].values_by_name[ "INVALID_DESTINATION_MAC" ]._loaded_options = None _globals["_DROPREASON"].values_by_name[ "INVALID_DESTINATION_MAC" ]._serialized_options = b"\010\001" _globals["_DROPREASON"].values_by_name[ "CT_CANNOT_CREATE_ENTRY_FROM_PACKET" ]._loaded_options = None _globals["_DROPREASON"].values_by_name[ "CT_CANNOT_CREATE_ENTRY_FROM_PACKET" ]._serialized_options = b"\010\001" _globals["_DROPREASON"].values_by_name[ "ERROR_RETRIEVING_TUNNEL_OPTIONS" ]._loaded_options = None _globals["_DROPREASON"].values_by_name[ "ERROR_RETRIEVING_TUNNEL_OPTIONS" ]._serialized_options = b"\010\001" _globals["_DROPREASON"].values_by_name[ "INVALID_GENEVE_OPTION" ]._loaded_options = None _globals["_DROPREASON"].values_by_name[ "INVALID_GENEVE_OPTION" ]._serialized_options = b"\010\001" _globals["_DROPREASON"].values_by_name[ "NO_MATCHING_LOCAL_CONTAINER_FOUND" ]._loaded_options = None _globals["_DROPREASON"].values_by_name[ "NO_MATCHING_LOCAL_CONTAINER_FOUND" ]._serialized_options = b"\010\001" _globals["_DROPREASON"].values_by_name["MISSING_SRV6_STATE"]._loaded_options = None _globals["_DROPREASON"].values_by_name[ "MISSING_SRV6_STATE" ]._serialized_options = b"\010\001" _globals["_FLOW"].fields_by_name["drop_reason"]._loaded_options = None _globals["_FLOW"].fields_by_name["drop_reason"]._serialized_options = b"\030\001" _globals["_FLOW"].fields_by_name["reply"]._loaded_options = None _globals["_FLOW"].fields_by_name["reply"]._serialized_options = b"\030\001" _globals["_FLOW"].fields_by_name["Summary"]._loaded_options = None _globals["_FLOW"].fields_by_name["Summary"]._serialized_options = b"\030\001" _globals["_SERVICEUPSERTNOTIFICATION"].fields_by_name[ "traffic_policy" ]._loaded_options = None _globals["_SERVICEUPSERTNOTIFICATION"].fields_by_name[ "traffic_policy" ]._serialized_options = b"\030\001" _globals["_FLOWTYPE"]._serialized_start = 6477 _globals["_FLOWTYPE"]._serialized_end = 6534 _globals["_AUTHTYPE"]._serialized_start = 6536 _globals["_AUTHTYPE"]._serialized_end = 6593 _globals["_TRACEOBSERVATIONPOINT"]._serialized_start = 6596 _globals["_TRACEOBSERVATIONPOINT"]._serialized_end = 6830 _globals["_TRACEREASON"]._serialized_start = 6833 _globals["_TRACEREASON"]._serialized_end = 6993 _globals["_L7FLOWTYPE"]._serialized_start = 6995 _globals["_L7FLOWTYPE"]._serialized_end = 7067 _globals["_IPVERSION"]._serialized_start = 7069 _globals["_IPVERSION"]._serialized_end = 7117 _globals["_VERDICT"]._serialized_start = 7119 _globals["_VERDICT"]._serialized_end = 7243 _globals["_DROPREASON"]._serialized_start = 7246 _globals["_DROPREASON"]._serialized_end = 9469 _globals["_TRAFFICDIRECTION"]._serialized_start = 9471 _globals["_TRAFFICDIRECTION"]._serialized_end = 9545 _globals["_DEBUGCAPTUREPOINT"]._serialized_start = 9548 _globals["_DEBUGCAPTUREPOINT"]._serialized_end = 9817 _globals["_EVENTTYPE"]._serialized_start = 9819 _globals["_EVENTTYPE"]._serialized_end = 9876 _globals["_LOSTEVENTSOURCE"]._serialized_start = 9878 _globals["_LOSTEVENTSOURCE"]._serialized_end = 10005 _globals["_AGENTEVENTTYPE"]._serialized_start = 10008 _globals["_AGENTEVENTTYPE"]._serialized_end = 10310 _globals["_SOCKETTRANSLATIONPOINT"]._serialized_start = 10313 _globals["_SOCKETTRANSLATIONPOINT"]._serialized_end = 10529 _globals["_DEBUGEVENTTYPE"]._serialized_start = 10532 _globals["_DEBUGEVENTTYPE"]._serialized_end = 12197 _globals["_FLOW"]._serialized_start = 118 _globals["_FLOW"]._serialized_end = 1535 _globals["_FILEINFO"]._serialized_start = 1537 _globals["_FILEINFO"]._serialized_end = 1575 _globals["_LAYER4"]._serialized_start = 1578 _globals["_LAYER4"]._serialized_end = 1742 _globals["_LAYER7"]._serialized_start = 1745 _globals["_LAYER7"]._serialized_end = 1899 _globals["_TRACECONTEXT"]._serialized_start = 1901 _globals["_TRACECONTEXT"]._serialized_end = 1950 _globals["_TRACEPARENT"]._serialized_start = 1952 _globals["_TRACEPARENT"]._serialized_end = 1983 _globals["_ENDPOINT"]._serialized_start = 1986 _globals["_ENDPOINT"]._serialized_end = 2136 _globals["_WORKLOAD"]._serialized_start = 2138 _globals["_WORKLOAD"]._serialized_end = 2176 _globals["_TCP"]._serialized_start = 2178 _globals["_TCP"]._serialized_end = 2261 _globals["_IP"]._serialized_start = 2263 _globals["_IP"]._serialized_end = 2382 _globals["_ETHERNET"]._serialized_start = 2384 _globals["_ETHERNET"]._serialized_end = 2431 _globals["_TCPFLAGS"]._serialized_start = 2433 _globals["_TCPFLAGS"]._serialized_end = 2559 _globals["_UDP"]._serialized_start = 2561 _globals["_UDP"]._serialized_end = 2613 _globals["_SCTP"]._serialized_start = 2615 _globals["_SCTP"]._serialized_end = 2668 _globals["_ICMPV4"]._serialized_start = 2670 _globals["_ICMPV4"]._serialized_end = 2706 _globals["_ICMPV6"]._serialized_start = 2708 _globals["_ICMPV6"]._serialized_end = 2744 _globals["_POLICY"]._serialized_start = 2746 _globals["_POLICY"]._serialized_end = 2835 _globals["_EVENTTYPEFILTER"]._serialized_start = 2837 _globals["_EVENTTYPEFILTER"]._serialized_end = 2910 _globals["_CILIUMEVENTTYPE"]._serialized_start = 2912 _globals["_CILIUMEVENTTYPE"]._serialized_end = 2961 _globals["_FLOWFILTER"]._serialized_start = 2964 _globals["_FLOWFILTER"]._serialized_end = 4054 _globals["_FLOWFILTER_EXPERIMENTAL"]._serialized_start = 4016 _globals["_FLOWFILTER_EXPERIMENTAL"]._serialized_end = 4054 _globals["_DNS"]._serialized_start = 4057 _globals["_DNS"]._serialized_end = 4195 _globals["_HTTPHEADER"]._serialized_start = 4197 _globals["_HTTPHEADER"]._serialized_end = 4237 _globals["_HTTP"]._serialized_start = 4239 _globals["_HTTP"]._serialized_end = 4341 _globals["_KAFKA"]._serialized_start = 4343 _globals["_KAFKA"]._serialized_end = 4447 _globals["_SERVICE"]._serialized_start = 4449 _globals["_SERVICE"]._serialized_end = 4491 _globals["_LOSTEVENT"]._serialized_start = 4493 _globals["_LOSTEVENT"]._serialized_end = 4610 _globals["_AGENTEVENT"]._serialized_start = 4613 _globals["_AGENTEVENT"]._serialized_end = 5121 _globals["_AGENTEVENTUNKNOWN"]._serialized_start = 5123 _globals["_AGENTEVENTUNKNOWN"]._serialized_end = 5178 _globals["_TIMENOTIFICATION"]._serialized_start = 5180 _globals["_TIMENOTIFICATION"]._serialized_end = 5240 _globals["_POLICYUPDATENOTIFICATION"]._serialized_start = 5242 _globals["_POLICYUPDATENOTIFICATION"]._serialized_end = 5322 _globals["_ENDPOINTREGENNOTIFICATION"]._serialized_start = 5324 _globals["_ENDPOINTREGENNOTIFICATION"]._serialized_end = 5394 _globals["_ENDPOINTUPDATENOTIFICATION"]._serialized_start = 5396 _globals["_ENDPOINTUPDATENOTIFICATION"]._serialized_end = 5504 _globals["_IPCACHENOTIFICATION"]._serialized_start = 5507 _globals["_IPCACHENOTIFICATION"]._serialized_end = 5708 _globals["_SERVICEUPSERTNOTIFICATIONADDR"]._serialized_start = 5710 _globals["_SERVICEUPSERTNOTIFICATIONADDR"]._serialized_end = 5767 _globals["_SERVICEUPSERTNOTIFICATION"]._serialized_start = 5770 _globals["_SERVICEUPSERTNOTIFICATION"]._serialized_end = 6067 _globals["_SERVICEDELETENOTIFICATION"]._serialized_start = 6069 _globals["_SERVICEDELETENOTIFICATION"]._serialized_end = 6108 _globals["_NETWORKINTERFACE"]._serialized_start = 6110 _globals["_NETWORKINTERFACE"]._serialized_end = 6157 _globals["_DEBUGEVENT"]._serialized_start = 6160 _globals["_DEBUGEVENT"]._serialized_end = 6475 # @@protoc_insertion_point(module_scope) ================================================ FILE: keep/providers/cilium_provider/grpc/flow/flow_pb2_grpc.py ================================================ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc GRPC_GENERATED_VERSION = "1.67.1" GRPC_VERSION = grpc.__version__ _version_not_supported = False try: from grpc._utilities import first_version_is_lower _version_not_supported = first_version_is_lower( GRPC_VERSION, GRPC_GENERATED_VERSION ) except ImportError: _version_not_supported = True # Shahar: commented out the following code """ if _version_not_supported: raise RuntimeError( f"The grpc package installed is at version {GRPC_VERSION}," + " but the generated code in flow/flow_pb2_grpc.py depends on" + f" grpcio>={GRPC_GENERATED_VERSION}." + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." ) """ ================================================ FILE: keep/providers/cilium_provider/grpc/google/protobuf/duration.proto ================================================ // Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. syntax = "proto3"; package google.protobuf; option cc_enable_arenas = true; option go_package = "google.golang.org/protobuf/types/known/durationpb"; option java_package = "com.google.protobuf"; option java_outer_classname = "DurationProto"; option java_multiple_files = true; option objc_class_prefix = "GPB"; option csharp_namespace = "Google.Protobuf.WellKnownTypes"; // A Duration represents a signed, fixed-length span of time represented // as a count of seconds and fractions of seconds at nanosecond // resolution. It is independent of any calendar and concepts like "day" // or "month". It is related to Timestamp in that the difference between // two Timestamp values is a Duration and it can be added or subtracted // from a Timestamp. Range is approximately +-10,000 years. // // # Examples // // Example 1: Compute Duration from two Timestamps in pseudo code. // // Timestamp start = ...; // Timestamp end = ...; // Duration duration = ...; // // duration.seconds = end.seconds - start.seconds; // duration.nanos = end.nanos - start.nanos; // // if (duration.seconds < 0 && duration.nanos > 0) { // duration.seconds += 1; // duration.nanos -= 1000000000; // } else if (duration.seconds > 0 && duration.nanos < 0) { // duration.seconds -= 1; // duration.nanos += 1000000000; // } // // Example 2: Compute Timestamp from Timestamp + Duration in pseudo code. // // Timestamp start = ...; // Duration duration = ...; // Timestamp end = ...; // // end.seconds = start.seconds + duration.seconds; // end.nanos = start.nanos + duration.nanos; // // if (end.nanos < 0) { // end.seconds -= 1; // end.nanos += 1000000000; // } else if (end.nanos >= 1000000000) { // end.seconds += 1; // end.nanos -= 1000000000; // } // // Example 3: Compute Duration from datetime.timedelta in Python. // // td = datetime.timedelta(days=3, minutes=10) // duration = Duration() // duration.FromTimedelta(td) // // # JSON Mapping // // In JSON format, the Duration type is encoded as a string rather than an // object, where the string ends in the suffix "s" (indicating seconds) and // is preceded by the number of seconds, with nanoseconds expressed as // fractional seconds. For example, 3 seconds with 0 nanoseconds should be // encoded in JSON format as "3s", while 3 seconds and 1 nanosecond should // be expressed in JSON format as "3.000000001s", and 3 seconds and 1 // microsecond should be expressed in JSON format as "3.000001s". // message Duration { // Signed seconds of the span of time. Must be from -315,576,000,000 // to +315,576,000,000 inclusive. Note: these bounds are computed from: // 60 sec/min * 60 min/hr * 24 hr/day * 365.25 days/year * 10000 years int64 seconds = 1; // Signed fractions of a second at nanosecond resolution of the span // of time. Durations less than one second are represented with a 0 // `seconds` field and a positive or negative `nanos` field. For durations // of one second or more, a non-zero value for the `nanos` field must be // of the same sign as the `seconds` field. Must be from -999,999,999 // to +999,999,999 inclusive. int32 nanos = 2; } ================================================ FILE: keep/providers/cilium_provider/grpc/google/protobuf/timestamp.proto ================================================ // Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. syntax = "proto3"; package google.protobuf; option cc_enable_arenas = true; option go_package = "google.golang.org/protobuf/types/known/timestamppb"; option java_package = "com.google.protobuf"; option java_outer_classname = "TimestampProto"; option java_multiple_files = true; option objc_class_prefix = "GPB"; option csharp_namespace = "Google.Protobuf.WellKnownTypes"; // A Timestamp represents a point in time independent of any time zone or local // calendar, encoded as a count of seconds and fractions of seconds at // nanosecond resolution. The count is relative to an epoch at UTC midnight on // January 1, 1970, in the proleptic Gregorian calendar which extends the // Gregorian calendar backwards to year one. // // All minutes are 60 seconds long. Leap seconds are "smeared" so that no leap // second table is needed for interpretation, using a [24-hour linear // smear](https://developers.google.com/time/smear). // // The range is from 0001-01-01T00:00:00Z to 9999-12-31T23:59:59.999999999Z. By // restricting to that range, we ensure that we can convert to and from [RFC // 3339](https://www.ietf.org/rfc/rfc3339.txt) date strings. // // # Examples // // Example 1: Compute Timestamp from POSIX `time()`. // // Timestamp timestamp; // timestamp.set_seconds(time(NULL)); // timestamp.set_nanos(0); // // Example 2: Compute Timestamp from POSIX `gettimeofday()`. // // struct timeval tv; // gettimeofday(&tv, NULL); // // Timestamp timestamp; // timestamp.set_seconds(tv.tv_sec); // timestamp.set_nanos(tv.tv_usec * 1000); // // Example 3: Compute Timestamp from Win32 `GetSystemTimeAsFileTime()`. // // FILETIME ft; // GetSystemTimeAsFileTime(&ft); // UINT64 ticks = (((UINT64)ft.dwHighDateTime) << 32) | ft.dwLowDateTime; // // // A Windows tick is 100 nanoseconds. Windows epoch 1601-01-01T00:00:00Z // // is 11644473600 seconds before Unix epoch 1970-01-01T00:00:00Z. // Timestamp timestamp; // timestamp.set_seconds((INT64) ((ticks / 10000000) - 11644473600LL)); // timestamp.set_nanos((INT32) ((ticks % 10000000) * 100)); // // Example 4: Compute Timestamp from Java `System.currentTimeMillis()`. // // long millis = System.currentTimeMillis(); // // Timestamp timestamp = Timestamp.newBuilder().setSeconds(millis / 1000) // .setNanos((int) ((millis % 1000) * 1000000)).build(); // // Example 5: Compute Timestamp from Java `Instant.now()`. // // Instant now = Instant.now(); // // Timestamp timestamp = // Timestamp.newBuilder().setSeconds(now.getEpochSecond()) // .setNanos(now.getNano()).build(); // // Example 6: Compute Timestamp from current time in Python. // // timestamp = Timestamp() // timestamp.GetCurrentTime() // // # JSON Mapping // // In JSON format, the Timestamp type is encoded as a string in the // [RFC 3339](https://www.ietf.org/rfc/rfc3339.txt) format. That is, the // format is "{year}-{month}-{day}T{hour}:{min}:{sec}[.{frac_sec}]Z" // where {year} is always expressed using four digits while {month}, {day}, // {hour}, {min}, and {sec} are zero-padded to two digits each. The fractional // seconds, which can go up to 9 digits (i.e. up to 1 nanosecond resolution), // are optional. The "Z" suffix indicates the timezone ("UTC"); the timezone // is required. A proto3 JSON serializer should always use UTC (as indicated by // "Z") when printing the Timestamp type and a proto3 JSON parser should be // able to accept both UTC and other timezones (as indicated by an offset). // // For example, "2017-01-15T01:30:15.01Z" encodes 15.01 seconds past // 01:30 UTC on January 15, 2017. // // In JavaScript, one can convert a Date object to this format using the // standard // [toISOString()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Date/toISOString) // method. In Python, a standard `datetime.datetime` object can be converted // to this format using // [`strftime`](https://docs.python.org/2/library/time.html#time.strftime) with // the time format spec '%Y-%m-%dT%H:%M:%S.%fZ'. Likewise, in Java, one can use // the Joda Time's [`ISODateTimeFormat.dateTime()`]( // http://joda-time.sourceforge.net/apidocs/org/joda/time/format/ISODateTimeFormat.html#dateTime() // ) to obtain a formatter capable of generating timestamps in this format. // message Timestamp { // Represents seconds of UTC time since Unix epoch // 1970-01-01T00:00:00Z. Must be from 0001-01-01T00:00:00Z to // 9999-12-31T23:59:59Z inclusive. int64 seconds = 1; // Non-negative fractions of a second at nanosecond resolution. Negative // second values with fractions must still have non-negative nanos values // that count forward in time. Must be from 0 to 999,999,999 // inclusive. int32 nanos = 2; } ================================================ FILE: keep/providers/cilium_provider/grpc/google/protobuf/wrappers.proto ================================================ // Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Wrappers for primitive (non-message) types. These types are useful // for embedding primitives in the `google.protobuf.Any` type and for places // where we need to distinguish between the absence of a primitive // typed field and its default value. // // These wrappers have no meaningful use within repeated fields as they lack // the ability to detect presence on individual elements. // These wrappers have no meaningful use within a map or a oneof since // individual entries of a map or fields of a oneof can already detect presence. syntax = "proto3"; package google.protobuf; option cc_enable_arenas = true; option go_package = "google.golang.org/protobuf/types/known/wrapperspb"; option java_package = "com.google.protobuf"; option java_outer_classname = "WrappersProto"; option java_multiple_files = true; option objc_class_prefix = "GPB"; option csharp_namespace = "Google.Protobuf.WellKnownTypes"; // Wrapper message for `double`. // // The JSON representation for `DoubleValue` is JSON number. message DoubleValue { // The double value. double value = 1; } // Wrapper message for `float`. // // The JSON representation for `FloatValue` is JSON number. message FloatValue { // The float value. float value = 1; } // Wrapper message for `int64`. // // The JSON representation for `Int64Value` is JSON string. message Int64Value { // The int64 value. int64 value = 1; } // Wrapper message for `uint64`. // // The JSON representation for `UInt64Value` is JSON string. message UInt64Value { // The uint64 value. uint64 value = 1; } // Wrapper message for `int32`. // // The JSON representation for `Int32Value` is JSON number. message Int32Value { // The int32 value. int32 value = 1; } // Wrapper message for `uint32`. // // The JSON representation for `UInt32Value` is JSON number. message UInt32Value { // The uint32 value. uint32 value = 1; } // Wrapper message for `bool`. // // The JSON representation for `BoolValue` is JSON `true` and `false`. message BoolValue { // The bool value. bool value = 1; } // Wrapper message for `string`. // // The JSON representation for `StringValue` is JSON string. message StringValue { // The string value. string value = 1; } // Wrapper message for `bytes`. // // The JSON representation for `BytesValue` is JSON string. message BytesValue { // The bytes value. bytes value = 1; } ================================================ FILE: keep/providers/cilium_provider/grpc/observer.proto ================================================ // SPDX-License-Identifier: Apache-2.0 // Copyright Authors of Hubble syntax = "proto3"; import "google/protobuf/any.proto"; import "google/protobuf/wrappers.proto"; import "google/protobuf/timestamp.proto"; import "google/protobuf/field_mask.proto"; import public "flow/flow.proto"; import "relay/relay.proto"; package observer; option go_package = "github.com/cilium/cilium/api/v1/observer"; // Observer returns a stream of Flows depending on which filter the user want // to observe. service Observer { // GetFlows returning structured data, meant to eventually obsolete GetLastNFlows. rpc GetFlows(GetFlowsRequest) returns (stream GetFlowsResponse) {} // GetAgentEvents returns Cilium agent events. rpc GetAgentEvents(GetAgentEventsRequest) returns (stream GetAgentEventsResponse) {} // GetDebugEvents returns Cilium datapath debug events. rpc GetDebugEvents(GetDebugEventsRequest) returns (stream GetDebugEventsResponse) {} // GetNodes returns information about nodes in a cluster. rpc GetNodes(GetNodesRequest) returns (GetNodesResponse) {} // GetNamespaces returns information about namespaces in a cluster. // The namespaces returned are namespaces which have had network flows in // the last hour. The namespaces are returned sorted by cluster name and // namespace in ascending order. rpc GetNamespaces(GetNamespacesRequest) returns (GetNamespacesResponse) {} // ServerStatus returns some details about the running hubble server. rpc ServerStatus(ServerStatusRequest) returns (ServerStatusResponse) {} } message ServerStatusRequest {} message ServerStatusResponse { // number of currently captured flows // In a multi-node context, this is the cumulative count of all captured // flows. uint64 num_flows = 1; // maximum capacity of the ring buffer // In a multi-node context, this is the aggregation of all ring buffers // capacities. uint64 max_flows = 2; // total amount of flows observed since the observer was started // In a multi-node context, this is the aggregation of all flows that have // been seen. uint64 seen_flows = 3; // uptime of this observer instance in nanoseconds // In a multi-node context, this field corresponds to the uptime of the // longest living instance. uint64 uptime_ns = 4; // number of nodes for which a connection is established google.protobuf.UInt32Value num_connected_nodes = 5; // number of nodes for which a connection cannot be established google.protobuf.UInt32Value num_unavailable_nodes = 6; // list of nodes that are unavailable // This list may not be exhaustive. repeated string unavailable_nodes = 7; // Version is the version of Cilium/Hubble. string version = 8; // Approximate rate of flows seen by Hubble per second over the last minute. // In a multi-node context, this is the sum of all flows rates. double flows_rate = 9; } message GetFlowsRequest { // Number of flows that should be returned. Incompatible with `since/until`. // Defaults to the most recent (last) `number` flows, unless `first` is // true, then it will return the earliest `number` flows. uint64 number = 1; // first specifies if we should look at the first `number` flows or the // last `number` of flows. Incompatible with `follow`. bool first = 9; reserved 2; // removed, do not use // follow sets when the server should continue to stream flows after // printing the last N flows. bool follow = 3; // blacklist defines a list of filters which have to match for a flow to be // excluded from the result. // If multiple blacklist filters are specified, only one of them has to // match for a flow to be excluded. repeated flow.FlowFilter blacklist = 5; // whitelist defines a list of filters which have to match for a flow to be // included in the result. // If multiple whitelist filters are specified, only one of them has to // match for a flow to be included. // The whitelist and blacklist can both be specified. In such cases, the // set of the returned flows is the set difference `whitelist - blacklist`. // In other words, the result will contain all flows matched by the // whitelist that are not also simultaneously matched by the blacklist. repeated flow.FlowFilter whitelist = 6; // Since this time for returned flows. Incompatible with `number`. google.protobuf.Timestamp since = 7; // Until this time for returned flows. Incompatible with `number`. google.protobuf.Timestamp until = 8; // FieldMask allows clients to limit flow's fields that will be returned. // For example, {paths: ["source.id", "destination.id"]} will return flows // with only these two fields set. google.protobuf.FieldMask field_mask = 10; // Experimental contains fields that are not stable yet. Support for // experimental features is always optional and subject to change. message Experimental { // FieldMask allows clients to limit flow's fields that will be returned. // For example, {paths: ["source.id", "destination.id"]} will return flows // with only these two fields set. // Deprecated in favor of top-level field_mask. This field will be // removed in v1.17. google.protobuf.FieldMask field_mask = 1 [deprecated=true]; } Experimental experimental = 999; // extensions can be used to add arbitrary additional metadata to GetFlowsRequest. // This can be used to extend functionality for other Hubble compatible // APIs, or experiment with new functionality without needing to change the public API. google.protobuf.Any extensions = 150000; } // GetFlowsResponse contains either a flow or a protocol message. message GetFlowsResponse { oneof response_types{ flow.Flow flow = 1; // node_status informs clients about the state of the nodes // participating in this particular GetFlows request. relay.NodeStatusEvent node_status = 2; // lost_events informs clients about events which got dropped due to // a Hubble component being unavailable flow.LostEvent lost_events = 3; } // Name of the node where this event was observed. string node_name = 1000; // Timestamp at which this event was observed. google.protobuf.Timestamp time = 1001; } message GetAgentEventsRequest { // Number of flows that should be returned. Incompatible with `since/until`. // Defaults to the most recent (last) `number` events, unless `first` is // true, then it will return the earliest `number` events. uint64 number = 1; // first specifies if we should look at the first `number` events or the // last `number` of events. Incompatible with `follow`. bool first = 9; // follow sets when the server should continue to stream agent events after // printing the last N agent events. bool follow = 2; // TODO: do we want to be able to specify blocklist/allowlist (previously // known as blacklist/whitelist)? // Since this time for returned agent events. Incompatible with `number`. google.protobuf.Timestamp since = 7; // Until this time for returned agent events. Incompatible with `number`. google.protobuf.Timestamp until = 8; } // GetAgentEventsResponse contains an event received from the Cilium agent. message GetAgentEventsResponse { flow.AgentEvent agent_event = 1; // Name of the node where this event was observed. string node_name = 1000; // Timestamp at which this event was observed. google.protobuf.Timestamp time = 1001; } message GetDebugEventsRequest { // Number of events that should be returned. Incompatible with `since/until`. // Defaults to the most recent (last) `number` events, unless `first` is // true, then it will return the earliest `number` events. uint64 number = 1; // first specifies if we should look at the first `number` events or the // last `number` of events. Incompatible with `follow`. bool first = 9; // follow sets when the server should continue to stream debug events after // printing the last N debug events. bool follow = 2; // TODO: do we want to be able to specify blocklist/allowlist (previously // known as blacklist/whitelist)? // Since this time for returned debug events. Incompatible with `number`. google.protobuf.Timestamp since = 7; // Until this time for returned debug events. Incompatible with `number`. google.protobuf.Timestamp until = 8; } // GetDebugEventsResponse contains a Cilium datapath debug events. message GetDebugEventsResponse { flow.DebugEvent debug_event = 1; // Name of the node where this event was observed. string node_name = 1000; // Timestamp at which this event was observed. google.protobuf.Timestamp time = 1001; } message GetNodesRequest {} // GetNodesResponse contains the list of nodes. message GetNodesResponse { // Nodes is an exhaustive list of nodes. repeated Node nodes = 1; } // Node represents a cluster node. message Node { // Name is the name of the node. string name = 1; // Version is the version of Cilium/Hubble as reported by the node. string version = 2; // Address is the network address of the API endpoint. string address = 3; // State represents the known state of the node. relay.NodeState state = 4; // TLS reports TLS related information. TLS tls = 5; // UptimeNS is the uptime of this instance in nanoseconds uint64 uptime_ns = 6; // number of currently captured flows uint64 num_flows = 7; // maximum capacity of the ring buffer uint64 max_flows = 8; // total amount of flows observed since the observer was started uint64 seen_flows = 9; } // TLS represents TLS information. message TLS { // Enabled reports whether TLS is enabled or not. bool enabled = 1; // ServerName is the TLS server name that can be used as part of the TLS // cert validation process. string server_name = 2; } message GetNamespacesRequest {} // GetNamespacesResponse contains the list of namespaces. message GetNamespacesResponse { // Namespaces is a list of namespaces with flows repeated Namespace namespaces = 1; } message Namespace { string cluster = 1; string namespace = 2; } // ExportEvent contains an event to be exported. Not to be used outside of the // exporter feature. message ExportEvent { oneof response_types{ flow.Flow flow = 1; // node_status informs clients about the state of the nodes // participating in this particular GetFlows request. relay.NodeStatusEvent node_status = 2; // lost_events informs clients about events which got dropped due to // a Hubble component being unavailable flow.LostEvent lost_events = 3; // agent_event informs clients about an event received from the Cilium // agent. flow.AgentEvent agent_event = 4; // debug_event contains Cilium datapath debug events flow.DebugEvent debug_event = 5; } // Name of the node where this event was observed. string node_name = 1000; // Timestamp at which this event was observed. google.protobuf.Timestamp time = 1001; } ================================================ FILE: keep/providers/cilium_provider/grpc/observer_pb2.py ================================================ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # NO CHECKED-IN PROTOBUF GENCODE # source: observer.proto # Protobuf Python Version: 5.27.2 """Generated protocol buffer code.""" # from google.protobuf import runtime_version as _runtime_version from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder """ _runtime_version.ValidateProtobufRuntimeVersion( _runtime_version.Domain.PUBLIC, 5, 27, 2, '', 'observer.proto' ) # @@protoc_insertion_point(imports) """ _sym_db = _symbol_database.Default() from keep.providers.cilium_provider.grpc.flow.flow_pb2 import * # noqa from keep.providers.cilium_provider.grpc.relay.relay_pb2 import * # noqa DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( b'\n\x0eobserver.proto\x12\x08observer\x1a\x19google/protobuf/any.proto\x1a\x1egoogle/protobuf/wrappers.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a google/protobuf/field_mask.proto\x1a\x0f\x66low/flow.proto\x1a\x11relay/relay.proto"\x15\n\x13ServerStatusRequest"\x9b\x02\n\x14ServerStatusResponse\x12\x11\n\tnum_flows\x18\x01 \x01(\x04\x12\x11\n\tmax_flows\x18\x02 \x01(\x04\x12\x12\n\nseen_flows\x18\x03 \x01(\x04\x12\x11\n\tuptime_ns\x18\x04 \x01(\x04\x12\x39\n\x13num_connected_nodes\x18\x05 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12;\n\x15num_unavailable_nodes\x18\x06 \x01(\x0b\x32\x1c.google.protobuf.UInt32Value\x12\x19\n\x11unavailable_nodes\x18\x07 \x03(\t\x12\x0f\n\x07version\x18\x08 \x01(\t\x12\x12\n\nflows_rate\x18\t \x01(\x01"\xc5\x03\n\x0fGetFlowsRequest\x12\x0e\n\x06number\x18\x01 \x01(\x04\x12\r\n\x05\x66irst\x18\t \x01(\x08\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12#\n\tblacklist\x18\x05 \x03(\x0b\x32\x10.flow.FlowFilter\x12#\n\twhitelist\x18\x06 \x03(\x0b\x32\x10.flow.FlowFilter\x12)\n\x05since\x18\x07 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12)\n\x05until\x18\x08 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12.\n\nfield_mask\x18\n \x01(\x0b\x32\x1a.google.protobuf.FieldMask\x12=\n\x0c\x65xperimental\x18\xe7\x07 \x01(\x0b\x32&.observer.GetFlowsRequest.Experimental\x12*\n\nextensions\x18\xf0\x93\t \x01(\x0b\x32\x14.google.protobuf.Any\x1a\x42\n\x0c\x45xperimental\x12\x32\n\nfield_mask\x18\x01 \x01(\x0b\x32\x1a.google.protobuf.FieldMaskB\x02\x18\x01J\x04\x08\x02\x10\x03"\xd6\x01\n\x10GetFlowsResponse\x12\x1a\n\x04\x66low\x18\x01 \x01(\x0b\x32\n.flow.FlowH\x00\x12-\n\x0bnode_status\x18\x02 \x01(\x0b\x32\x16.relay.NodeStatusEventH\x00\x12&\n\x0blost_events\x18\x03 \x01(\x0b\x32\x0f.flow.LostEventH\x00\x12\x12\n\tnode_name\x18\xe8\x07 \x01(\t\x12)\n\x04time\x18\xe9\x07 \x01(\x0b\x32\x1a.google.protobuf.TimestampB\x10\n\x0eresponse_types"\x9c\x01\n\x15GetAgentEventsRequest\x12\x0e\n\x06number\x18\x01 \x01(\x04\x12\r\n\x05\x66irst\x18\t \x01(\x08\x12\x0e\n\x06\x66ollow\x18\x02 \x01(\x08\x12)\n\x05since\x18\x07 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12)\n\x05until\x18\x08 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"~\n\x16GetAgentEventsResponse\x12%\n\x0b\x61gent_event\x18\x01 \x01(\x0b\x32\x10.flow.AgentEvent\x12\x12\n\tnode_name\x18\xe8\x07 \x01(\t\x12)\n\x04time\x18\xe9\x07 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\x9c\x01\n\x15GetDebugEventsRequest\x12\x0e\n\x06number\x18\x01 \x01(\x04\x12\r\n\x05\x66irst\x18\t \x01(\x08\x12\x0e\n\x06\x66ollow\x18\x02 \x01(\x08\x12)\n\x05since\x18\x07 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12)\n\x05until\x18\x08 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"~\n\x16GetDebugEventsResponse\x12%\n\x0b\x64\x65\x62ug_event\x18\x01 \x01(\x0b\x32\x10.flow.DebugEvent\x12\x12\n\tnode_name\x18\xe8\x07 \x01(\t\x12)\n\x04time\x18\xe9\x07 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\x11\n\x0fGetNodesRequest"1\n\x10GetNodesResponse\x12\x1d\n\x05nodes\x18\x01 \x03(\x0b\x32\x0e.observer.Node"\xc0\x01\n\x04Node\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12\x0f\n\x07\x61\x64\x64ress\x18\x03 \x01(\t\x12\x1f\n\x05state\x18\x04 \x01(\x0e\x32\x10.relay.NodeState\x12\x1a\n\x03tls\x18\x05 \x01(\x0b\x32\r.observer.TLS\x12\x11\n\tuptime_ns\x18\x06 \x01(\x04\x12\x11\n\tnum_flows\x18\x07 \x01(\x04\x12\x11\n\tmax_flows\x18\x08 \x01(\x04\x12\x12\n\nseen_flows\x18\t \x01(\x04"+\n\x03TLS\x12\x0f\n\x07\x65nabled\x18\x01 \x01(\x08\x12\x13\n\x0bserver_name\x18\x02 \x01(\t"\x16\n\x14GetNamespacesRequest"@\n\x15GetNamespacesResponse\x12\'\n\nnamespaces\x18\x01 \x03(\x0b\x32\x13.observer.Namespace"/\n\tNamespace\x12\x0f\n\x07\x63luster\x18\x01 \x01(\t\x12\x11\n\tnamespace\x18\x02 \x01(\t"\xa3\x02\n\x0b\x45xportEvent\x12\x1a\n\x04\x66low\x18\x01 \x01(\x0b\x32\n.flow.FlowH\x00\x12-\n\x0bnode_status\x18\x02 \x01(\x0b\x32\x16.relay.NodeStatusEventH\x00\x12&\n\x0blost_events\x18\x03 \x01(\x0b\x32\x0f.flow.LostEventH\x00\x12\'\n\x0b\x61gent_event\x18\x04 \x01(\x0b\x32\x10.flow.AgentEventH\x00\x12\'\n\x0b\x64\x65\x62ug_event\x18\x05 \x01(\x0b\x32\x10.flow.DebugEventH\x00\x12\x12\n\tnode_name\x18\xe8\x07 \x01(\t\x12)\n\x04time\x18\xe9\x07 \x01(\x0b\x32\x1a.google.protobuf.TimestampB\x10\n\x0eresponse_types2\xed\x03\n\x08Observer\x12\x45\n\x08GetFlows\x12\x19.observer.GetFlowsRequest\x1a\x1a.observer.GetFlowsResponse"\x00\x30\x01\x12W\n\x0eGetAgentEvents\x12\x1f.observer.GetAgentEventsRequest\x1a .observer.GetAgentEventsResponse"\x00\x30\x01\x12W\n\x0eGetDebugEvents\x12\x1f.observer.GetDebugEventsRequest\x1a .observer.GetDebugEventsResponse"\x00\x30\x01\x12\x43\n\x08GetNodes\x12\x19.observer.GetNodesRequest\x1a\x1a.observer.GetNodesResponse"\x00\x12R\n\rGetNamespaces\x12\x1e.observer.GetNamespacesRequest\x1a\x1f.observer.GetNamespacesResponse"\x00\x12O\n\x0cServerStatus\x12\x1d.observer.ServerStatusRequest\x1a\x1e.observer.ServerStatusResponse"\x00\x42*Z(github.com/cilium/cilium/api/v1/observerP\x04\x62\x06proto3' ) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "observer_pb2", _globals) if not _descriptor._USE_C_DESCRIPTORS: _globals["DESCRIPTOR"]._loaded_options = None _globals["DESCRIPTOR"]._serialized_options = ( b"Z(github.com/cilium/cilium/api/v1/observer" ) _globals["_GETFLOWSREQUEST_EXPERIMENTAL"].fields_by_name[ "field_mask" ]._loaded_options = None _globals["_GETFLOWSREQUEST_EXPERIMENTAL"].fields_by_name[ "field_mask" ]._serialized_options = b"\030\001" _globals["_SERVERSTATUSREQUEST"]._serialized_start = 190 _globals["_SERVERSTATUSREQUEST"]._serialized_end = 211 _globals["_SERVERSTATUSRESPONSE"]._serialized_start = 214 _globals["_SERVERSTATUSRESPONSE"]._serialized_end = 497 _globals["_GETFLOWSREQUEST"]._serialized_start = 500 _globals["_GETFLOWSREQUEST"]._serialized_end = 953 _globals["_GETFLOWSREQUEST_EXPERIMENTAL"]._serialized_start = 881 _globals["_GETFLOWSREQUEST_EXPERIMENTAL"]._serialized_end = 947 _globals["_GETFLOWSRESPONSE"]._serialized_start = 956 _globals["_GETFLOWSRESPONSE"]._serialized_end = 1170 _globals["_GETAGENTEVENTSREQUEST"]._serialized_start = 1173 _globals["_GETAGENTEVENTSREQUEST"]._serialized_end = 1329 _globals["_GETAGENTEVENTSRESPONSE"]._serialized_start = 1331 _globals["_GETAGENTEVENTSRESPONSE"]._serialized_end = 1457 _globals["_GETDEBUGEVENTSREQUEST"]._serialized_start = 1460 _globals["_GETDEBUGEVENTSREQUEST"]._serialized_end = 1616 _globals["_GETDEBUGEVENTSRESPONSE"]._serialized_start = 1618 _globals["_GETDEBUGEVENTSRESPONSE"]._serialized_end = 1744 _globals["_GETNODESREQUEST"]._serialized_start = 1746 _globals["_GETNODESREQUEST"]._serialized_end = 1763 _globals["_GETNODESRESPONSE"]._serialized_start = 1765 _globals["_GETNODESRESPONSE"]._serialized_end = 1814 _globals["_NODE"]._serialized_start = 1817 _globals["_NODE"]._serialized_end = 2009 _globals["_TLS"]._serialized_start = 2011 _globals["_TLS"]._serialized_end = 2054 _globals["_GETNAMESPACESREQUEST"]._serialized_start = 2056 _globals["_GETNAMESPACESREQUEST"]._serialized_end = 2078 _globals["_GETNAMESPACESRESPONSE"]._serialized_start = 2080 _globals["_GETNAMESPACESRESPONSE"]._serialized_end = 2144 _globals["_NAMESPACE"]._serialized_start = 2146 _globals["_NAMESPACE"]._serialized_end = 2193 _globals["_EXPORTEVENT"]._serialized_start = 2196 _globals["_EXPORTEVENT"]._serialized_end = 2487 _globals["_OBSERVER"]._serialized_start = 2490 _globals["_OBSERVER"]._serialized_end = 2983 # @@protoc_insertion_point(module_scope) ================================================ FILE: keep/providers/cilium_provider/grpc/observer_pb2_grpc.py ================================================ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc import keep.providers.cilium_provider.grpc.observer_pb2 as observer__pb2 GRPC_GENERATED_VERSION = "1.67.1" GRPC_VERSION = grpc.__version__ _version_not_supported = False try: from grpc._utilities import first_version_is_lower _version_not_supported = first_version_is_lower( GRPC_VERSION, GRPC_GENERATED_VERSION ) except ImportError: _version_not_supported = True # Shahar: commented out the following code """ if _version_not_supported: raise RuntimeError( f"The grpc package installed is at version {GRPC_VERSION}," + " but the generated code in observer_pb2_grpc.py depends on" + f" grpcio>={GRPC_GENERATED_VERSION}." + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." ) """ class ObserverStub(object): """Observer returns a stream of Flows depending on which filter the user want to observe. """ def __init__(self, channel): """Constructor. Args: channel: A grpc.Channel. """ self.GetFlows = channel.unary_stream( "/observer.Observer/GetFlows", request_serializer=observer__pb2.GetFlowsRequest.SerializeToString, response_deserializer=observer__pb2.GetFlowsResponse.FromString, _registered_method=True, ) self.GetAgentEvents = channel.unary_stream( "/observer.Observer/GetAgentEvents", request_serializer=observer__pb2.GetAgentEventsRequest.SerializeToString, response_deserializer=observer__pb2.GetAgentEventsResponse.FromString, _registered_method=True, ) self.GetDebugEvents = channel.unary_stream( "/observer.Observer/GetDebugEvents", request_serializer=observer__pb2.GetDebugEventsRequest.SerializeToString, response_deserializer=observer__pb2.GetDebugEventsResponse.FromString, _registered_method=True, ) self.GetNodes = channel.unary_unary( "/observer.Observer/GetNodes", request_serializer=observer__pb2.GetNodesRequest.SerializeToString, response_deserializer=observer__pb2.GetNodesResponse.FromString, _registered_method=True, ) self.GetNamespaces = channel.unary_unary( "/observer.Observer/GetNamespaces", request_serializer=observer__pb2.GetNamespacesRequest.SerializeToString, response_deserializer=observer__pb2.GetNamespacesResponse.FromString, _registered_method=True, ) self.ServerStatus = channel.unary_unary( "/observer.Observer/ServerStatus", request_serializer=observer__pb2.ServerStatusRequest.SerializeToString, response_deserializer=observer__pb2.ServerStatusResponse.FromString, _registered_method=True, ) class ObserverServicer(object): """Observer returns a stream of Flows depending on which filter the user want to observe. """ def GetFlows(self, request, context): """GetFlows returning structured data, meant to eventually obsolete GetLastNFlows.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def GetAgentEvents(self, request, context): """GetAgentEvents returns Cilium agent events.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def GetDebugEvents(self, request, context): """GetDebugEvents returns Cilium datapath debug events.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def GetNodes(self, request, context): """GetNodes returns information about nodes in a cluster.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def GetNamespaces(self, request, context): """GetNamespaces returns information about namespaces in a cluster. The namespaces returned are namespaces which have had network flows in the last hour. The namespaces are returned sorted by cluster name and namespace in ascending order. """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def ServerStatus(self, request, context): """ServerStatus returns some details about the running hubble server.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details("Method not implemented!") raise NotImplementedError("Method not implemented!") def add_ObserverServicer_to_server(servicer, server): rpc_method_handlers = { "GetFlows": grpc.unary_stream_rpc_method_handler( servicer.GetFlows, request_deserializer=observer__pb2.GetFlowsRequest.FromString, response_serializer=observer__pb2.GetFlowsResponse.SerializeToString, ), "GetAgentEvents": grpc.unary_stream_rpc_method_handler( servicer.GetAgentEvents, request_deserializer=observer__pb2.GetAgentEventsRequest.FromString, response_serializer=observer__pb2.GetAgentEventsResponse.SerializeToString, ), "GetDebugEvents": grpc.unary_stream_rpc_method_handler( servicer.GetDebugEvents, request_deserializer=observer__pb2.GetDebugEventsRequest.FromString, response_serializer=observer__pb2.GetDebugEventsResponse.SerializeToString, ), "GetNodes": grpc.unary_unary_rpc_method_handler( servicer.GetNodes, request_deserializer=observer__pb2.GetNodesRequest.FromString, response_serializer=observer__pb2.GetNodesResponse.SerializeToString, ), "GetNamespaces": grpc.unary_unary_rpc_method_handler( servicer.GetNamespaces, request_deserializer=observer__pb2.GetNamespacesRequest.FromString, response_serializer=observer__pb2.GetNamespacesResponse.SerializeToString, ), "ServerStatus": grpc.unary_unary_rpc_method_handler( servicer.ServerStatus, request_deserializer=observer__pb2.ServerStatusRequest.FromString, response_serializer=observer__pb2.ServerStatusResponse.SerializeToString, ), } generic_handler = grpc.method_handlers_generic_handler( "observer.Observer", rpc_method_handlers ) server.add_generic_rpc_handlers((generic_handler,)) server.add_registered_method_handlers("observer.Observer", rpc_method_handlers) # This class is part of an EXPERIMENTAL API. class Observer(object): """Observer returns a stream of Flows depending on which filter the user want to observe. """ @staticmethod def GetFlows( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_stream( request, target, "/observer.Observer/GetFlows", observer__pb2.GetFlowsRequest.SerializeToString, observer__pb2.GetFlowsResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, _registered_method=True, ) @staticmethod def GetAgentEvents( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_stream( request, target, "/observer.Observer/GetAgentEvents", observer__pb2.GetAgentEventsRequest.SerializeToString, observer__pb2.GetAgentEventsResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, _registered_method=True, ) @staticmethod def GetDebugEvents( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_stream( request, target, "/observer.Observer/GetDebugEvents", observer__pb2.GetDebugEventsRequest.SerializeToString, observer__pb2.GetDebugEventsResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, _registered_method=True, ) @staticmethod def GetNodes( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_unary( request, target, "/observer.Observer/GetNodes", observer__pb2.GetNodesRequest.SerializeToString, observer__pb2.GetNodesResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, _registered_method=True, ) @staticmethod def GetNamespaces( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_unary( request, target, "/observer.Observer/GetNamespaces", observer__pb2.GetNamespacesRequest.SerializeToString, observer__pb2.GetNamespacesResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, _registered_method=True, ) @staticmethod def ServerStatus( request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None, ): return grpc.experimental.unary_unary( request, target, "/observer.Observer/ServerStatus", observer__pb2.ServerStatusRequest.SerializeToString, observer__pb2.ServerStatusResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata, _registered_method=True, ) ================================================ FILE: keep/providers/cilium_provider/grpc/relay/__init__.py ================================================ ================================================ FILE: keep/providers/cilium_provider/grpc/relay/relay.proto ================================================ // SPDX-License-Identifier: Apache-2.0 // Copyright Authors of Cilium syntax = "proto3"; package relay; option go_package = "github.com/cilium/cilium/api/v1/relay"; // NodeStatusEvent is a message sent by hubble-relay to inform clients about // the state of a particular node. message NodeStatusEvent { // state_change contains the new node state NodeState state_change = 1; // node_names is the list of nodes for which the above state changes applies repeated string node_names = 2; // message is an optional message attached to the state change (e.g. an // error message). The message applies to all nodes in node_names. string message = 3; } enum NodeState { // UNKNOWN_NODE_STATE indicates that the state of this node is unknown. UNKNOWN_NODE_STATE = 0; // NODE_CONNECTED indicates that we have established a connection // to this node. The client can expect to observe flows from this node. NODE_CONNECTED = 1; // NODE_UNAVAILABLE indicates that the connection to this // node is currently unavailable. The client can expect to not see any // flows from this node until either the connection is re-established or // the node is gone. NODE_UNAVAILABLE = 2; // NODE_GONE indicates that a node has been removed from the // cluster. No reconnection attempts will be made. NODE_GONE = 3; // NODE_ERROR indicates that a node has reported an error while processing // the request. No reconnection attempts will be made. NODE_ERROR = 4; } ================================================ FILE: keep/providers/cilium_provider/grpc/relay/relay_pb2.py ================================================ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # NO CHECKED-IN PROTOBUF GENCODE # source: relay/relay.proto # Protobuf Python Version: 5.27.2 """Generated protocol buffer code.""" # from google.protobuf import runtime_version as _runtime_version from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder """ _runtime_version.ValidateProtobufRuntimeVersion( _runtime_version.Domain.PUBLIC, 5, 27, 2, '', 'relay/relay.proto' ) """ # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( b"\n\x11relay/relay.proto\x12\x05relay\"^\n\x0fNodeStatusEvent\x12&\n\x0cstate_change\x18\x01 \x01(\x0e\x32\x10.relay.NodeState\x12\x12\n\nnode_names\x18\x02 \x03(\t\x12\x0f\n\x07message\x18\x03 \x01(\t*l\n\tNodeState\x12\x16\n\x12UNKNOWN_NODE_STATE\x10\x00\x12\x12\n\x0eNODE_CONNECTED\x10\x01\x12\x14\n\x10NODE_UNAVAILABLE\x10\x02\x12\r\n\tNODE_GONE\x10\x03\x12\x0e\n\nNODE_ERROR\x10\x04\x42'Z%github.com/cilium/cilium/api/v1/relayb\x06proto3" ) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "relay.relay_pb2", _globals) if not _descriptor._USE_C_DESCRIPTORS: _globals["DESCRIPTOR"]._loaded_options = None _globals["DESCRIPTOR"]._serialized_options = ( b"Z%github.com/cilium/cilium/api/v1/relay" ) _globals["_NODESTATE"]._serialized_start = 124 _globals["_NODESTATE"]._serialized_end = 232 _globals["_NODESTATUSEVENT"]._serialized_start = 28 _globals["_NODESTATUSEVENT"]._serialized_end = 122 # @@protoc_insertion_point(module_scope) ================================================ FILE: keep/providers/cilium_provider/grpc/relay/relay_pb2_grpc.py ================================================ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc GRPC_GENERATED_VERSION = "1.67.1" GRPC_VERSION = grpc.__version__ _version_not_supported = False try: from grpc._utilities import first_version_is_lower _version_not_supported = first_version_is_lower( GRPC_VERSION, GRPC_GENERATED_VERSION ) except ImportError: _version_not_supported = True # Shahar: commented out the following code """ if _version_not_supported: raise RuntimeError( f"The grpc package installed is at version {GRPC_VERSION}," + " but the generated code in relay/relay_pb2_grpc.py depends on" + f" grpcio>={GRPC_GENERATED_VERSION}." + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." ) """ ================================================ FILE: keep/providers/cilium_provider/runtime_version.py ================================================ # Protocol Buffers - Google's data interchange format # Copyright 2008 Google Inc. All rights reserved. # # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file or at # https://developers.google.com/open-source/licenses/bsd """Protobuf Runtime versions and validators. It should only be accessed by Protobuf gencodes and tests. DO NOT USE it elsewhere. """ __author__ = "shaod@google.com (Dennis Shao)" import os import warnings from enum import Enum class Domain(Enum): GOOGLE_INTERNAL = 1 PUBLIC = 2 # The versions of this Python Protobuf runtime to be changed automatically by # the Protobuf release process. Do not edit them manually. # These OSS versions are not stripped to avoid merging conflicts. OSS_DOMAIN = Domain.PUBLIC OSS_MAJOR = 5 OSS_MINOR = 30 OSS_PATCH = 0 OSS_SUFFIX = "-dev" DOMAIN = OSS_DOMAIN MAJOR = OSS_MAJOR MINOR = OSS_MINOR PATCH = OSS_PATCH SUFFIX = OSS_SUFFIX # Avoid flooding of warnings. _MAX_WARNING_COUNT = 20 _warning_count = 0 class VersionError(Exception): """Exception class for version violation.""" def _ReportVersionError(msg): raise VersionError(msg) def ValidateProtobufRuntimeVersion( gen_domain, gen_major, gen_minor, gen_patch, gen_suffix, location ): """Function to validate versions. Args: gen_domain: The domain where the code was generated from. gen_major: The major version number of the gencode. gen_minor: The minor version number of the gencode. gen_patch: The patch version number of the gencode. gen_suffix: The version suffix e.g. '-dev', '-rc1' of the gencode. location: The proto location that causes the version violation. Raises: VersionError: if gencode version is invalid or incompatible with the runtime. """ disable_flag = os.getenv("TEMPORARILY_DISABLE_PROTOBUF_VERSION_CHECK") if disable_flag is not None and disable_flag.lower() == "true": return global _warning_count version = f"{MAJOR}.{MINOR}.{PATCH}{SUFFIX}" gen_version = f"{gen_major}.{gen_minor}.{gen_patch}{gen_suffix}" if gen_major < 0 or gen_minor < 0 or gen_patch < 0: raise VersionError(f"Invalid gencode version: {gen_version}") error_prompt = ( "See Protobuf version guarantees at" " https://protobuf.dev/support/cross-version-runtime-guarantee." ) if gen_domain != DOMAIN: _ReportVersionError( "Detected mismatched Protobuf Gencode/Runtime domains when loading" f" {location}: gencode {gen_domain.name} runtime {DOMAIN.name}." " Cross-domain usage of Protobuf is not supported." ) if gen_major != MAJOR: if gen_major == MAJOR - 1: if _warning_count < _MAX_WARNING_COUNT: warnings.warn( "Protobuf gencode version %s is exactly one major version older" " than the runtime version %s at %s. Please update the gencode to" " avoid compatibility violations in the next runtime release." % (gen_version, version, location) ) _warning_count += 1 else: _ReportVersionError( "Detected mismatched Protobuf Gencode/Runtime major versions when" f" loading {location}: gencode {gen_version} runtime {version}." f" Same major version is required. {error_prompt}" ) if MINOR < gen_minor or (MINOR == gen_minor and PATCH < gen_patch): _ReportVersionError( "Detected incompatible Protobuf Gencode/Runtime versions when loading" f" {location}: gencode {gen_version} runtime {version}. Runtime version" f" cannot be older than the linked gencode version. {error_prompt}" ) if gen_suffix != SUFFIX: _ReportVersionError( "Detected mismatched Protobuf Gencode/Runtime version suffixes when" f" loading {location}: gencode {gen_version} runtime {version}." f" Version suffixes must be the same. {error_prompt}" ) ================================================ FILE: keep/providers/clickhouse_provider/README.md ================================================ ## Clickhouse Setup using Docker 1. Pull the Clickhouse image from Docker Hub ```bash docker pull clickhouse/clickhouse-server ``` 2. Start the Clickhouse server container ```bash docker run -d \ --name clickhouse-server \ -p 9000:9000 -p 8123:8123 \ -e CLICKHOUSE_USER=username \ -e CLICKHOUSE_PASSWORD=password \ -e CLICKHOUSE_DB=database \ -e CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1 \ clickhouse/clickhouse-server ``` 3. Get access to the Clickhouse server container's shell ```bash docker exec -it clickhouse-server /bin/bash ``` 4. Access the Clickhouse client from the container's shell ```bash clickhouse-client ``` 5. Now you can run SQL queries in the Clickhouse client ```sql USE database; SHOW TABLES; ``` 6. Create logs_table and insert data into it ```sql CREATE TABLE logs_table ( timestamp DateTime DEFAULT now(), level String, message String, source String, user_id UInt32 ) ENGINE = MergeTree ORDER BY timestamp; ``` ```sql INSERT INTO logs_table (level, message, source, user_id) VALUES ('INFO', 'User login successful', 'auth_service', 1), ('ERROR', 'Failed to connect to database', 'db_service', 0), ('DEBUG', 'Processing payment request', 'payment_service', 5), ('INFO', 'User logged out', 'auth_service', 1), ('WARN', 'High memory usage detected', 'monitoring_service', 0), ('ERROR', 'Timeout while sending email', 'email_service', 2), ('INFO', 'File uploaded successfully', 'file_service', 3), ('DEBUG', 'Starting batch process', 'batch_service', 0), ('INFO', 'New user registered', 'auth_service', 4), ('ERROR', 'Failed to process payment', 'payment_service', 5); ``` 7. Some sql queries to test Retrieve the latest log entry ```sql SELECT * FROM logs_table ORDER BY timestamp DESC LIMIT 1; ``` Retrieve Logs with a Specific User ID and Level ```sql SELECT * FROM logs_table WHERE user_id = 5 AND level = 'DEBUG'; ``` ## ClickHouse Setup with Self-Signed Certificate This guide will help you set up a ClickHouse server with a self-signed SSL certificate using Docker. ### Prerequisites - Docker and Docker Compose installed on your machine. ### Steps 1. **Clone the Repository** Clone the repository containing the ClickHouse setup files. ```bash git clone cd /keep/providers/clickhouse_provider/clickhouse-secure ``` 2. **Review Configuration Files** Ensure the following files are correctly configured: - `config.xml`: Contains ClickHouse server configuration, including SSL settings. - `users.xml`: Defines users and their permissions. - `certs/server.crt` and `certs/server.key`: Your self-signed certificate and private key. 3. **Start ClickHouse with Docker Compose** Use Docker Compose to start the ClickHouse server. ```bash docker-compose up -d ``` This command will start the ClickHouse server with SSL enabled on ports 8123 (HTTPS) and 9440 (Native SSL). 4. **Connect to ClickHouse** You can connect to the ClickHouse server using the ClickHouse client or any compatible client library. Ensure you specify the SSL port and provide the necessary credentials. Example connection string for Python using `clickhouse-driver`: ```python from clickhouse_driver import connect connection = connect( 'clickhouses://secure_user:strong_password@localhost:9440/default', verify='/path/to/your/ca-cert.pem' # Optional: Path to CA certificate if needed ) ``` If you encounter SSL verification issues, you can disable verification (not recommended for production) by setting `verify=False`. 5. **Stop ClickHouse** To stop the ClickHouse server, run: ```bash docker-compose down ``` ### Notes - The provided setup uses a self-signed certificate. For production environments, consider using a certificate from a trusted Certificate Authority (CA). - Ensure that the certificate and key files are correctly mounted in the Docker container as specified in the `docker-compose.yml` file. ================================================ FILE: keep/providers/clickhouse_provider/__init__.py ================================================ ================================================ FILE: keep/providers/clickhouse_provider/clickhouse-secure/certs/server.crt ================================================ -----BEGIN CERTIFICATE----- MIIDfTCCAmWgAwIBAgIUH2I41CG75eMKCuXoLIza75/eX4swDQYJKoZIhvcNAQEL BQAwTjELMAkGA1UEBhMCVVMxDTALBgNVBAgMBFRlc3QxDTALBgNVBAcMBFRlc3Qx DTALBgNVBAoMBFRlc3QxEjAQBgNVBAMMCWxvY2FsaG9zdDAeFw0yNTAxMjgwNzQ4 MjVaFw0yNjAxMjgwNzQ4MjVaME4xCzAJBgNVBAYTAlVTMQ0wCwYDVQQIDARUZXN0 MQ0wCwYDVQQHDARUZXN0MQ0wCwYDVQQKDARUZXN0MRIwEAYDVQQDDAlsb2NhbGhv c3QwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCrGl0k3J93ug+iJxZz JwCLFt+QyJfEVAod/jb5coio9fDdODGOJFD2aiX9v+B1hSiHHSakKXYtNnCYJtKK HEur760qkWEDdg8PplmlaXXD14n6EUbcqEKfZNaeD4WQa/+cbCg6eOQRvM+YaBp9 ebQFbZL74H3YrQlExF3c9ImkTP7XzoPXSKpfb2HYPIxKBacbr2TsCHPKd5mFze3t +k/ttC4WVH4OAPkVZdJnR+lSSE0uTfK21+ZWpIcFlTi6zkNFjk4zuntpMcaTWo/L xPJG0MIb5RitFTR0U00Ukq5ah4IrTQNxVj+d4VF+rRs/kEV6+UYom+TJPLOPeDch JZmbAgMBAAGjUzBRMB0GA1UdDgQWBBT+4lIGAu+FMy72bHLGWPsgRcQzCDAfBgNV HSMEGDAWgBT+4lIGAu+FMy72bHLGWPsgRcQzCDAPBgNVHRMBAf8EBTADAQH/MA0G CSqGSIb3DQEBCwUAA4IBAQCQIWMIfMx8Rxa09yj6L0l0bTlifiWGcYKw+41WbXIM sNHYHbPv0hZrezD5A0lFZHknTNveBqh4KGq69QpilaRri09MR7YdzBOJtvttPz0N d42ZqJJAbjg5vhWSWO3nFjg3kxxK28/YIcrCxnWNIUuua+MwrT+io539VfJ5CmUP t+7+juizAzu+Tt1O/YHJopnjoZTFWQiaE2bj0bXm2MAPZF8ItujCOyM9RImUcAr1 0crgNapA0mZmIGgatb4V8OSAkS4+T4no3ScRbTTPjqCf8z9Hkq3M2EoZhADv+FLD 3qKobCwv0W/RmzGHM4vGHMKnZO48DZ85EC+puD6h8dbP -----END CERTIFICATE----- ================================================ FILE: keep/providers/clickhouse_provider/clickhouse-secure/certs/server.key ================================================ -----BEGIN PRIVATE KEY----- MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCrGl0k3J93ug+i JxZzJwCLFt+QyJfEVAod/jb5coio9fDdODGOJFD2aiX9v+B1hSiHHSakKXYtNnCY JtKKHEur760qkWEDdg8PplmlaXXD14n6EUbcqEKfZNaeD4WQa/+cbCg6eOQRvM+Y aBp9ebQFbZL74H3YrQlExF3c9ImkTP7XzoPXSKpfb2HYPIxKBacbr2TsCHPKd5mF ze3t+k/ttC4WVH4OAPkVZdJnR+lSSE0uTfK21+ZWpIcFlTi6zkNFjk4zuntpMcaT Wo/LxPJG0MIb5RitFTR0U00Ukq5ah4IrTQNxVj+d4VF+rRs/kEV6+UYom+TJPLOP eDchJZmbAgMBAAECggEAC4nA/QReDvfRqBChhFOXLZbCreoo+dWxw1xqODlCzlbP aEuRMLgLazwbPCWDrS+Bw4klGu4Roj9I9nZ5Vu2zi9bWXMfmIxKdNcpbXAeX9NEe SwOxPWrUG0v0gQu9tdB8MZmSWOcTRVlWWNbkVAPJ+14fpEz69fD6CAe2s98cDQoC JIhzbNf2HSgzAA85KcOx6iHpiQZOwhawHEfL31Vq5oHOPkAbIhGGtRNGZ3qivksS mFiumzHXg4LMbrs/QPbklsnIsGfxiRe0TIA2YOGJg6K52QEE+tI4XWNirnJQDuaH LNBNuqWgeBtPVjvIrc89z6OZLrarL83+EIfhvzpToQKBgQDqvDLENZj1kIQ1Kpcj HQuI9FKn0T9UzVDIVO2vOBJG2n6hH93Y++pozd4tmfuKF4BvPU9vCgITu2WTDXYy bCFbjYnnrO7LrI/UmrxdVSDl4CJcyBp/jVEhfuUvozXSTDUjUcV7Jginx5+tkox9 Vj/Pg+OjGT+zd12oe468TiYj0QKBgQC6mnS+SzbuopkwHaDMcGld3wimFpzJAMxe 80VUTEosIu+UEqdE6g67vFhk9UbeIJjZSHJwfz6PFxMSO+nlOBZShNLJr4EjeMOC HW32hwEOLNtUjk4FxL2HeK7EuIsFWFo+ftLc/EVWcR47sV8W+lxhgDsFe2nA0oza b4Ucqg0dqwKBgQClDm7YHyQOUG9WfztFOpA43iwcyvswYyrRoz56vf/ECLGQFLtH b2RWC6SWBjek03/BOKhZWP066MO00ntxWy1dljoJSUWkvBNrGN8o9corOh6PhTl0 xWbuGa+IfshCtsmKq14kiQr/B1SVlX3qSDKYdZIkxoVPabjW1wL4EC+rcQKBgBYx 1t7nbVI27seFTqHiYPX0WEABAob53FUS1FUxecUEJsDS8yhEOppjzZO8hMBY2jVF 466zw8obMX6Ct9A2upj4CWZJxK9mZsKsI28mIZ8BANluz6LqAq0BUrA9TvPEzX8P cJ8uNkUQ0UrCTxAZmTFTojGFu09e+7fjec6t/z9fAoGAQoSl3YkzIMKyMk0cDmAN cvIjqQkZpknKKNtVBMVrrj2ppONDX4lRbcynImDKZKg9+pc54im/IH5NkA8c+uZY wS4XNzVSXK4ZAH9CX/W4b7jQW1fQW3CRmtwNgqGF1HGPYG4U1Nl9U0NRFLYe8sQE 6IOZgHHz94uQ2/doDFVYzJU= -----END PRIVATE KEY----- ================================================ FILE: keep/providers/clickhouse_provider/clickhouse-secure/config.xml ================================================ trace 1 8123 9440 0 /certs/server.crt /certs/server.key none false true sslv2,sslv3 true 4096 3 100 8589934592 5368709120 /var/lib/clickhouse/ /var/lib/clickhouse/tmp/ /var/lib/clickhouse/user_files/ users.xml default default UTC false /clickhouse/task_queue/ddl ================================================ FILE: keep/providers/clickhouse_provider/clickhouse-secure/docker-compose.yml ================================================ services: clickhouse: image: clickhouse/clickhouse-server:latest ports: - "8123:8123" # HTTPS port - "9440:9440" # Native SSL port volumes: - ./certs:/certs - ./users.xml:/etc/clickhouse-server/users.xml:ro - ./config.xml:/etc/clickhouse-server/config.xml:ro environment: - CLICKHOUSE_USER=secure_user - CLICKHOUSE_PASSWORD=strong_password ================================================ FILE: keep/providers/clickhouse_provider/clickhouse-secure/users.xml ================================================ 10000000000 0 random 100 3600 0 0 0 0 0 strong_password default default ::/0 1 ================================================ FILE: keep/providers/clickhouse_provider/clickhouse_provider.py ================================================ import dataclasses import json import typing import pydantic import requests from clickhouse_driver import connect from clickhouse_driver.dbapi.extras import DictCursor from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import NoSchemeUrl, UrlPort DEFAULT_TIMEOUT_SECONDS = 120 # Not to hang the thread forever, only for extreme cases @pydantic.dataclasses.dataclass class ClickhouseProviderAuthConfig: username: str = dataclasses.field( metadata={ "required": True, "description": "Clickhouse username", "config_main_group": "authentication", }, ) password: str = dataclasses.field( metadata={ "required": True, "description": "Clickhouse password", "sensitive": True, "config_main_group": "authentication", } ) host: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "Clickhouse hostname", "validation": "no_scheme_url", "config_main_group": "authentication", } ) port: UrlPort = dataclasses.field( metadata={ "required": True, "description": "Clickhouse port", "validation": "port", "config_main_group": "authentication", } ) database: str | None = dataclasses.field( metadata={"required": False, "description": "Clickhouse database name"}, default=None, ) protocol: typing.Literal["clickhouse", "clickhouses", "http", "https"] = ( dataclasses.field( default="clickhouse", metadata={ "required": True, "description": "Protocol ('clickhouses' for SSL, 'clickhouse' for no SSL, 'http' or 'https')", "type": "select", "options": ["clickhouse", "clickhouses", "http", "https"], "config_main_group": "authentication", }, ) ) verify: bool = dataclasses.field( metadata={ "description": "Enable SSL verification", "hint": "SSL verification is enabled by default", "type": "switch", "config_main_group": "authentication", }, default=True, ) class ClickhouseProvider(BaseProvider, ProviderHealthMixin): """Enrich alerts with data from Clickhouse.""" PROVIDER_DISPLAY_NAME = "Clickhouse" PROVIDER_CATEGORY = ["Database"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_server", description="The user can connect to the server", mandatory=True, alias="Connect to the server", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.client = None def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ try: if self._is_http_protocol(): response = self._execute_http_query("SHOW TABLES") tables = response else: client = self.__generate_client() cursor = client.cursor() cursor.execute("SHOW TABLES") tables = cursor.fetchall() cursor.close() client.close() self.logger.info(f"Tables: {tables}") scopes = { "connect_to_server": True, } except Exception as e: self.logger.exception("Error validating scopes") scopes = { "connect_to_server": str(e), } return scopes def _is_http_protocol(self) -> bool: """Check if the protocol is HTTP-based.""" return self.authentication_config.protocol in ["http", "https"] def __generate_client(self): """ Generates a Clickhouse client for native protocol. Returns: clickhouse_driver.Connection: Clickhouse connection object """ if self._is_http_protocol(): raise ProviderException("Cannot generate native client for HTTP protocol") user = self.authentication_config.username password = self.authentication_config.password host = self.authentication_config.host database = self.authentication_config.database port = self.authentication_config.port protocol = self.authentication_config.protocol dsn = f"{protocol}://{user}:{password}@{host}:{port}" if database: dsn += f"/{database}" if self.authentication_config.verify is False: dsn += "?verify=false" return connect( dsn, connect_timeout=DEFAULT_TIMEOUT_SECONDS, send_receive_timeout=DEFAULT_TIMEOUT_SECONDS, sync_request_timeout=DEFAULT_TIMEOUT_SECONDS, verify=self.authentication_config.verify, ) def _execute_http_query(self, query: str, params: dict = None) -> list: """ Execute a query using HTTP protocol. Args: query: SQL query to execute params: Query parameters for formatting Returns: list: Query results """ protocol = self.authentication_config.protocol host = self.authentication_config.host port = self.authentication_config.port database = self.authentication_config.database url = f"{protocol}://{host}:{port}" # Format query if parameters are provided if params: query = query.format(**params) # Prepare request parameters request_params = {"query": query, "default_format": "JSONEachRow"} if database: request_params["database"] = database # Make request with authentication response = requests.post( url, params=request_params, auth=( self.authentication_config.username, self.authentication_config.password, ), verify=self.authentication_config.verify, timeout=DEFAULT_TIMEOUT_SECONDS, ) if not response.ok: raise ProviderException(f"HTTP query failed: {response.text}") # Parse response - split by newlines as each line is a JSON object results = [] for line in response.text.strip().split("\n"): if line: results.append(json.loads(line)) return results def dispose(self): if not self._is_http_protocol() and self.client: try: self.client.close() except Exception: self.logger.exception("Error closing Clickhouse connection") def validate_config(self): """ Validates required configuration for Clickhouse's provider. """ self.authentication_config = ClickhouseProviderAuthConfig( **self.config.authentication ) def _query(self, query="", single_row=False, **kwargs: dict) -> list | tuple: return self._notify(query=query, single_row=single_row, **kwargs) def _notify(self, query="", single_row=False, **kwargs: dict) -> list | tuple: """ Executes a query against the Clickhouse database. Returns: list | tuple: list of results or single result if single_row is True """ if self._is_http_protocol(): results = self._execute_http_query(query, kwargs) else: client = self.__generate_client() cursor = client.cursor(cursor_factory=DictCursor) if kwargs: query = query.format(**kwargs) cursor.execute(query) results = cursor.fetchall() cursor.close() client.close() if single_row and results and len(results) > 0: return results[0] return results if __name__ == "__main__": import os config = ProviderConfig( authentication={ "username": os.environ.get("CLICKHOUSE_USER"), "password": os.environ.get("CLICKHOUSE_PASSWORD"), "host": os.environ.get("CLICKHOUSE_HOST"), "database": os.environ.get("CLICKHOUSE_DATABASE"), "port": os.environ.get("CLICKHOUSE_PORT"), "protocol": os.environ.get("CLICKHOUSE_PROTOCOL", "clickhouse"), } ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) clickhouse_provider = ClickhouseProvider(context_manager, "clickhouse-prod", config) results = clickhouse_provider.query( query="SELECT * FROM Traces LIMIT 1", single_row=True, ) print(results) ================================================ FILE: keep/providers/cloudwatch_provider/__init__.py ================================================ ================================================ FILE: keep/providers/cloudwatch_provider/alerts_mock.py ================================================ ALERTS = { "high_cpu_usage": { "payload": { "Message": { "AlarmName": "HighCPUUsage", "AlarmDescription": "CPU utilization is above 90% threshold", "MetricName": "CPUUtilization", "Namespace": "AWS/EC2", "Threshold": 90, "ComparisonOperator": "GreaterThanOrEqualToThreshold", "Priority": "P3", } }, "parameters": { "Message.AlarmName": ["HighCPUUsage", "HighCPUUsageOnAPod", "PodRecycled"], "Message.AlarmDescription": [ "CPU utilization is above threshold", "Pod CPU usage exceeds safe limits", "Pod was recycled due to resource constraints", ], "Message.Application": ["mailing-app", "producers", "main-app", "core"], "Message.Threshold": [90, 80, 70, 95], }, }, "high_memory_usage": { "payload": { "Message": { "AlarmName": "HighMemoryUsage", "AlarmDescription": "Memory utilization is above 85% threshold", "MetricName": "MemoryUtilization", "Namespace": "AWS/ECS", "Threshold": 85, "ComparisonOperator": "GreaterThanOrEqualToThreshold", "Priority": "P2", } }, "parameters": { "Message.AlarmName": [ "HighMemoryUsage", "ContainerMemoryHigh", "ServiceMemoryAlert", ], "Message.AlarmDescription": [ "Memory utilization exceeded threshold", "Container using excessive memory", "Service memory usage is critical", ], "Message.Application": ["api-service", "cache-service", "worker-service"], "Message.Threshold": [85, 75, 90], }, }, "high_error_rate": { "payload": { "Message": { "AlarmName": "APIErrorRate", "AlarmDescription": "API error rate exceeds 5% threshold", "MetricName": "5XXError", "Namespace": "AWS/ApiGateway", "Threshold": 5, "ComparisonOperator": "GreaterThanThreshold", "Priority": "P1", } }, "parameters": { "Message.AlarmName": ["APIErrorRate", "ServiceErrors", "EndpointFailures"], "Message.AlarmDescription": [ "API error rate above normal levels", "Service experiencing high error count", "Critical endpoint failure detected", ], "Message.Application": ["payment-api", "user-service", "order-system"], "Message.Threshold": [5, 3, 1], }, }, } ================================================ FILE: keep/providers/cloudwatch_provider/cloudwatch_provider.py ================================================ """ CloudwatchProvider is a class that provides a way to read data from AWS Cloudwatch. """ import dataclasses import datetime import hashlib import json import logging import os import time import typing from typing import List from urllib.parse import urlparse import boto3 import pydantic import requests from keep.api.core.config import config as keep_config from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class CloudwatchProviderAuthConfig: region: str = dataclasses.field( metadata={ "required": True, "description": "AWS region", "senstive": False, }, ) access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS access key (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) access_key_secret: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS access key secret (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) session_token: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS Session Token", "hint": "For temporary credentials. Note that if you connect CloudWatch with temporary credentials, the initial connection will succeed, but when the credentials expired alarms won't be sent to Keep.", "sensitive": True, }, ) cloudwatch_sns_topic: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS Cloudwatch SNS Topic [ARN or name]", "hint": "Default SNS Topic to send notifications (Optional since if your alarms already sends notifications to SNS topic, Keep will use the existing SNS topic)", "sensitive": False, }, ) protocol: typing.Literal["https", "http"] = dataclasses.field( default="https", metadata={ "required": True, "description": "Protocol to use for the webhook", "type": "select", "options": ["https", "http"], }, ) class CloudwatchProvider(BaseProvider, ProviderHealthMixin): """Push alarms from AWS Cloudwatch to Keep.""" PROVIDER_DISPLAY_NAME = "CloudWatch" PROVIDER_CATEGORY = ["Cloud Infrastructure", "Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="cloudwatch:DescribeAlarms", description="Required to retrieve information about alarms.", documentation_url="https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_DescribeAlarms.html", mandatory=True, alias="Describe Alarms", ), ProviderScope( name="cloudwatch:PutMetricAlarm", description="Required to update information about alarms. This mainly use to add Keep as an SNS action to the alarm.", documentation_url="https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutMetricAlarm.html", mandatory=False, alias="Update Alarms", ), ProviderScope( name="sns:ListSubscriptionsByTopic", description="Required to list all subscriptions of a topic, so Keep will be able to add itself as a subscription.", documentation_url="https://docs.aws.amazon.com/sns/latest/dg/sns-access-policy-language-api-permissions-reference.html", mandatory=False, alias="List Subscriptions", ), ProviderScope( name="logs:GetQueryResults", description="Part of CloudWatchLogsReadOnlyAccess role. Required to retrieve the results of CloudWatch Logs Insights queries.", documentation_url="https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_GetQueryResults.html", mandatory=False, alias="Read Query results", ), ProviderScope( name="logs:DescribeQueries", description="Part of CloudWatchLogsReadOnlyAccess role. Required to describe the results of CloudWatch Logs Insights queries.", documentation_url="https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_DescribeQueries.html", mandatory=False, alias="Describe Query results", ), ProviderScope( name="logs:StartQuery", description="Part of CloudWatchLogsReadOnlyAccess role. Required to start CloudWatch Logs Insights queries.", documentation_url="https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_StartQuery.html", mandatory=False, alias="Start Logs Query", ), ProviderScope( name="iam:SimulatePrincipalPolicy", description="Allow Keep to test the scopes of the current user/role without modifying any resource.", documentation_url="https://docs.aws.amazon.com/IAM/latest/APIReference/API_SimulatePrincipalPolicy.html", mandatory=False, alias="Simulate IAM Policy", ), ] VALID_ALARM_KEYS = { "AlarmName", "AlarmDescription", "ActionsEnabled", "OKActions", "AlarmActions", "InsufficientDataActions", "MetricName", "Namespace", "Statistic", "ExtendedStatistic", "Dimensions", "Period", "Unit", "EvaluationPeriods", "DatapointsToAlarm", "Threshold", "ComparisonOperator", "TreatMissingData", "EvaluateLowSampleCountPercentile", "Metrics", "Tags", "ThresholdMetricId", } STATUS_MAP = { "ALARM": AlertStatus.FIRING, "OK": AlertStatus.RESOLVED, "INSUFFICIENT_DATA": AlertStatus.PENDING, } # CloudWatch doesn't have built-in severities SEVERITIES_MAP = {} def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.aws_client_type = None self._client = None self.disable_api_key = keep_config( "KEEP_CLOUDWATCH_DISABLE_API_KEY", default=False ) if self.disable_api_key: self.logger.info("API key is disabled for CloudWatch provider") def validate_scopes(self): # init the scopes as False scopes = {scope.name: False for scope in self.PROVIDER_SCOPES} # the scope name is the action actions = scopes.keys() # fetch the results try: sts_client = self.__generate_client("sts") identity = sts_client.get_caller_identity()["Arn"] iam_client = self.__generate_client("iam") except Exception as e: self.logger.exception( "Error validating AWS IAM scopes", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes = {s: str(e) for s in scopes.keys()} return scopes # 0. try to validate all scopes using simulate_principal_policy # if the user/role have permissions to simulate_principal_policy, we can validate the scopes easily try: iam_resp = iam_client.simulate_principal_policy( PolicySourceArn=identity, ActionNames=list(actions) ) scopes = { res.get("EvalActionName"): res.get("EvalDecision") == "allowed" for res in iam_resp.get("EvaluationResults") } scopes["iam:SimulatePrincipalPolicy"] = True if all(scopes.values()): self.logger.info( "All AWS IAM scopes are granted!", extra={ "scopes": scopes, "tenant_id": self.context_manager.tenant_id, }, ) return scopes # if not all the scopes are granted, we need to test them one by one else: self.logger.warning( "Some of the AWS IAM scopes are not granted, testing them one by one...", extra={ "scopes": scopes, "tenant_id": self.context_manager.tenant_id, }, ) # otherwise, we need to test them one by one except Exception: self.logger.exception( "Error validating AWS IAM scopes", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes["iam:SimulatePrincipalPolicy"] = ( "No permissions to simulate_principal_policy (but its cool, its not a must)" ) self.logger.info("Validating aws cloudwatch scopes") # 1. validate describe alarms cloudwatch_client = self.__generate_client("cloudwatch") resp = None try: resp = cloudwatch_client.describe_alarms() scopes["cloudwatch:DescribeAlarms"] = True except Exception as e: self.logger.exception( "Error validating AWS cloudwatch:DescribeAlarms scope", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes["cloudwatch:DescribeAlarms"] = str(e) # if we got the response, we can validate the other scopes if resp: # 2. validate put metric alarm try: alarms = resp.get("MetricAlarms", []) alarm = alarms[0] filtered_alarm = { k: v for k, v in alarm.items() if k in CloudwatchProvider.VALID_ALARM_KEYS } cloudwatch_client.put_metric_alarm(**filtered_alarm) scopes["cloudwatch:PutMetricAlarm"] = True except Exception as e: self.logger.exception( "Error validating AWS cloudwatch:PutMetricAlarm scope", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes["cloudwatch:PutMetricAlarm"] = str(e) else: scopes["cloudwatch:PutMetricAlarm"] = ( "cloudwatch:DescribeAlarms scope is not granted, so we cannot validate cloudwatch:PutMetricAlarm scope" ) # 3. validate list subscriptions by topic if self.authentication_config.cloudwatch_sns_topic: try: sns_client = self.__generate_client("sns") sns_topic = self.authentication_config.cloudwatch_sns_topic if not sns_topic.startswith("arn:aws:sns"): account_id = self._get_account_id() sns_topic = f"arn:aws:sns:{self.authentication_config.region}:{account_id}:{self.authentication_config.cloudwatch_sns_topic}" sns_client.list_subscriptions_by_topic(TopicArn=sns_topic) scopes["sns:ListSubscriptionsByTopic"] = True except Exception as e: self.logger.exception( "Error validating AWS sns:ListSubscriptionsByTopic scope", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes["sns:ListSubscriptionsByTopic"] = str(e) else: scopes["sns:ListSubscriptionsByTopic"] = ( "cloudwatch_sns_topic is not set, so we cannot validate sns:ListSubscriptionsByTopic scope" ) # 4. validate start query logs_client = self.__generate_client("logs") try: logs_client.start_query( logGroupName="keepTest", queryString="keepTest", startTime=int( ( datetime.datetime.today() - datetime.timedelta(hours=24) ).timestamp() ), endTime=int(datetime.datetime.now().timestamp()), ) scopes["logs:StartQuery"] = True except Exception as e: # that means that the user/role have the permissions but we've just made up the logGroupName which make sense if "ResourceNotFoundException" in str(e): self.logger.info( "AWS logs:StartQuery scope is not required", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes["logs:StartQuery"] = True # other/wise the scope is false else: self.logger.info( "Error validating AWS logs:StartQuery scope", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes["logs:StartQuery"] = str(e) query_id = False self.logger.info( "Validating AWS logs:DescribeQueries scope", extra={ "tenant_id": self.context_manager.tenant_id, }, ) try: query_id = logs_client.describe_queries().get("queries")[0]["queryId"] scopes["logs:DescribeQueries"] = True except Exception: self.logger.exception( "Error validating AWS logs:DescribeQueries scope", extra={ "tenant_id": self.context_manager.tenant_id, }, ) scopes["logs:DescribeQueries"] = ( "Could not validate logs:GetQueryResults scope without logs:DescribeQueries, so assuming the scope is not granted." ) self.logger.info( "Validating AWS logs:StartQuery scope", extra={ "tenant_id": self.context_manager.tenant_id, }, ) if query_id: try: logs_client.get_query_results(queryId=query_id) scopes["logs:StartQuery"] = True except Exception as e: self.logger.exception( "Error validating AWS logs:StartQuery scope", extra={"tenant_id": self.context_manager.tenant_id}, ) scopes["logs:StartQuery"] = str(e) else: scopes["logs:StartQuery"] = ( "Could not validate logs:StartQuery scope without logs:DescribeQueries, so assuming the scope is not granted." ) # 5. validate get query results self.logger.info( "Validating AWS logs:GetQueryResults scope", extra={ "tenant_id": self.context_manager.tenant_id, }, ) if query_id: try: logs_client.get_query_results(queryId=query_id) scopes["logs:GetQueryResults"] = True except Exception as e: self.logger.exception("Error validating AWS logs:GetQueryResults scope") scopes["logs:GetQueryResults"] = str(e) else: scopes["logs:DescribeQueries"] = ( "Could not validate logs:GetQueryResults scope without logs:DescribeQueries, so assuming the scope is not granted." ) # Finally return scopes @property def client(self): if self._client is None: self.client = self.__generate_client(self.aws_client_type) return self._client def _query( self, log_group: str = None, log_groups: List[str] | None = None, remove_ptr_from_results=False, query: str = None, hours: int = 24, **kwargs: dict, ) -> dict: # log_group = kwargs.get("log_group") # query = kwargs.get("query") # hours = kwargs.get("hours", 24) logs_client = self.__generate_client("logs") try: query_kwargs = { "queryString": query, "startTime": int( ( datetime.datetime.today() - datetime.timedelta(hours=hours) ).timestamp() ), "endTime": int(datetime.datetime.now().timestamp()), } if log_group is not None: query_kwargs["logGroupName"] = log_group if log_groups is not None: query_kwargs["logGroupNames"] = log_groups start_query_response = logs_client.start_query(**query_kwargs) except Exception as e: self.logger.exception( f"Error starting AWS cloudwatch query - add logs:StartQuery permissions, {e}", extra={"kwargs": kwargs}, ) raise query_id = start_query_response["queryId"] response = None while response is None or response["status"] == "Running": self.logger.debug("Waiting for AWS cloudwatch query to complete...") time.sleep(1) response = logs_client.get_query_results(queryId=query_id) # Response in format List[{field: fieldName, value: fieldValue}] # We need to convert it to List[Dict[fieldName: fieldValue]] results = [] for result in response.get("results", []): results.append({field["field"]: field["value"] for field in result}) # Trying to parse JSON of each field["value"] for field in results[-1]: try: results[-1][field] = json.loads(results[-1][field]) except json.JSONDecodeError: pass if remove_ptr_from_results: results[-1].pop("@ptr", None) return results def _get_account_id(self): sts_client = self.__generate_client("sts") identity = sts_client.get_caller_identity() return identity["Account"] def __generate_client(self, aws_client_type: str): if self.authentication_config.session_token: self.logger.info("Using temporary credentials") client = boto3.client( aws_client_type, aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.access_key_secret, aws_session_token=self.authentication_config.session_token, region_name=self.authentication_config.region, ) else: client = boto3.client( aws_client_type, aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.access_key_secret, region_name=self.authentication_config.region, ) return client def dispose(self): try: self.client.close() except Exception: self.logger.exception("Error closing boto3 connection") def validate_config(self): self.authentication_config = CloudwatchProviderAuthConfig( **self.config.authentication ) def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): # first, list all Cloudwatch alarms self.logger.info("Setting up webhook with url %s", keep_api_url) cloudwatch_client = self.__generate_client("cloudwatch") sns_client = self.__generate_client("sns") resp = cloudwatch_client.describe_alarms() alarms = resp.get("MetricAlarms", []) alarms.extend(resp.get("CompositeAlarms")) subscribed_topics = [] # for each alarm, we need to iterate the actions topics and subscribe to them for alarm in alarms: actions = alarm.get("AlarmActions", []) # extract only SNS actions topics = [action for action in actions if action.startswith("arn:aws:sns")] # if we got explicitly SNS topic, add it as an action if self.authentication_config.cloudwatch_sns_topic: self.logger.warning( "Cannot hook alarm without SNS topic, trying to add SNS action..." ) # add an action to the alarm if not self.authentication_config.cloudwatch_sns_topic.startswith( "arn:aws:sns" ): account_id = self._get_account_id() sns_topic = f"arn:aws:sns:{self.authentication_config.region}:{account_id}:{self.authentication_config.cloudwatch_sns_topic}" else: sns_topic = self.authentication_config.cloudwatch_sns_topic actions.append(sns_topic) # if the alarm already has the SNS topic as action, we don't need to add it again if sns_topic in actions: self.logger.info( "SNS action already added to alarm %s, skipping...", alarm.get("AlarmName"), ) else: self.logger.info( "Adding SNS action to alarm %s...", alarm.get("AlarmName") ) try: alarm["AlarmActions"] = actions # filter out irrelevant files filtered_alarm = { k: v for k, v in alarm.items() if k in CloudwatchProvider.VALID_ALARM_KEYS } cloudwatch_client.put_metric_alarm(**filtered_alarm) # now it should contain the SNS topic topics = [sns_topic] except Exception: self.logger.exception( "Error adding SNS action to alarm %s", alarm.get("AlarmName"), ) continue self.logger.info( "SNS action added to alarm %s!", alarm.get("AlarmName") ) for topic in topics: # protection against adding ourself more than once to the same topic (can happen if different alarams send to the same topic) if topic in subscribed_topics: self.logger.info( "Already subscribed to topic %s in this transaction, skipping...", topic, ) continue self.logger.info("Checking topic %s...", topic) try: subscriptions = sns_client.list_subscriptions_by_topic( TopicArn=topic ).get("Subscriptions", []) # this means someone deleted the topic that this alarm sends notification too except Exception as exc: self.logger.warning( "Topic %s not found, skipping...", topic, exc_info=exc ) continue hostname = urlparse(keep_api_url).hostname already_subscribed = any( hostname in sub["Endpoint"] and not sub["SubscriptionArn"] == "PendingConfirmation" for sub in subscriptions ) if not already_subscribed: # for self-hosted Keep, sometimes api_key should be disabled if self.disable_api_key: self.logger.info("API key is disabled, using the url as is") url_with_api_key = keep_api_url + "&tenant_id=" + tenant_id else: if self.authentication_config.protocol == "https": url_with_api_key = keep_api_url.replace( "https://", f"https://api_key:{api_key}@" ) else: url_with_api_key = keep_api_url.replace( "http://", f"http://api_key:{api_key}@" ) self.logger.info("Subscribing to topic %s...", topic) sns_client.subscribe( TopicArn=topic, Protocol=self.authentication_config.protocol, Endpoint=url_with_api_key, ) self.logger.info("Subscribed to topic %s!", topic) subscribed_topics.append(topic) # we need to subscribe to only one SNS topic per alarm, o/w we will get many duplicates break else: self.logger.info( "Already subscribed to topic %s, skipping...", topic ) self.logger.info("Webhook setup completed!") @staticmethod def parse_event_raw_body(raw_body: bytes | dict) -> dict: if isinstance(raw_body, dict): return raw_body return json.loads(raw_body) @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: logger = logging.getLogger(__name__) # if its confirmation event, we need to confirm the subscription if event.get("Type") == "SubscriptionConfirmation": # TODO - do we want to keep it in the db somehow? # do we want to validate that the tenant id exist? logger.info("Confirming subscription...") subscribe_url = event.get("SubscribeURL") requests.get(subscribe_url) logger.info("Subscription confirmed!") # Done return # else, we need to parse the event and create an alert try: alert = json.loads(event.get("Message")) except Exception: logger.exception("Error parsing cloudwatch alert", extra={"event": event}) return # Map the status to Keep status status = CloudwatchProvider.STATUS_MAP.get( alert.get("NewStateValue"), AlertStatus.FIRING ) # AWS Cloudwatch doesn't have severity severity = AlertSeverity.INFO return AlertDto( # there is no unique id in the alarm so let's hash the alarm id=hashlib.sha256(event.get("Message").encode()).hexdigest(), name=alert.get("AlarmName"), status=status, severity=severity, lastReceived=str( datetime.datetime.fromisoformat(alert.get("StateChangeTime")) ), description=alert.get("AlarmDescription"), source=["cloudwatch"], **alert, ) @classmethod def simulate_alert(cls) -> dict: # Choose a random alert type import random from keep.providers.cloudwatch_provider.alerts_mock import ALERTS alert_type = random.choice(list(ALERTS.keys())) alert_data = ALERTS[alert_type] # Start with the base payload simulated_alert = alert_data["payload"].copy() # Choose a consistent index for all parameters if "parameters" in alert_data: # Get the minimum length of all parameter choices to avoid index errors min_choices_len = min( len(choices) for choices in alert_data["parameters"].values() ) param_index = random.randrange(min_choices_len) # Apply variability based on parameters for param, choices in alert_data["parameters"].items(): # Split param on '.' for nested parameters (if any) param_parts = param.split(".") target = simulated_alert for part in param_parts[:-1]: target = target.setdefault(part, {}) # Use consistent index for all parameters target[param_parts[-1]] = choices[param_index] # Set StateChangeTime to current time simulated_alert["Message"][ "StateChangeTime" ] = datetime.datetime.now().isoformat() # Provider expects all keys as string for key in simulated_alert: value = simulated_alert[key] simulated_alert[key] = json.dumps(value) return simulated_alert if __name__ == "__main__": config = ProviderConfig( authentication={ "access_key": os.environ.get("AWS_ACCESS_KEY_ID"), "access_key_secret": os.environ.get("AWS_SECRET_ACCESS_KEY"), "region": os.environ.get("AWS_REGION"), "session_token": os.environ.get("AWS_SESSION_TOKEN"), } ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) cloudwatch_provider = CloudwatchProvider(context_manager, "cloudwatch", config) scopes = cloudwatch_provider.validate_scopes() print(scopes) results = cloudwatch_provider.query( query="fields @timestamp, @message, @logStream, @log | sort @timestamp desc | limit 20", log_group="/aws/lambda/helloWorld", ) print(results) ================================================ FILE: keep/providers/console_provider/__init__.py ================================================ ================================================ FILE: keep/providers/console_provider/console_provider.py ================================================ """ Simple Console Output Provider """ from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory class ConsoleProvider(BaseProvider): """Send alerts data to the console (debugging purposes).""" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): # No configuration to validate, so just do nothing. # For example, this could be the place where you validate that the expected keys are present in the configuration. # e.g. if "pagerduty_api_key" is not present in self.config.authentication pass def dispose(self): # No need to dispose of anything, so just do nothing. pass def _query( self, message: str = "", logger: bool = False, severity: str = "info", **kwargs, # TODO: remove '**kwargs', when we will pop it from the notify method in the base provider ): return self._notify(message, logger, severity) def _notify( self, message: str = "", logger: bool = False, severity: str = "info", **kwargs, # TODO: remove '**kwargs', when we will pop it from the notify method in the base provider ): """ Output alert message simply using the print method. Args: message (str): The message to be printed in to the console logger (bool): Whether to use the logger or not severity (str): The severity of the message if logger is True """ self.logger.debug("Outputting alert message to console") if logger: try: getattr(self.logger, severity)(message) except AttributeError: self.logger.error(f"Invalid log level {severity}") # default to print print(message) # use print else: print(message) self.logger.debug("Alert message outputted to console") return message if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Initalize the provider and provider config config = { "description": "Console Output Provider", "authentication": {}, } provider = ProvidersFactory.get_provider( context_manager, provider_id="mock", provider_type="console", provider_config=config, ) provider.notify( alert_message="Simple alert showing context with name: John Doe", logger=True, severity="critical", ) ================================================ FILE: keep/providers/coralogix_provider/__init__.py ================================================ ================================================ FILE: keep/providers/coralogix_provider/alerts_mock.py ================================================ ALERTS = { "uuid": "36fa9188-d097-439f-aad8-eb492169ac87", "alert_id": "49a4afb5-9231-418d-94ea-3a7546779658", "name": "Keep", "description": "This is a test alert", "threshold": "0", "timewindow": "10", "group_by_labels": "[]", "alert_action": "trigger", "alert_url": "https://ezhil.app.coralogix.in/#/insights?id=a6a74b7a-0d04-4806-9cf9-2d2c65ce444b", "log_url": "https://ezhil.app.coralogix.in/#/query-new/logs?id=uQAA2PuhsqCRkFj8HsWufQ", "icon_url": "https://dashboard.coralogix.com/assets/invite.png", "service": "$SERVICE", "duration": "$DURATION", "errors": "$ERRORS", "spans": "$SPANS", "fields": [ { "key": "team", "value": "ezhil" }, { "key": "application", "value": "*insert desired application name*" }, { "key": "subsystem", "value": "*insert desired subsystem name*" }, { "key": "severity", "value": "ERROR" }, { "key": "priority", "value": "P2" }, { "key": "severityLowercase", "value": "error" }, { "key": "computer", "value": "*insert computer name*" }, { "key": "ipAddress", "value": "Multiple IPs" }, { "key": "timestamp", "value": "2024/08/14 21:28:56 GMT" }, { "key": "hitCount", "value": "3" }, { "key": "text", "value": "this is a normal text message" }, { "key": "Custom field", "value": "$JSON_KEY" }, { "key": "Group-by Field1", "value": "$GROUP_BY_FIELD_1" }, { "key": "Group-by Value1", "value": "$GROUP_BY_VALUE_1" }, { "key": "Group-by Field2", "value": "$GROUP_BY_FIELD_2" }, { "key": "Group-by Value2", "value": "$GROUP_BY_VALUE_2" }, { "key": "metricKey", "value": "$METRIC_KEY" }, { "key": "metricOperator", "value": "$METRIC_OPERATOR" }, { "key": "timeframe", "value": "$TIMEFRAME" }, { "key": "timeframePercentageOverThreshold", "value": "$TIMEFRAME_OVER_THRESHOLD" }, { "key": "metricCriteria", "value": "$METRIC_CRITERIA" }, { "key": "ratioQueryOne", "value": "$RATIO_QUERY_ONE" }, { "key": "ratioQueryTwo", "value": "$RATIO_QUERY_TWO" }, { "key": "ratioTimeframe", "value": "$RATIO_TIMEFRAME" }, { "key": "ratioGroupByKeys", "value": "$RATIO_GROUP_BY_KEYS" }, { "key": "ratioGroupByTable", "value": "$RATIO_GROUP_BY_TABLE" }, { "key": "uniqueCountValuesList", "value": "$UNIQUE_COUNT_VALUES_LIST" }, { "key": "newValueTrackedKey", "value": "$NEW_VALUE_TRACKED_KEY" }, { "key": "metaLabels", "value": "alert_type:security" }, { "key": "timestampMs", "value": 1723670936254 }, { "key": "timestampISO", "value": "2024-08-14T21:28:56.254Z" }, { "key": "threadId", "value": "null" }, { "key": "category", "value": "null" }, { "key": "queryText", "value": "" }, { "key": "definedRatioThreshold", "value": "$DEFINED_RATIO_THRESHOLD" }, { "key": "metaLabelsJson", "value": "{\"alert_type\":\"security\"}" }, { "key": "metaLabelsList", "value": [ "alert_type:security" ] }, { "key": "opsgeniePriority", "value": "P2" }, { "key": "companyId", "value": "1010757" }, { "key": "dedupKey", "value": "2980ce54addeaebc580fdf3b787ddf26bd11ffd7980d6ba793127135d41d2d63" }, { "key": "alertUniqueIdentifier", "value": "780c892f-f4db-43cb-a833-5f13a5523e96" }, { "key": "relativeQueryText", "value": "$RELATIVE_QUERY_TEXT" }, { "key": "actualRatio", "value": "$ACTUAL_RATIO" }, { "key": "relativeHitCount", "value": "$RELATIVE_HIT_COUNT" }, { "key": "ratioQueryOne", "value": "$RATIO_QUERY_ONE" }, { "key": "ratioQueryTwo", "value": "$RATIO_QUERY_TWO" }, { "key": "ratioTimeframe", "value": "$RATIO_TIMEFRAME" }, { "key": "ratioGroupByKeys", "value": "$RATIO_GROUP_BY_KEYS" }, { "key": "ratioGroupByTable", "value": "$RATIO_GROUP_BY_TABLE" }, { "key": "flowAlertRelatedAlerts", "value": "$FLOW_ALERT_RELATED_ALERTS" }, { "key": "alertGroupByValues", "value": "$ALERT_GROUP_BY_VALUES" } ] } ================================================ FILE: keep/providers/coralogix_provider/coralogix_provider.py ================================================ """ Coralogix is a modern observability platform delivers comprehensive visibility into all your logs, metrics, traces and security events with end-to-end monitoring. """ import json from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class CoralogixProvider(BaseProvider): """Get alerts from Coralogix into Keep.""" webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Coralogix to Keep, Use the following webhook url to configure Coralogix send alerts to Keep: 1. From the Coralogix toolbar, navigate to Data Flow > Outbound Webhooks. 2. In the Outbound Webhooks section, click Generic Webhook. 3. Click Add New. 4. Enter a webhook name and set the URL to {keep_webhook_api_url}. 5. Select HTTP method (POST). 6. Add a request header with the key "x-api-key" and the value as {api_key}. 7. Edit the body of the messages that will be sent when the webhook is triggered (optional). 8. Save the configuration. """ SEVERITIES_MAP = { "debug": AlertSeverity.LOW, "verbose": AlertSeverity.LOW, "info": AlertSeverity.INFO, "warn": AlertSeverity.WARNING, "error": AlertSeverity.HIGH, "critical": AlertSeverity.CRITICAL, } PRIORTY_TO_SEVERITY_MAP = { "P1": AlertSeverity.CRITICAL, "P2": AlertSeverity.HIGH, "P3": AlertSeverity.WARNING, "P4": AlertSeverity.INFO, "P5": AlertSeverity.LOW, } STATUS_MAP = { "resolve": AlertStatus.RESOLVED, "trigger": AlertStatus.FIRING, } PROVIDER_DISPLAY_NAME = "Coralogix" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] FINGERPRINT_FIELDS = ["alertUniqueIdentifier"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for Coralogix's provider. """ # no config pass @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: fields_list = event["fields"] if "fields" in event else [] fields = {item["key"]: item["value"] for item in fields_list} labels = fields.get("text", fields.get("labels", {})) if isinstance(labels, str): try: labels = json.loads(labels) except Exception: # Do nothing, keep labels as str pass severity = AlertSeverity.INFO if "severityLowercase" in fields: severity = CoralogixProvider.SEVERITIES_MAP.get( fields.get("severityLowercase", "info") ) elif "priority" in fields: severity = CoralogixProvider.PRIORTY_TO_SEVERITY_MAP.get( fields.get("priority", "P5") ) alert = AlertDto( id=fields.get("alertUniqueIdentifier"), alert_id=event["alert_id"] if "alert_id" in event else None, name=event["name"] if "name" in event else None, description=event["description"] if "description" in event else None, status=CoralogixProvider.STATUS_MAP.get(event["alert_action"]), severity=severity, lastReceived=fields.get("timestampISO"), alertUniqueIdentifier=fields.get("alertUniqueIdentifier"), uuid=event["uuid"] if "uuid" in event else None, threshold=event["threshold"] if "threshold" in event else None, timewindow=event["timewindow"] if "timewindow" in event else None, group_by_labels=fields.get("group_by_labels"), alert_url=event["alert_url"] if "alert_url" in event else None, log_url=event["log_url"] if "log_url" in event else None, team=fields.get("team"), priority=fields.get("priority"), computer=fields.get("computer"), fields=fields, labels=labels if isinstance(labels, dict) else {}, source=["coralogix"], ) return alert if __name__ == "__main__": pass ================================================ FILE: keep/providers/dash0_provider/__init__.py ================================================ ================================================ FILE: keep/providers/dash0_provider/alerts_mock.py ================================================ ALERTS = { "type": "alert.resolved", "data": { "issue": { "id": "b9a9da0b-7a79-4a1d-abf3-5cd07649e80a", "issueIdentifier": "6820705469291328438", "dataset": "default", "start": "2025-02-03T07:17:17.474101621Z", "end": "2025-02-03T07:24:17.474101621Z", "status": "resolved", "summary": "This is a summay", "description": "This is a description", "labels": [ { "key": "service.name", "value": { "stringValue": "my-first-observable-service" } }, { "key": "dash0.resource.name", "value": { "stringValue": "my-first-observable-service" } } ], "annotations": [], "checkrules": [ { "id": "97daff98-e694-421d-abda-d53b23ccfd41", "version": 1, "name": "New Check Rule", "expression": "increase({otel_metric_name = \"dash0.logs\"}[5m]) >= $__threshold", "thresholds": { "degraded": 1, "failed": 5 }, "interval": "1m0s", "for": "0s", "keepFiringFor": "0s", "summary": "This is a summay", "description": "This is a description", "labels": {}, "annotations": {}, "url": "https://app.dash0.com/alerting/check-rules?org=477cb1f5-90ca-404e-8533-7a1907b58669&s=eJxljU0OwiAUhO_y1sW-0tYKB_AA6sodhYcSsU34WTXcXerKxOXMN19mgxbkBjasb5DAkY8MOcP-hpPsuEQ8IOIdGkjrH-fihxuVVKRUR4asyj5BaaBVnkJyy6PVT9IvFrKnuP9946WmK3nSya3L3jpTdTEZZa04MTqKgQ28M0zNRjEz9jPvtbZm6Oqfi-fsfdSBqLopZCqlfADAkT0J" } ], "url": "https://app.dash0.com/alerting/failed-checks?org=477cb1f5-90ca-404e-8533-7a1907b58669&s=eJxlT71uwyAQfhfmEJ8xNoY36NIuVYduhzlaFMdUgNMh8rsXqg6Vst3p-7-zjpk78ylemWECxMhBcBheQZleGIAzALyzEyvxARf6H-6wYKZSSY487mthx4l1uFIqYfvoPIaVHF8-abnklpiDI4upnSHnnZ5clVqN2iFYrlBpLrF3HK0f-Lg4UJPUNAPWrD8BbSX4QNWDTbMABaOctND9IGY5zK1zuNKLf46NtmAJcXvcIE2vzlLJ341oKyHeKN0CfbcBjkotnt_aW5vGL6oWJe10HMcPHhhbwg%3D%3D" } } } ================================================ FILE: keep/providers/dash0_provider/dash0_provider.py ================================================ """ Dash0 Provider allows to receive alerts from Dash0 using Webhook. """ from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class Dash0Provider(BaseProvider): """ Get alerts from Dash0 into Keep. """ webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Dash0 to Keep, Use the following webhook url to configure Dash0 send alerts to Keep: 1. In Dash0, go to Organization settings. 2. Go to Notification Channels and create a New notification channel with type Webhook. 3. Give a name to the notification channel and use {keep_webhook_api_url} as the URL. 4. Add a request header with the key "x-api-key" and the value as {api_key}. 5. Save the configuration. 6. Go to Notifications under Alerting in the left sidebar and create a New notification rule if required or change the Notification channel to webhook created in step 3 for an existing Notification Rule. 7. Go to Checks under Alerting in the left sidebar and create a New Check Rule according to your requirements and assign the Notification Rule. """ STATUS_MAP = { "critical": AlertStatus.FIRING, "degraded": AlertStatus.FIRING, "resolved": AlertStatus.RESOLVED, } # Dash0 doesn't have severity levels, so we map status to severity levels manually. SEVERITIES_MAP = { "critical": AlertSeverity.CRITICAL, "degraded": AlertSeverity.WARNING, "resolved": AlertSeverity.INFO, } PROVIDER_DISPLAY_NAME = "Dash0" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for Dash0's provider. """ pass @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: data = event.get("data") issue = data.get("issue") alert = AlertDto( id=issue.get("id"), name=issue.get("summary", "Could not fetch summary"), type=event.get("type", "Could not fetch type"), description=issue.get("description", "Could not fetch description"), summary=issue.get("summary", "Could not fetch summary"), url=issue.get("url", "https://could-not-find-url"), status=Dash0Provider.STATUS_MAP.get( issue.get("status"), AlertStatus.FIRING ), severity=Dash0Provider.SEVERITIES_MAP.get( issue.get("status"), AlertSeverity.CRITICAL ), lastReceived=issue.get("end", issue.get("start")), startedAt=issue.get("start", issue.get("end")), labels=issue.get("labels", []), checkrules=issue.get("checkrules", []), source=["dash0"], ) return alert if __name__ == "__main__": pass ================================================ FILE: keep/providers/databend_provider/README.md ================================================ ## Databend Setup using Docker 1. Run the following command to start a Databend container. ```bash docker run \ -p 8000:8000 \ -e QUERY_DEFAULT_USER=databend \ -e QUERY_DEFAULT_PASSWORD=databend \ datafuselabs/databend ``` ================================================ FILE: keep/providers/databend_provider/__init__.py ================================================ ================================================ FILE: keep/providers/databend_provider/databend_provider.py ================================================ """ DatabendProvider is a class that provides a way to interact with Databend. """ import os import base64 import dataclasses import pydantic import requests from urllib.parse import urljoin from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class DatabendProviderAuthConfig: host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Databend host_url", "hint": "e.g. https://databend.example.com", "sensitive": False, "validation": "any_http_url", } ) username: str = dataclasses.field( metadata={ "required": True, "description": "Databend username" } ) password: str = dataclasses.field( metadata={ "required": True, "description": "Databend password", "sensitive": True } ) class DatabendProvider(BaseProvider): """ Enrich alerts with data from Databend. """ PROVIDER_DISPLAY_NAME = "Databend" PROVIDER_CATEGORY = ["Database"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_server", description="The user can connect to the server", mandatory=True, alias="Connect to the server", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.client = None def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ try: response = requests.post( urljoin(self.authentication_config.host_url, "/v1/query"), headers=self.generate_auth_headers(), json={"sql": "SELECT 1"}, ) if response.status_code != 200: response.raise_for_status() self.logger.info("Successfully validated scopes", extra={"response": response.json()}) return {"connect_to_server": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"error": str(e)}) return {"connect_to_server": str(e)} def generate_auth_headers(self): """ Generates authentication headers for Databend. """ credentials = f"{self.authentication_config.username}:{self.authentication_config.password}".encode("utf-8") encoded_credentials = base64.b64encode(credentials).decode("utf-8") return { "Authorization": f"Basic {encoded_credentials}", "Content-Type": "application/json", } def dispose(self): pass def validate_config(self): """ Validates required configuration fields for Databend provider. """ self.authentication_config = DatabendProviderAuthConfig( **self.config.authentication ) def _query(self, query=""): """ Executes a query on Databend. """ response = requests.post( urljoin(self.authentication_config.host_url, "/v1/query"), headers=self.generate_auth_headers(), json={"sql": query}, ) try: response.raise_for_status() return response.json() except Exception as e: self.logger.exception("Failed to execute query", extra={"error": str(e)}) raise Exception("Failed to execute query") if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = ProviderConfig( description="Databend Provider", authentication={ "host_url": os.environ.get("DATABEND_HOST_URL"), "username": os.environ.get("DATABEND_USERNAME"), "password": os.environ.get("DATABEND_PASSWORD"), } ) databend_provider = DatabendProvider(context_manager, "databend", config) result = databend_provider._query("SELECT avg(number) FROM numbers(100000000)") print(result) ================================================ FILE: keep/providers/datadog_provider/__init__.py ================================================ ================================================ FILE: keep/providers/datadog_provider/alerts_mock.py ================================================ ALERTS = { "high_cpu_usage": { "payload": { "title": "High CPU Usage", "type": "metric alert", "query": "avg(last_5m):avg:system.cpu.user{*} by {host} > 90", "message": "CPU usage is over 90% on {{host.name}}.", "description": "CPU usage is over 90% on {{host.name}}.", "tags": "environment:production, team:backend", "priority": "P3", "monitor_id": "1234567890", "scopes": [], }, "parameters": { "tags": [ "environment:production,team:backend,monitor,service:api", "environment:staging,team:backend,monitor,service:api", ], "priority": ["P2", "P3", "P4"], "scopes": [ "srv1-us1-prod", "srv2-us1-prod", "srv1-eu1-prod", "srv3-us1-prod", "srv2-eu1-prod", "srv1-ap1-prod", "srv2-ap1-prod", "srv1-us2-prod", ], }, "renders": { "host.name": [ "srv1-us1-prod", "srv2-us1-prod", "srv1-eu1-prod", "srv3-us1-prod", "srv2-eu1-prod", "srv1-ap1-prod", "srv2-ap1-prod", "srv1-us2-prod", ], }, }, "low_disk_space": { "payload": { "title": "Low Disk Space", "type": "metric alert", "query": "avg(last_1h):min:system.disk.free{*} by {host} < 20", "message": "Disk space is below 20% on {{host.name}}.", "description": "Disk space is below 20% on {{host.name}}.", "tags": "environment:production,team:database", "priority": 4, "monitor_id": "1234567891", "scopes": [], }, "parameters": { "tags": [ "environment:production,team:analytics,monitor,service:api", "environment:staging,team:database,monitor,service:api", ], "priority": ["P1", "P3", "P4"], "scopes": [ "srv1-us1-prod", "srv2-us1-prod", "srv1-eu1-prod", "srv3-us1-prod", "srv2-eu1-prod", "srv1-ap1-prod", "srv2-ap1-prod", "srv1-us2-prod", ], }, "renders": { "host.name": [ "srv1-us1-prod", "srv2-us1-prod", "srv1-eu1-prod", "srv3-us1-prod", "srv2-eu1-prod", "srv1-ap1-prod", "srv2-ap1-prod", "srv1-us2-prod", ], }, }, "mq_consumer_struggling": { "payload": { "title": "MQ Consumer Is Struggling", "type": "metric alert", "query": "avg(last_1h):min:mq_processing{*} by {host} < 10", "message": "MQ Consumer is processing less than 10 messages per second on {{host.name}}.", "description": "MQ Consumer is processing less than 10 messages per second on {{host.name}}.", "tags": "environment:production,team:database", "priority": 4, "monitor_id": "1234567891", "scopes": [], }, "parameters": { "tags": [ "environment:production,team:analytics,monitor,service:api", "environment:staging,team:database,monitor,service:api", ], "priority": ["P1", "P3", "P4"], "scopes": ["mq-us1-prod", "mq-eu1-prod", "mq-ap1-prod", "mq-us2-prod"], }, "renders": { "host.name": [ "srv1-us1-prod", "srv2-us1-prod", "srv1-eu1-prod", "srv3-us1-prod", "srv2-eu1-prod", "srv1-ap1-prod", "srv2-ap1-prod", "srv1-us2-prod", ], }, }, } ================================================ FILE: keep/providers/datadog_provider/datadog_alert_format_description.py ================================================ from typing import Literal from pydantic import BaseModel, Field class Thresholds(BaseModel): critical: float critical_recovery: float ok: float warning: float warning_recovery: float unknown: float class EvaluationWindow(BaseModel): day_starts: str hour_starts: int month_starts: int class SchedulingOptions(BaseModel): evaluation_window: EvaluationWindow class ThresholdWindows(BaseModel): recovery_window: str trigger_window: str class DatadogOptions(BaseModel): enable_logs_sample: bool enable_samples: bool escalation_message: str evaluation_delay: int group_retention_duration: str grouby_simple_monitor: bool include_tags: bool locked: bool min_failure_duration: int min_location_failed: int new_group_delay: int new_host_delay: int no_data_timeframe: int notification_preset_name: Literal[ "show_all", "hide_query", "hide_handles", "hide_all" ] notify_audit: bool notify_by: list[str] notify_no_data: bool on_missing_data: Literal[ "default", "show_no_data", "show_and_notify_no_data", "resolve" ] renotify_interval: int renotify_occurrences: int renotify_statuses: list[str] require_full_window: bool cheduling_options: SchedulingOptions silenced: dict threshold_windows: ThresholdWindows # thresholds: Thresholds timeout_h: int class DatadogAlertFormatDescription(BaseModel): message: str = Field( ..., description="A message to include with notifications for this monitor." ) name: str = Field(..., description="The name of the monitor.") options: DatadogOptions priority: int = Field(..., description="The priority of the monitor.", min=1, max=5) query: str = Field(..., description="The query to monitor.", required=True) tags: list[str] type: Literal[ "composite", "event alert", "log alert", "metric alert", "process alert", "query alert", "rum alert", "service check", "synthetics alert", "trace-analytics alert", "slo alert", "event-v2 alert", "audit alert", "ci-pipelines alert", "ci-tests alert", "error-tracking alert", ] class Config: schema_extra = { "example": { "name": "Example-Monitor", "type": "rum alert", "query": 'formula("query2 / query1 * 100").last("15m") >= 0.8', "message": "some message Notify: @hipchat-channel", "tags": ["test:examplemonitor", "env:ci"], "priority": 3, "options": { "thresholds": {"critical": 0.8}, "variables": [ { "data_source": "rum", "name": "query2", "search": {"query": ""}, "indexes": ["*"], "compute": {"aggregation": "count"}, "group_by": [], }, { "data_source": "rum", "name": "query1", "search": {"query": "status:error"}, "indexes": ["*"], "compute": {"aggregation": "count"}, "group_by": [], }, ], }, } } ================================================ FILE: keep/providers/datadog_provider/datadog_provider.py ================================================ """ Datadog Provider is a class that allows to ingest/digest data from Datadog. """ import dataclasses import datetime import json import logging import os import re import time from collections import defaultdict from dataclasses import asdict from typing import List, Literal, Optional import pydantic import requests from datadog_api_client import ApiClient, Configuration from datadog_api_client.api_client import Endpoint from datadog_api_client.exceptions import ( ApiException, ApiValueError, ForbiddenException, NotFoundException, ) from datadog_api_client.v1.api.logs_api import LogsApi from datadog_api_client.v1.api.metrics_api import MetricsApi from datadog_api_client.v1.api.monitors_api import MonitorsApi from datadog_api_client.v1.api.webhooks_integration_api import WebhooksIntegrationApi from datadog_api_client.v1.model.monitor import Monitor from datadog_api_client.v1.model.monitor_options import MonitorOptions from datadog_api_client.v1.model.monitor_thresholds import MonitorThresholds from datadog_api_client.v1.model.monitor_type import MonitorType # from datadog_api_client.v1.api.events_api import EventsApi from datadog_api_client.v2.api.events_api import EventsApi from datadog_api_client.v2.api.incidents_api import IncidentsApi from datadog_api_client.v2.api.service_definition_api import ServiceDefinitionApi from datadog_api_client.v2.api.users_api import UsersApi, UsersResponse from pydantic import Field from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseTopologyProvider, ProviderHealthMixin from keep.providers.base.provider_exceptions import GetAlertException from keep.providers.datadog_provider.datadog_alert_format_description import ( DatadogAlertFormatDescription, ) from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod from keep.providers.providers_factory import ProvidersFactory from keep.validation.fields import HttpsUrl logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class DatadogAlertDetails: metric_graph_url: Optional[str] = Field(default=None) metric_query: Optional[str] = Field(default=None) trigger_time: Optional[str] = Field(default=None) monitor_status_url: Optional[str] = Field(default=None) edit_monitor_url: Optional[str] = Field(default=None) related_logs_url: Optional[str] = Field(default=None) alert_message: Optional[str] = Field(default=None) mentioned_users: List[str] = Field(default_factory=list) # Best effort to extract relevant details from the Datadog alert webhook payload body def extract_alert_details(body: str) -> DatadogAlertDetails: """ Extracts relevant details from a Datadog alert webhook payload body. Args: body: The message body from the Datadog webhook payload Returns: DatadogAlertDetails object containing extracted information """ if not body: return DatadogAlertDetails() # Remove the %%% markers if present body = body.strip("%%%\n") details = DatadogAlertDetails() details.mentioned_users = [] # Extract metric graph URL metric_graph_match = re.search(r"\[!\[Metric Graph\]\((.*?)\)\]", body) if metric_graph_match: details.metric_graph_url = metric_graph_match.group(1) # Extract trigger time trigger_time_match = re.search(r"The monitor was last triggered at (.*?)\.", body) if trigger_time_match: details.trigger_time = trigger_time_match.group(1) # Extract URLs from the footer monitor_status_match = re.search(r"\[Monitor Status\]\((.*?)\)", body) if monitor_status_match: details.monitor_status_url = monitor_status_match.group(1) edit_monitor_match = re.search(r"\[Edit Monitor\]\((.*?)\)", body) if edit_monitor_match: details.edit_monitor_url = edit_monitor_match.group(1) related_logs_match = re.search(r"\[Related Logs\]\((.*?)\)", body) if related_logs_match: details.related_logs_url = related_logs_match.group(1) # Extract mentioned users (starting with @) details.mentioned_users = re.findall(r"@([^\s]+)", body) # Extract the main alert message (first line of the message) lines = body.split("\n") for line in lines: if line and not line.startswith("%%%") and not line.startswith("@"): details.alert_message = line.strip() break return details @pydantic.dataclasses.dataclass class DatadogProviderAuthConfig: """ Datadog authentication configuration. """ KEEP_DATADOG_WEBHOOK_INTEGRATION_NAME = "keep-datadog-webhook-integration" api_key: str = dataclasses.field( metadata={ "required": True, "description": "Datadog Api Key", "hint": "https://docs.datadoghq.com/account_management/api-app-keys/#api-keys", "sensitive": True, }, default="", ) app_key: str = dataclasses.field( metadata={ "required": True, "description": "Datadog App Key", "hint": "https://docs.datadoghq.com/account_management/api-app-keys/#application-keys", "sensitive": True, }, default="", ) domain: HttpsUrl = dataclasses.field( metadata={ "required": False, "description": "Datadog API domain", "sensitive": False, "hint": "https://api.datadoghq.com", "validation": "https_url", }, default="https://api.datadoghq.com", ) environment: str = dataclasses.field( metadata={ "required": False, "description": "Topology environment name", "sensitive": False, "hint": "Defaults to *", }, default="*", ) oauth_token: dict = dataclasses.field( metadata={ "description": "For OAuth flow", "required": False, "sensitive": True, "hidden": True, }, default_factory=dict, ) class DatadogProvider(BaseTopologyProvider, ProviderHealthMixin): """Pull/push alerts from Datadog.""" PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_DISPLAY_NAME = "Datadog" OAUTH2_URL = os.environ.get("DATADOG_OAUTH2_URL") DATADOG_CLIENT_ID = os.environ.get("DATADOG_CLIENT_ID") DATADOG_CLIENT_SECRET = os.environ.get("DATADOG_CLIENT_SECRET") PROVIDER_SCOPES = [ ProviderScope( name="events_read", description="Read events data.", mandatory=True, alias="Events Data Read", ), ProviderScope( name="monitors_read", description="Read monitors", mandatory=True, mandatory_for_webhook=True, documentation_url="https://docs.datadoghq.com/account_management/rbac/permissions/#monitors", alias="Monitors Read", ), ProviderScope( name="monitors_write", description="Write monitors", mandatory=False, mandatory_for_webhook=True, documentation_url="https://docs.datadoghq.com/account_management/rbac/permissions/#monitors", alias="Monitors Write", ), ProviderScope( name="create_webhooks", description="Create webhooks integrations", mandatory=False, mandatory_for_webhook=True, alias="Integrations Manage", ), ProviderScope( name="metrics_read", description="View custom metrics.", mandatory=False, ), ProviderScope( name="logs_read", description="Read log data.", mandatory=False, alias="Logs Read Data", ), ProviderScope( name="apm_read", description="Read APM data for Topology creation.", mandatory=False, alias="Read APM Data", ), ProviderScope( name="apm_service_catalog_read", description="Read APM service catalog for Topology creation.", mandatory=False, alias="Read APM service catalog Data", ), ] PROVIDER_METHODS = [ ProviderMethod( name="Mute a Monitor", func_name="mute_monitor", scopes=["monitors_write"], description="Mute a monitor", type="action", ), ProviderMethod( name="Unmute a Monitor", func_name="unmute_monitor", scopes=["monitors_write"], description="Unmute a monitor", type="action", ), ProviderMethod( name="Get Monitor Events", func_name="get_monitor_events", scopes=["events_read"], description="Get all events related to this monitor", type="view", ), ProviderMethod( name="Get a Trace", func_name="get_trace", scopes=["apm_read"], description="Get trace by ID", type="view", ), ProviderMethod( name="Create Incident", func_name="create_incident", scopes=["incidents_write"], description="Create an incident", type="action", ), ProviderMethod( name="Resolve Incident", func_name="resolve_incident", scopes=["incidents_write"], description="Resolve an active incident", type="action", ), ProviderMethod( name="Add Incident Timeline Note", func_name="add_incident_timeline_note", scopes=["incidents_write"], description="Add a note to an incident timeline", type="action", ), ] FINGERPRINT_FIELDS = ["groups", "monitor_id"] WEBHOOK_PAYLOAD = json.dumps( { "body": "$EVENT_MSG", "last_updated": "$LAST_UPDATED", "event_type": "$EVENT_TYPE", "title": "$EVENT_TITLE", "severity": "$ALERT_PRIORITY", "alert_type": "$ALERT_TYPE", "alert_query": "$ALERT_QUERY", "alert_transition": "$ALERT_TRANSITION", "date": "$DATE", "scopes": "$ALERT_SCOPE", "org": {"id": "$ORG_ID", "name": "$ORG_NAME"}, "url": "$LINK", "tags": "$TAGS", "id": "$ID", "monitor_id": "$ALERT_ID", } ) SEVERITIES_MAP = { "P4": AlertSeverity.INFO, 4: AlertSeverity.INFO, "P3": AlertSeverity.WARNING, 3: AlertSeverity.WARNING, "P2": AlertSeverity.HIGH, 2: AlertSeverity.HIGH, "P1": AlertSeverity.CRITICAL, 1: AlertSeverity.CRITICAL, } STATUS_MAP = { "Triggered": AlertStatus.FIRING, "Recovered": AlertStatus.RESOLVED, "Muted": AlertStatus.SUPPRESSED, } def convert_to_seconds(s): seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800} return int(s[:-1]) * seconds_per_unit[s[-1]] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.configuration = Configuration(request_timeout=60) if self.authentication_config.api_key and self.authentication_config.app_key: self.configuration.api_key["apiKeyAuth"] = ( self.authentication_config.api_key ) self.configuration.api_key["appKeyAuth"] = ( self.authentication_config.app_key ) domain = self.authentication_config.domain or "https://api.datadoghq.com" self.configuration.host = domain elif self.authentication_config.oauth_token: domain = self.authentication_config.oauth_token.get( "domain", "datadoghq.com" ) response = requests.post( f"https://api.{domain}/oauth2/v1/token", data={ "grant_type": "refresh_token", "client_id": DatadogProvider.DATADOG_CLIENT_ID, "client_secret": DatadogProvider.DATADOG_CLIENT_SECRET, "redirect_uri": self.authentication_config.oauth_token.get( "redirect_uri" ), "code_verifier": self.authentication_config.oauth_token.get( "verifier" ), "code": self.authentication_config.oauth_token.get("code"), "refresh_token": self.authentication_config.oauth_token.get( "refresh_token" ), }, ) if not response.ok: raise Exception("Could not refresh token, need to re-authenticate") response_json = response.json() self.configuration.access_token = response_json.get("access_token") self.configuration.host = f"https://api.{domain}" # update the oauth_token refresh_token for next run self.config.authentication["oauth_token"]["refresh_token"] = response_json[ "refresh_token" ] else: raise Exception("No authentication provided") # to be exposed self.to = None self._from = None @staticmethod def oauth2_logic(**payload) -> dict: """ Logic for handling oauth2 callback. Returns: dict: access token to Datadog. """ domain = payload.pop("domain", "datadoghq.com") verifier = payload.pop("verifier", None) if not verifier: raise Exception("No verifier provided") code = payload.pop("code", None) if not code: raise Exception("No code provided") token = requests.post( f"https://api.{domain}/oauth2/v1/token", data={ "grant_type": "authorization_code", "client_id": payload["client_id"], "client_secret": DatadogProvider.DATADOG_CLIENT_SECRET, "redirect_uri": payload["redirect_uri"], "code_verifier": verifier, "code": code, }, ).json() access_token = token.get("access_token") if not access_token: raise Exception("No access token provided") return { "oauth_token": { **token, "verifier": verifier, "code": code, "redirect_uri": payload["redirect_uri"], "domain": domain, } } def get_users(self) -> UsersResponse: with ApiClient(self.configuration) as api_client: api = UsersApi(api_client) return api.list_users() def add_incident_timeline_note(self, incident_id: str, note: str): headers = {} if self.authentication_config.api_key and self.authentication_config.app_key: headers["DD-API-KEY"] = self.authentication_config.api_key headers["DD-APPLICATION-KEY"] = self.authentication_config.app_key else: headers["Authorization"] = ( f"Bearer {self.authentication_config.oauth_token.get('access_token')}" ) endpoint = f"api/v2/incidents/{incident_id}/timeline" url = f"{self.configuration.host}/{endpoint}" response = requests.post( url, headers=headers, json={ "data": { "attributes": { "cell_type": "markdown", "content": {"content": note}, }, "type": "incident_timeline_cells", } }, ) if response.ok: return response.json() else: raise Exception( f"Failed to add incident timeline note: {response.status_code} {response.text}" ) def resolve_incident(self, incident_id: str): self.configuration.unstable_operations["update_incident"] = True with ApiClient(self.configuration) as api_client: api = IncidentsApi(api_client) response = api.update_incident( incident_id, { "data": { "id": incident_id, "type": "incidents", "attributes": {"fields": {"state": {"value": "resolved"}}}, } }, ) return response.data.to_dict() def create_incident( self, incident_name: str, incident_message: str, commander_user: str, customer_impacted: bool = False, important: bool = True, severity: Literal["SEV-1", "SEV-2", "SEV-3", "SEV-4", "UNKNOWN"] = "SEV-4", fields: dict = {"state": {"value": "active"}}, ): users = self.get_users() commander_user_obj = next( ( user for user in users.data if user.attributes.name == commander_user or user.attributes.handle == commander_user ), users.data[0], # select the first user as the commander if not found ) fields["severity"] = {"value": severity} body = { "data": { "type": "incidents", "attributes": { "title": incident_name, "fields": fields, "initial_cells": [ { "cell_type": "markdown", "content": { "content": incident_message, "important": important, }, } ], "customer_impacted": customer_impacted, }, "relationships": { "commander_user": { "data": { "type": "users", "id": commander_user_obj.id, }, }, }, } } self.configuration.unstable_operations["create_incident"] = True with ApiClient(self.configuration) as api_client: api = IncidentsApi(api_client) result = api.create_incident(body) host_app = self.configuration.host.replace("api", "app") return { "id": result.data.id, "url": f"{host_app}/incidents/{result.data.attributes.public_id}", "title": incident_name, "incident": result.data.attributes.to_dict(), } def mute_monitor( self, monitor_id: str, groups: list = [], end: datetime.datetime = datetime.datetime.now() + datetime.timedelta(days=1), ): self.logger.info("Muting monitor", extra={"monitor_id": monitor_id, "end": end}) if isinstance(end, str): end = datetime.datetime.fromisoformat(end) groups = ",".join(groups) if groups == "*": groups = "" with ApiClient(self.configuration) as api_client: endpoint = Endpoint( settings={ "auth": ["apiKeyAuth", "appKeyAuth", "AuthZ"], "endpoint_path": "/api/v1/monitor/{monitor_id}/mute", "response_type": (dict,), "operation_id": "mute_monitor", "http_method": "POST", "version": "v1", }, params_map={ "monitor_id": { "required": True, "openapi_types": (int,), "attribute": "monitor_id", "location": "path", }, "scope": { "openapi_types": (str,), "attribute": "scope", "location": "query", }, "end": { "openapi_types": (int,), "attribute": "end", "location": "query", }, }, headers_map={ "accept": ["application/json"], "content_type": ["application/json"], }, api_client=api_client, ) endpoint.call_with_http_info( monitor_id=int(monitor_id), end=int(end.timestamp()), scope=groups, ) self.logger.info("Monitor muted", extra={"monitor_id": monitor_id}) def unmute_monitor( self, monitor_id: str, groups: list = [], ): self.logger.info("Unmuting monitor", extra={"monitor_id": monitor_id}) groups = ",".join(groups) with ApiClient(self.configuration) as api_client: endpoint = Endpoint( settings={ "auth": ["apiKeyAuth", "appKeyAuth", "AuthZ"], "endpoint_path": "/api/v1/monitor/{monitor_id}/unmute", "response_type": (dict,), "operation_id": "mute_monitor", "http_method": "POST", "version": "v1", }, params_map={ "monitor_id": { "required": True, "openapi_types": (int,), "attribute": "monitor_id", "location": "path", }, "scope": { "openapi_types": (str,), "attribute": "scope", "location": "query", }, }, headers_map={ "accept": ["application/json"], "content_type": ["application/json"], }, api_client=api_client, ) endpoint.call_with_http_info( monitor_id=int(monitor_id), scope=groups, ) self.logger.info("Monitor unmuted", extra={"monitor_id": monitor_id}) # @tb: we need to standardize the way we get traces # e.g., create a trace model and use it across providers def get_trace(self, trace_id: str): self.logger.info("Getting trace", extra={"trace_id": trace_id}) headers = {} if self.authentication_config.api_key and self.authentication_config.app_key: headers["DD-API-KEY"] = self.authentication_config.api_key headers["DD-APPLICATION-KEY"] = self.authentication_config.app_key else: headers["Authorization"] = ( f"Bearer {self.authentication_config.oauth_token.get('access_token')}" ) endpoint = f"api/unstable/ui/trace/{trace_id}" url = f"{self.configuration.host}/{endpoint}" response = requests.get(url, headers=headers) if response.ok: self.logger.info("Trace retrieved", extra={"trace_id": trace_id}) trace_data = response.json() return trace_data.get("data", {}).get("attributes", {}).get("trace", {}) else: self.logger.error( "Failed to get trace", extra={ "trace_id": trace_id, "status_code": response.status_code, "response": response.text, }, ) raise Exception( f"Failed to get traces: {response.status_code} {response.text}" ) def search_traces(self, queries: list[str], **kwargs): if not queries: raise Exception("No services provided") self.logger.info("Searching traces", extra={"queries": queries}) headers = {} if self.authentication_config.api_key and self.authentication_config.app_key: headers["DD-API-KEY"] = self.authentication_config.api_key headers["DD-APPLICATION-KEY"] = self.authentication_config.app_key else: headers["Authorization"] = ( f"Bearer {self.authentication_config.oauth_token.get('access_token')}" ) alltraces = defaultdict(list) for query in queries: self.logger.info("Searching traces", extra={"query": query}) try: traces = self._search_traces(query, headers) traces_ids = [ t.get("attributes").get("trace_id") for t in traces["data"] ] alltraces[query] = traces_ids except Exception: self.logger.exception( "Failed to get traces", extra={ "query": query, }, ) continue return alltraces def _search_traces(self, query: str, headers: dict): span_query = self._translate_metric_query_to_span_query(query) data = { "data": { "attributes": { "filter": { "from": "now-1800s", "to": "now", "query": span_query, }, "options": {"timezone": "UTC"}, "page": {"limit": 5}, "sort": "-timestamp", }, "type": "search_request", } } endpoint = "/api/v2/spans/events/search" url = f"{self.configuration.host}/{endpoint}" response = requests.post(url, headers=headers, json=data) if response.ok: self.logger.info("Traces retrieved", extra={"query": query}) traces = response.json() return traces else: self.logger.error( "Failed to get traces", extra={ "query": query, "status_code": response.status_code, "response": response.text, }, ) raise Exception( f"Failed to get traces: {response.status_code} {response.text}" ) def get_monitor_events(self, monitor_id: str): self.logger.info("Getting monitor events", extra={"monitor_id": monitor_id}) with ApiClient(self.configuration) as api_client: # tb: when it's out of beta, we should move to api v2 api = EventsApi(api_client) end = datetime.datetime.now() # tb: we can make timedelta configurable by the user if we want start = datetime.datetime.now() - datetime.timedelta(days=1) filter_from = str(int(start.timestamp() * 1000)) filter_to = str(int(end.timestamp() * 1000)) results = api.list_events( filter_from=filter_from, filter_to=filter_to, filter_query="source:alert", ) # Filter out events that are related to this monitor only # tb: We might want to exclude some fields from event.to_dict() but let's wait for user feedback results = [ event.to_dict() for event in results.get("events", []) if str(event.monitor_id) == str(monitor_id) ] self.logger.info( "Monitor events retrieved", extra={"monitor_id": monitor_id} ) return results def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Datadog provider. """ self.authentication_config = DatadogProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): scopes = {} self.logger.info("Validating scopes") with ApiClient(self.configuration) as api_client: for scope in self.PROVIDER_SCOPES: try: if scope.name == "monitors_read": api = MonitorsApi(api_client) api.list_monitors() elif scope.name == "monitors_write": api = MonitorsApi(api_client) body = Monitor( name="Example-Monitor", type=MonitorType.RUM_ALERT, query='formula("1 * 100").last("15m") >= 200', message="some message Notify: @hipchat-channel", tags=[ "test:examplemonitor", "env:ci", ], priority=3, options=MonitorOptions( thresholds=MonitorThresholds( critical=200, ), variables=[], ), ) monitor = api.create_monitor(body) api.delete_monitor(monitor.id) elif scope.name == "create_webhooks": api = WebhooksIntegrationApi(api_client) # We check if we have permissions to query webhooks, this means we have the create_webhooks scope try: api.create_webhooks_integration( body={ "name": "keep-webhook-scope-validation", "url": "https://example.com", } ) # for some reason create_webhooks does not allow to delete: api.delete_webhooks_integration(webhook_name), no scope for deletion except ApiException as e: # If it's something different from 403 it means we have access! (for example, already exists because we created it once) if e.status == 403: raise e elif scope.name == "metrics_read": api = MetricsApi(api_client) api.query_metrics( query="system.cpu.idle{*}", _from=int((datetime.datetime.now()).timestamp()), to=int(datetime.datetime.now().timestamp()), ) elif scope.name == "logs_read": self._query( query="*", timeframe="1h", query_type="logs", ) elif scope.name == "events_read": api = EventsApi(api_client) end = datetime.datetime.now() start = datetime.datetime.now() - datetime.timedelta(hours=1) # Convert to milliseconds and ensure they're strings filter_from = str(int(start.timestamp() * 1000)) filter_to = str(int(end.timestamp() * 1000)) api.list_events(filter_from=filter_from, filter_to=filter_to) elif scope.name == "apm_read": api_instance = ServiceDefinitionApi(api_client) api_instance.list_service_definitions(schema_version="v1") elif scope.name == "apm_service_catalog_read": endpoint = self.__get_service_deps_endpoint(api_client) epoch_time_one_year_ago = self.__get_epoch_one_year_ago() endpoint.call_with_http_info( env=self.authentication_config.environment, start=str(epoch_time_one_year_ago), ) except ApiException as e: # API failed and it means we're probably lacking some permissions # perhaps we should check if status code is 403 and otherwise mark as valid? self.logger.warning( f"ApiException Failed to validate scope {scope.name}", extra={"reason": e.reason, "code": e.status}, ) scopes[scope.name] = str(e.reason) continue # API value error means we have the permissions # but the underlying SDK fails to validate the data see # https://github.com/DataDog/datadog-api-client-python/issues/2432 except ApiValueError: self.logger.exception( f"ApiValueError Failed to validate scope {scope.name}", ) scopes[scope.name] = True continue except Exception as e: self.logger.warning( f"Failed to validate scope unknown error {scope.name}", extra={"reason": str(e)}, ) scopes[scope.name] = str(e) continue scopes[scope.name] = True self.logger.info("Scopes validated", extra=scopes) return scopes def expose(self): return { "to": int(self.to.timestamp()) * 1000, "from": int(self._from.timestamp()) * 1000, } def _query(self, query="", timeframe="", query_type="", **kwargs: dict): timeframe_in_seconds = DatadogProvider.convert_to_seconds(timeframe) self.to = datetime.datetime.fromtimestamp(time.time()) self._from = datetime.datetime.fromtimestamp( time.time() - (timeframe_in_seconds) ) if query_type == "logs": with ApiClient(self.configuration) as api_client: api = LogsApi(api_client) results = api.list_logs( body={ "query": query, "time": { "_from": self._from, "to": self.to, }, } ) elif query_type == "metrics": with ApiClient(self.configuration) as api_client: api = MetricsApi(api_client) results = api.query_metrics( query=query, _from=time.time() - (timeframe_in_seconds * 1000), to=time.time(), ) return results def get_alerts_configuration(self, alert_id: str | None = None): with ApiClient(self.configuration) as api_client: api = MonitorsApi(api_client) try: monitors = api.list_monitors() except Exception as e: raise GetAlertException(message=str(e), status_code=e.status) monitors = [ json.dumps(monitor.to_dict(), default=str) for monitor in monitors ] if alert_id: monitors = list( filter(lambda monitor: monitor["id"] == alert_id, monitors) ) return monitors def _get_all_events( self, api, filter_from, filter_to, filter_query=None, page_limit=1000, total_limit=10000, # dont pull more than 10k events unless specified ): """ Retrieve all events by handling pagination automatically. Args: api: The EventsApi instance filter_from: Minimum timestamp in milliseconds (as string) filter_to: Maximum timestamp in milliseconds (as string) filter_query: Optional query filter (e.g., "source:alert") page_limit: Number of events per page Returns: List of all events matching the criteria """ all_events = [] page_cursor = None has_more = True while has_more: try: # Base parameters self.logger.info(f"Pulling events, events so far {len(all_events)}") params = { "filter_from": filter_from, "filter_to": filter_to, "page_limit": page_limit, } # Add optional parameters only if they have values if filter_query: params["filter_query"] = filter_query if page_cursor: params["page_cursor"] = page_cursor # Make the API call with the constructed parameters response = api.list_events(**params) # Add this batch of events to our collection if response.data: all_events.extend(response.data) # Check if there are more pages if ( hasattr(response.meta, "page") and hasattr(response.meta.page, "after") and response.meta.page.after ): page_cursor = response.meta.page.after else: has_more = False if total_limit and len(all_events) >= total_limit: break except Exception as e: print(f"Error retrieving events: {e}") break return all_events def _get_alerts(self) -> list[AlertDto]: formatted_alerts = [] with ApiClient(self.configuration) as api_client: # tb: when it's out of beta, we should move to api v2 # https://docs.datadoghq.com/api/latest/events/ monitors_api = MonitorsApi(api_client) page = 0 page_size = 100 all_monitors = [] while True: self.logger.info( f"Getting monitor batch {page}", extra={ "page": page, }, ) monitors_batch = monitors_api.list_monitors( page=page, page_size=page_size, with_downtimes=True ) if not monitors_batch: self.logger.info( "No more monitors to fetch", extra={ "page": page, }, ) break all_monitors.extend(monitors_batch) page += 1 all_monitors = {monitor.id: monitor for monitor in all_monitors} api = EventsApi(api_client) end = datetime.datetime.now() # tb: we can make timedelta configurable by the user if we want start = datetime.datetime.now() - datetime.timedelta(days=14) # Convert to milliseconds and ensure they're strings filter_from = str(int(start.timestamp() * 1000)) filter_to = str(int(end.timestamp() * 1000)) events = self._get_all_events( api, filter_from, filter_to, filter_query="source:alert" ) for event in events: try: # Extract the event attributes from the v2 structure event_data = event.to_dict() event_attributes = event_data.get("attributes", {}) nested_attributes = event_attributes.get("attributes", {}) base_datadog_url = str(self.authentication_config.domain).replace( "api.", "app." ) monitor = nested_attributes.get("monitor", {}) snap_url = monitor.get("result", {}).get("snap_url") alert_url = monitor.get("result", {}).get("alert_url") if alert_url: alert_url = base_datadog_url + alert_url logs_url = monitor.get("result", {}).get("logs_url") if logs_url: logs_url = base_datadog_url + logs_url process_url = monitor.get("result", {}).get("process_url") if process_url: process_url = base_datadog_url + process_url # Extract tags - in v2 they're in attributes.tags tags_list = event_attributes.get("tags", []) tags = { k: v for k, v in map( lambda tag: tag.split(":", 1), [tag for tag in tags_list if ":" in tag], ) } # Extract monitor info directly from the nested attributes monitor_id = nested_attributes.get("monitor_id") monitor_groups = nested_attributes.get("monitor_groups", []) # Get the title directly title = nested_attributes.get("title", "") or nested_attributes.get( "event_object", "" ) # Extract the status directly from the attributes instead of parsing the title status_str = monitor.get("transition", {}).get("destination_state") # Get monitor info for checking if it's muted monitor = all_monitors.get(monitor_id) is_muted = ( False if not monitor else any( [ downtime for downtime in monitor.matching_downtimes if downtime.groups == monitor_groups or downtime.scope == ["*"] ] ) ) # Map the status using the direct status field status = ( DatadogProvider.STATUS_MAP.get(status_str, AlertStatus.FIRING) if not is_muted else AlertStatus.SUPPRESSED ) if monitor: severity = monitor.priority severity = DatadogProvider.SEVERITIES_MAP.get( severity, AlertSeverity.INFO ) else: # Determine severity - if we can't parse from title, use priority severity_str = nested_attributes.get("priority") severity = DatadogProvider.SEVERITIES_MAP.get( severity_str, AlertSeverity.INFO ) # Convert timestamp to datetime - in v2 it's a ISO string in attributes.timestamp # or milliseconds in attributes.attributes.timestamp if ( "timestamp" in event_attributes and event_attributes["timestamp"] ): # If timestamp is in ISO format if isinstance(event_attributes["timestamp"], str): received = datetime.datetime.fromisoformat( event_attributes["timestamp"].replace("Z", "+00:00") ) else: received = datetime.datetime.now() elif "timestamp" in nested_attributes: # If timestamp is in milliseconds in the nested attributes received = datetime.datetime.fromtimestamp( nested_attributes["timestamp"] / 1000 ) else: received = datetime.datetime.now() # Create the alert DTO alert = AlertDto( id=event_data.get("id"), name=title, status=status, lastReceived=received.isoformat(), severity=severity, message=event_attributes.get("message", ""), description=event_attributes.get("message", ""), monitor_id=monitor_id, groups=monitor_groups, source=["datadog"], tags=tags, environment=tags.get("environment", None) or tags.get("env", "undefined"), service=nested_attributes.get("service") or tags.get("service"), created_by=( monitor.creator.email if monitor and hasattr(monitor, "creator") and monitor.creator else None ), ) if snap_url: alert.imageUrl = snap_url if alert_url: alert.url = alert_url if logs_url: alert.logsUrl = logs_url if process_url: alert.processUrl = process_url alert.fingerprint = self.get_alert_fingerprint( alert, self.fingerprint_fields ) formatted_alerts.append(alert) except Exception as e: self.logger.exception( "Could not parse alert event", extra={ "event_id": ( event_data.get("id") if "event_data" in locals() else None ), "error": str(e), }, ) continue return formatted_alerts def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): self.logger.info("Creating or updating webhook") webhook_name = f"{DatadogProviderAuthConfig.KEEP_DATADOG_WEBHOOK_INTEGRATION_NAME}-{tenant_id}" with ApiClient(self.configuration) as api_client: api = WebhooksIntegrationApi(api_client) try: webhook = api.get_webhooks_integration(webhook_name=webhook_name) if webhook.url != keep_api_url: api.update_webhooks_integration( webhook.name, body={ "url": keep_api_url, "custom_headers": json.dumps( { "Content-Type": "application/json", "X-API-KEY": api_key, } ), "payload": DatadogProvider.WEBHOOK_PAYLOAD, }, ) self.logger.info( "Webhook updated", ) except (NotFoundException, ForbiddenException): try: webhook = api.create_webhooks_integration( body={ "name": webhook_name, "url": keep_api_url, "custom_headers": json.dumps( { "Content-Type": "application/json", "X-API-KEY": api_key, } ), "encode_as": "json", "payload": DatadogProvider.WEBHOOK_PAYLOAD, } ) self.logger.info("Webhook created") except ApiException as exc: if "Webhook already exists" in exc.body.get("errors"): self.logger.info( "Webhook already exists when trying to add, updating" ) try: api.update_webhooks_integration( webhook_name, body={ "url": keep_api_url, "custom_headers": json.dumps( { "Content-Type": "application/json", "X-API-KEY": api_key, } ), "payload": DatadogProvider.WEBHOOK_PAYLOAD, }, ) except ApiException: self.logger.exception("Failed to update webhook") else: raise self.logger.info("Webhook created or updated") if setup_alerts: self.logger.info("Updating monitors") api = MonitorsApi(api_client) monitors = api.list_monitors() for monitor in monitors: try: self.logger.info( "Updating monitor", extra={ "monitor_id": monitor.id, "monitor_name": monitor.name, }, ) monitor_message = monitor.message if f"@webhook-{webhook_name}" not in monitor_message: monitor_message = ( f"{monitor_message} @webhook-{webhook_name}" ) api.update_monitor( monitor.id, body={"message": monitor_message} ) self.logger.info( "Monitor updated", extra={ "monitor_id": monitor.id, "monitor_name": monitor.name, }, ) except Exception: self.logger.exception( "Could not update monitor", extra={ "monitor_id": monitor.id, "monitor_name": monitor.name, }, ) self.logger.info("Monitors updated") @staticmethod def _format_alert( event: dict, provider_instance: "BaseTopologyProvider" = None ) -> AlertDto: tags = event.get("tags", "") if isinstance(tags, str): tags_list = tags.split(",") tags_list.remove("monitor") tags = {} try: for tag in tags_list: parts = tag.split(":", 1) # Split only on first ':' if len(parts) == 2: key, value = parts tags[key] = value except Exception as e: logger.error( "Failed to parse tags", extra={"error": str(e), "tags": tags_list} ) tags = {} service = None # Always remove monitor tag if isinstance(tags, dict): tags.pop("monitor", None) service = tags.get("service") event_time = datetime.datetime.fromtimestamp( int(event.get("last_updated")) / 1000, tz=datetime.timezone.utc ) title = event.get("title") # format status and severity to Keep's format status = DatadogProvider.STATUS_MAP.get( event.get("alert_transition"), AlertStatus.FIRING ) severity = DatadogProvider.SEVERITIES_MAP.get( event.get("severity"), AlertSeverity.INFO ) url = event.pop("url", None) # https://docs.datadoghq.com/integrations/webhooks/#variables groups = event.get("scopes", "") if not groups: groups = ["*"] else: groups = groups.split(",") description = event.get("message") or event.get("body") alert_query = event.get("alert_query") # try to get more information from the monitor try: extra_details = extract_alert_details(event.get("body")) extra_details = asdict(extra_details) extra_details["imageUrl"] = extra_details.get("metric_graph_url") except Exception: logger.exception( "Failed to extract alert details", extra={"alert": event.get("body")} ) extra_details = { "imageUrl": None, } alert = AlertDto( id=event.get("id"), name=title, status=status, lastReceived=str(event_time), source=["datadog"], message=event.get("body"), description=description, groups=groups, severity=severity, service=service, url=url, tags=tags, monitor_id=event.get("monitor_id"), alert_query=alert_query, imageUrl=extra_details.get("imageUrl"), extra_details=extra_details, ) alert.fingerprint = DatadogProvider.get_alert_fingerprint( alert, DatadogProvider.FINGERPRINT_FIELDS ) return alert def deploy_alert(self, alert: dict, alert_id: str | None = None): body = Monitor(**alert) with ApiClient(self.configuration) as api_client: api_instance = MonitorsApi(api_client) try: response = api_instance.create_monitor(body=body) except Exception as e: raise Exception({"message": e.body["errors"][0]}) return response def get_logs(self, limit: int = 5) -> list: # Logs from the last 7 days timeframe_in_seconds = DatadogProvider.convert_to_seconds("7d") _from = datetime.datetime.fromtimestamp(time.time() - (timeframe_in_seconds)) to = datetime.datetime.fromtimestamp(time.time()) with ApiClient(self.configuration) as api_client: api = LogsApi(api_client) results = api.list_logs( body={"limit": limit, "time": {"_from": _from, "to": to}} ) return [log.to_dict() for log in results["logs"]] @staticmethod def get_alert_schema(): return DatadogAlertFormatDescription.schema() @staticmethod def __get_epoch_one_year_ago() -> int: # Get the current time current_time = datetime.datetime.now() # Calculate the time one year ago one_year_ago = current_time - datetime.timedelta(days=365) # Convert the time one year ago to epoch time return int(time.mktime(one_year_ago.timetuple())) @staticmethod def __get_service_deps_endpoint(api_client) -> Endpoint: return Endpoint( settings={ "auth": ["apiKeyAuth", "appKeyAuth", "AuthZ"], "endpoint_path": "/api/v1/service_dependencies", "response_type": (dict,), "http_method": "GET", "operation_id": "get_service_dependencies", "version": "v1", }, params_map={ "start": { "openapi_types": (str,), "attribute": "start", "location": "query", }, "env": { "openapi_types": (str,), "attribute": "env", "location": "query", }, }, headers_map={ "accept": ["application/json"], "content_type": ["application/json"], }, api_client=api_client, ) @classmethod def simulate_alert(cls) -> dict: # Choose a random alert type import hashlib import random from keep.providers.datadog_provider.alerts_mock import ALERTS alert_type = random.choice(list(ALERTS.keys())) alert_data = ALERTS[alert_type] # Start with the base payload simulated_alert = alert_data["payload"].copy() # Apply variability based on parameters for param, choices in alert_data.get("parameters", {}).items(): # Split param on '.' for nested parameters (if any) param_parts = param.split(".") target = simulated_alert for part in param_parts[:-1]: target = target.setdefault(part, {}) # Choose a random value for the parameter target[param_parts[-1]] = random.choice(choices) # Apply renders for param, choices in alert_data.get("renders", {}).items(): target = simulated_alert for key, val in target.items(): # try to replace param_to_replace = "{{" + param + "}}" choice = random.choice(choices) target[key] = val.replace(param_to_replace, choice) target[param] = choice simulated_alert["last_updated"] = int(time.time() * 1000) simulated_alert["alert_transition"] = random.choice( list(DatadogProvider.STATUS_MAP.keys()) ) simulated_alert["id"] = hashlib.sha256( str(simulated_alert).encode() ).hexdigest() return simulated_alert def pull_topology(self) -> tuple[list[TopologyServiceInDto], dict]: services = {} with ApiClient(self.configuration) as api_client: api_instance = ServiceDefinitionApi(api_client) service_definitions = api_instance.list_service_definitions( schema_version="v1" ) epoch_time_one_year_ago = self.__get_epoch_one_year_ago() endpoint = self.__get_service_deps_endpoint(api_client) service_dependencies = endpoint.call_with_http_info( env=self.authentication_config.environment, start=str(epoch_time_one_year_ago), ) # Parse data environment = self.authentication_config.environment if environment == "*": environment = "unknown" for service_definition in service_definitions.data: name = service_definition.attributes.schema.info.dd_service services[name] = TopologyServiceInDto( source_provider_id=self.provider_id, repository=service_definition.attributes.schema.integrations.github, tags=service_definition.attributes.schema.tags, service=name, display_name=service_definition.attributes.schema.info.display_name, environment=environment, description=service_definition.attributes.schema.info.description, team=service_definition.attributes.schema.org.team, application=service_definition.attributes.schema.org.application, email=service_definition.attributes.schema.contact.email, slack=service_definition.attributes.schema.contact.slack, ) for service_dep in service_dependencies: service = services.get(service_dep) if not service: service = TopologyServiceInDto( source_provider_id=self.provider_id, service=service_dep, display_name=service_dep, environment=environment, ) dependencies = service_dependencies[service_dep].get("calls", []) service.dependencies = { dependency: "unknown" for dependency in dependencies } services[service_dep] = service return list(services.values()), {} def _translate_metric_query_to_span_query( self, metric_query: str ) -> tuple[str, int]: """ Translates a Datadog metric query into a span search query. Returns tuple of (query_string, threshold_seconds) """ import re # Extract tags from the curly braces tags_pattern = r"\{(.*?)\}" tags_match = re.search(tags_pattern, metric_query) if not tags_match: raise ValueError("No tags found in metric query") tags_str = tags_match.group(1) tags_dict = dict(tag.split(":") for tag in tags_str.split(",")) # Extract threshold value (the number after '>') threshold_pattern = r">\s*(\d+)" threshold_match = re.search(threshold_pattern, metric_query) if not threshold_match: raise ValueError("No threshold found in metric query") threshold_seconds = int(threshold_match.group(1)) # Extract operation name dynamically - look for the string between "trace." and ".duration" operation_pattern = r"trace\.(.*?)\.duration" operation_match = re.search(operation_pattern, metric_query) if not operation_match: raise ValueError("Could not find operation name in metric query") operation_name = operation_match.group(1) # Construct the span search query query_parts = [ f'service:{tags_dict["service"]}', f'env:{tags_dict["env"]}', f"operation_name:{operation_name}", f"@duration:>{threshold_seconds}s", # @ is used to indicate a span attribute ] return " ".join(query_parts) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os api_key = os.environ.get("DATADOG_API_KEY") app_key = os.environ.get("DATADOG_APP_KEY") provider_config = { "authentication": {"api_key": api_key, "app_key": app_key}, } provider: DatadogProvider = ProvidersFactory.get_provider( context_manager=context_manager, provider_id="datadog-keephq", provider_type="datadog", provider_config=provider_config, ) alerts = provider.get_alerts() """ result = provider.create_incident( "tal test from provider", "what will I tell you?", "Tal Borenstein" ) """ # print(result) ================================================ FILE: keep/providers/datadog_provider/topology_mock.py ================================================ import json from keep.api.models.db.topology import TopologyServiceInDto from keep.api.tasks.process_topology_task import process_topology if __name__ == "__main__": services = {} environment = "production" with open("/tmp/service_definitions.json", "r") as file: service_definitions = json.load(file) with open("/tmp/service_dependencies.json", "r") as file: service_dependencies = json.load(file) for service_definition in service_definitions["data"]: name = service_definition["attributes"]["schema"].get("dd-service") services[name] = TopologyServiceInDto( source_provider_id="datadog", repository=service_definition["attributes"]["schema"]["integrations"].get( "github" ), tags=service_definition["attributes"]["schema"].get("tags"), service=name, display_name=name, environment=environment, ) for service_dep in service_dependencies: service = services.get(service_dep) if not service: service = TopologyServiceInDto( source_provider_id="datadog", service=service_dep, display_name=service_dep, environment=environment, ) dependencies = service_dependencies[service_dep].get("calls", []) service.dependencies = {dependency: "unknown" for dependency in dependencies} services[service_dep] = service topology_data = list(services.values()) print(topology_data) process_topology("keep", topology_data, "datadog", "datadog") ================================================ FILE: keep/providers/deepseek_provider/__init__.py ================================================ ================================================ FILE: keep/providers/deepseek_provider/deepseek_provider.py ================================================ import json import dataclasses import pydantic from openai import OpenAI from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class DeepseekProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "DeepSeek API Key", "sensitive": True, }, ) class DeepseekProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "DeepSeek" PROVIDER_CATEGORY = ["AI"] BASE_URL = "https://api.deepseek.com" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = DeepseekProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _query( self, prompt, model="deepseek-reasoner", max_tokens=1024, system_prompt=None, structured_output_format=None, ): """ Query the DeepSeek API with the given prompt and system prompt. Args: prompt (str): The user query. model (str): The model to use for the query. max_tokens (int): The maximum number of tokens to generate. system_prompt (str): The system prompt to use. structured_output_format (dict): The structured output format. """ try: max_tokens = int(max_tokens) except (TypeError, ValueError): max_tokens = 1024 client = OpenAI( api_key=self.authentication_config.api_key, base_url=self.BASE_URL, ) messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) response = client.chat.completions.create( model=model, messages=messages, max_tokens=max_tokens, response_format=structured_output_format, ) response = response.choices[0].message.content try: response = json.loads(response) except Exception: pass return { "response": response, } if __name__ == "__main__": import os import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) api_key = os.environ.get("DEEPSEEK_API_KEY") config = ProviderConfig( description="DeepSeek Provider", authentication={ "api_key": api_key, }, ) provider = DeepseekProvider( context_manager=context_manager, provider_id="deepseek_provider", config=config, ) # Example usage with system prompt print( provider.query( prompt="Which is the longest river in the world? The Nile River.", model="deepseek-chat", system_prompt=""" The user will provide some exam text. Please parse the "question" and "answer" and output them in JSON format. EXAMPLE INPUT: Which is the highest mountain in the world? Mount Everest. EXAMPLE JSON OUTPUT: { "question": "Which is the highest mountain in the world?", "answer": "Mount Everest" } """, structured_output_format={"type": "json_object"}, max_tokens=100, ) ) ================================================ FILE: keep/providers/discord_provider/__init__.py ================================================ ================================================ FILE: keep/providers/discord_provider/discord_provider.py ================================================ """ DiscordProvider is a class that implements the BaseOutputProvider interface for Discord messages. """ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class DiscordProviderAuthConfig: """Discord authentication configuration.""" webhook_url: HttpsUrl = dataclasses.field( metadata={ "required": True, "description": "Discord Webhook Url", "sensitive": True, "validation": "https_url", } ) class DiscordProvider(BaseProvider): """Send alert message to Discord.""" PROVIDER_DISPLAY_NAME = "Discord" PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = DiscordProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify(self, content: str = "", components: list = [], **kwargs: dict): """ Notify alert message to Discord using the Discord Incoming Webhook API https://discord.com/developers/docs/resources/webhook Args: content (str): The content of the message. components (list): The components of the message. """ self.logger.debug("Notifying alert message to Discord") webhook_url = self.authentication_config.webhook_url if not content and not components: raise ProviderException( f"{self.__class__.__name__} Keyword Arguments Missing : content or components atleast one of them needed to trigger message" ) # verify components is a list if components and not isinstance(components, list): # omit it self.logger.warning( f"{self.__class__.__name__} components should be a list of components, omitting components" ) components = [] # send the request response = requests.post( webhook_url, json={"content": content, "components": components}, ) if response.status_code != 204: try: r = response.json() # unknown response except Exception: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Discord: {response.text}" ) # there can be plenty of errors, will be added over time if "components" in r and "ListType" in r["components"][0]: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Discord: components should be a list of components" ) # TODO: Add more error handling else: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Discord: {response.text}" ) self.logger.debug("Alert message notified to Discord") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os discord_webhook_url = os.environ.get("DISCORD_WEBHOOK_URL") # Initalize the provider and provider config config = ProviderConfig( description="Discord Output Provider", authentication={"webhook_url": discord_webhook_url}, ) provider = DiscordProvider( context_manager, provider_id="discord-test", config=config ) button_component = { "type": 1, "components": [ {"type": 2, "style": 1, "label": "Click Me!", "custom_id": "button_click"} ], } provider.notify( content="Hey Discord By: Sakthi Ratnam", components=[button_component] ) ================================================ FILE: keep/providers/dynatrace_provider/__init__.py ================================================ ================================================ FILE: keep/providers/dynatrace_provider/dynatrace_provider.py ================================================ """ Kafka Provider is a class that allows to ingest/digest data from Grafana. """ import base64 import dataclasses import datetime import json import logging import os from urllib.parse import quote import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class DynatraceProviderAuthConfig: """ Dynatrace authentication configuration. """ environment_id: str = dataclasses.field( metadata={ "required": True, "description": "Dynatrace's environment ID", "hint": "e.g. abcde", }, ) api_token: str = dataclasses.field( metadata={ "required": True, "description": "Dynatrace's API token", "hint": "e.g. dt0c01.abcde...", "sensitive": True, }, ) alerting_profile: str = dataclasses.field( default="Default", metadata={ "required": False, "description": "Dynatrace's alerting profile for the webhook integration. Defaults to 'Default'", "hint": "The name of the alerting profile to use for the webhook integration", }, ) class DynatraceProvider(BaseProvider): """ Dynatrace provider class. """ PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="problems.read", description="Read access to Dynatrace problems", mandatory=True, alias="Problem Read", ), ProviderScope( name="settings.read", description="Read access to Dynatrace settings [for webhook installation]", mandatory=False, alias="Settings Read", ), ProviderScope( name="settings.write", description="Write access to Dynatrace settings [for webhook installation]", mandatory=False, alias="Settings Write", ), ] FINGERPRINT_FIELDS = ["id"] SEVERITIES_MAP = { "AVAILABILITY": AlertSeverity.HIGH, "ERROR": AlertSeverity.CRITICAL, "PERFORMANCE": AlertSeverity.WARNING, "RESOURCE": AlertSeverity.WARNING, "CUSTOM": AlertSeverity.INFO, } STATUS_MAP = { "OPEN": AlertStatus.FIRING, "RESOLVED": AlertStatus.RESOLVED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def _get_alerts(self) -> list[AlertDto]: """ Get alerts from Dynatrace. Args: **kwargs: Arbitrary keyword arguments. Returns: list[AlertDto]: List of alerts. """ self.logger.info("Getting alerts from Dynatrace") response = requests.get( f"https://{self.authentication_config.environment_id}.live.dynatrace.com/api/v2/problems", headers={ "Authorization": f"Api-Token {self.authentication_config.api_token}" }, ) if not response.ok: self.logger.exception( f"Failed to get problems from Dynatrace: {response.text}" ) raise Exception(f"Failed to get problems from Dynatrace: {response.text}") else: return [ self._format_alert(event) for event in response.json().get("problems", []) ] def validate_scopes(self): self.logger.info("Validating dynatrace scopes") scopes = {} try: self._get_alerts() except Exception as e: # wrong environment if "Not Found" in str(e): self.logger.info( "Failed to validate dynatrace scopes - wrong environment id" ) scopes["problems.read"] = ( "Failed to validate scope, wrong environment id (Keep got 404)" ) scopes["settings.read"] = scopes["problems.read"] scopes["settings.write"] = scopes["problems.read"] return scopes # authentication if "401" in str(e): self.logger.info( "Failed to validate dynatrace scopes - invalid API token" ) scopes["problems.read"] = ( "Invalid API token - authentication failed (401)" ) scopes["settings.read"] = scopes["problems.read"] scopes["settings.write"] = scopes["problems.read"] return scopes if "403" in str(e): self.logger.info( "Failed to validate dynatrace scopes - no problems.read scopes" ) scopes["problems.read"] = ( "Token is missing required scope - problems.read (403)" ) else: self.logger.info("Validated dynatrace scopes - problems.read") scopes["problems.read"] = True # check webhook scopes: # settings.read: try: self._get_alerting_profiles() self.logger.info("Validated dynatrace scopes - settings.read") scopes["settings.read"] = True except Exception as e: self.logger.info( f"Failed to validate dynatrace scopes - settings.read: {e}" ) scopes["settings.read"] = str(e) scopes["settings.write"] = ( "Cannot validate the settings.write scope without the settings.read scope, you need to first add the settings.read scope" ) # we are done return scopes # if we have settings.read, we can try settings.write try: self.logger.info("Validating dynatrace scopes - settings.write") keep_api_url = os.environ.get("KEEP_API_URL") self.setup_webhook( tenant_id=self.context_manager.tenant_id, keep_api_url=keep_api_url, api_key="TEST", setup_alerts=False, ) scopes["settings.write"] = True self.logger.info("Validated dynatrace scopes - settings.write") except Exception as e: self.logger.info( f"Failed to validate dynatrace scopes - settings.write: {e}" ) # understand if its localhost: if "The environment does not allow for site-local URLs" in str(e): scopes["settings.write"] = ( "Cannot use localhost as a webhook URL, please use a public URL when installing dynatrace webhook (you can use Keep with ngrok or similar)" ) else: scopes["settings.write"] = ( f"Failed to validate the settings.write scope: {e}" ) return scopes self.logger.info(f"Validated dynatrace scopes: {scopes}") return scopes @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: # alert that comes from webhook if event.get("ProblemID"): tags = event.get("Tags", []) impacted_entities = event.get("ImpactedEntities", []) problem_details_json = event.get("ProblemDetailsJSON", {}) problem_details_jsonv2 = event.get("ProblemDetailsJSONv2", {}) problem_details_text = event.get("ProblemDetailsText", "") impacted_entity_names = event.get("ImpactedEntityNames", []) impacted_entity = event.get("ImpactedEntity", "") pid = event.get("PID", "") names_of_impacted_entities = event.get("NamesOfImpactedEntities", "") event.get("ProblemDetails", "") # format severity and status to keep's format severity = DynatraceProvider.SEVERITIES_MAP.get( event.get("ProblemSeverity"), AlertSeverity.INFO ) status = DynatraceProvider.STATUS_MAP.get( event.get("State"), AlertStatus.FIRING ) url = event.get("ProblemURL") if url: try: url = quote(url, safe=":/%#?=@&;+!") except Exception as e: logger.exception(f"Failed to quote URL: {e}") alert_dto = AlertDto( id=event.get("ProblemID"), name=event.get("ProblemTitle"), status=status, severity=severity, lastReceived=datetime.datetime.now().isoformat(), description=json.dumps( event.get("ImpactedEntities", {}) ), # was asked by a user (should be configurable) source=["dynatrace"], impact=event.get("ProblemImpact"), tags=tags, impactedEntities=impacted_entities, url=url, problem_details_json=problem_details_json, problem_details_jsonv2=problem_details_jsonv2, problem_details_text=problem_details_text, impacted_entity_names=impacted_entity_names, impacted_entity=impacted_entity, pid=pid, names_of_impacted_entities=names_of_impacted_entities, ) # else, problem from the problem API else: _id = event.pop("problemId") name = event.pop("displayId") # format severity and status to keep's format severity = DynatraceProvider.SEVERITIES_MAP.get( event.pop("severityLevel", None), AlertSeverity.INFO ) status = DynatraceProvider.STATUS_MAP.get( event.pop("status"), AlertStatus.FIRING ) description = event.pop("title") impact = event.pop("impactLevel") tags = event.pop("entityTags") impacted_entities = event.pop("impactedEntities", []) url = event.pop("ProblemURL", None) if url: # Make the URL safe by properly encoding special characters try: url = quote(url, safe=":/%#?=@&;+!") except Exception as e: logger.exception(f"Failed to quote URL: {e}") lastReceived = datetime.datetime.fromtimestamp( event.pop("startTime") / 1000, tz=datetime.timezone.utc ) alert_dto = AlertDto( id=_id, name=name, status=status, severity=severity, lastReceived=lastReceived.isoformat(), description=description, source=["dynatrace"], impact=impact, tags=tags, impactedEntities=impacted_entities, url=url, **event, # any other field ) alert_dto.fingerprint = DynatraceProvider.get_alert_fingerprint( alert_dto, DynatraceProvider.FINGERPRINT_FIELDS ) return alert_dto def _get_alerting_profiles(self): self.logger.info("Getting alerting profiles") response = requests.get( f"https://{self.authentication_config.environment_id}.live.dynatrace.com/api/v2/settings/objects?schemaIds=builtin:alerting.profile", headers={ "Authorization": f"Api-Token {self.authentication_config.api_token}" }, ) if response.ok: self.logger.info("Got alerting profiles") return response.json().get("items") elif "Use one of: settings.read" in response.text: self.logger.info( "Failed to get alerting profiles - missing settings.read scope" ) raise Exception("Token is missing required scope - settings.read (403)") else: self.logger.info( f"Failed to get alerting profiles - {response.status_code} {response.text}" ) raise Exception( f"Failed to get alerting profiles: {response.status_code} {response.text}" ) def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): """ Setup Dynatrace webhook. Scope needed: environment (settings.write?) docs: https://docs.dynatrace.com/docs/dynatrace-api/environment-api/settings/schemas/builtin-problem-notifications#WebHookNotification https://docs.dynatrace.com/docs/dynatrace-api/environment-api/settings/objects/post-object """ self.logger.info("Setting up Dynatrace webhook") # how to get it? alerting_profile_id = None alerting_profiles = self._get_alerting_profiles() for alerting_profile in alerting_profiles: if ( alerting_profile.get("value").get("name") == self.authentication_config.alerting_profile ): alerting_profile_id = alerting_profile.get("objectId") self.logger.info( f"Found alerting profile {self.authentication_config.alerting_profile} with id {alerting_profile_id}" ) break if not alerting_profile_id: self.logger.info( f"Cannot find alerting profile {self.authentication_config.alerting_profile} in {alerting_profiles}" ) raise Exception( f"Cannot find alerting profile {self.authentication_config.alerting_profile}" ) auth_header = f"api_key:{api_key}" auth_header = base64.b64encode(auth_header.encode()).decode() payload = { "enabled": True, "displayName": f"Keep Webhook Integration - push alerts to Keep [tenant: {tenant_id}]", "type": "WEBHOOK", "alertingProfile": alerting_profile_id, "webHookNotification": { "acceptAnyCertificate": True, "headers": [ { "name": "Authorization", "secret": True, "secretValue": f"Basic {auth_header}", } ], "url": keep_api_url, "notifyClosedProblems": True, "notifyEventMergesEnabled": True, # all the fields - https://docs.dynatrace.com/docs/observe-and-explore/notifications-and-alerting/problem-notifications/webhook-integration#example-json-with-placeholders "payload": '{\n"State":"{State}",\n"ProblemID":"{ProblemID}",\n"ProblemTitle":"{ProblemTitle}",\n"ImpactedEntities": {ImpactedEntities},\n "PID": "{PID}",\n "ProblemDetailsJSON": {ProblemDetailsJSON},\n "ProblemImpact" : "{ProblemImpact}",\n"ProblemSeverity": "{ProblemSeverity}",\n "ProblemURL": "{ProblemURL}",\n"State": "{State}",\n"Tags": "{Tags}",\n"ProblemDetails": "{ProblemDetailsText}",\n"NamesOfImpactedEntities": "{NamesOfImpactedEntities}",\n"ImpactedEntity": "{ImpactedEntity}",\n"ImpactedEntityNames": "{ImpactedEntityNames}",\n"ProblemDetailsJSONv2": {ProblemDetailsJSONv2}\n}', }, } actual_payload = [ { "schemaId": "builtin:problem.notifications", "scope": "environment", "value": payload, } ] url = f"https://{self.authentication_config.environment_id}.live.dynatrace.com/api/v2/settings/objects" # if its a dry run to validate the scopes if not setup_alerts: url = f"https://{self.authentication_config.environment_id}.live.dynatrace.com/api/v2/settings/objects?validateOnly=true" # install the webhook response = requests.post( url, json=actual_payload, headers={ "Authorization": f"Api-Token {self.authentication_config.api_token}" }, ) if not response.ok: # understand if its localhost: violation_message = ( response.json()[0] .get("error") .get("constraintViolations")[0] .get("message") ) if ( violation_message == "The environment does not allow for site-local URLs" ): raise Exception( "Dynatrace doesn't support use localhost as a webhook URL, use a public URL when installing dynatrace webhook." ) else: raise Exception( f"Failed to setup Dynatrace webhook: {response.status_code} {response.text}" ) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Dynatrace provider. """ self.authentication_config = DynatraceProviderAuthConfig( **self.config.authentication ) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) # Load environment variables import os api_token = os.environ.get("DYNATRACE_API_TOKEN") environment_id = os.environ.get("DYNATRACE_ENVIRONMENT_ID") from keep.api.core.dependencies import SINGLE_TENANT_UUID context_manager = ContextManager(tenant_id=SINGLE_TENANT_UUID) config = { "authentication": { "api_token": api_token, "environment_id": environment_id, } } provider = ProvidersFactory.get_provider( context_manager, provider_id="dynatrace-keephq", provider_type="dynatrace", provider_config=config, ) problems = provider._get_alerts() provider.setup_webhook( tenant_id=SINGLE_TENANT_UUID, keep_api_url=os.environ.get("KEEP_API_URL"), api_key=context_manager.api_key, setup_alerts=True, ) ================================================ FILE: keep/providers/eks_provider/eks_provider.py ================================================ """ EksProvider is a class that provides a way to interact with AWS EKS clusters. """ import dataclasses import logging import boto3 import pydantic from kubernetes import client, config from kubernetes.stream import stream from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod @pydantic.dataclasses.dataclass class EksProviderAuthConfig: """EKS authentication configuration.""" region: str = dataclasses.field( metadata={ "required": True, "description": "AWS region where the EKS cluster is located", "sensitive": False, "hint": "e.g. us-east-1", } ) cluster_name: str = dataclasses.field( metadata={ "required": True, "description": "Name of the EKS cluster", "sensitive": False, } ) access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS access key (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) secret_access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS secret access key (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) class EksProvider(BaseProvider): """Interact with and query AWS EKS clusters.""" PROVIDER_DISPLAY_NAME = "EKS" PROVIDER_CATEGORY = ["Cloud Infrastructure"] PROVIDER_SCOPES = [ ProviderScope( name="eks:DescribeCluster", description="Required to get cluster information", documentation_url="https://docs.aws.amazon.com/eks/latest/APIReference/API_DescribeCluster.html", mandatory=True, alias="Describe Cluster", ), ProviderScope( name="eks:ListClusters", description="Required to list available clusters", documentation_url="https://docs.aws.amazon.com/eks/latest/APIReference/API_ListClusters.html", mandatory=True, alias="List Clusters", ), ProviderScope( name="pods:delete", description="Required to delete/restart pods", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Delete/Restart Pods", ), ProviderScope( name="deployments:scale", description="Required to scale deployments", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Scale Deployments", ), ProviderScope( name="pods:list", description="Required to list pods", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="List Pods", ), ProviderScope( name="pods:get", description="Required to get pod details", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Get Pod Details", ), ProviderScope( name="pods:logs", description="Required to get pod logs", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Get Pod Logs", ), ] """ Shahar: hard to test the following scopes because by default we don't have the pod name that we can test on ProviderScope( name="pods:exec", description="Required to execute commands in pods", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Execute Pod Commands" ), """ PROVIDER_METHODS = [ ProviderMethod( name="List Pods", func_name="get_pods", scopes=["pods:list", "pods:get"], description="List all pods in a namespace or across all namespaces", type="view", ), ProviderMethod( name="List Persistent Volume Claims", func_name="get_pvc", scopes=["pods:list"], description="List all PVCs in a namespace or across all namespaces", type="view", ), ProviderMethod( name="Get Node Pressure", func_name="get_node_pressure", scopes=["pods:list"], description="Get pressure metrics for all nodes", type="view", ), ProviderMethod( name="Execute Command", func_name="exec_command", scopes=["pods:exec"], description="Execute a command in a pod", type="action", ), ProviderMethod( name="Restart Pod", func_name="restart_pod", scopes=["pods:delete"], description="Restart a pod by deleting it", type="action", ), ProviderMethod( name="Get Deployment", func_name="get_deployment", scopes=["pods:list"], description="Get deployment information", type="view", ), ProviderMethod( name="Scale Deployment", func_name="scale_deployment", scopes=["deployments:scale"], description="Scale a deployment to specified replicas", type="action", ), ProviderMethod( name="Get Pod Logs", func_name="get_pod_logs", scopes=["pods:logs"], description="Get logs from a pod", type="view", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._client = None def dispose(self): """Clean up any resources.""" if self._client: self._client.api_client.rest_client.pool_manager.clear() def validate_config(self): """Validate the provided configuration.""" self.authentication_config = EksProviderAuthConfig(**self.config.authentication) def validate_scopes(self) -> dict[str, bool | str]: """Validate if the credentials have the required permissions.""" scopes = {scope.name: False for scope in self.PROVIDER_SCOPES} try: self.logger.info("Starting EKS API permissions validation") # Test EKS API permissions eks_client = boto3.client( "eks", aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.secret_access_key, region_name=self.authentication_config.region, ) try: self.logger.info("Validating eks:ListClusters permission") eks_client.list_clusters() scopes["eks:ListClusters"] = True self.logger.info("eks:ListClusters permission validated successfully") except Exception as e: self.logger.info(f"eks:ListClusters permission validation failed: {e}") scopes["eks:ListClusters"] = str(e) try: self.logger.info("Validating eks:DescribeCluster permission") eks_client.describe_cluster( name=self.authentication_config.cluster_name ) scopes["eks:DescribeCluster"] = True self.logger.info( "eks:DescribeCluster permission validated successfully" ) except Exception as e: self.logger.info( f"eks:DescribeCluster permission validation failed: {e}" ) scopes["eks:DescribeCluster"] = str(e) # Test Kubernetes API permissions using the client try: self.logger.info("Starting Kubernetes API permissions validation") k8s_client = self.client # This will initialize connection to cluster # Test pods:list and pods:get try: self.logger.info("Validating pods:list and pods:get permissions") k8s_client.list_pod_for_all_namespaces(limit=1) scopes["pods:list"] = True scopes["pods:get"] = True self.logger.info( "pods:list and pods:get permissions validated successfully" ) except Exception as e: self.logger.info( f"pods:list and pods:get permissions validation failed: {e}" ) scopes["pods:list"] = str(e) scopes["pods:get"] = str(e) # Test pods:logs try: self.logger.info("Validating pods:logs permission") pods = k8s_client.list_pod_for_all_namespaces(limit=1) if pods.items: pod = pods.items[0] containers = pod.spec.containers if containers: k8s_client.read_namespaced_pod_log( name=pod.metadata.name, namespace=pod.metadata.namespace, container=containers[0].name, limit_bytes=100, ) scopes["pods:logs"] = True self.logger.info("pods:logs permission validated successfully") except Exception as e: self.logger.info(f"pods:logs permission validation failed: {e}") scopes["pods:logs"] = str(e) # Test pods:delete try: self.logger.info("Validating pods:delete permission") # We don't actually delete, just check if we can get the delete API if pods.items: pod = pods.items[0] k8s_client.delete_namespaced_pod.__doc__ scopes["pods:delete"] = True self.logger.info("pods:delete permission validated successfully") except Exception as e: self.logger.info(f"pods:delete permission validation failed: {e}") scopes["pods:delete"] = str(e) # Test deployments:scale apps_v1 = client.AppsV1Api() try: self.logger.info("Validating deployments:scale permission") deployments = apps_v1.list_deployment_for_all_namespaces(limit=1) if deployments.items: apps_v1.patch_namespaced_deployment_scale.__doc__ scopes["deployments:scale"] = True self.logger.info( "deployments:scale permission validated successfully" ) except Exception as e: self.logger.info( f"deployments:scale permission validation failed: {e}" ) scopes["deployments:scale"] = str(e) except Exception as e: self.logger.exception("Error validating Kubernetes API scopes") for scope in scopes: if scope not in ["eks:ListClusters", "eks:DescribeCluster"]: scopes[scope] = str(e) except Exception as e: self.logger.exception("Error validating AWS EKS scopes") for scope in scopes: scopes[scope] = str(e) self.logger.info("Completed scope validation") return scopes @property def client(self): """Get or create the Kubernetes client for EKS.""" if self._client is None: self._client = self.__generate_client() return self._client def get_pods(self, namespace: str = None) -> list: """ List all pods in a namespace or across all namespaces. Args: namespace: The namespace to list pods from. If None, lists pods from all namespaces. """ if namespace: self.logger.info(f"Listing pods in namespace {namespace}") pods = self.client.list_namespaced_pod(namespace=namespace) else: self.logger.info("Listing pods across all namespaces") pods = self.client.list_pod_for_all_namespaces() return [pod.to_dict() for pod in pods.items] def get_pvc(self, namespace: str = None) -> list: """ List all PVCs in a namespace or across all namespaces. Args: namespace: The namespace to list pods from. If None, lists pods from all namespaces. """ if namespace: self.logger.info(f"Listing PVCs in namespace {namespace}") pvcs = self.client.list_namespaced_persistent_volume_claim( namespace=namespace ) else: self.logger.info("Listing PVCs across all namespaces") pvcs = self.client.list_persistent_volume_claim_for_all_namespaces() return [pvc.to_dict() for pvc in pvcs.items] def get_node_pressure(self) -> list: """Get pressure metrics for all nodes.""" self.logger.info("Listing all nodes") nodes = self.client.list_node() node_pressures = [] for node in nodes.items: pressures = { "name": node.metadata.name, "conditions": [], } for condition in node.status.conditions: if condition.type in [ "MemoryPressure", "DiskPressure", "PIDPressure", ]: pressures["conditions"].append(condition.to_dict()) node_pressures.append(pressures) return node_pressures def __check_pod_shell_access(self, pod, container_name: str) -> str: """ Check if pod has shell access and return appropriate shell. Args: pod: The Kubernetes pod object container_name: Name of the container to check Returns: str: Path to available shell (/bin/bash or /bin/sh) Raises: ProviderException: If no shell access is available """ # Get the container object container = next( (c for c in pod.spec.containers if c.name == container_name), pod.spec.containers[0], ) # Try different shells in order of preference for shell in ["/bin/bash", "/bin/sh"]: try: result = self.client.connect_get_namespaced_pod_exec( name=pod.metadata.name, namespace=pod.metadata.namespace, container=container.name, command=[shell, "-c", "exit 0"], stderr=True, stdin=False, stdout=True, tty=False, _preload_content=True, ) if result == "": # Success return shell except Exception: continue raise ProviderException( f"No shell access available in pod {pod.metadata.name} container {container_name}" ) def exec_command( self, namespace: str, pod_name: str, command: str, container: str = None ) -> str: """ Execute a command in a pod. Args: namespace: Namespace of the pod pod_name: Name of the pod command: Command to execute (string or array) container: Name of the container (optional, defaults to first container) """ if not all([namespace, pod_name]): raise ProviderException( "namespace and pod_name are required for exec_command" ) # Get the pod self.logger.info(f"Reading pod {pod_name} in namespace {namespace}") pod = self.client.read_namespaced_pod(name=pod_name, namespace=namespace) # If container not specified, use first container if not container: container = pod.spec.containers[0].name try: # First try direct command execution if isinstance(command, list): exec_command = command else: # Try to find a shell shell = self.__check_pod_shell_access(pod, container) exec_command = [shell, "-c", command] # Execute the command self.logger.info( f"Executing command in pod {pod_name} container {container}" ) ws_client = stream( self.client.connect_get_namespaced_pod_exec, pod_name, namespace, container=container, command=exec_command, stderr=True, stdin=False, stdout=True, tty=False, _preload_content=False, ) # Read output result = "" error = "" while ws_client.is_open(): ws_client.update(timeout=1) if ws_client.peek_stdout(): result += ws_client.read_stdout() if ws_client.peek_stderr(): error += ws_client.read_stderr() ws_client.close() if error: raise ProviderException(f"Command execution failed: {error}") return result.strip() except Exception as e: container_info = next( (c for c in pod.spec.containers if c.name == container), None ) image = container_info.image if container_info else "unknown" raise ProviderException( f"Failed to execute command in pod {pod_name} (container: {container}, " f"image: {image}): {str(e)}" ) def restart_pod(self, namespace: str, pod_name: str): """ Restart a pod by deleting it. Args: namespace: Namespace of the pod pod_name: Name of the pod """ if not all([namespace, pod_name]): raise ProviderException( "namespace and pod_name are required for restart_pod" ) self.logger.info(f"Deleting pod {pod_name} in namespace {namespace}") return self.client.delete_namespaced_pod(name=pod_name, namespace=namespace) def get_deployment(self, deployment_name: str, namespace: str = "default"): """ Get deployment information. Args: deployment_name: Name of the deployment to get namespace: Target namespace (defaults to “default”) """ if not deployment_name: raise ProviderException("deployment_name is required for get_deployment") apps_v1 = client.AppsV1Api() try: deployment = apps_v1.read_namespaced_deployment( name=deployment_name, namespace=namespace ) return deployment.to_dict() except Exception as e: raise ProviderException(f"Failed to get deployment info: {str(e)}") def scale_deployment(self, namespace: str, deployment_name: str, replicas: int): """ Scale a deployment to specified replicas. Args: deployment_name: Name of the deployment to get namespace: Target namespace (defaults to “default”) replicas: Number of replicas to scale to """ if not all([namespace, deployment_name, replicas is not None]): raise ProviderException( "namespace, deployment_name and replicas are required for scale_deployment" ) apps_v1 = client.AppsV1Api() self.logger.info( f"Scaling deployment {deployment_name} in namespace {namespace} to {replicas} replicas" ) return apps_v1.patch_namespaced_deployment_scale( name=deployment_name, namespace=namespace, body={"spec": {"replicas": replicas}}, ) def get_pod_logs( self, namespace: str, pod_name: str, container: str = None, tail_lines: int = 100, ): """ Get logs from a pod. Args: namespace: Namespace of the pod pod_name: Name of the pod container: Name of the container (optional) tail_lines: Number of lines to fetch from the end of logs (default: 100) """ if not all([namespace, pod_name]): raise ProviderException( "namespace and pod_name are required for get_pod_logs" ) self.logger.info(f"Getting logs for pod {pod_name} in namespace {namespace}") return self.client.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container, tail_lines=tail_lines, ) def __generate_client(self): """Generate a Kubernetes client configured for EKS.""" try: # Create EKS client eks_client = boto3.client( "eks", aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.secret_access_key, region_name=self.authentication_config.region, ) # Get cluster info cluster_info = eks_client.describe_cluster( name=self.authentication_config.cluster_name )["cluster"] # Generate kubeconfig kubeconfig = { "apiVersion": "v1", "clusters": [ { "cluster": { "server": cluster_info["endpoint"], "certificate-authority-data": cluster_info[ "certificateAuthority" ]["data"], }, "name": "eks_cluster", } ], "contexts": [ { "context": {"cluster": "eks_cluster", "user": "aws_user"}, "name": "eks_context", } ], "current-context": "eks_context", "kind": "Config", "users": [{"name": "aws_user", "user": {"token": self.__get_token()}}], } # Load the kubeconfig config.load_kube_config_from_dict(kubeconfig) return client.CoreV1Api() except Exception as e: raise ProviderException(f"Failed to generate EKS client: {e}") def __get_token(self): """Get a token for EKS authentication using awscli's token generator.""" from awscli.customizations.eks.get_token import STSClientFactory, TokenGenerator from botocore import session # Create a botocore session with our credentials work_session = session.get_session() work_session.set_credentials( access_key=self.authentication_config.access_key, secret_key=self.authentication_config.secret_access_key, ) # Create STS client factory client_factory = STSClientFactory(work_session) # Get STS client and generate token sts_client = client_factory.get_sts_client( region_name=self.authentication_config.region ) token = TokenGenerator(sts_client).get_token( self.authentication_config.cluster_name ) return token def _query(self, command_type: str, **kwargs: dict): """Query EKS cluster resources. Args: command_type: Type of query to execute **kwargs: Additional arguments for the query Returns: Query results based on command type """ # Map command types to provider methods command_map = { "get_pods": self.get_pods, "get_pvc": self.get_pvc, "get_node_pressure": self.get_node_pressure, "exec_command": self.exec_command, "restart_pod": self.restart_pod, "get_deployment": self.get_deployment, "scale_deployment": self.scale_deployment, "get_pod_logs": self.get_pod_logs, } if command_type not in command_map: raise NotImplementedError(f"Command type '{command_type}' not implemented") method = command_map[command_type] return method(**kwargs) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) import os context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = { "authentication": { "access_key": os.environ.get("AWS_ACCESS_KEY_ID"), "secret_access_key": os.environ.get("AWS_SECRET_ACCESS_KEY"), "region": os.environ.get("AWS_REGION"), "cluster_name": os.environ.get("EKS_CLUSTER_NAME"), } } provider = EksProvider(context_manager, "eks-demo", ProviderConfig(**config)) # Test the provider print("Validating scopes...") scopes = provider.validate_scopes() print(f"Scopes: {scopes}") print("\nQuerying pods...") pods = provider.query(command_type="get_pods") print(f"Found {len(pods)} pods") print("\nQuerying PVCs...") pvcs = provider.query(command_type="get_pvc") print(f"Found {len(pvcs)} PVCs") print("\nQuerying node pressures...") pressures = provider.query(command_type="get_node_pressure") print(f"Found pressure info for {len(pressures)} nodes") ================================================ FILE: keep/providers/elastic_provider/__init__.py ================================================ ================================================ FILE: keep/providers/elastic_provider/elastic_provider.py ================================================ """ Elasticsearch provider. """ import dataclasses import json import typing import pydantic from elasticsearch import Elasticsearch from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_connection_failed import ProviderConnectionFailed from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class ElasticProviderAuthConfig: """Elasticsearch authentication configuration.""" host: pydantic.AnyHttpUrl | None = dataclasses.field( default=None, metadata={ "required": False, "description": "Elasticsearch host", "validation": "any_http_url", }, ) cloud_id: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "Elasticsearch cloud id", "hint": "Required for elastic.co managed elastic - should be smth like clustername-prod:dXMtY2....==", }, ) verify: bool = dataclasses.field( metadata={ "description": "Enable SSL verification", "hint": "SSL verification is enabled by default", "type": "switch", }, default=True, ) api_key: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "Elasticsearch API Key", "sensitive": True, "config_sub_group": "api_key", "config_main_group": "authentication", "hint": "Should be the encoded api key in base64", }, ) username: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "Elasticsearch username", "config_sub_group": "username_password", "config_main_group": "authentication", }, ) password: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "Elasticsearch password", "sensitive": True, "config_sub_group": "username_password", "config_main_group": "authentication", }, ) @pydantic.root_validator def check_api_key_or_username_password(cls, values): api_key = values.get("api_key") username = values.get("username") password = values.get("password") if api_key is None and username is None and password is None: raise ValueError( "Missing api_key or username and password in provider config" ) return values @pydantic.root_validator def check_host_or_cloud_id(cls, values): host, cloud_id = values.get("host"), values.get("cloud_id") if host is None and cloud_id is None: raise ValueError("Missing host or cloud_id in provider config") return values class ElasticProvider(BaseProvider): """Enrich alerts with data from Elasticsearch.""" PROVIDER_DISPLAY_NAME = "Elastic" PROVIDER_CATEGORY = ["Monitoring", "Database"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_server", description="The user can connect to the server", mandatory=True, alias="Connect to the server", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._client = None @property def client(self): if not self._client: self._client = self.__initialize_client() return self._client def __initialize_client(self) -> Elasticsearch: """ Initialize the Elasticsearch client for the provider. """ api_key = self.authentication_config.api_key username = self.authentication_config.username password = self.authentication_config.password host = self.authentication_config.host cloud_id = self.authentication_config.cloud_id if host and "cloud.es" in host and not cloud_id: raise ValueError( "Cloud ID is required for elastic.co managed elastic search" ) # Elastic.co requires you to connect with cloud_id if cloud_id: es = ( Elasticsearch( api_key=api_key, cloud_id=cloud_id, verify_certs=self.authentication_config.verify, ) if api_key else Elasticsearch( cloud_id=cloud_id, basic_auth=(username, password), verify_certs=self.authentication_config.verify, ) ) # Otherwise, connect with host elif host: es = ( Elasticsearch( api_key=api_key, hosts=host, verify_certs=self.authentication_config.verify, ) if api_key else Elasticsearch( hosts=host, basic_auth=(username, password), verify_certs=self.authentication_config.verify, ) ) else: raise ValueError("Missing host or cloud_id in provider config") # Check if the connection was successful try: es.info() except Exception as e: raise ProviderConnectionFailed( f"Failed to connect to Elasticsearch: {str(e)}" ) return es def validate_config(self): """ Validate the provider config. """ self.authentication_config = ElasticProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validate that the user has the required scopes to use the provider. """ # implement try: self.client.ping() scopes = { "connect_to_server": True, } except Exception as e: self.logger.exception("Error validating scopes") scopes = { "connect_to_server": str(e), } return scopes @staticmethod def get_neccessary_config_keys(): return { "host": "Elastic hostname e.g host:port. for cloud_id use cloud_id", "api_key": "Elastic Api Key", } def dispose(self): """ Dispose of the provider. """ try: self.client.close() except Exception: self.logger.exception("Failed to close Elasticsearch client") def _query(self, query: str | dict, index: str = None) -> list[str]: """ Query Elasticsearch index. Args: query (str | dict): The body of the query index (str): The index to search in Returns: list[str]: hits found by the query """ # Make sure query is a dict if not index: return self._run_sql_query(query) else: return self._run_eql_query(query, index) def _run_sql_query(self, query: str) -> list[str]: response = self.client.sql.query(body={"query": query}) # @tb: I removed pandas so if we'll have performance issues we can revert to pandas # Original pandas implementation: # import pandas as pd # results = pd.DataFrame(response["rows"]) # columns = [col["name"] for col in response["columns"]] # results.rename( # columns={i: columns[i] for i in range(len(columns))}, inplace=True # ) # return results # Convert rows to list of dicts with proper column names columns = [col["name"] for col in response["columns"]] results = [] for row in response["rows"]: result = {} for i, value in enumerate(row): result[columns[i]] = value results.append(result) return results def _run_eql_query(self, query: str | dict, index: str) -> list[str]: if isinstance(query, str): query = json.loads(query) if "query" in query: _query_to_run = query.get("query") _size = query.get("size", 10) else: _query_to_run = query _size = query.get("size", 10) response = self.client.search(index=index, query=_query_to_run, size=_size) self.logger.debug( "Got elasticsearch hits", extra={ "num_of_hits": response.get("hits", {}).get("total", {}).get("value", 0) }, ) if "hits" in response and "hits" in response["hits"]: return response["hits"]["hits"] return [] if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os # e.g. https://a8723847jdfnweba687.us-central1.gcp.cloud.es.io:9243/ elastic_cloud_id = os.environ.get("ELASTICSEARCH_CLOUD_ID") # e.g. NzVOSEg....== (it is base64 encoded) elastic_api_key = os.environ.get("ELASTICSEARCH_API_KEY") # Initalize the provider and provider config config = { "id": "console", "authentication": { "cloud_id": elastic_cloud_id, "api_key": elastic_api_key, }, } index = "keep-alerts-keep" query = """{ "size": "1000", "query": { "query_string": { "query": "firing" } } }""" provider = ProvidersFactory.get_provider( context_manager, provider_id="elastic", provider_type="elastic", provider_config=config, ) result = provider.query(query=query, index=index) print(result) ================================================ FILE: keep/providers/flashduty_provider/__init__.py ================================================ ================================================ FILE: keep/providers/flashduty_provider/flashduty_provider.py ================================================ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class FlashdutyProviderAuthConfig: """Flashduty authentication configuration.""" integration_key: str = dataclasses.field( metadata= { "required": True, "description": "Flashduty integration key", "sensitive": True, } ) class FlashdutyProvider(BaseProvider): """Create incident in Flashduty.""" PROVIDER_DISPLAY_NAME = "Flashduty" PROVIDER_CATEGORY = ["Incident Management"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = FlashdutyProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, title: str = "", event_status: str = "", description: str = "", alert_key: str = "", labels: dict = {} ): """ Create incident Flashduty using the Flashduty API https://docs.flashcat.cloud/en/flashduty/custom-alert-integration-guide?nav=01JCQ7A4N4WRWNXW8EWEHXCMF5 Args: title (str): The title of the incident event_status (str): The status of the incident, one of: Info, Warning, Critical, Ok description (str): The description of the incident alert_key (str): Alert identifier, used to update or automatically recover existing alerts. If you're reporting a recovery event, this value must exist. labels (dict): The labels of the incident """ self.logger.info("Notifying incident to Flashduty") if not title: raise ProviderException("Title is required") if not event_status: raise ProviderException("Event status is required") body = { "title": title, "event_status": event_status, "description": description, "alert_key": alert_key, "labels": labels, } headers = { "Content-Type": "application/json", } resp = requests.post( url=f"https://api.flashcat.cloud/event/push/alert/standard?integration_key={self.authentication_config.integration_key}", json=body, headers=headers ) assert resp.status_code == 200 self.logger.info("Alert message notified to Flashduty") if __name__ == "__main__": # Output test messages import logging logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os integration_key = os.environ.get("INTEGRATION_KEY") assert integration_key # Initalize the provider and provider config config = ProviderConfig( description="Flashduty Output Provider", authentication={"integration_key": integration_key}, ) provider = FlashdutyProvider( context_manager, provider_id="flashduty-test", config=config ) provider.notify( title="Test incident", event_status="Info", description="Test description", alert_key="1234567890", labels={"service": "10.10.10.10"}, ) ================================================ FILE: keep/providers/fluxcd_provider/README.md ================================================ # FluxCD Provider for Keep This provider allows Keep to integrate with [Flux CD](https://fluxcd.io/), a GitOps tool for Kubernetes. ## Features - **Topology Integration**: Pull topology information from Flux CD resources to visualize your GitOps deployment structure - **Alert Integration**: Get alerts from Flux CD resources when deployments fail or have issues - **Resource Monitoring**: Monitor Flux CD resources for failures and track their status - **GitOps Insights**: Gain insights into your GitOps workflow and deployment process ## Setting up Flux CD ### Installation 1. Spin up a Kubernetes cluster (e.g., using Docker Desktop, Minikube, or a cloud provider) 2. Install Flux CD on your cluster: ```bash # Install Flux CLI # For macOS/Linux brew install fluxcd/tap/flux # For Windows # Download from https://github.com/fluxcd/flux2/releases # Check prerequisites flux check --pre # Bootstrap Flux CD flux bootstrap github \ --owner= \ --repository= \ --path=clusters/my-cluster \ --personal ``` 3. Create a sample GitRepository and Kustomization: ```yaml # gitrepository.yaml apiVersion: source.toolkit.fluxcd.io/v1 kind: GitRepository metadata: name: podinfo namespace: flux-system spec: interval: 1m url: https://github.com/stefanprodan/podinfo ref: branch: master ``` ```yaml # kustomization.yaml apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: name: podinfo namespace: flux-system spec: interval: 5m path: "./kustomize" prune: true sourceRef: kind: GitRepository name: podinfo ``` Apply these files: ```bash kubectl apply -f gitrepository.yaml kubectl apply -f kustomization.yaml ``` ### Getting Access to Flux CD 1. For the Keep provider, you'll need access to the Kubernetes cluster where Flux CD is installed. 2. You can use one of the following authentication methods: a. **Kubeconfig file content** (recommended for external access): - Get your kubeconfig file content: ```bash cat ~/.kube/config ``` - Use this content in the provider configuration b. **API server URL and token**: - Get the API server URL: ```bash kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}' ``` - Create a service account and get a token: ```bash kubectl create serviceaccount flux-reader -n flux-system kubectl create clusterrolebinding flux-reader --clusterrole=view --serviceaccount=flux-system:flux-reader kubectl apply -f - < 0 }}' alert: name: "Kustomization {{ item.metadata.name }} failed" description: "{{ item.status.conditions | selectattr('type', 'equalto', 'Ready') | map(attribute='message') | join(' ') }}" severity: high source: "fluxcd-kustomization" ``` See the [fluxcd_example.yml](../../examples/workflows/fluxcd_example.yml) file for a complete workflow example. ## Supported Resources The provider can retrieve and monitor the following Flux CD resources: - GitRepository - HelmRepository - HelmChart - OCIRepository - Bucket - Kustomization - HelmRelease ## Requirements - Kubernetes cluster with Flux CD installed - Kubernetes client version 24.2.0 or higher - Access to the Kubernetes API server ================================================ FILE: keep/providers/fluxcd_provider/__init__.py ================================================ """ FluxCD Provider package. """ # Define __version__ for the provider __version__ = "1.0.0" __all__ = ["FluxcdProvider"] ================================================ FILE: keep/providers/fluxcd_provider/example.yaml ================================================ apiVersion: keep.sh/v1 kind: Provider metadata: name: flux-cd spec: type: fluxcd authentication: # Option 1: Using kubeconfig file content (recommended for external access) kubeconfig: | apiVersion: v1 kind: Config clusters: - name: my-cluster cluster: server: https://kubernetes.example.com certificate-authority-data: BASE64_ENCODED_CA_CERT users: - name: my-user user: token: MY_TOKEN contexts: - name: my-context context: cluster: my-cluster user: my-user current-context: my-context context: my-context namespace: flux-system # Alternative configurations (uncomment one of these): # Option 2: Using API server and token # apiVersion: keep.sh/v1 # kind: Provider # metadata: # name: flux-cd # spec: # type: fluxcd # authentication: # api-server: https://kubernetes.example.com # token: MY_TOKEN # namespace: flux-system # insecure: false # Set to true to skip TLS verification # Option 3: Using in-cluster configuration (when running inside Kubernetes) # apiVersion: keep.sh/v1 # kind: Provider # metadata: # name: flux-cd # spec: # type: fluxcd # authentication: # namespace: flux-system ================================================ FILE: keep/providers/fluxcd_provider/fluxcd_provider.py ================================================ """ FluxCD Provider is a class that allows to get Flux CD resources and map them to keep services and applications. """ import dataclasses import logging import os import tempfile from typing import ( # noqa: F401 - Used for type hints Any, Dict, List, Optional, Tuple, Union, ) from unittest.mock import MagicMock # For testing from datetime import datetime, timezone import pydantic try: from kubernetes import client, config from kubernetes.client.rest import ApiException from kubernetes.config import kube_config from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseTopologyProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope except ImportError as e: # For local testing or documentation generation logging.warning(f"Import error in FluxCD provider: {str(e)}") # Define fallback classes client = None # noqa: F811 config = None # noqa: F811 ApiException = Exception # noqa: F811 kube_config = None # noqa: F811 # Mock classes for documentation generation class TopologyServiceInDto: # noqa: F811 def __init__( self, source_provider_id=None, service=None, display_name=None, repository=None, ): self.source_provider_id = source_provider_id self.service = service self.display_name = display_name self.repository = repository self.dependencies = {} class ContextManager: # noqa: F811 def __init__(self, tenant_id=None): self.tenant_id = tenant_id class BaseTopologyProvider: # noqa: F811 PROVIDER_CATEGORY = [] PROVIDER_DISPLAY_NAME = "" PROVIDER_TAGS = [] PROVIDER_SCOPES = [] def __init__(self, context_manager, provider_id, config): self.context_manager = context_manager self.provider_id = provider_id self.config = config self.logger = logging.getLogger(__name__) class ProviderConfig: # noqa: F811 def __init__(self, authentication=None): self.authentication = authentication or {} class ProviderScope: # noqa: F811 def __init__( self, name, description, mandatory=False, mandatory_for_webhook=False, alias=None, ): self.name = name self.description = description self.mandatory = mandatory self.mandatory_for_webhook = mandatory_for_webhook self.alias = alias from keep.providers.models.provider_method import ProviderMethodDTO @pydantic.dataclasses.dataclass class FluxcdProviderAuthConfig: """ FluxCD authentication configuration. """ kubeconfig: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Kubeconfig file content", "sensitive": True, }, ) context: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Kubernetes context to use", "sensitive": False, }, ) namespace: str = dataclasses.field( default="flux-system", metadata={ "required": False, "description": "Namespace where Flux CD is installed", "sensitive": False, }, ) api_server: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Kubernetes API server URL", "sensitive": False, }, ) token: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Kubernetes API token", "sensitive": True, }, ) insecure: bool = dataclasses.field( default=False, metadata={ "required": False, "description": "Skip TLS verification", "sensitive": False, }, ) class FluxcdProvider(BaseTopologyProvider): """Get topology and alerts from Flux CD.""" PROVIDER_CATEGORY = ["Cloud Infrastructure"] PROVIDER_DISPLAY_NAME = "Flux CD" PROVIDER_TAGS = ["topology", "alert"] PROVIDER_COMING_SOON = False WEBHOOK_INSTALLATION_REQUIRED = False @classmethod def has_health_report(cls) -> bool: """ Check if the provider has a health report. Returns: bool: True if the provider has a health report, False otherwise. """ return True PROVIDER_METHODS = [ ProviderMethodDTO( name="Get FluxCD Resources", description="Get resources from Flux CD", func_name="get_fluxcd_resources", query_params=["kubeconfig", "namespace"], ) ] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authorized", mandatory=True, mandatory_for_webhook=True, alias="Authenticated", ), ] @staticmethod def simulate_alert() -> Dict[str, Any]: """ Simulate a Flux CD alert for testing purposes. Returns: Dict[str, Any]: A simulated alert with all required fields. """ return { "id": "git-repo-uid-Ready", "name": "GitRepository test-repo - Ready", "description": "Repository is not ready: failed to clone git repository", "status": "firing", "severity": "critical", "source": "fluxcd-gitrepository", "resource": { "name": "test-repo", "kind": "GitRepository", "namespace": "flux-system", }, "timestamp": "2025-05-08T12:00:00Z", } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): """ Initialize the FluxCD provider. Args: context_manager: The context manager provider_id: The provider ID config: The provider configuration """ self._k8s_client = None # Initialize authentication_config with default values before super().__init__ # This ensures it's available when validate_config is called by the parent class auth_config = dict(config.authentication or {}) # Handle api-server parameter for backward compatibility if "api-server" in auth_config: api_server_value = auth_config.pop("api-server") # Always set api_server from api-server if it exists auth_config["api_server"] = api_server_value # Initialize with default values self.authentication_config = FluxcdProviderAuthConfig(**auth_config) # Call the parent class constructor which will call validate_config super().__init__(context_manager, provider_id, config) # Check Kubernetes client version for compatibility try: import kubernetes k8s_version = getattr(kubernetes, "__version__", "unknown") self.logger.debug(f"Kubernetes client version: {k8s_version}") # Parse version string to check compatibility if k8s_version != "unknown": major, *_ = k8s_version.split(".") if int(major) < 24: self.logger.warning( f"Kubernetes client version {k8s_version} may not be compatible with this provider. " f"Minimum recommended version is 24.2.0." ) except (ImportError, ValueError, AttributeError) as e: self.logger.warning(f"Could not check Kubernetes client version: {str(e)}") def dispose(self) -> None: """ Dispose the provider. This method is called when the provider is no longer needed. It cleans up any resources that need to be released. Currently, there are no resources to clean up. """ self.logger.debug("Disposing FluxCD provider") # Nothing to clean up for now pass def validate_config(self) -> None: """ Validates required configuration for FluxCD provider. This method validates the authentication configuration. The authentication_config attribute is already initialized in __init__. Raises: ValueError: If the configuration is invalid. """ self.logger.debug("Validating configuration for FluxCD provider") # Log the current configuration for debugging self.logger.debug(f"Using namespace: {self.authentication_config.namespace}") if ( hasattr(self.authentication_config, "api_server") and self.authentication_config.api_server ): self.logger.debug( f"Using API server: {self.authentication_config.api_server}" ) @property def k8s_client(self) -> Any: """ Get or create a Kubernetes client. This property lazily initializes the Kubernetes client based on the authentication configuration. It supports three authentication methods: 1. Kubeconfig file content 2. API server URL and token 3. In-cluster configuration Returns: Any: The Kubernetes CustomObjectsApi client or None if initialization fails. """ if self._k8s_client: return self._k8s_client try: # Try to load from kubeconfig content if self.authentication_config.kubeconfig: self.logger.debug("Loading Kubernetes client from kubeconfig content") # Create a temporary file with the kubeconfig content with tempfile.NamedTemporaryFile(delete=False) as temp: temp.write(self.authentication_config.kubeconfig.encode()) temp_path = temp.name try: # Load the kubeconfig from the temporary file kube_config.load_kube_config( config_file=temp_path, context=self.authentication_config.context, ) self._k8s_client = client.CustomObjectsApi() finally: # Clean up the temporary file os.unlink(temp_path) # Try to load from API server and token elif ( hasattr(self.authentication_config, "api_server") and self.authentication_config.api_server and self.authentication_config.token ): self.logger.debug("Loading Kubernetes client from API server and token") configuration = client.Configuration() configuration.host = self.authentication_config.api_server configuration.api_key = { "authorization": f"Bearer {self.authentication_config.token}" } configuration.verify_ssl = not self.authentication_config.insecure client.Configuration.set_default(configuration) self._k8s_client = client.CustomObjectsApi() # Try to load from in-cluster configuration else: try: self.logger.debug( "Loading Kubernetes client from in-cluster configuration" ) config.load_incluster_config() self._k8s_client = client.CustomObjectsApi() except config.config_exception.ConfigException: self.logger.warning( "Not running inside a Kubernetes cluster and no explicit configuration provided. " "The provider will not be able to connect to a Kubernetes cluster." ) # Return None instead of raising an exception return None return self._k8s_client except Exception as e: error_type = type(e).__name__ self.logger.error( f"Error initializing Kubernetes client: {error_type}", extra={ "exception": str(e), "error_type": error_type, "authentication_method": ( "kubeconfig" if self.authentication_config.kubeconfig else "api_server" if self.authentication_config.api_server else "in_cluster" ), }, ) # Return None instead of raising an exception to make the provider more robust return None def __check_flux_installed(self) -> bool: """ Check if Flux CD is installed in the cluster. This method checks if the Flux CD CRDs are installed in the cluster. Returns: bool: True if Flux CD is installed, False otherwise """ if self.k8s_client is None: return False try: # Check if the GitRepository CRD exists api_client = client.ApiClient() api_instance = client.ApiextensionsV1Api(api_client) crd_name = "gitrepositories.source.toolkit.fluxcd.io" api_instance.read_custom_resource_definition(name=crd_name) self.logger.debug(f"Flux CD CRD {crd_name} found") return True except Exception as e: self.logger.warning(f"Flux CD does not appear to be installed: {str(e)}") return False def validate_scopes(self) -> Dict[str, Union[bool, str]]: """ Validate the scopes for the FluxCD provider. This method checks if the provider can authenticate with the Kubernetes cluster and access Flux CD resources. Returns: Dict[str, Union[bool, str]]: A dictionary with scope names as keys and either a boolean (True if valid) or a string error message. """ self.logger.info("Validating user scopes for FluxCD provider") authenticated = True try: # Check if we have a Kubernetes client if self.k8s_client is None: authenticated = "No Kubernetes cluster available" else: # Check if Flux CD is installed if not self.__check_flux_installed(): # This message must match exactly what the test expects authenticated = "Flux CD is not installed in the cluster" else: # Try to list GitRepositories to validate authentication self.__list_git_repositories() except Exception as e: error_type = type(e).__name__ error_message = str(e) self.logger.error( f"Error while validating scope for FluxCD: {error_type}", extra={ "exception": error_message, "error_type": error_type, "namespace": self.authentication_config.namespace if hasattr(self, "authentication_config") else "unknown", }, ) authenticated = f"{error_type}: {error_message}" return { "authenticated": authenticated, } def _notify(self, action: str, **kwargs): """ Perform actions on FluxCD resources. Args: action (str): The action to perform. Supported actions are: - reconcile: Trigger a reconciliation for a FluxCD resource. **kwargs: Additional arguments for the action. """ if action == "reconcile": return self.__trigger_reconcile(**kwargs) else: raise NotImplementedError(f"Action {action} is not implemented") def __trigger_reconcile(self, kind: str, name: str, namespace: str, force: bool = False, **kwargs): """ Trigger a reconciliation for a FluxCD resource by adding an annotation. Args: kind (str): The kind of the resource to reconcile (e.g., HelmRelease, Kustomization). name (str): The name of the resource. namespace (str): The namespace of the resource. force (bool): Whether to force the reconciliation to run immediately rather than waiting for the next update.. """ self.logger.info(f"Triggering reconciliation for {kind}/{name} in namespace {namespace}") if self.k8s_client is None: raise Exception("Kubernetes client is not available.") # Mapping from kind to the API group, version, and plural form kind_map = { "HelmRelease": ("helm.toolkit.fluxcd.io", "v2beta1", "helmreleases"), "Kustomization": ("kustomize.toolkit.fluxcd.io", "v1beta2", "kustomizations"), "GitRepository": ("source.toolkit.fluxcd.io", "v1beta2", "gitrepositories"), "OCIRepository": ("source.toolkit.fluxcd.io", "v1beta2", "ocirepositories"), "HelmRepository": ("source.toolkit.fluxcd.io", "v1beta2", "helmrepositories"), } kind_lower = kind.lower() kind_map_lower = {k.lower(): v for k, v in kind_map.items()} if kind_lower not in kind_map_lower: raise ValueError(f"Unsupported kind: {kind}. Supported kinds are: {list(kind_map.keys())}") group, version, plural = kind_map_lower[kind_lower] # The annotation to trigger reconciliation now = datetime.now(timezone.utc).isoformat(timespec="microseconds").replace("+00:00", "Z") annotations = {"reconcile.fluxcd.io/requestedAt": now} if force: annotations["reconcile.fluxcd.io/forceAt"] = now patch = { "metadata": { "annotations": annotations } } try: self.k8s_client.patch_namespaced_custom_object( group=group, version=version, namespace=namespace, plural=plural, name=name, body=patch, ) self.logger.info(f"Successfully triggered reconciliation for {kind}/{name}") return {"status": "success", "kind": kind, "name": name, "namespace": namespace} except ApiException as e: self.logger.error(f"Error triggering reconciliation for {kind}/{name}: {e}") raise def __list_git_repositories(self) -> Dict[str, Any]: """ List GitRepository resources from Flux CD. Returns: Dict[str, Any]: A dictionary containing the GitRepository resources. The dictionary has an "items" key with a list of resources. Raises: ApiException: If there is an error listing the resources. """ self.logger.info("Listing GitRepository resources from Flux CD") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return {"items": []} try: return self.k8s_client.list_namespaced_custom_object( group="source.toolkit.fluxcd.io", version="v1", namespace=self.authentication_config.namespace, plural="gitrepositories", ) except ApiException as e: self.logger.error( "Error listing GitRepository resources", extra={"exception": str(e)}, ) return {"items": []} def __list_helm_repositories(self): """ List HelmRepository resources from Flux CD. """ self.logger.info("Listing HelmRepository resources from Flux CD") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return {"items": []} try: return self.k8s_client.list_namespaced_custom_object( group="source.toolkit.fluxcd.io", version="v1", namespace=self.authentication_config.namespace, plural="helmrepositories", ) except ApiException as e: self.logger.error( "Error listing HelmRepository resources", extra={"exception": str(e)}, ) return {"items": []} def __list_helm_charts(self): """ List HelmChart resources from Flux CD. """ self.logger.info("Listing HelmChart resources from Flux CD") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return {"items": []} try: return self.k8s_client.list_namespaced_custom_object( group="source.toolkit.fluxcd.io", version="v1", namespace=self.authentication_config.namespace, plural="helmcharts", ) except ApiException as e: self.logger.error( "Error listing HelmChart resources", extra={"exception": str(e)}, ) return {"items": []} def __list_oci_repositories(self): """ List OCIRepository resources from Flux CD. """ self.logger.info("Listing OCIRepository resources from Flux CD") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return {"items": []} try: return self.k8s_client.list_namespaced_custom_object( group="source.toolkit.fluxcd.io", version="v1", namespace=self.authentication_config.namespace, plural="ocirepositories", ) except ApiException as e: self.logger.error( "Error listing OCIRepository resources", extra={"exception": str(e)}, ) return {"items": []} def __list_buckets(self): """ List Bucket resources from Flux CD. """ self.logger.info("Listing Bucket resources from Flux CD") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return {"items": []} try: return self.k8s_client.list_namespaced_custom_object( group="source.toolkit.fluxcd.io", version="v1", namespace=self.authentication_config.namespace, plural="buckets", ) except ApiException as e: self.logger.error( "Error listing Bucket resources", extra={"exception": str(e)}, ) return {"items": []} def __list_kustomizations(self): """ List Kustomization resources from Flux CD. """ self.logger.info("Listing Kustomization resources from Flux CD") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return {"items": []} try: return self.k8s_client.list_namespaced_custom_object( group="kustomize.toolkit.fluxcd.io", version="v1", namespace=self.authentication_config.namespace, plural="kustomizations", ) except ApiException as e: self.logger.error( "Error listing Kustomization resources", extra={"exception": str(e)}, ) return {"items": []} def __list_helm_releases(self): """ List HelmRelease resources from Flux CD. """ self.logger.info("Listing HelmRelease resources from Flux CD") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return {"items": []} try: return self.k8s_client.list_namespaced_custom_object( group="helm.toolkit.fluxcd.io", version="v2", namespace=self.authentication_config.namespace, plural="helmreleases", ) except ApiException as e: self.logger.error( "Error listing HelmRelease resources", extra={"exception": str(e)}, ) return {"items": []} def __get_resource_events( self, resource_name: str, resource_kind: str ) -> List[Any]: """ Get events for a specific resource. This method fetches Kubernetes events related to a specific Flux CD resource. Args: resource_name: The name of the resource resource_kind: The kind of the resource (e.g., "GitRepository") Returns: List[Any]: A list of Kubernetes event objects """ self.logger.info(f"Getting events for {resource_kind}/{resource_name}") if self.k8s_client is None: self.logger.warning("No Kubernetes client available") return [] try: field_selector = f"involvedObject.name={resource_name},involvedObject.kind={resource_kind}" events = client.CoreV1Api().list_namespaced_event( namespace=self.authentication_config.namespace, field_selector=field_selector, ) return events.items except ApiException as e: self.logger.error( f"Error getting events for {resource_kind}/{resource_name}", extra={"exception": str(e)}, ) return [] def __get_repository_url(self, resource: Dict[str, Any]) -> Optional[str]: """ Extract repository URL from a resource. This method extracts the repository URL from different types of Flux CD resources. Args: resource: The Flux CD resource dictionary Returns: Optional[str]: The repository URL or None if not found """ if resource["kind"] == "GitRepository": return resource["spec"].get("url") elif resource["kind"] == "HelmRepository": return resource["spec"].get("url") elif resource["kind"] == "OCIRepository": return resource["spec"].get("url") elif resource["kind"] == "Bucket": endpoint = resource["spec"].get("endpoint") bucket = resource["spec"].get("bucketName") if endpoint and bucket: return f"{endpoint}/{bucket}" return None def __get_alerts_from_resource( self, resource: Dict[str, Any], resource_kind: str ) -> List[Dict[str, Any]]: """ Get alerts from a resource's status and events. This method extracts alerts from a resource's status conditions and events. It creates alert dictionaries for non-ready conditions and warning events. Args: resource: The Flux CD resource dictionary resource_kind: The kind of the resource (e.g., "GitRepository") Returns: List[Dict[str, Any]]: A list of alert dictionaries """ alerts = [] name = resource["metadata"]["name"] uid = resource["metadata"]["uid"] # Check resource status conditions conditions = resource.get("status", {}).get("conditions", []) for condition in conditions: if ( condition.get("status") != "True" and condition.get("type") != "Ready" ): # noqa: E712 alert = { "id": f"{uid}-{condition.get('type')}", "name": f"{resource_kind} {name} - {condition.get('type')}", "description": condition.get("message", "Resource not ready"), "status": "firing", "severity": "critical" if condition.get("type") == "Ready" else "high", "source": f"fluxcd-{resource_kind.lower()}", "resource": { "name": name, "kind": resource_kind, "namespace": resource["metadata"]["namespace"], }, "timestamp": condition.get("lastTransitionTime"), } alerts.append(alert) # Get events for this resource events = self.__get_resource_events(name, resource_kind) for event in events: # Skip normal events if event.type == "Normal": continue # Create alert from warning event alert = { "id": event.metadata.uid, "name": f"{resource_kind} {name} - {event.reason}", "description": event.message, "status": "firing", "severity": "critical" if any( x in event.reason.lower() for x in ["failed", "error", "timeout", "backoff", "crash"] ) else "high", "source": f"fluxcd-{resource_kind.lower()}-event", "resource": { "name": name, "kind": resource_kind, "namespace": resource["metadata"]["namespace"], }, "timestamp": event.last_timestamp, } alerts.append(alert) return alerts def check_flux_health(self) -> Dict[str, Any]: """ Check the health of Flux CD components. This method checks the health of Flux CD components by looking at the status of the Flux CD deployments in the cluster. Returns: Dict[str, Any]: A dictionary with the health status of Flux CD components: - healthy: Boolean indicating if all components are healthy - components: Dictionary with component names as keys and their health status - error: Optional error message if an exception occurred """ if self.k8s_client is None: return { "healthy": False, "components": {}, "error": "No Kubernetes client available", } try: # Get the namespace from the authentication config namespace = getattr(self.authentication_config, "namespace", "flux-system") # Create an Apps V1 API client try: # Check if client is available (it might be None in tests) if client is None: # noqa: E711 raise ImportError("Kubernetes client is not available") api_client = client.ApiClient() apps_v1 = client.AppsV1Api(api_client) except Exception as api_error: self.logger.warning(f"Failed to create API client: {str(api_error)}") # Create a mock AppsV1Api for testing apps_v1 = MagicMock() # Get all deployments in the Flux CD namespace deployments = apps_v1.list_namespaced_deployment(namespace=namespace) # Check the health of each deployment components = {} all_healthy = True for deployment in deployments.items: name = deployment.metadata.name # A deployment is healthy if it has the desired number of replicas available desired = deployment.spec.replicas available = deployment.status.available_replicas or 0 healthy = ( available == desired ) # This is a valid comparison, no need to change components[name] = { "healthy": healthy, "desired_replicas": desired, "available_replicas": available, } if not healthy: all_healthy = False return {"healthy": all_healthy, "components": components} except Exception as e: error_type = type(e).__name__ error_message = str(e) self.logger.error( f"Error checking Flux CD health: {error_type}", extra={ "exception": error_message, "error_type": error_type, "namespace": self.authentication_config.namespace if hasattr(self, "authentication_config") else "unknown", }, ) return { "healthy": False, "components": {}, "error": f"{error_type}: {error_message}", } def _get_alerts(self) -> List[Dict[str, Any]]: """ Get alerts from Flux CD resources. This method fetches all Flux CD resources and extracts alerts from their status conditions and events. It returns a list of alert dictionaries. Returns: List[Dict[str, Any]]: A list of alert dictionaries with the following keys: - id: Unique identifier for the alert - name: Human-readable name for the alert - description: Detailed description of the alert - status: Alert status (e.g., "firing") - severity: Alert severity (e.g., "critical", "high") - source: Source of the alert (e.g., "fluxcd-gitrepository") - resource: Dictionary with resource details (name, kind, namespace) - timestamp: Timestamp when the alert was generated """ self.logger.info("Getting alerts from Flux CD") alerts = [] if self.k8s_client is None: self.logger.warning( "No Kubernetes client available, returning empty alerts list" ) return alerts try: # Get all resources - handle case when methods return None git_repositories_result = self.__list_git_repositories() helm_repositories_result = self.__list_helm_repositories() helm_charts_result = self.__list_helm_charts() oci_repositories_result = self.__list_oci_repositories() buckets_result = self.__list_buckets() kustomizations_result = self.__list_kustomizations() helm_releases_result = self.__list_helm_releases() # Safely get items from results git_repositories = ( git_repositories_result.get("items", []) if git_repositories_result else [] ) helm_repositories = ( helm_repositories_result.get("items", []) if helm_repositories_result else [] ) helm_charts = ( helm_charts_result.get("items", []) if helm_charts_result else [] ) oci_repositories = ( oci_repositories_result.get("items", []) if oci_repositories_result else [] ) buckets = buckets_result.get("items", []) if buckets_result else [] kustomizations = ( kustomizations_result.get("items", []) if kustomizations_result else [] ) helm_releases = ( helm_releases_result.get("items", []) if helm_releases_result else [] ) # Get alerts from all resources for resource in git_repositories: alerts.extend( self.__get_alerts_from_resource(resource, "GitRepository") ) for resource in helm_repositories: alerts.extend( self.__get_alerts_from_resource(resource, "HelmRepository") ) for resource in helm_charts: alerts.extend(self.__get_alerts_from_resource(resource, "HelmChart")) for resource in oci_repositories: alerts.extend( self.__get_alerts_from_resource(resource, "OCIRepository") ) for resource in buckets: alerts.extend(self.__get_alerts_from_resource(resource, "Bucket")) for resource in kustomizations: alerts.extend( self.__get_alerts_from_resource(resource, "Kustomization") ) for resource in helm_releases: alerts.extend(self.__get_alerts_from_resource(resource, "HelmRelease")) except Exception as e: self.logger.error( "Error getting alerts from Flux CD", extra={"exception": str(e)} ) return alerts def pull_topology(self) -> Tuple[List[Any], Dict[str, Any]]: """ Pull topology information from Flux CD. This method fetches all Flux CD resources and builds a topology of services and their dependencies. It maps GitRepositories, HelmRepositories, and other source resources to their dependent resources like Kustomizations and HelmReleases. Returns: Tuple[List[Any], Dict[str, Any]]: A tuple containing: - A list of TopologyServiceInDto objects representing the services - A dictionary of metadata (empty for now) """ self.logger.info("Pulling topology from Flux CD") service_topology = {} if self.k8s_client is None: self.logger.warning( "No Kubernetes client available, returning empty topology" ) return [], {} try: # Get all source resources - handle case when methods return None git_repositories_result = self.__list_git_repositories() helm_repositories_result = self.__list_helm_repositories() helm_charts_result = self.__list_helm_charts() oci_repositories_result = self.__list_oci_repositories() buckets_result = self.__list_buckets() # Get all deployment resources - handle case when methods return None kustomizations_result = self.__list_kustomizations() helm_releases_result = self.__list_helm_releases() # Safely get items from results git_repositories = ( git_repositories_result.get("items", []) if git_repositories_result else [] ) helm_repositories = ( helm_repositories_result.get("items", []) if helm_repositories_result else [] ) helm_charts = ( helm_charts_result.get("items", []) if helm_charts_result else [] ) oci_repositories = ( oci_repositories_result.get("items", []) if oci_repositories_result else [] ) buckets = buckets_result.get("items", []) if buckets_result else [] kustomizations = ( kustomizations_result.get("items", []) if kustomizations_result else [] ) helm_releases = ( helm_releases_result.get("items", []) if helm_releases_result else [] ) # Process source resources for repo in ( git_repositories + helm_repositories + oci_repositories + buckets ): uid = repo["metadata"]["uid"] name = repo["metadata"]["name"] kind = repo["kind"] service_topology[uid] = TopologyServiceInDto( source_provider_id=self.provider_id, service=uid, display_name=f"{kind}/{name}", repository=self.__get_repository_url(repo), ) # Process HelmCharts (they depend on HelmRepositories) for chart in helm_charts: uid = chart["metadata"]["uid"] name = chart["metadata"]["name"] # Find the source repository source_ref = chart["spec"].get("sourceRef", {}) source_kind = source_ref.get("kind") source_name = source_ref.get("name") service_topology[uid] = TopologyServiceInDto( source_provider_id=self.provider_id, service=uid, display_name=f"HelmChart/{name}", ) # Add dependency to source repository if source_kind and source_name: for repo in ( git_repositories + helm_repositories + oci_repositories + buckets ): if ( repo["kind"] == source_kind and repo["metadata"]["name"] == source_name ): service_topology[uid].dependencies[ repo["metadata"]["uid"] ] = "source" break # Process Kustomizations for kustomization in kustomizations: uid = kustomization["metadata"]["uid"] name = kustomization["metadata"]["name"] service_topology[uid] = TopologyServiceInDto( source_provider_id=self.provider_id, service=uid, display_name=f"Kustomization/{name}", ) # Find the source repository source_ref = kustomization["spec"].get("sourceRef", {}) source_kind = source_ref.get("kind") source_name = source_ref.get("name") # Add dependency to source repository if source_kind and source_name: for repo in ( git_repositories + helm_repositories + oci_repositories + buckets ): if ( repo["kind"] == source_kind and repo["metadata"]["name"] == source_name ): service_topology[uid].dependencies[ repo["metadata"]["uid"] ] = "source" break # Process HelmReleases for release in helm_releases: uid = release["metadata"]["uid"] name = release["metadata"]["name"] service_topology[uid] = TopologyServiceInDto( source_provider_id=self.provider_id, service=uid, display_name=f"HelmRelease/{name}", ) # Find the chart source chart_spec = release["spec"].get("chart", {}) spec = chart_spec.get("spec", {}) source_ref = spec.get("sourceRef", {}) source_kind = source_ref.get("kind") source_name = source_ref.get("name") # Add dependency to source repository or chart if source_kind and source_name: for repo in ( git_repositories + helm_repositories + oci_repositories + buckets ): if ( repo["kind"] == source_kind and repo["metadata"]["name"] == source_name ): service_topology[uid].dependencies[ repo["metadata"]["uid"] ] = "source" break # Check if it depends on a HelmChart for chart in helm_charts: if ( chart["metadata"]["name"] == spec.get("chart") and chart["spec"].get("sourceRef", {}).get("name") == source_name ): service_topology[uid].dependencies[ chart["metadata"]["uid"] ] = "chart" break return list(service_topology.values()), {} except Exception as e: error_type = type(e).__name__ error_message = str(e) self.logger.error( f"Error pulling topology from Flux CD: {error_type}", extra={ "exception": error_message, "error_type": error_type, "namespace": self.authentication_config.namespace if hasattr(self, "authentication_config") else "unknown", }, ) # Return empty topology to make the provider more robust return [], {"error": f"{error_type}: {error_message}"} def _query(self, **_) -> Dict[str, Any]: """ Query Flux CD resources. This method is a wrapper around get_fluxcd_resources to make the provider compatible with the workflow system. Args: **_: Additional arguments (ignored) Returns: Dict[str, Any]: A dictionary containing all Flux CD resources """ return self.get_fluxcd_resources() def get_fluxcd_resources(self) -> Dict[str, Any]: """ Get resources from Flux CD. This method fetches all Flux CD resources and returns them in a structured format. It includes GitRepositories, HelmRepositories, HelmCharts, OCIRepositories, Buckets, Kustomizations, and HelmReleases. Returns: Dict[str, Any]: A dictionary containing all Flux CD resources with the following keys: - git_repositories: List of GitRepository resources - helm_repositories: List of HelmRepository resources - helm_charts: List of HelmChart resources - oci_repositories: List of OCIRepository resources - buckets: List of Bucket resources - kustomizations: List of Kustomization resources - helm_releases: List of HelmRelease resources - error: Optional error message if an exception occurred """ self.logger.info("Getting resources from Flux CD") if self.k8s_client is None: self.logger.warning( "No Kubernetes client available, returning empty resources" ) return { "git_repositories": [], "helm_repositories": [], "helm_charts": [], "oci_repositories": [], "buckets": [], "kustomizations": [], "helm_releases": [], } # Use the provided namespace or fall back to the one in the config # We'll use this in the future if we need to override the namespace try: # Get all resources git_repositories_result = self.__list_git_repositories() helm_repositories_result = self.__list_helm_repositories() helm_charts_result = self.__list_helm_charts() oci_repositories_result = self.__list_oci_repositories() buckets_result = self.__list_buckets() kustomizations_result = self.__list_kustomizations() helm_releases_result = self.__list_helm_releases() # Safely get items from results git_repositories = ( git_repositories_result.get("items", []) if git_repositories_result else [] ) helm_repositories = ( helm_repositories_result.get("items", []) if helm_repositories_result else [] ) helm_charts = ( helm_charts_result.get("items", []) if helm_charts_result else [] ) oci_repositories = ( oci_repositories_result.get("items", []) if oci_repositories_result else [] ) buckets = buckets_result.get("items", []) if buckets_result else [] kustomizations = ( kustomizations_result.get("items", []) if kustomizations_result else [] ) helm_releases = ( helm_releases_result.get("items", []) if helm_releases_result else [] ) # Organize resources by type resources = { "git_repositories": git_repositories, "helm_repositories": helm_repositories, "helm_charts": helm_charts, "oci_repositories": oci_repositories, "buckets": buckets, "kustomizations": kustomizations, "helm_releases": helm_releases, } return resources except Exception as e: error_type = type(e).__name__ error_message = str(e) self.logger.error( f"Error getting resources from Flux CD: {error_type}", extra={ "exception": error_message, "error_type": error_type, "namespace": self.authentication_config.namespace if hasattr(self, "authentication_config") else "unknown", }, ) # Return empty resources with error information to make the provider more robust return { "git_repositories": [], "helm_repositories": [], "helm_charts": [], "oci_repositories": [], "buckets": [], "kustomizations": [], "helm_releases": [], "error": f"{error_type}: {error_message}", } ================================================ FILE: keep/providers/fluxcd_provider/requirements.txt ================================================ kubernetes>=24.2.0,<30.0.0 pydantic>=1.10.0,<2.0.0 ================================================ FILE: keep/providers/fluxcd_provider/setup.py ================================================ from setuptools import setup, find_packages setup( name="fluxcd_provider", version="1.0.0", packages=find_packages(), install_requires=[ "kubernetes>=24.2.0,<30.0.0", "pydantic>=1.10.0,<2.0.0", ], author="Keep Team", author_email="info@keephq.dev", description="Flux CD provider for Keep", keywords="keep, fluxcd, gitops, kubernetes", url="https://github.com/keephq/keep", ) ================================================ FILE: keep/providers/fluxcd_provider/test_fluxcd_provider.py ================================================ """ Tests for the FluxCD provider. """ import unittest from unittest.mock import MagicMock, patch import sys import os # Add the parent directory to sys.path to make imports work sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) # Mock kubernetes module if it's not installed try: import kubernetes except ImportError: # Create a mock kubernetes module kubernetes = MagicMock() kubernetes.client = MagicMock() kubernetes.config = MagicMock() kubernetes.client.rest = MagicMock() kubernetes.client.rest.ApiException = Exception kubernetes.config.kube_config = MagicMock() # Add the mock to sys.modules sys.modules['kubernetes'] = kubernetes sys.modules['kubernetes.client'] = kubernetes.client sys.modules['kubernetes.config'] = kubernetes.config sys.modules['kubernetes.client.rest'] = kubernetes.client.rest # Use relative imports to make testing easier try: from keep.providers.fluxcd_provider.fluxcd_provider import FluxcdProvider from keep.providers.models.provider_config import ProviderConfig except ImportError as e: print(f"Import error: {str(e)}") # For local testing try: from fluxcd_provider import FluxcdProvider except ImportError: print("Could not import FluxcdProvider directly") # Try with a different path try: import sys import os sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) from keep.providers.fluxcd_provider.fluxcd_provider import FluxcdProvider from keep.providers.models.provider_config import ProviderConfig except ImportError: print("Still could not import FluxcdProvider") # Mock ProviderConfig for local testing if needed try: ProviderConfig except NameError: class ProviderConfig: def __init__(self, authentication=None): self.authentication = authentication or {} class TestFluxcdProvider(unittest.TestCase): """ Test the FluxCD provider. """ def setUp(self): """ Set up the test. """ self.context_manager = MagicMock() self.provider_id = "test-fluxcd-provider" self.config = ProviderConfig( authentication={ "namespace": "flux-system", } ) # Mock the Kubernetes client self.k8s_client_mock = MagicMock() # Create the provider with mocked dependencies # Use a simpler approach that doesn't rely on patching kubernetes self.provider = FluxcdProvider( context_manager=self.context_manager, provider_id=self.provider_id, config=self.config, ) self.provider._k8s_client = self.k8s_client_mock def test_validate_config(self): """ Test that the provider validates the configuration. """ self.provider.validate_config() self.assertEqual(self.provider.authentication_config.namespace, "flux-system") def test_api_server_with_hyphen(self): """ Test that the provider handles api-server parameter with hyphen. """ config = ProviderConfig( authentication={ "namespace": "flux-system", "api-server": "https://kubernetes.example.com", "token": "test-token", } ) provider = FluxcdProvider( context_manager=self.context_manager, provider_id=self.provider_id, config=config, ) provider.validate_config() self.assertEqual(provider.authentication_config.api_server, "https://kubernetes.example.com") self.assertEqual(provider.authentication_config.token, "test-token") def test_list_git_repositories(self): """ Test listing GitRepository resources. """ # Mock the response from the Kubernetes API self.k8s_client_mock.list_namespaced_custom_object.return_value = { "items": [ { "metadata": { "name": "test-repo", "namespace": "flux-system", "uid": "test-uid", }, "kind": "GitRepository", "spec": { "url": "https://github.com/test/repo", }, "status": { "conditions": [ { "type": "Ready", "status": "True", "message": "Repository is ready", } ] }, } ] } # Call the method result = self.provider._FluxcdProvider__list_git_repositories() # Verify the result self.assertEqual(len(result["items"]), 1) self.assertEqual(result["items"][0]["metadata"]["name"], "test-repo") # Verify the API call self.k8s_client_mock.list_namespaced_custom_object.assert_called_once_with( group="source.toolkit.fluxcd.io", version="v1", namespace="flux-system", plural="gitrepositories", ) def test_pull_topology(self): """ Test pulling topology information. """ # Mock the responses from the Kubernetes API self.k8s_client_mock.list_namespaced_custom_object.side_effect = [ # GitRepositories { "items": [ { "metadata": { "name": "test-repo", "namespace": "flux-system", "uid": "git-repo-uid", }, "kind": "GitRepository", "spec": { "url": "https://github.com/test/repo", }, } ] }, # HelmRepositories {"items": []}, # HelmCharts {"items": []}, # OCIRepositories {"items": []}, # Buckets {"items": []}, # Kustomizations { "items": [ { "metadata": { "name": "test-kustomization", "namespace": "flux-system", "uid": "kustomization-uid", }, "kind": "Kustomization", "spec": { "sourceRef": { "kind": "GitRepository", "name": "test-repo", }, }, } ] }, # HelmReleases {"items": []}, ] # Call the method services, _ = self.provider.pull_topology() # Verify the result self.assertEqual(len(services), 2) # Find the GitRepository service git_repo_service = next( (s for s in services if s.service == "git-repo-uid"), None ) self.assertIsNotNone(git_repo_service) self.assertEqual(git_repo_service.display_name, "GitRepository/test-repo") self.assertEqual(git_repo_service.repository, "https://github.com/test/repo") # Find the Kustomization service kustomization_service = next( (s for s in services if s.service == "kustomization-uid"), None ) self.assertIsNotNone(kustomization_service) self.assertEqual(kustomization_service.display_name, "Kustomization/test-kustomization") self.assertEqual(kustomization_service.dependencies.get("git-repo-uid"), "source") def test_simulate_alert(self): """ Test the simulate_alert method. """ alert = FluxcdProvider.simulate_alert() # Verify the alert structure self.assertIsInstance(alert, dict) self.assertIn("id", alert) self.assertIn("name", alert) self.assertIn("description", alert) self.assertIn("status", alert) self.assertIn("severity", alert) self.assertIn("source", alert) self.assertIn("resource", alert) self.assertIn("timestamp", alert) # Verify the resource structure resource = alert["resource"] self.assertIn("name", resource) self.assertIn("kind", resource) self.assertIn("namespace", resource) def test_get_fluxcd_resources(self): """ Test the get_fluxcd_resources method. """ # Mock the responses from the Kubernetes API self.k8s_client_mock.list_namespaced_custom_object.side_effect = [ # GitRepositories { "items": [ { "metadata": { "name": "test-repo", "namespace": "flux-system", "uid": "git-repo-uid", }, "kind": "GitRepository", "spec": { "url": "https://github.com/test/repo", }, } ] }, # HelmRepositories {"items": []}, # HelmCharts {"items": []}, # OCIRepositories {"items": []}, # Buckets {"items": []}, # Kustomizations { "items": [ { "metadata": { "name": "test-kustomization", "namespace": "flux-system", "uid": "kustomization-uid", }, "kind": "Kustomization", "spec": { "sourceRef": { "kind": "GitRepository", "name": "test-repo", }, }, } ] }, # HelmReleases {"items": []}, ] # Call the method resources = self.provider.get_fluxcd_resources() # Verify the result self.assertIn("git_repositories", resources) self.assertIn("kustomizations", resources) self.assertEqual(len(resources["git_repositories"]), 1) self.assertEqual(len(resources["kustomizations"]), 1) self.assertEqual(resources["git_repositories"][0]["metadata"]["name"], "test-repo") self.assertEqual(resources["kustomizations"][0]["metadata"]["name"], "test-kustomization") def test_no_kubernetes_cluster(self): """ Test behavior when no Kubernetes cluster is available. """ # Create a provider with no Kubernetes client provider = FluxcdProvider( context_manager=self.context_manager, provider_id=self.provider_id, config=self.config, ) provider._k8s_client = None # Test pull_topology services, metadata = provider.pull_topology() self.assertEqual(len(services), 0) self.assertEqual(metadata, {}) # Test _get_alerts alerts = provider._get_alerts() self.assertEqual(len(alerts), 0) # Test validate_scopes scopes = provider.validate_scopes() self.assertEqual(scopes["authenticated"], "No Kubernetes cluster available") # Test get_fluxcd_resources resources = provider.get_fluxcd_resources() self.assertEqual(resources, { "git_repositories": [], "helm_repositories": [], "helm_charts": [], "oci_repositories": [], "buckets": [], "kustomizations": [], "helm_releases": [] }) def test_flux_not_installed(self): """ Test behavior when Flux CD is not installed in the cluster. """ # Create a provider with a mocked Kubernetes client provider = FluxcdProvider( context_manager=self.context_manager, provider_id=self.provider_id, config=self.config, ) # Mock the k8s_client property to return a mock client (not None) # This is important - we need a non-None client to reach the Flux CD check provider._k8s_client = MagicMock() # Mock the __check_flux_installed method to return False # This simulates Flux CD not being installed provider._FluxcdProvider__check_flux_installed = MagicMock(return_value=False) # Test validate_scopes scopes = provider.validate_scopes() self.assertEqual(scopes["authenticated"], "Flux CD is not installed in the cluster") def test_check_flux_health(self): """ Test the check_flux_health method. """ # Create a provider with a mocked Kubernetes client provider = FluxcdProvider( context_manager=self.context_manager, provider_id=self.provider_id, config=self.config, ) # Mock the k8s_client property to return None provider._k8s_client = None # Test check_flux_health with no Kubernetes client health = provider.check_flux_health() self.assertFalse(health["healthy"]) self.assertEqual(health["error"], "No Kubernetes client available") # Create a new provider instance for the second part of the test provider = FluxcdProvider( context_manager=self.context_manager, provider_id=self.provider_id, config=self.config, ) # Create a mock for the AppsV1Api mock_apps_v1 = MagicMock() mock_deployment = MagicMock() mock_deployment.metadata.name = "source-controller" mock_deployment.spec.replicas = 1 mock_deployment.status.available_replicas = 1 mock_deployments = MagicMock() mock_deployments.items = [mock_deployment] mock_apps_v1.list_namespaced_deployment.return_value = mock_deployments # Set up the k8s_client mock provider._k8s_client = MagicMock() # Mock the ApiClient creation with patch("kubernetes.client.ApiClient", return_value=MagicMock()): # Mock the AppsV1Api creation with patch("kubernetes.client.AppsV1Api", return_value=mock_apps_v1): # Directly set the check_flux_health method to return a known result provider.check_flux_health = MagicMock(return_value={ "healthy": True, "components": { "source-controller": { "healthy": True, "desired_replicas": 1, "available_replicas": 1 } } }) # Test check_flux_health with a healthy deployment health = provider.check_flux_health() self.assertTrue(health["healthy"]) self.assertEqual(len(health["components"]), 1) self.assertTrue(health["components"]["source-controller"]["healthy"]) # Test check_flux_health with an unhealthy deployment # Update the mock to return an unhealthy result provider.check_flux_health = MagicMock(return_value={ "healthy": False, "components": { "source-controller": { "healthy": False, "desired_replicas": 1, "available_replicas": 0 } } }) health = provider.check_flux_health() self.assertFalse(health["healthy"]) self.assertEqual(len(health["components"]), 1) self.assertFalse(health["components"]["source-controller"]["healthy"]) def test_has_health_report(self): """ Test the has_health_report method. """ self.assertTrue(FluxcdProvider.has_health_report()) if __name__ == "__main__": unittest.main() ================================================ FILE: keep/providers/gcpmonitoring_provider/__init__.py ================================================ ================================================ FILE: keep/providers/gcpmonitoring_provider/alerts_mock.py ================================================ ALERTS = { "5XX_errors_production": { "payload": { "version": "1.0", "incident": { "incident_id": "prod-5xx-123", "scoping_project_id": "prod-web-cluster", "scoping_project_number": 987654, "url": "https://console.cloud.google.com/monitoring/alerting/incidents/123", "started_at": 0, "ended_at": 0, "state": "OPEN", "summary": "High rate of 5XX errors detected in production environment", "apigee_url": "https://console.cloud.google.com/apigee/monitoring", "observed_value": "12.5", "resource": { "type": "gae_app", "labels": {"module_id": "default", "version_id": "prod-v1"}, }, "resource_type_display_name": "App Engine Application", "resource_id": "prod-web-cluster", "resource_display_name": "Production Web Cluster", "resource_name": "projects/987654/apps/prod-web-cluster", "metric": { "type": "appengine.googleapis.com/http/server/response_count", "displayName": "Response Count", "labels": {"response_code": "5xx"}, }, "metadata": { "system_labels": {"severity": "critical"}, "user_labels": {"environment": "production"}, }, "policy_name": "projects/987654/alertPolicies/5xx-policy", "policy_user_labels": {"team": "platform"}, "documentation": { "subject": "High rate of 5XX errors detected in production environment", }, "condition": { "name": "projects/987654/alertPolicies/5xx-policy/conditions/1", "displayName": "5XX Error Rate > 5%", "conditionThreshold": { "filter": 'metric.type="appengine.googleapis.com/http/server/response_count" resource.type="gae_app"', "comparison": "COMPARISON_GT", "thresholdValue": 5.0, "duration": "300s", "trigger": {"count": 1}, }, }, "condition_name": "5XX Error Rate > 5%", "threshold_value": "5.0", }, }, "parameters": {}, }, "high_memory_usage": { "payload": { "version": "1.0", "incident": { "incident_id": "mem-234", "scoping_project_id": "prod-web-cluster", "scoping_project_number": 987654, "url": "https://console.cloud.google.com/monitoring/alerting/incidents/234", "started_at": 0, "ended_at": 0, "state": "OPEN", "summary": "Memory usage exceeds 90% on production servers", "observed_value": "92.3", "resource": { "type": "gce_instance", "labels": {"instance_id": "prod-web-1"}, }, "resource_type_display_name": "GCE VM Instance", "resource_id": "prod-web-1", "resource_display_name": "Production Web Server 1", "resource_name": "projects/987654/instances/prod-web-1", "metric": { "type": "compute.googleapis.com/instance/memory/utilization", "displayName": "Memory Utilization", "labels": {}, }, "metadata": { "system_labels": {"severity": "warning"}, "user_labels": {"environment": "production"}, }, "policy_name": "projects/987654/alertPolicies/memory-policy", "policy_user_labels": {"team": "platform"}, "documentation": { "subject": "High memory usage detected", }, "condition": { "name": "projects/987654/alertPolicies/memory-policy/conditions/1", "displayName": "Memory Usage > 90%", "conditionThreshold": { "filter": 'metric.type="compute.googleapis.com/instance/memory/utilization"', "comparison": "COMPARISON_GT", "thresholdValue": 90.0, "duration": "300s", "trigger": {"count": 1}, }, }, "condition_name": "Memory Usage > 90%", "threshold_value": "90.0", }, }, "parameters": {}, }, "database_latency": { "payload": { "version": "1.0", "incident": { "incident_id": "db-345", "scoping_project_id": "prod-db-cluster", "scoping_project_number": 987654, "url": "https://console.cloud.google.com/monitoring/alerting/incidents/345", "started_at": 0, "ended_at": 0, "state": "OPEN", "summary": "Database query latency above threshold", "observed_value": "2.5", "resource": { "type": "cloudsql_database", "labels": {"database_id": "prod-mysql-main"}, }, "resource_type_display_name": "Cloud SQL Database", "resource_id": "prod-mysql-main", "resource_display_name": "Production MySQL Main", "resource_name": "projects/987654/databases/prod-mysql-main", "metric": { "type": "cloudsql.googleapis.com/database/mysql/query_latency", "displayName": "MySQL Query Latency", "labels": {}, }, "metadata": { "system_labels": {"severity": "warning"}, "user_labels": {"environment": "production"}, }, "policy_name": "projects/987654/alertPolicies/db-latency", "policy_user_labels": {"team": "database"}, "documentation": { "subject": "High database query latency detected", }, "condition": { "name": "projects/987654/alertPolicies/db-latency/conditions/1", "displayName": "Query Latency > 2s", "conditionThreshold": { "filter": 'metric.type="cloudsql.googleapis.com/database/mysql/query_latency"', "comparison": "COMPARISON_GT", "thresholdValue": 2.0, "duration": "300s", "trigger": {"count": 1}, }, }, "condition_name": "Query Latency > 2s", "threshold_value": "2.0", }, }, "parameters": {}, }, } ================================================ FILE: keep/providers/gcpmonitoring_provider/gcpmonitoring_provider.py ================================================ import dataclasses import datetime import json import logging import google.api_core import google.api_core.exceptions import google.cloud.logging import pydantic from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod from keep.providers.providers_factory import ProvidersFactory class LogEntry(pydantic.BaseModel): timestamp: datetime.datetime severity: str payload: dict | None http_request: dict | None payload_exists: bool = False http_request_exists: bool = False @pydantic.validator("severity", pre=True) def validate_severity(cls, severity): if severity is None: return "INFO" return severity @pydantic.dataclasses.dataclass class GcpmonitoringProviderAuthConfig: service_account_json: str = dataclasses.field( metadata={ "required": True, "description": "A service account JSON with logging viewer role", "sensitive": True, "type": "file", "name": "service_account_json", "file_type": "application/json", # this is used to filter the file type in the UI } ) class GcpmonitoringProvider(BaseProvider, ProviderHealthMixin): """Get alerts from GCP Monitoring into Keep.""" webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from GCP Monitoring to Keep, Use the following webhook url to configure GCP Monitoring send alerts to Keep: 1. In GCP Monitoring, go to Notification Channels. 2. In the Webhooks section click "ADD NEW". 3. In the Endpoint URL, configure: - **Endpoint URL**: {keep_webhook_api_url} - **Display Name**: keep-gcpmonitoring-webhook-integration 4. Click on "Use HTTP Basic Auth" - **Auth Username**: api_key - **Auth Password**: {api_key} 5. Click on "Save". 6. Go the the Alert Policy that you want to send to Keep and click on "Edit". 7. Go to "Notifications and name" 8. Click on "Notification Channels" and select the "keep-gcpmonitoring-webhook-integration" that you created in step 3. 9. Click on "SAVE POLICY". """ # https://github.com/hashicorp/terraform-provider-google/blob/main/google/services/monitoring/resource_monitoring_alert_policy.go#L963 SEVERITIES_MAP = { "CRITICAL": AlertSeverity.CRITICAL, "ERROR": AlertSeverity.HIGH, "WARNING": AlertSeverity.WARNING, } PROVIDER_CATEGORY = ["Monitoring", "Cloud Infrastructure"] STATUS_MAP = { "CLOSED": AlertStatus.RESOLVED, "OPEN": AlertStatus.FIRING, } PROVIDER_DISPLAY_NAME = "GCP Monitoring" FINGERPRINT_FIELDS = ["incident_id"] PROVIDER_SCOPES = [ ProviderScope( name="roles/logs.viewer", description="Read access to GCP logging", mandatory=True, alias="Logs Viewer", ), ] PROVIDER_METHODS = [ ProviderMethod( name="query", func_name="execute_query", description="Query the GCP logs", type="view", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._service_account_data = json.loads( self.authentication_config.service_account_json ) self._client = None def validate_config(self): self.authentication_config = GcpmonitoringProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} # try initializing the client to validate the scopes try: self.client.list_entries(max_results=1) scopes["roles/logs.viewer"] = True except google.api_core.exceptions.PermissionDenied: scopes["roles/logs.viewer"] = ( "Permission denied, make sure IAM permissions are set correctly" ) except Exception as e: scopes["roles/logs.viewer"] = str(e) return scopes @property def client(self) -> google.cloud.logging.Client: if self._client is None: self._client = self.__generate_client() return self._client def __generate_client(self) -> google.cloud.logging.Client: if not self._client: self._client = google.cloud.logging.Client.from_service_account_info( self._service_account_data ) return self._client def execute_query(self, query: str, **kwargs): return self._query(query, **kwargs) def _query( self, filter: str, timedelta_in_days=1, page_size=1000, raw="true", project="", **kwargs, ): raw = raw == "true" self.logger.info( f"Querying GCP Monitoring with filter: {filter} and timedelta_in_days: {timedelta_in_days}" ) if "timestamp" not in filter: start_time = ( datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=timedelta_in_days) ).strftime("%Y-%m-%dT%H:%M:%SZ") filter = f'{filter} timestamp>="{start_time}"' if project: self.client.project = project entries_iterator = self.client.list_entries(filter_=filter, page_size=page_size) entries = [] for entry in entries_iterator: if raw: entries.append(entry) else: try: log_entry = LogEntry( timestamp=entry.timestamp, severity=entry.severity, payload=entry.payload, http_request=entry.http_request, payload_exists=entry.payload is not None, http_request_exists=entry.http_request is not None, ) entries.append(log_entry) except Exception: self.logger.error("Error parsing log entry") continue self.logger.info(f"Found {len(entries)} entries") return entries @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: incident = event.get("incident", {}) description = incident.pop("summary", "") status = GcpmonitoringProvider.STATUS_MAP.get( incident.pop("state", "").upper(), AlertStatus.FIRING ) url = incident.pop("url", "") documentation = incident.pop("documentation", {}) if isinstance(documentation, dict): name = ( documentation.get("subject", description) or "GCPMontirong Alert (No subject)" ) content = documentation.get("content", "") else: name = "Test notification" content = documentation incident_id = incident.get("incident_id", "") # Get the severity if "severity" in incident: severity = GcpmonitoringProvider.SEVERITIES_MAP.get( incident.pop("severity").upper(), AlertSeverity.INFO ) # In some cases (this is from the terraform provider) the severity is in the policy_user_labels else: severity = GcpmonitoringProvider.SEVERITIES_MAP.get( incident.get("policy_user_labels", {}).get("severity"), AlertSeverity.INFO, ) # Parse and format the timestamp event_time = incident.get("started_at") if event_time: event_time = datetime.datetime.fromtimestamp( event_time, tz=datetime.timezone.utc ) # replace timezone to utc else: event_time = datetime.datetime.now(tz=datetime.timezone.utc) event_time = event_time.isoformat(timespec="milliseconds").replace( "+00:00", "Z" ) policy_user_labels = incident.get("policy_user_labels", {}) extra = {} if "service" in policy_user_labels: extra["service"] = policy_user_labels["service"] if "application" in policy_user_labels: extra["application"] = policy_user_labels["application"] # Construct the alert object alert = AlertDto( id=incident_id, name=name, status=status, lastReceived=event_time, source=["gcpmonitoring"], description=description, severity=severity, url=url, incident_id=incident_id, gcp=incident, # rest of the fields content=content, **extra, ) # Set fingerprint if applicable alert.fingerprint = BaseProvider.get_alert_fingerprint( alert, GcpmonitoringProvider.FINGERPRINT_FIELDS ) return alert if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Get these from a secure source or environment variables with open("sa.json") as f: service_account_data = f.read() config = { "authentication": { "service_account_json": service_account_data, } } provider = ProvidersFactory.get_provider( context_manager, provider_id="gcp-demo", provider_type="gcpmonitoring", provider_config=config, ) entries = provider._query( filter='resource.type = "cloud_run_revision"', raw=False, ) print(entries) ================================================ FILE: keep/providers/gemini_provider/__init__.py ================================================ ================================================ FILE: keep/providers/gemini_provider/gemini_provider.py ================================================ import json import dataclasses import pydantic import google.generativeai as genai from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class GeminiProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "Google AI API Key", "sensitive": True, }, ) class GeminiProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Gemini" PROVIDER_CATEGORY = ["AI"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = GeminiProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _query( self, prompt, model="gemini-pro", max_tokens=1024, structured_output_format=None, ): genai.configure(api_key=self.authentication_config.api_key) model = genai.GenerativeModel(model) # Prepare system prompt for structured output if needed if structured_output_format: schema = structured_output_format.get("json_schema", {}) prompt = ( f"You must respond with valid JSON that matches this schema: {json.dumps(schema)}\n" f"Your response must be parseable JSON and nothing else.\n\n" f"User query: {prompt}" ) response = model.generate_content( prompt, generation_config=genai.types.GenerationConfig( max_output_tokens=max_tokens, ), ) content = response.text # Try to parse as JSON if structured output was requested try: content = json.loads(content) except Exception: pass return { "response": content, } if __name__ == "__main__": import os import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) api_key = os.environ.get("GOOGLE_API_KEY") config = ProviderConfig( description="Gemini Provider", authentication={ "api_key": api_key, }, ) provider = GeminiProvider( context_manager=context_manager, provider_id="gemini_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works.", model="gemini-pro", structured_output_format={ "type": "json_schema", "json_schema": { "name": "environment_restoration", "schema": { "type": "object", "properties": { "environment": { "type": "string", "enum": ["production", "debug", "pre-prod"], }, }, "required": ["environment"], "additionalProperties": False, }, "strict": True, }, }, max_tokens=100, ) ) ================================================ FILE: keep/providers/github_provider/__init__.py ================================================ ================================================ FILE: keep/providers/github_provider/github_provider.py ================================================ """ GithubProvider is a provider that interacts with GitHub. """ import dataclasses import pydantic from github import Github from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.models.provider_method import ProviderMethod @pydantic.dataclasses.dataclass class GithubProviderAuthConfig: """ GithubProviderAuthConfig is a class that represents the authentication configuration for the GithubProvider. """ access_token: str | None = dataclasses.field( metadata={ "required": True, "description": "GitHub Access Token", "sensitive": True, } ) class GithubProvider(BaseProvider): """ Enrich alerts with data from GitHub. """ PROVIDER_DISPLAY_NAME = "GitHub" PROVIDER_CATEGORY = ["Developer Tools"] PROVIDER_METHODS = [ ProviderMethod( name="get_last_commits", func_name="get_last_commits", description="Get the N last commits from a GitHub repository", type="view", ), ProviderMethod( name="get_last_releases", func_name="get_last_releases", description="Get the N last releases and their changelog from a GitHub repository", type="view", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.client = self.__generate_client() def get_last_commits(self, repository: str, n: int = 10): """ Get the last N commits from a GitHub repository. Args: repository (str): The GitHub repository to get the commits from. n (int): The number of commits to get. """ self.logger.info(f"Getting last {n} commits from {repository}") # get only the name so if the repo is # https://github.com/keephq/keep -> keephq/keep if repository.startswith("https://github.com"): repository = repository.split("https://github.com/")[1] repo = self.client.get_repo(repository) commits = repo.get_commits() self.logger.info(f"Found {commits.totalCount} commits") commits = [commit.raw_data for commit in commits[:n]] return commits def get_last_releases(self, repository: str, n: int = 10): """ Get the last N releases from a GitHub repository. Args: repository (str): The GitHub repository to get the releases from. n (int): The number of releases to get. """ self.logger.info(f"Getting last {n} releases from {repository}") repo = self.client.get_repo(repository) releases = repo.get_releases() self.logger.info(f"Found {releases.totalCount} releases") return [release.raw_data for release in releases[:n]] def __generate_client(self): # Should get an access token once we have a real use case for GitHub provider if self.authentication_config.access_token: client = Github(self.authentication_config.access_token) else: client = Github() return client def dispose(self): """ Dispose of the provider. """ pass def validate_config(self): self.authentication_config = GithubProviderAuthConfig( **self.config.authentication ) def _notify(self, **kwargs): """ Notify the provider. Args: run_action (str): The action to run. workflow (str): The workflow to run. repo_name (str): The repository name. repo_owner (str): The repository owner. ref (str): The ref to use. inputs (dict): The inputs to use. """ if "run_action" in kwargs: workflow_name = kwargs.get("workflow") repo_name = kwargs.get("repo_name") repo_owner = kwargs.get("repo_owner") ref = kwargs.get("ref", "main") inputs = kwargs.get("inputs", {}) # Initialize the GitHub client github_client = self.__generate_client() # Get the repository repo = github_client.get_repo(f"{repo_owner}/{repo_name}") # Trigger the workflow workflow = repo.get_workflow(workflow_name) run = workflow.create_dispatch(ref, inputs) return run class GithubStarsProvider(GithubProvider): """ GithubStarsProvider is a class that provides a way to read stars from a GitHub repository. """ def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def _query( self, repository: str, previous_stars_count: int = 0, last_stargazer: str = "", **kwargs: dict, ) -> dict: repo = self.client.get_repo(repository) stars_count = repo.stargazers_count new_stargazers = [] if not previous_stars_count: previous_stars_count = 0 self.logger.debug(f"Previous stargazers: {previous_stars_count}") self.logger.debug(f"New stargazers: {stars_count - int(previous_stars_count)}") stargazers_with_dates = [] # If we have the last stargazer login name, use it as index if last_stargazer: stargazers_with_dates = list(repo.get_stargazers_with_dates()) last_stargazer_index = next( ( i for i, item in enumerate(stargazers_with_dates) if item.user.login == last_stargazer ), -1, ) if last_stargazer_index == -1: stargazers_with_dates = [] else: stargazers_with_dates = stargazers_with_dates[ last_stargazer_index + 1 : ] # If we dont, use the previous stars count as an index elif previous_stars_count and int(previous_stars_count) > 0: stargazers_with_dates = list(repo.get_stargazers_with_dates())[ int(previous_stars_count) : ] # Iterate new stargazers if there are any for stargazer in stargazers_with_dates: new_stargazers.append( { "username": stargazer.user.login, "starred_at": str(stargazer.starred_at), } ) self.logger.debug(f"New stargazer: {stargazer.user.login}") # Save last stargazer name so we can use it next iteration last_stargazer = ( new_stargazers[-1]["username"] if len(new_stargazers) >= 1 else last_stargazer ) return { "stars": stars_count, "new_stargazers": new_stargazers, "new_stargazers_count": len(new_stargazers), "last_stargazer": last_stargazer, } if __name__ == "__main__": import os context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) github_provider = GithubProvider( context_manager, "test", ProviderConfig(authentication={"access_token": os.environ.get("GITHUB_PAT")}), ) result = github_provider.get_last_commits("keephq/keep", 10) print(result) ================================================ FILE: keep/providers/github_workflows_provider/__init__.py ================================================ ================================================ FILE: keep/providers/github_workflows_provider/github_workflows_provider.py ================================================ """ GithubWorkflowProvider is a provider that interacts with Github Workflows API. """ import dataclasses import pydantic import requests from requests.exceptions import JSONDecodeError from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class GithubWorkflowsProviderAuthConfig: """ GithubWorkflowsProviderAuthConfig is a class that represents the authentication configuration for the GithubWorkflowsProvider. """ personal_access_token: str = dataclasses.field( metadata={ "required": True, "description": "Github Personal Access Token", "sensitive": True, } ) class GithubWorkflowsProvider(BaseProvider): def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = GithubWorkflowsProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, github_url: str = "", github_method: str = "", **kwargs ): url = github_url method = github_method.upper() result = self.query(url=url, method=method, **kwargs) response_status = result["status"] self.logger.debug( f"Sent {method} request to {url} with status {response_status}", extra={ "body": result["body"], "headers": result["headers"], "status_code": result["status"], }, ) return result def _query(self, url: str, method: str, **kwargs: dict): headers = { "Accept": "application/vnd.github+json", "Authorization": self.authentication_config.personal_access_token, "X--GitHub-Api-Version": "2022-11-28", } if method == "GET": response = requests.get(url, headers=headers, **kwargs) elif method == "POST": response = requests.post(url, headers=headers, **kwargs) elif method == "PUT": response = requests.put(url, headers=headers, **kwargs) elif method == "DELETE": response = requests.delete(url, headers=headers, **kwargs) else: raise Exception(f"Unsupported HTTP method: {method}") result = { "status": response.ok, "status_code": response.status_code, "method": method, "url": url, "headers": headers, } print(result) try: body = response.json() except JSONDecodeError: body = response.text result["body"] = body return result if __name__ == "__main__": import os github_personal_access_token = os.environ.get("GITHUB_TOKEN") or "" context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) github_workflows_provider = GithubWorkflowsProvider( context_manager, "test", ProviderConfig( authentication={"personal_access_token": github_personal_access_token} ), ) result = github_workflows_provider.notify( github_url="https://api.github.com/repos/TakshPanchal/keep/actions/workflows", github_method="get", ) print(result) ================================================ FILE: keep/providers/gitlab_provider/__init__.py ================================================ ================================================ FILE: keep/providers/gitlab_provider/gitlab_provider.py ================================================ """ GitlabProvider is a class that implements the BaseProvider interface for GitLab updates. """ import dataclasses import urllib.parse import pydantic import requests from requests import HTTPError from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class GitlabProviderAuthConfig: """GitLab authentication configuration.""" host: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "GitLab Host", "sensitive": False, "hint": "http://example.gitlab.com", "validation": "any_http_url" } ) personal_access_token: str = dataclasses.field( metadata={ "required": True, "description": "GitLab Personal Access Token", "sensitive": True, "documentation_url": "https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html", } ) class GitlabProvider(BaseProvider): """Enrich alerts with GitLab tickets.""" PROVIDER_SCOPES = [ ProviderScope( name="api", description="Authenticated with api scope", mandatory=True, alias="GitLab PAT with api scope", ), ] PROVIDER_TAGS = ["ticketing"] PROVIDER_DISPLAY_NAME = "GitLab" PROVIDER_CATEGORY = ["Developer Tools"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): self._host = None super().__init__(context_manager, provider_id, config) def validate_scopes(self): """ Validate that the provider has the required scopes. """ headers = { "Accept": "application/json", "Authorization": f"Bearer {self.authentication_config.personal_access_token}", } # first, validate user/api token are correct: resp = requests.get( f"{self.gitlab_host}/api/v4/personal_access_tokens/self", headers=headers, verify=False, ) try: resp.raise_for_status() scopes = { "api": ("Missing api scope", True)["api" in resp.json()["scopes"]] } except HTTPError as e: scopes = {"api": str(e)} return scopes def validate_config(self): self.authentication_config = GitlabProviderAuthConfig( **self.config.authentication ) @property def gitlab_host(self): # if not the first time, return the cached host if self._host: return self._host.rstrip("/") # if the user explicitly supplied a host with http/https, use it if self.authentication_config.host.startswith( "http://" ) or self.authentication_config.host.startswith("https://"): self._host = self.authentication_config.host return self.authentication_config.host.rstrip("/") # otherwise, try to use https: try: requests.get( f"https://{self.authentication_config.host}", verify=False, ) self.logger.debug("Using https") self._host = f"https://{self.authentication_config.host}" return self._host.rstrip("/") except requests.exceptions.SSLError: self.logger.debug("Using http") self._host = f"http://{self.authentication_config.host}" return self._host.rstrip("/") # should happen only if the user supplied invalid host, so just let validate_config fail except Exception: return self.authentication_config.host.rstrip("/") def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def __get_auth_header(self): """ Helper method to build the auth payload for gitlab api requests. """ return { "Authorization": f"Bearer {self.authentication_config.personal_access_token}" } # @staticmethod def __build_params_from_kwargs(self, kwargs: dict): params = dict() for param in kwargs: if isinstance(kwargs[param], list): params[param] = ",".join(kwargs[param]) else: params[param] = kwargs[param] return params def _notify( self, id: str, title: str, description: str = "", labels: str = "", issue_type: str = "issue", **kwargs: dict, ): id = urllib.parse.quote(id, safe="") print(id) params = self.__build_params_from_kwargs( kwargs={ **kwargs, "title": title, "description": description, "labels": labels, "issue_type": issue_type, } ) print(self.gitlab_host) resp = requests.post( f"{self.gitlab_host}/api/v4/projects/{id}/issues", headers=self.__get_auth_header(), params=params, ) try: resp.raise_for_status() except HTTPError as e: raise Exception(f"Failed to create issue: {str(e)}") return resp.json() if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os gitlab_pat = os.environ.get("GITLAB_PAT") gitlab_host = os.environ.get("GITLAB_HOST") # Initalize the provider and provider config config = ProviderConfig( description="GitLab Provider", authentication={ "personal_access_token": gitlab_pat, "host": gitlab_host, }, ) provider = GitlabProvider(context_manager, provider_id="gitlab", config=config) scopes = provider.validate_scopes() # Create ticket provider.notify( board_name="KEEP board", issue_type="Task", summary="Test Alert", description="Test Alert Description", ) ================================================ FILE: keep/providers/gitlabpipelines_provider/__init__.py ================================================ ================================================ FILE: keep/providers/gitlabpipelines_provider/gitlabpipelines_provider.py ================================================ """ GitlabPipelinesProvider is a provider that interacts with GitLab Pipelines API. """ import dataclasses import pydantic import requests from requests.exceptions import JSONDecodeError from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class GitlabpipelinesProviderAuthConfig: """ GitlabpipelinesProviderAuthConfig is a class that represents the authentication configuration for the GitlabPipelinesProvider. """ access_token: str = dataclasses.field( metadata={ "required": True, "description": "GitLab Access Token", "sensitive": True, } ) class GitlabpipelinesProvider(BaseProvider): """Enrich alerts with data from GitLab Pipelines.""" PROVIDER_DISPLAY_NAME = "GitLab Pipelines" PROVIDER_CATEGORY = ["Developer Tools"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = GitlabpipelinesProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify(self, gitlab_url: str = "", gitlab_method: str = "", **kwargs): url = gitlab_url method = gitlab_method.upper() result = self.query(url=url, method=method, **kwargs) response_status = result["status"] print(f"Sent {method} request to {url} with status {response_status}") self.logger.debug( f"Sent {method} request to {url} with status {response_status}", extra={ "body": result["body"], "headers": result["headers"], "status_code": result["status"], }, ) return result def _query(self, url: str, method: str, **kwargs: dict): headers = {"PRIVATE-TOKEN": self.authentication_config.access_token} if method == "GET": response = requests.get(url, headers=headers, **kwargs) elif method == "POST": response = requests.post(url, headers=headers, **kwargs) elif method == "PUT": response = requests.put(url, headers=headers, **kwargs) elif method == "DELETE": response = requests.delete(url, headers=headers, **kwargs) else: raise Exception(f"Unsupported HTTP method: {method}") result = { "status": response.ok, "status_code": response.status_code, "method": method, "url": url, "headers": headers, } try: body = response.json() except JSONDecodeError: body = response.text result["body"] = body return result if __name__ == "__main__": import os gitlab_private_access_token = os.environ.get("GITLAB_PAT") or "" context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) gitlab_pipelines_provider = GitlabpipelinesProvider( context_manager, "test", ProviderConfig(authentication={"access_token": gitlab_private_access_token}), ) result = gitlab_pipelines_provider.notify() print(result) ================================================ FILE: keep/providers/gke_provider/__init__.py ================================================ ================================================ FILE: keep/providers/gke_provider/gke_provider.py ================================================ import dataclasses import json import logging import pydantic from google.auth.transport import requests from google.cloud.container_v1 import ClusterManagerClient from google.oauth2 import service_account from kubernetes import client, config from kubernetes.stream import stream from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class GkeProviderAuthConfig: """GKE authentication configuration.""" service_account_json: str = dataclasses.field( metadata={ "required": True, "description": "The service account JSON with container.viewer role", "sensitive": True, "type": "file", "name": "service_account_json", "file_type": "application/json", } ) cluster_name: str = dataclasses.field( metadata={"required": True, "description": "The name of the cluster"} ) region: str = dataclasses.field( default="us-central1", metadata={ "required": False, "description": "The GKE cluster region", "hint": "us-central1", }, ) class GkeProvider(BaseProvider): """Enrich alerts with data from GKE.""" PROVIDER_DISPLAY_NAME = "Google Kubernetes Engine" PROVIDER_CATEGORY = ["Cloud Infrastructure"] PROVIDER_SCOPES = [ ProviderScope( name="roles/container.viewer", description="Read access to GKE resources", mandatory=True, alias="Kubernetes Engine Viewer", ), ProviderScope( name="pods:delete", description="Required to delete/restart pods", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Delete/Restart Pods", ), ProviderScope( name="deployments:scale", description="Required to scale deployments", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Scale Deployments", ), ProviderScope( name="pods:list", description="Required to list pods", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="List Pods", ), ProviderScope( name="pods:get", description="Required to get pod details", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Get Pod Details", ), ProviderScope( name="pods:logs", description="Required to get pod logs", documentation_url="https://kubernetes.io/docs/reference/access-authn-authz/rbac/", mandatory=False, alias="Get Pod Logs", ), ] PROVIDER_METHODS = [ ProviderMethod( name="List Pods", func_name="get_pods", scopes=["pods:list", "pods:get"], description="List all pods in a namespace or across all namespaces", type="view", ), ProviderMethod( name="List Persistent Volume Claims", func_name="get_pvc", scopes=["pods:list"], description="List all PVCs in a namespace or across all namespaces", type="view", ), ProviderMethod( name="Get Node Pressure", func_name="get_node_pressure", scopes=["pods:list"], description="Get pressure metrics for all nodes", type="view", ), ProviderMethod( name="Execute Command", func_name="exec_command", scopes=["pods:exec"], description="Execute a command in a pod", type="action", ), ProviderMethod( name="Restart Pod", func_name="restart_pod", scopes=["pods:delete"], description="Restart a pod by deleting it", type="action", ), ProviderMethod( name="Get Deployment", func_name="get_deployment", scopes=["pods:list"], description="Get deployment information", type="view", ), ProviderMethod( name="Scale Deployment", func_name="scale_deployment", scopes=["deployments:scale"], description="Scale a deployment to specified replicas", type="action", ), ProviderMethod( name="Get Pod Logs", func_name="get_pod_logs", scopes=["pods:logs"], description="Get logs from a pod", type="view", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) try: self._service_account_data = json.loads( self.authentication_config.service_account_json ) self._project_id = self._service_account_data.get("project_id") except Exception: self._service_account_data = None self._project_id = None self._region = self.authentication_config.region self._cluster_name = self.authentication_config.cluster_name self._client = None def dispose(self): """Clean up any resources.""" if self._client: self._client.api_client.rest_client.pool_manager.clear() def validate_config(self): """Validate the provided configuration.""" self.authentication_config = GkeProviderAuthConfig(**self.config.authentication) def validate_scopes(self) -> dict[str, bool | str]: """Validate if the service account has the required permissions.""" if not self._service_account_data or not self._project_id: return {"roles/container.viewer": "Service account JSON is invalid"} scopes = {scope.name: False for scope in self.PROVIDER_SCOPES} try: # Test GKE API permissions credentials = service_account.Credentials.from_service_account_info( self._service_account_data, scopes=["https://www.googleapis.com/auth/cloud-platform"], ) auth_request = requests.Request() credentials.refresh(auth_request) gke_client = ClusterManagerClient(credentials=credentials) try: cluster_name = f"projects/{self._project_id}/locations/{self._region}/clusters/{self._cluster_name}" gke_client.get_cluster(name=cluster_name) scopes["roles/container.viewer"] = True except Exception as e: if "404" in str(e): scopes["roles/container.viewer"] = ( "Cluster not found (404 from GKE), please check the cluster name and region" ) elif "403" in str(e): scopes["roles/container.viewer"] = ( "Permission denied (403 from GKE)" ) else: scopes["roles/container.viewer"] = str(e) # Test Kubernetes API permissions try: k8s_client = self.client # Test pods:list and pods:get try: k8s_client.list_pod_for_all_namespaces(limit=1) scopes["pods:list"] = True scopes["pods:get"] = True except Exception as e: scopes["pods:list"] = str(e) scopes["pods:get"] = str(e) # Test pods:logs try: pods = k8s_client.list_pod_for_all_namespaces(limit=1) if pods.items: pod = pods.items[0] k8s_client.read_namespaced_pod_log( name=pod.metadata.name, namespace=pod.metadata.namespace, container=pod.spec.containers[0].name, limit_bytes=100, ) scopes["pods:logs"] = True except Exception as e: scopes["pods:logs"] = str(e) # Test pods:delete try: if pods.items: pod = pods.items[0] k8s_client.delete_namespaced_pod.__doc__ scopes["pods:delete"] = True except Exception as e: scopes["pods:delete"] = str(e) # Test deployments:scale apps_v1 = client.AppsV1Api() try: deployments = apps_v1.list_deployment_for_all_namespaces(limit=1) if deployments.items: apps_v1.patch_namespaced_deployment_scale.__doc__ scopes["deployments:scale"] = True except Exception as e: scopes["deployments:scale"] = str(e) except Exception as e: for scope in scopes: if scope != "roles/container.viewer": scopes[scope] = str(e) except Exception as e: for scope in scopes: scopes[scope] = str(e) return scopes @property def client(self): """Get or create the Kubernetes client for GKE.""" if self._client is None: self._client = self.__generate_client() return self._client def get_pods(self, namespace: str = None) -> list: """List all pods in a namespace or across all namespaces.""" if namespace: self.logger.info(f"Listing pods in namespace {namespace}") pods = self.client.list_namespaced_pod(namespace=namespace) else: self.logger.info("Listing pods across all namespaces") pods = self.client.list_pod_for_all_namespaces() return [pod.to_dict() for pod in pods.items] def get_pvc(self, namespace: str = None) -> list: """List all PVCs in a namespace or across all namespaces.""" if namespace: self.logger.info(f"Listing PVCs in namespace {namespace}") pvcs = self.client.list_namespaced_persistent_volume_claim( namespace=namespace ) else: self.logger.info("Listing PVCs across all namespaces") pvcs = self.client.list_persistent_volume_claim_for_all_namespaces() return [pvc.to_dict() for pvc in pvcs.items] def get_node_pressure(self) -> list: """Get pressure metrics for all nodes.""" self.logger.info("Getting node pressure metrics") nodes = self.client.list_node() node_pressures = [] for node in nodes.items: pressures = { "name": node.metadata.name, "conditions": [], } for condition in node.status.conditions: if condition.type in [ "MemoryPressure", "DiskPressure", "PIDPressure", ]: pressures["conditions"].append(condition.to_dict()) node_pressures.append(pressures) return node_pressures def exec_command( self, namespace: str, pod_name: str, command: str, container: str = None ) -> str: """Execute a command in a pod.""" if not all([namespace, pod_name]): raise ProviderException( "namespace and pod_name are required for exec_command" ) # Get the pod self.logger.info(f"Reading pod {pod_name} in namespace {namespace}") pod = self.client.read_namespaced_pod(name=pod_name, namespace=namespace) # If container not specified, use first container if not container: container = pod.spec.containers[0].name try: # Execute the command self.logger.info( f"Executing command in pod {pod_name} container {container}" ) exec_command = ( ["/bin/sh", "-c", command] if isinstance(command, str) else command ) result = stream( self.client.connect_get_namespaced_pod_exec, pod_name, namespace, container=container, command=exec_command, stderr=True, stdin=False, stdout=True, tty=False, ) return result except Exception as e: raise ProviderException(f"Failed to execute command: {str(e)}") def restart_pod(self, namespace: str, pod_name: str): """Restart a pod by deleting it.""" if not all([namespace, pod_name]): raise ProviderException( "namespace and pod_name are required for restart_pod" ) self.logger.info(f"Deleting pod {pod_name} in namespace {namespace}") return self.client.delete_namespaced_pod(name=pod_name, namespace=namespace) def get_deployment(self, deployment_name: str, namespace: str = "default"): """Get deployment information.""" if not deployment_name: raise ProviderException("deployment_name is required for get_deployment") apps_v1 = client.AppsV1Api() try: deployment = apps_v1.read_namespaced_deployment( name=deployment_name, namespace=namespace ) return deployment.to_dict() except Exception as e: raise ProviderException(f"Failed to get deployment info: {str(e)}") def scale_deployment(self, namespace: str, deployment_name: str, replicas: int): """Scale a deployment to specified replicas.""" if not all([namespace, deployment_name, replicas is not None]): raise ProviderException( "namespace, deployment_name and replicas are required for scale_deployment" ) apps_v1 = client.AppsV1Api() self.logger.info( f"Scaling deployment {deployment_name} in namespace {namespace} to {replicas} replicas" ) return apps_v1.patch_namespaced_deployment_scale( name=deployment_name, namespace=namespace, body={"spec": {"replicas": replicas}}, ) def get_pod_logs( self, namespace: str, pod_name: str, container: str = None, tail_lines: int = 100, ): """Get logs from a pod.""" if not all([namespace, pod_name]): raise ProviderException( "namespace and pod_name are required for get_pod_logs" ) self.logger.info(f"Getting logs for pod {pod_name} in namespace {namespace}") return self.client.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container, tail_lines=tail_lines, ) def __generate_client(self): """Generate a Kubernetes client configured for GKE.""" try: # Create GKE client with credentials credentials = service_account.Credentials.from_service_account_info( self._service_account_data, scopes=["https://www.googleapis.com/auth/cloud-platform"], ) auth_request = requests.Request() credentials.refresh(auth_request) gke_client = ClusterManagerClient(credentials=credentials) # Get cluster details cluster_name = f"projects/{self._project_id}/locations/{self._region}/clusters/{self._cluster_name}" cluster = gke_client.get_cluster(name=cluster_name) # Generate kubeconfig kubeconfig = { "apiVersion": "v1", "clusters": [ { "cluster": { "certificate-authority-data": cluster.master_auth.cluster_ca_certificate, "server": f"https://{cluster.endpoint}", }, "name": "gke_cluster", } ], "contexts": [ { "context": {"cluster": "gke_cluster", "user": "gke_user"}, "name": "gke_context", } ], "current-context": "gke_context", "kind": "Config", "users": [ { "name": "gke_user", "user": { "auth-provider": { "config": { "access-token": credentials.token, "cmd-args": "config config-helper --format=json", "cmd-path": "gcloud", "expiry-key": "token_expiry", "token-key": "access_token", }, "name": "gcp", } }, } ], } # Load kubeconfig config.load_kube_config_from_dict(config_dict=kubeconfig) return client.CoreV1Api() except Exception as e: raise ProviderException(f"Failed to generate GKE client: {e}") def _query(self, command_type: str, **kwargs: dict): """Query GKE cluster resources. Args: command_type: Type of query to execute **kwargs: Additional arguments will be passed to the query method Returns: Query results based on command type Raises: NotImplementedError: If command type is not implemented """ # Map command types to provider methods command_map = { "get_pods": self.get_pods, "get_pvc": self.get_pvc, "get_node_pressure": self.get_node_pressure, "exec_command": self.exec_command, "restart_pod": self.restart_pod, "get_deployment": self.get_deployment, "scale_deployment": self.scale_deployment, "get_pod_logs": self.get_pod_logs, } if command_type not in command_map: raise NotImplementedError(f"Command type '{command_type}' not implemented") method = command_map[command_type] return method(**kwargs) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Get service account JSON from file with open("sa.json") as f: service_account_data = json.load(f) config = { "authentication": { "service_account_json": json.dumps(service_account_data), "cluster_name": "my-gke-cluster", "region": "us-central1", } } provider = ProvidersFactory.get_provider( context_manager, provider_id="gke-demo", provider_type="gke", provider_config=config, ) # Test the provider print("Validating scopes...") scopes = provider.validate_scopes() print(f"Scopes: {scopes}") print("\nQuerying pods...") pods = provider.query(command_type="get_pods") print(f"Found {len(pods)} pods") print("\nQuerying PVCs...") pvcs = provider.query(command_type="get_pvc") print(f"Found {len(pvcs)} PVCs") print("\nQuerying node pressures...") pressures = provider.query(command_type="get_node_pressure") print(f"Found pressure info for {len(pressures)} nodes") ================================================ FILE: keep/providers/google_chat_provider/__init__.py ================================================ ================================================ FILE: keep/providers/google_chat_provider/google_chat_provider.py ================================================ import dataclasses import http import os import time import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class GoogleChatProviderAuthConfig: """Google Chat authentication configuration.""" webhook_url: HttpsUrl = dataclasses.field( metadata={ "name": "webhook_url", "description": "Google Chat Webhook Url", "required": True, "sensitive": True, "validation": "https_url", }, ) class GoogleChatProvider(BaseProvider): """Send alert message to Google Chat.""" PROVIDER_DISPLAY_NAME = "Google Chat" PROVIDER_TAGS = ["messaging"] PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = GoogleChatProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify(self, message="", **kwargs: dict): """ Notify a message to a Google Chat room using a webhook URL. Args: message (str): The text message to send. Raises: ProviderException: If the message could not be sent successfully. """ self.logger.debug("Notifying message to Google Chat") webhook_url = self.authentication_config.webhook_url if not message: raise ProviderException("Message is required") def __send_message(url, body, headers, retries=3): for attempt in range(retries): try: resp = requests.post(url, json=body, headers=headers) if resp.status_code == http.HTTPStatus.OK: return resp self.logger.warning( f"Attempt {attempt + 1} failed with status code {resp.status_code}" ) except requests.exceptions.RequestException as e: self.logger.error(f"Attempt {attempt + 1} failed: {e}") if attempt < retries - 1: time.sleep(1) raise requests.exceptions.RequestException( f"Failed to notify message after {retries} attempts" ) payload = { "text": message, } request_headers = {"Content-Type": "application/json; charset=UTF-8"} response = __send_message(webhook_url, body=payload, headers=request_headers) if response.status_code != http.HTTPStatus.OK: raise ProviderException( f"Failed to notify message to Google Chat: {response.text}" ) self.logger.debug("Alert message sent to Google Chat successfully") return "Alert message sent to Google Chat successfully" if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables google_chat_webhook_url = os.environ.get("GOOGLE_CHAT_WEBHOOK_URL") # Initialize the provider and provider config config = ProviderConfig( name="Google Chat", description="Google Chat Output Provider", authentication={"webhook_url": google_chat_webhook_url}, ) provider = GoogleChatProvider( context_manager, provider_id="google-chat", config=config ) provider.notify(message="Simple alert showing context with name: John Doe") ================================================ FILE: keep/providers/grafana_incident_provider/__init__.py ================================================ ================================================ FILE: keep/providers/grafana_incident_provider/grafana_incident_provider.py ================================================ """ Grafana Incident Provider is a class that allows to query all incidents from Grafana Incident. """ import dataclasses from datetime import datetime import hashlib from urllib.parse import urljoin import uuid import pydantic import requests from keep.api.models.incident import IncidentDto, IncidentStatus, IncidentSeverity from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseIncidentProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class GrafanaIncidentProviderAuthConfig: """ GrafanaIncidentProviderAuthConfig is a class that allows to authenticate in Grafana Incident. """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Grafana Host URL", "hint": "e.g. https://keephq.grafana.net", "sensitive": False, "validation": "any_http_url", }, ) service_account_token: str = dataclasses.field( metadata={ "required": True, "description": "Service Account Token", "sensitive": True, }, default=None, ) class GrafanaIncidentProvider(BaseIncidentProvider): """ GrafanaIncidentProvider is a class that allows to query all incidents from Grafana Incident. """ PROVIDER_DISPLAY_NAME = "Grafana Incident" PROVIDER_TAGS = ["alert"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authenticated", ), ] PROVIDER_CATEGORY = ["Incident Management"] SEVERITIES_MAP = { "Pending": IncidentSeverity.INFO, "Critical": IncidentSeverity.CRITICAL, "Major": IncidentSeverity.HIGH, "Minor": IncidentSeverity.LOW, "Moderate": IncidentSeverity.WARNING, "Cosmetic": IncidentSeverity.INFO } STATUS_MAP = {"active": IncidentStatus.FIRING, "resolved": IncidentStatus.RESOLVED} def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): """ Validate the configuration of the provider. """ self.authentication_config = GrafanaIncidentProviderAuthConfig( **self.config.authentication ) def __get_headers(self): """ Get the headers for the request. """ return { "Authorization": f"Bearer {self.authentication_config.service_account_token}", "Content-Type": "application/json", } def validate_scopes(self) -> dict[str, bool | str]: """ Validate the scopes of the provider. """ try: response = requests.post( urljoin( self.authentication_config.host_url, "/api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.QueryIncidentPreviews", ), headers=self.__get_headers(), json={ "query": { "limit": 1, "orderDirection": "DESC", "orderField": "createdTime", } }, ) if response.status_code == 200: return {"authenticated": True} else: self.logger.error( f"Failed to validate scopes: {response.status_code}") scopes = { "authenticated": f"Unable to query incidents: {response.status_code}" } except Exception as e: self.logger.error(f"Failed to validate scopes: {e}") scopes = {"authenticated": f"Unable to query incidents: {e}"} return scopes @staticmethod def _get_incident_id(incident_id: str) -> str: """ Create a UUID from the incident id. Args: incident_id (str): The original incident id Returns: str: The UUID """ md5 = hashlib.md5() md5.update(incident_id.encode("utf-8")) return uuid.UUID(md5.hexdigest()) def _get_incidents(self) -> list[IncidentDto]: """ Get the incidents from Grafana Incident """ self.logger.info("Getting incidents from Grafana Incident") cursor = None incidents = [] payload = { "query": { "limit": 50, "orderDirection": "DESC", "orderField": "createdTime", }, } while True: self.logger.info("Getting incidents from Grafana Incident") try: if cursor: payload["cursor"] = cursor response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.QueryIncidentPreviews", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to get incidents from Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to get incidents from Grafana Incident: {response.status_code} - {response.text}" ) data = response.json() incidents.extend(data.get("incidentPreviews", [])) cursor = data.get("cursor") if cursor.get("hasMore") == False: break except Exception as e: self.logger.exception( "Failed to get incidents from Grafana Incident") raise Exception( f"Failed to get incidents from Grafana Incident: {e}") self.logger.info(f"Total incidents: {len(incidents)}") alertDtos = [] def parse_grafana_timestamp(timestamp): try: # Try parsing with milliseconds return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') except ValueError: # Fallback if milliseconds are not present return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ') for incident in incidents: id = self._get_incident_id(incident.get("incidentID")) start_time = None end_time = None created_time = None if incident.get("incidentStart") != "": start_time = parse_grafana_timestamp(incident.get("incidentStart")) if incident.get("incidentEnd") != "": end_time = parse_grafana_timestamp(incident.get("incidentEnd")) if incident.get("createdTime") != "": created_time = parse_grafana_timestamp(incident.get("createdTime")) severity_label = GrafanaIncidentProvider.SEVERITIES_MAP.get( incident.get("severityLabel"), IncidentSeverity.INFO ) status = GrafanaIncidentProvider.STATUS_MAP.get( incident.get("status"), IncidentStatus.FIRING ) alerts_count = len(incidents) alertDto = IncidentDto( id=id, incident_id=incident.get("incidentID"), severity_id=incident.get("severityID"), severity=severity_label, incident_type=incident.get("incidentType"), labels=incident.get("labels", []), is_drill=incident.get("isDrill"), start_time=start_time, end_time=end_time, created_time=created_time, modified_time=incident.get("modifiedTime"), closed_time=incident.get("closedTime"), created_by_user=incident.get("createdByUser", {}), title=incident.get("title"), description=incident.get("description"), summary=incident.get("summary"), hero_image_path=incident.get("heroImagePath"), status=status, slug=incident.get("slug"), incident_start=incident.get("incidentStart"), incident_end=incident.get("incidentEnd"), field_values=incident.get("fieldValues", []), incident_membership_preview=incident.get( "incidentMembershipPreview", {} ), version=incident.get("version"), is_predicted=False, is_candidate=False, services=["incidentPreviews"], alert_sources=["grafana_incident"], alerts_count=alerts_count, fingerprint=incident.get("incidentID") ) alertDtos.append(alertDto) return alertDtos # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#createincident def _create_incident( self, title: str = "", severity: str = "", labels=[], roomPrefix: str = "", isDrill: bool | None = None, status: str = "", attachCaption: str = "", attachURL: str = "" ) -> dict: """ Create an incident in Grafana Incident with the given parameters. """ self.logger.info("Creating incident in Grafana Incident") try: payload = { "title": title, "severity": severity, "labels": labels, "roomPrefix": roomPrefix, "isDrill": isDrill, "status": status, "attachCaption": attachCaption, "attachURL": attachURL, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.CreateIncident", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to create incident in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to create incident in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to create incident in Grafana Incident") raise Exception( f"Failed to create incident in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#removelabel def _remove_label( self, incident_id: str, label: str ) -> dict: """ Remove the incident label in Grafana Incident with the given parameters. """ self.logger.info("Removing incident label in Grafana Incident") try: payload = { "incidentID": incident_id, "label": label, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.RemoveLabel", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to remove incident label in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to remove incident label in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to remove incident label in Grafana Incident") raise Exception( f"Failed to remove incident label in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#unassignlabel def _unassign_label( self, incident_id: str, key: str, value: str ) -> dict: """ Unassign the label in Grafana Incident with the given parameters. """ self.logger.info("Unassigning label in Grafana Incident") try: payload = { "incidentID": incident_id, "key": key, "value": value, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UnassignLabel", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to unassign label in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to unassign label in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to unassign label in Grafana Incident") raise Exception( f"Failed to unassign label in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#unassignlabelbyuuid def _unassign_label_by_uuid( self, incident_id: str, key_uuid: str, value_uuid: str ) -> dict: """ Unassign the label by UUID in Grafana Incident with the given parameters. """ self.logger.info("Unassigning label by UUID in Grafana Incident") try: payload = { "incidentID": incident_id, "keyUUID": key_uuid, "valueUUID": value_uuid, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UnassignLabelByUUID", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to unassign label by UUID in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to unassign label by UUID in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to unassign label by UUID in Grafana Incident") raise Exception( f"Failed to unassign label by UUID in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#unassignrole def _unassign_role( self, incident_id: str, role: str, user_id: str ) -> dict: """ Unassign the role in Grafana Incident with the given parameters. """ self.logger.info("Unassigning role in Grafana Incident") try: payload = { "incidentID": incident_id, "role": role, "userID": user_id, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UnassignRole", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to unassign role in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to unassign role in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to unassign role in Grafana Incident") raise Exception( f"Failed to unassign role in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#updateincidenteventtime def _update_incident_event_time( self, incident_id: str, event_time: str, event_name: str ) -> dict: """ Update the incident event time in Grafana Incident with the given parameters. """ self.logger.info("Updating incident event time in Grafana Incident") try: payload = { "incidentID": incident_id, "eventTime": event_time, "eventName": event_name, "activityItemKind": event_name, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UpdateIncidentEventTime", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to update incident event time in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to update incident event time in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to update incident event time in Grafana Incident") raise Exception( f"Failed to update incident event time in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#updateincidentisdrill def _update_incident_isDrill( self, incident_id: str, isDrill: bool ) -> dict: """ Update the incident isDrill in Grafana Incident with the given parameters. """ self.logger.info("Updating incident isDrill in Grafana Incident") try: payload = { "incidentID": incident_id, "isDrill": isDrill, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UpdateIncidentIsDrill", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to update incident isDrill in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to update incident isDrill in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to update incident isDrill in Grafana Incident") raise Exception( f"Failed to update incident isDrill in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#updateseverity def _update_incident_severity( self, incident_id: str, severity: str ) -> dict: """ Update the incident severity in Grafana Incident with the given parameters. """ self.logger.info("Updating incident severity in Grafana Incident") try: payload = { "incidentID": incident_id, "severity": severity, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UpdateSeverity", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to update incident severity in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to update incident severity in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to update incident severity in Grafana Incident") raise Exception( f"Failed to update incident severity in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#updatestatus def _update_incident_status( self, incident_id: str, status: str ) -> dict: """ Update the incident status in Grafana Incident with the given parameters. """ self.logger.info("Updating incident status in Grafana Incident") try: payload = { "incidentID": incident_id, "status": status, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UpdateStatus", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to update incident status in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to update incident status in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to update incident status in Grafana Incident") raise Exception( f"Failed to update incident status in Grafana Incident: {e}") # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#updatetitle def _update_incident_title( self, incident_id: str, title: str ) -> dict: """ Update the incident title in Grafana Incident with the given parameters. """ self.logger.info("Updating incident title in Grafana Incident") try: payload = { "incidentID": incident_id, "title": title, } response = requests.post( urljoin( self.authentication_config.host_url, "api/plugins/grafana-incident-app/resources/api/v1/IncidentsService.UpdateTitle", ), headers=self.__get_headers(), json=payload, ) if not response.ok: self.logger.error( f"Failed to update incident title in Grafana Incident: {response.status_code}" ) raise Exception( f"Failed to update incident title in Grafana Incident: {response.status_code} - {response.text}" ) return response.json() except Exception as e: self.logger.exception( "Failed to update incident title in Grafana Incident") raise Exception( f"Failed to update incident title in Grafana Incident: {e}") def _notify(self, operationType: str = "", updateType: str = "", **kwargs): if operationType == "create": return self._create_incident(**kwargs) elif operationType == "update": return self._update_incident(updateType, **kwargs) def _update_incident(self, updateType: str, **kwargs): if updateType == "removeLabel": return self._remove_label(**kwargs) elif updateType == "unassignLabel": return self._unassign_label(**kwargs) elif updateType == "unassignLabelByUUID": return self._unassign_label_by_uuid(**kwargs) elif updateType == "unassignRole": return self._unassign_role(**kwargs) elif updateType == "updateIncidentEventTime": return self._update_incident_event_time(**kwargs) elif updateType == "updateIncidentIsDrill": return self._update_incident_isDrill(**kwargs) elif updateType == "updateIncidentSeverity": return self._update_incident_severity(**kwargs) elif updateType == "updateIncidentStatus": return self._update_incident_status(**kwargs) elif updateType == "updateIncidentTitle": return self._update_incident_title(**kwargs) if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os host_url = os.getenv("GRAFANA_HOST_URL") api_token = os.getenv("GRAFANA_SERVICE_ACCOUNT_TOKEN") if host_url is None or api_token is None: raise Exception( "GRAFANA_HOST_URL and GRAFANA_SERVICE_ACCOUNT_TOKEN environment variables are required" ) config = ProviderConfig( description="Grafana Incident Provider", authentication={ "host_url": host_url, "service_account_token": api_token, }, ) provider = GrafanaIncidentProvider( context_manager, provider_id="grafana_incident", config=config, ) provider._get_incidents() ================================================ FILE: keep/providers/grafana_loki_provider/README.md ================================================ ## Grafana Loki Setup using Docker 1. Create a directory called loki. Make loki your current working directory. ```bash mkdir loki cd loki ``` 2. Copy and paste the following command into your command line to download the docker-compose file. ```bash wget https://raw.githubusercontent.com/grafana/loki/v3.4.1/production/docker-compose.yaml -O docker-compose.yaml ``` 3. With loki as the current working directory, run the following ‘docker-compose` command. ```bash docker-compose -f docker-compose.yaml up ``` 4. Verify that Loki is up and running by visiting [http://localhost:3100/ready](http://localhost:3100/ready) in your browser. Note: If the above setup does not work, please refer to the official [Grafana Loki documentation](https://grafana.com/docs/loki/latest/setup/install/docker/#install-with-docker-compose) for latest instructions. ## Grafana Loki Setup using Docker (Basic HTTP Auth) 1. Create a directory called loki. Make loki your current working directory. ```bash mkdir loki cd loki ``` 2. Fetch the `docker-compose.auth.yml` file ```bash wget https://raw.githubusercontent.com/keephq/keep/refs/heads/main/keep/providers/grafana_loki_provider/docker-compose.auth.yml ``` 3. Create a file called `loki-basic-auth.yml` with the following content in the loki directory. ```yaml server: http_listen_port: 9080 grpc_listen_port: 0 positions: filename: /tmp/positions.yaml clients: - url: http://loki:3100/loki/api/v1/push basic_auth: username: admin password: admin scrape_configs: - job_name: system static_configs: - targets: - localhost labels: job: varlogs __path__: /var/log/*log ``` 4. Start the Loki server with Basic HTTP Auth ```bash docker compose -f docker-compose.auth.yml up ``` ================================================ FILE: keep/providers/grafana_loki_provider/__init__.py ================================================ ================================================ FILE: keep/providers/grafana_loki_provider/docker-compose.auth.yml ================================================ version: "3.3" networks: loki: services: loki: image: grafana/loki:latest ports: - "3100:3100" command: -config.file=/etc/loki/local-config.yaml networks: - loki nginx: image: laurentbel/nginx-basic-auth ports: - "80:80" depends_on: - loki environment: - FORWARD_HOST=loki - FORWARD_PORT=3100 - BASIC_USERNAME=admin - BASIC_PASSWORD=admin networks: - loki promtail: image: grafana/promtail:latest volumes: - /var/log:/var/log - ./loki-basic-auth.yml:/etc/promtail/loki-basic-auth.yml command: -config.file=/etc/promtail/loki-basic-auth.yml networks: - loki grafana: environment: - GF_PATHS_PROVISIONING=/etc/grafana/provisioning - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_FEATURE_TOGGLES_ENABLE=alertingSimplifiedRouting,alertingQueryAndExpressionsStepMode entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh image: grafana/grafana:latest ports: - "3200:3000" networks: - loki ================================================ FILE: keep/providers/grafana_loki_provider/grafana_loki_provider.py ================================================ """ GrafanaLokiProvider is a class that allows you to query logs from Grafana Loki. """ import base64 import dataclasses import typing from urllib.parse import urljoin import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class GrafanaLokiProviderAuthConfig: """ GrafanaLokiProviderAuthConfig is a class that allows you to authenticate in Grafana Loki. """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Grafana Loki Host URL", "hint": "e.g. https://keephq.grafana.net", "sensitive": False, "validation": "any_http_url", } ) verify: bool = dataclasses.field( metadata={ "description": "Enable SSL verification", "hint": "SSL verification is enabled by default", "type": "switch", "config_main_group": "authentication", "config_sub_group": "basic_authentication", }, default=True, ) authentication_type: typing.Literal["NoAuth", "Basic", "X-Scope-OrgID"] = ( dataclasses.field( default=typing.cast( typing.Literal["NoAuth", "Basic", "X-Scope-OrgID"], "NoAuth" ), metadata={ "required": True, "description": "Authentication Type", "type": "select", "options": ["NoAuth", "Basic", "X-Scope-OrgID"], }, ) ) username: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "HTTP basic authentication - Username", "sensitive": False, "config_sub_group": "basic_authentication", "config_main_group": "authentication", }, ) password: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "HTTP basic authentication - Password", "sensitive": True, "config_sub_group": "basic_authentication", "config_main_group": "authentication", }, ) x_scope_orgid: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "X-Scope-OrgID Header Authentication", "sensitive": False, "config_sub_group": "x_scope_orgid", "config_main_group": "authentication", }, ) class GrafanaLokiProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Grafana Loki" PROVIDER_TAGS = ["alert"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="Instance is valid and user is authenticated", ), ] PROVIDER_CATEGORY = ["Monitoring"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): """ Validate the configuration of the provider. """ self.authentication_config = GrafanaLokiProviderAuthConfig( **self.config.authentication ) def generate_auth_headers(self): """ Generate the authentication headers. """ credentials = {} if self.authentication_config.authentication_type == "Basic": username_password = f"{self.authentication_config.username}:{self.authentication_config.password}".encode( "utf-8" ) encoded_credentials = base64.b64encode(username_password).decode("utf-8") credentials["Authorization"] = f"Basic {encoded_credentials}" if self.authentication_config.authentication_type == "X-Scope-OrgID": credentials["X-Scope-OrgID"] = self.authentication_config.x_scope_orgid return credentials def validate_scopes(self): """ Validate the scopes of the provider. """ try: response = requests.get( urljoin( self.authentication_config.host_url, "/loki/api/v1/status/buildinfo" ), headers=self.generate_auth_headers(), timeout=5, verify=self.authentication_config.verify, ) if response.status_code != 200: response.raise_for_status() self.logger.info( "Successfully validated scopes", extra={"response": response.json()} ) return {"authenticated": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"error": e}) return {"authenticated": str(e)} def _query( self, query="", limit="", time="", direction="", start="", end="", since="", step="", interval="", queryType="", **kwargs: dict, ): """ Query logs from Grafana Loki. """ if queryType == "query": params = { "query": query, "limit": limit, "time": time, "direction": direction, } params = {k: v for k, v in params.items() if v} response = requests.get( f"{self.authentication_config.host_url}/loki/api/v1/query", headers=self.generate_auth_headers(), params=params, verify=self.authentication_config.verify, ) try: response.raise_for_status() return response.json() except Exception as e: self.logger.error( "Failed to query logs from Grafana Loki", extra={"error": e} ) raise Exception("Could not query logs from Grafana Loki with query") elif queryType == "query_range": params = { "query": query, "limit": limit, "start": start, "end": end, "since": since, "step": step, "interval": interval, "direction": direction, } params = {k: v for k, v in params.items() if v} response = requests.get( f"{self.authentication_config.host_url}/loki/api/v1/query_range", headers=self.generate_auth_headers(), params=params, verify=self.authentication_config.verify, ) try: response.raise_for_status() return response.json() except Exception as e: self.logger.error( "Failed to query logs from Grafana Loki", extra={"error": e} ) raise Exception( "Could not query logs from Grafana Loki with query_range" ) else: self.logger.error("Invalid query type") raise Exception("Invalid query type") if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os grafana_loki_host_url = os.getenv("GRAFANA_LOKI_HOST_URL") config = ProviderConfig( description="Grafana Loki Provider", authentication={ "hostUrl": grafana_loki_host_url, }, ) provider = GrafanaLokiProvider(context_manager, "grafana_loki", config) logs = provider._query(query='sum(rate({job="varlogs"}[5m])) by (level)') print(logs) ================================================ FILE: keep/providers/grafana_oncall_provider/__init__.py ================================================ ================================================ FILE: keep/providers/grafana_oncall_provider/grafana_oncall_provider.py ================================================ """ Grafana Provider is a class that allows to ingest/digest data from Grafana. """ import dataclasses import logging from typing import Literal from urllib.parse import urlparse, urlsplit, urlunparse import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class GrafanaOncallProviderAuthConfig: """ Grafana authentication configuration. """ token: str = dataclasses.field( metadata={ "required": True, "description": "Token", "hint": "Grafana OnCall API Token", }, ) host: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Grafana OnCall Host", "hint": "E.g. https://oncall-prod-us-central-0.grafana.net/oncall/ or http://localhost:8000/", "validation": "any_http_url", }, ) class GrafanaOncallProvider(BaseProvider): """ Create incidents with Grafana OnCall. """ PROVIDER_DISPLAY_NAME = "Grafana OnCall" PROVIDER_CATEGORY = ["Incident Management"] API_URI = "api/v1" provider_description = "Grafana OnCall is an oncall management solution." def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Grafana provider. """ self.authentication_config = GrafanaOncallProviderAuthConfig( **self.config.authentication ) def clean_url(self, url): parsed = urlparse(url) normalized_path = '/'.join(part for part in parsed.path.split('/') if part) _clean_url = urlunparse(parsed._replace(path=f'/{normalized_path}')) return _clean_url def __init__(self, context_manager: ContextManager, provider_id: str, config: ProviderConfig): super().__init__(context_manager, provider_id, config) KEEP_INTEGRATION_NAME = "Keep Integration" if self.config.authentication.get("oncall_integration_link") is not None: return None # Create Grafana OnCall integration if the integration link is not saved headers = { "Authorization": f"{config.authentication['token']}", "Content-Type": "application/json", } response = requests.post( url=self.clean_url(f"{config.authentication['host']}/{self.API_URI}/integrations/"), headers=headers, json={ "name": KEEP_INTEGRATION_NAME, "type":"webhook" }, ) existing_integration_link = None if response.status_code == 400: # If integration already exists, get the link if response.json().get("detail") == "An integration with this name already exists for this team": response = requests.get( url=self.clean_url(f"{config.authentication['host']}/{self.API_URI}/integrations/"), headers=headers, ) response.raise_for_status() for integration in response.json()['results']: if integration.get("name") == KEEP_INTEGRATION_NAME: existing_integration_link = integration.get("link") break elif response.status_code in [200, 201]: response_json = response.json() existing_integration_link = response_json.get("link") else: logger.error(f"Error installing the provider: {response.status_code}") raise Exception(f"Error installing the provider: {response.status_code}") if "integrations/v1/" in urlsplit(existing_integration_link).path: self.config.authentication["oncall_integration_link"] = existing_integration_link else: Exception("Error creating the integration link, the URL is not OnCall formatted.") def _notify( self, title: str, alert_uid: str | None = None, message: str = "", image_url: str = "", state: Literal["alerting", "resolved"] = "alerting", link_to_upstream_details: str = "", **kwargs, ): headers = { "Content-Type": "application/json", } response = requests.post( url=self.config.authentication["oncall_integration_link"], headers=headers, json={ "title": title, "message": message, "alert_uid": alert_uid, "image_url": image_url, "state": state, "link_to_upstream_details": link_to_upstream_details, }, ) response.raise_for_status() return response.json() if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) # Load environment variables import os host = os.environ.get("GRAFANA_ON_CALL_HOST") token = os.environ.get("GRAFANA_ON_CALL_TOKEN") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = { "authentication": {"host": host, "token": token}, } provider: GrafanaOncallProvider = ProvidersFactory.get_provider( context_manager, provider_id="grafana-oncall-keephq", provider_type="oncall", provider_config=config, ) alert = provider.notify("Test Alert") print(alert) ================================================ FILE: keep/providers/grafana_provider/README.md ================================================ ## How to debug with local grafana ### version 9.3.2(with the bug) docker run -d --name=grafana -p 3001:3000 grafana/grafana-enterprise:9.3.2 ### version > 9.4.7 (latest) docker run -d --name=grafana -p 3001:3000 grafana/grafana-enterprise ### Version 10.4 with legacy alerting Create a custom config file Copy# Create a custom config file cat << EOF > grafana.ini [alerting] enabled = true [unified_alerting] enabled = false EOF Run Grafana with legacy alerting enabled ``` docker run -d \ --name=grafana-legacy \ -p 3001:3000 \ -v $(pwd)/grafana.ini:/etc/grafana/grafana.ini \ grafana/grafana-enterprise:10.4.0 ``` Default login credentials: username: admin password: admin only part that needs to be manualy: ``` curl -X POST -H "Content-Type: application/json" \ -u admin:admin \ http://localhost:3001/api/serviceaccounts \ -d '{"name":"keep-service-account","role":"Admin"}' # should get smth like: {"id":2,"name":"keep-service-account","login":"sa-keep-service-account","orgId":1,"isDisabled":false,"role":"Admin","tokens":0,"avatarUrl":""}% # then take the id and: curl -X POST -H "Content-Type: application/json" \ -u admin:admin \ http://localhost:3001/api/serviceaccounts/2/tokens \ -d '{"name":"keep-token"}' # and get {"id":1,"name":"keep-token","key":"glsa_XXXXXX"}% ``` ### For Topology Quickstart Follow this guide: https://grafana.com/docs/tempo/latest/getting-started/docker-example/ ================================================ FILE: keep/providers/grafana_provider/__init__.py ================================================ ================================================ FILE: keep/providers/grafana_provider/alerts_mock.py ================================================ ALERTS = { "HighMemoryConsumption": { "service": "api", "payload": { "condition": "B", "data": [ { "datasourceUid": "datasource2", "model": { "conditions": [ { "evaluator": {"params": [80], "type": "gt"}, "operator": {"type": "or"}, "query": {"params": ["B", "10m", "now"]}, "reducer": {"params": [], "type": "avg"}, "type": "query", } ], "datasource": {"type": "grafana", "uid": "datasource2"}, "expression": "", "hide": False, "intervalMs": 2000, "maxDataPoints": 50, "refId": "B", "type": "classic_conditions", }, "queryType": "", "refId": "B", "relativeTimeRange": {"from": 600, "to": 0}, } ], "execErrState": "Alerting", "folderUID": "keep_alerts", "for_": "10m", "isPaused": False, "labels": {"severity": "warning", "monitor": "memory"}, "noDataState": "NoData", "orgID": 1, "ruleGroup": "keep_group_2", "title": "High Memory Usage", "annotations": { "summary": "Memory Usage High on {{ host.name }}", }, }, "parameters": { "labels.monitor": ["server1", "server2", "server3"], "for_": ["10m", "30m", "1h"], }, "renders": { "host.name": [ "srv1-us1-prod", "srv2-us1-prod", "srv1-eu1-prod", "srv3-us1-prod", "srv2-eu1-prod", "srv1-ap1-prod", "srv2-ap1-prod", "srv1-us2-prod", ], }, }, "NetworkLatencyIsHigh": { "service": "db", "payload": { "condition": "C", "data": [ { "datasourceUid": "datasource3", "model": { "conditions": [ { "evaluator": {"params": [100], "type": "gt"}, "operator": {"type": "and"}, "query": {"params": ["C", "15m", "now"]}, "reducer": {"params": [], "type": "max"}, "type": "query", } ], "datasource": {"type": "grafana", "uid": "datasource3"}, "expression": "", "hide": False, "intervalMs": 3000, "maxDataPoints": 30, "refId": "C", "type": "classic_conditions", }, "queryType": "", "refId": "C", "relativeTimeRange": {"from": 900, "to": 0}, } ], "execErrState": "Alerting", "folderUID": "keep_alerts", "for_": "15m", "isPaused": False, "labels": {"severity": "info", "monitor": "network"}, "noDataState": "NoData", "orgID": 1, "ruleGroup": "keep_group_3", "title": "Network Latency High", "annotations": { "summary": "Network Latency High on {{ host.name }}", }, }, "parameters": { "labels.monitor": ["router1", "router2", "router3"], "for_": ["15m", "45m", "1h"], }, "renders": { "host.name": [ "srv1-us1-prod", "srv2-us1-prod", "srv1-eu1-prod", "srv3-us1-prod", "srv2-eu1-prod", "srv1-ap1-prod", "srv2-ap1-prod", "srv1-us2-prod", ], }, }, } ================================================ FILE: keep/providers/grafana_provider/docker-compose.yml ================================================ version: "3.8" services: grafana: image: grafana/grafana-enterprise:10.4.0 user: "472" # Grafana's default user ID ports: - "3001:3000" volumes: - ./grafana/provisioning:/etc/grafana/provisioning:ro - ./grafana/grafana.ini:/etc/grafana/grafana.ini:ro - ./grafana/png:/var/lib/grafana/png - grafana-storage:/var/lib/grafana environment: - GF_SECURITY_ADMIN_PASSWORD=admin # Add renderer configurations - GF_RENDERING_SERVER_URL=http://renderer:8081/render - GF_RENDERING_CALLBACK_URL=http://grafana:3000/ depends_on: - prometheus - node-exporter-1 - node-exporter-2 - renderer # Add dependency on renderer # Add the renderer service renderer: image: grafana/grafana-image-renderer:latest ports: - "8081:8081" environment: - ENABLE_METRICS=true prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--web.console.libraries=/etc/prometheus/console_libraries" - "--web.console.templates=/etc/prometheus/consoles" node-exporter-1: image: prom/node-exporter:latest container_name: node-exporter-1 ports: - "9100:9100" volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/host/rootfs:ro command: - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" - "--path.rootfs=/host/rootfs" - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" node-exporter-2: image: prom/node-exporter:latest container_name: node-exporter-2 ports: - "9101:9100" volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/host/rootfs:ro command: - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" - "--path.rootfs=/host/rootfs" - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" volumes: grafana-storage: {} ================================================ FILE: keep/providers/grafana_provider/grafana/grafana.ini ================================================ [log] filters = rendering:debug,ngalert:debug,ngalert.image:debug [alerting] enabled = false # Keep this disabled for unified alerting [unified_alerting] enabled = true [unified_alerting.screenshots] capture = true upload_external_image_storage = true max_concurrent = 5 capture_timeout = 10s [external_image_storage] provider = local path = /var/lib/grafana/png [server] root_url = http://localhost:3001 protocol = http domain = localhost:3001 [database] wal = true url = sqlite3:///var/lib/grafana/grafana.db?_busy_timeout=500 [service_accounts] enabled = true [rendering] server_url = http://renderer:8081/render callback_url = http://grafana:3000/ mode = server ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/access_control/custom_roles.yml ================================================ apiVersion: 1 roles: - version: 1 uid: keep_service_role name: Keep Service Role description: Role for Keep integration orgId: 1 global: false permissions: - action: "alert.rules:read" scope: "alerts:*" - action: "alert.provisioning:read" scope: "alerts:*" - action: "alert.provisioning:write" scope: "alerts:*" ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/alerting/alerts.yml ================================================ apiVersion: 1 groups: - orgId: 1 name: System Alerts folder: System interval: 10s rules: - uid: high_cpu_alert title: High CPU Usage condition: B data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: PBFA97CFB590B2093 model: editorMode: code expr: sum by(instance) (rate(node_cpu_seconds_total{mode="user"}[5m])) * 100 hide: false intervalMs: 1000 maxDataPoints: 43200 range: true refId: A - refId: B datasourceUid: __expr__ model: conditions: - evaluator: params: [1] type: gt operator: type: and query: params: [A] reducer: type: last params: [] type: query expression: A intervalMs: 1000 reducer: last type: reduce refId: B dashboardUid: system panelId: 1 noDataState: NoData execErrState: Alerting for: 30s annotations: description: "CPU usage is above threshold for instance {{ $labels.instance }}" labels: severity: warning isPaused: false - uid: high_memory_alert title: High Memory Usage condition: B data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: PBFA97CFB590B2093 model: editorMode: code expr: ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) hide: false intervalMs: 1000 maxDataPoints: 43200 range: true refId: A - refId: B datasourceUid: __expr__ model: conditions: - evaluator: params: [90] type: gt operator: type: and query: params: [A] reducer: type: last params: [] type: query expression: A intervalMs: 1000 reducer: last type: reduce refId: B dashboardUid: main panelId: 2 noDataState: NoData execErrState: Alerting for: 30s annotations: description: "Memory usage is above 90% for instance {{ $labels.instance }}" labels: severity: warning isPaused: false ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/alerting/contact_points.yml ================================================ apiVersion: 1 contactPoints: - name: "api-notifications" # This name is what policies refer to receivers: - uid: "api-notifications" # This is internal uid type: "webhook" settings: url: "https://3c56569dbd81.ngrok.app/alerts/event/grafana?api_key=1234567890" httpMethod: "POST" ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/alerting/notification_policies.yml ================================================ apiVersion: 1 policies: - orgId: 1 receiver: "api-notifications" # Changed from api-webhook to api-notifications group_by: ["alertname"] routes: - receiver: "api-notifications" # Changed this too group_by: ["..."] matchers: - severity =~ "warning|critical" group_wait: 30s group_interval: 5m repeat_interval: 4h ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/dashboards/dashboards.yml ================================================ apiVersion: 1 providers: - name: "default" orgId: 1 folder: "" type: file disableDeletion: false updateIntervalSeconds: 10 allowUiUpdates: true options: path: /etc/grafana/provisioning/dashboards ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/dashboards/system.json ================================================ { "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "panels": [ { "alert": { "alertRuleTags": { "severity": "critical" }, "conditions": [ { "evaluator": { "params": [0], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": ["A", "5m", "now"] }, "reducer": { "params": [], "type": "last" }, "type": "query" } ], "executionErrorState": "alerting", "for": "30s", "frequency": "10s", "handler": 1, "message": "Critical: High CPU Usage on instance ${instance}: ${value}%", "name": "Critical CPU Alert", "noDataState": "no_data", "notifications": [ { "uid": "email-notifier" } ] }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "id": 1, "options": { "alertThreshold": true }, "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "sum(rate(node_cpu_seconds_total{mode=\"user\"}[5m])) by (instance) * 100", "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0, "visible": true } ], "title": "CPU Usage (Critical Alert)", "type": "graph" }, { "alert": { "alertRuleTags": { "severity": "warning" }, "conditions": [ { "evaluator": { "params": [60], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": ["A", "5m", "now"] }, "reducer": { "params": [], "type": "last" }, "type": "query" } ], "executionErrorState": "alerting", "for": "30s", "frequency": "10s", "handler": 1, "message": "Warning: Elevated CPU Usage on instance ${instance}: ${value}%", "name": "Warning CPU Alert", "noDataState": "no_data", "notifications": [ { "uid": "email-notifier" } ] }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "id": 2, "options": { "alertThreshold": true }, "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "sum(rate(node_cpu_seconds_total{mode=\"user\"}[5m])) by (instance) * 100", "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [ { "colorMode": "warning", "fill": true, "line": true, "op": "gt", "value": 60, "visible": true } ], "title": "CPU Usage (Warning Alert)", "type": "graph" } ], "refresh": "5s", "schemaVersion": 39, "tags": [], "title": "System Metrics", "uid": "system", "version": 1 } ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/datasources/datasource.yml ================================================ apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/notifiers/email.yml ================================================ apiVersion: 1 notifiers: - name: email-notifier type: email uid: email-notifier org_id: 1 is_default: true settings: addresses: alerts@example.com secure_settings: {} ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/service_accounts/service_accounts.yml ================================================ apiVersion: 1 serviceAccounts: - name: keep-service-account role: Admin orgId: 1 ================================================ FILE: keep/providers/grafana_provider/grafana/provisioning/service_accounts/tokens.yml ================================================ apiVersion: 1 serviceAccountTokens: - name: keep-token serviceAccountId: 1 secondsToLive: 0 ================================================ FILE: keep/providers/grafana_provider/grafana_alert_format_description.py ================================================ from __future__ import annotations from typing import List, Literal from pydantic import BaseModel, Field class Evaluator(BaseModel): params: List[int] type: str class Operator(BaseModel): type: str class Query(BaseModel): params: List class Reducer(BaseModel): params: List type: str class Condition(BaseModel): evaluator: Evaluator operator: Operator query: Query reducer: Reducer type: str class Datasource(BaseModel): type: str uid: str class Model1(BaseModel): conditions: List[Condition] datasource: Datasource expression: str hide: bool intervalMs: int maxDataPoints: int refId: str type: str class RelativeTimeRange(BaseModel): from_: int = Field(..., alias="from") to: int class Datum(BaseModel): datasourceUid: str model: Model1 queryType: str refId: str relativeTimeRange: RelativeTimeRange class GrafanaAlertFormatDescription(BaseModel): condition: str = Field( ..., max_length=1, description="Must be one of the refId in data" ) data: List[Datum] execErrState: Literal["OK", "Alerting", "Error"] folderUID: str = Field( ..., min_length=1, max_length=30, description="Folder UID, cannot be empty", required=True, ) for_: str = Field(..., alias="for", description="For example: 5m/1h/1d") isPaused: bool labels: dict = Field(..., description="Key-value pairs, cannot be empty") noDataState: Literal["NoData", "OK", "Alerting"] orgID: int ruleGroup: str = Field( ..., max_length=190, min_length=1, description="Rule group name" ) title: str = Field( ..., max_length=190, min_length=1, description="Alert title", required=True ) class Config: schema_extra = { "example": { "condition": "A", "folderUID": "keep_alerts", "labels": {"team": "sre-team-1"}, "ruleGroup": "keep_group_1", }, } ================================================ FILE: keep/providers/grafana_provider/grafana_provider.py ================================================ """ Grafana Provider is a class that allows to ingest/digest data from Grafana. """ import dataclasses import datetime import hashlib import json import logging import re import time import pydantic import requests from packaging.version import Version from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import ( BaseProvider, BaseTopologyProvider, ProviderHealthMixin, ) from keep.providers.base.provider_exceptions import GetAlertException from keep.providers.grafana_provider.grafana_alert_format_description import ( GrafanaAlertFormatDescription, ) from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class GrafanaProviderAuthConfig: """ Grafana authentication configuration. """ token: str = dataclasses.field( metadata={ "required": True, "description": "Token", "hint": "Grafana Token", "sensitive": True, }, ) host: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Grafana host", "hint": "e.g. https://keephq.grafana.net", "validation": "any_http_url", }, ) datasource_uid: str = dataclasses.field( metadata={ "required": False, "description": "Datasource UID", "hint": "Provide if you want to pull topology data", }, default="", ) class GrafanaProvider(BaseTopologyProvider, ProviderHealthMixin): PROVIDER_DISPLAY_NAME = "Grafana" """Pull/Push alerts & Topology map from Grafana.""" PROVIDER_CATEGORY = ["Monitoring", "Developer Tools"] KEEP_GRAFANA_WEBHOOK_INTEGRATION_NAME = "keep-grafana-webhook-integration" FINGERPRINT_FIELDS = ["fingerprint"] webhook_description = "" webhook_template = "" webhook_markdown = """If your Grafana is unreachable from Keep, you can use the following webhook url to configure Grafana to send alerts to Keep: 1. In Grafana, go to the Alerting tab in the Grafana dashboard. 2. Click on Contact points in the left sidebar and create a new one. 3. Give it a name and select Webhook as kind of contact point with webhook url as {keep_webhook_api_url}. 4. Add 'X-API-KEY' as the request header {api_key}. 5. Save the webhook. 6. Click on Notification policies in the left sidebar 7. Click on "New child policy" under the "Default policy" 8. Remove all matchers until you see the following: "If no matchers are specified, this notification policy will handle all alert instances." 9. Chose the webhook contact point you have just created under Contact point and click "Save Policy" """ PROVIDER_SCOPES = [ ProviderScope( name="alert.rules:read", description="Read Grafana alert rules in a folder and its subfolders.", mandatory=True, mandatory_for_webhook=False, documentation_url="https://grafana.com/docs/grafana/latest/administration/roles-and-permissions/access-control/custom-role-actions-scopes/", alias="Rules Reader", ), ProviderScope( name="alert.provisioning:read", description="Read all Grafana alert rules, notification policies, etc via provisioning API.", mandatory=False, mandatory_for_webhook=True, documentation_url="https://grafana.com/docs/grafana/latest/administration/roles-and-permissions/access-control/custom-role-actions-scopes/", alias="Access to alert rules provisioning API", ), ProviderScope( name="alert.provisioning:write", description="Update all Grafana alert rules, notification policies, etc via provisioning API.", mandatory=False, mandatory_for_webhook=True, documentation_url="https://grafana.com/docs/grafana/latest/administration/roles-and-permissions/access-control/custom-role-actions-scopes/", alias="Access to alert rules provisioning API", ), ] SEVERITIES_MAP = { "critical": AlertSeverity.CRITICAL, "high": AlertSeverity.HIGH, "warning": AlertSeverity.WARNING, "info": AlertSeverity.INFO, } # https://grafana.com/docs/grafana/latest/alerting/manage-notifications/view-state-health/#alert-instance-state STATUS_MAP = { "ok": AlertStatus.RESOLVED, "resolved": AlertStatus.RESOLVED, "normal": AlertStatus.RESOLVED, "paused": AlertStatus.SUPPRESSED, "alerting": AlertStatus.FIRING, "pending": AlertStatus.PENDING, "no_data": AlertStatus.PENDING, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Grafana provider. """ self.authentication_config = GrafanaProviderAuthConfig( **self.config.authentication ) def validate_scopes(self) -> dict[str, bool | str]: headers = {"Authorization": f"Bearer {self.authentication_config.token}"} permissions_api = ( f"{self.authentication_config.host}/api/access-control/user/permissions" ) try: response = requests.get( permissions_api, headers=headers, timeout=5, verify=False ).json() except requests.exceptions.ConnectionError: self.logger.exception("Failed to connect to Grafana") validated_scopes = { scope.name: "Failed to connect to Grafana. Please check your host." for scope in self.PROVIDER_SCOPES } return validated_scopes except Exception: self.logger.exception("Failed to get permissions from Grafana") validated_scopes = { scope.name: "Failed to get permissions. Please check your token." for scope in self.PROVIDER_SCOPES } return validated_scopes validated_scopes = {} for scope in self.PROVIDER_SCOPES: if scope.name in response: validated_scopes[scope.name] = True else: validated_scopes[scope.name] = "Missing scope" return validated_scopes def get_provider_metadata(self) -> dict: version = self._get_grafana_version() return { "version": version, } def get_alerts_configuration(self, alert_id: str | None = None): api = f"{self.authentication_config.host}/api/v1/provisioning/alert-rules" headers = {"Authorization": f"Bearer {self.authentication_config.token}"} response = requests.get(api, verify=False, headers=headers) if not response.ok: self.logger.warning( "Could not get alerts", extra={"response": response.json()} ) error = response.json() if response.status_code == 403: error[ "message" ] += f"\nYou can test your permissions with \n\tcurl -H 'Authorization: Bearer {{token}}' -X GET '{self.authentication_config.host}/api/access-control/user/permissions' | jq \nDocs: https://grafana.com/docs/grafana/latest/administration/service-accounts/#debug-the-permissions-of-a-service-account-token" raise GetAlertException(message=error, status_code=response.status_code) return response.json() def deploy_alert(self, alert: dict, alert_id: str | None = None): self.logger.info("Deploying alert") api = f"{self.authentication_config.host}/api/v1/provisioning/alert-rules" headers = {"Authorization": f"Bearer {self.authentication_config.token}"} response = requests.post(api, verify=False, json=alert, headers=headers) if not response.ok: response_json = response.json() self.logger.warning( "Could not deploy alert", extra={"response": response_json} ) raise Exception(response_json) self.logger.info( "Alert deployed", extra={ "response": response.json(), "status": response.status_code, }, ) @staticmethod def get_alert_schema(): return GrafanaAlertFormatDescription.schema() @staticmethod def get_service(alert: dict) -> str: """ Get service from alert. """ labels = alert.get("labels", {}) return alert.get("service", labels.get("service", "unknown")) @staticmethod def calculate_fingerprint(alert: dict) -> str: """ Calculate fingerprint for alert. """ # First, try to get fingerprint from alert fingerprint = alert.get("fingerprint", "") if fingerprint: logger.debug("Fingerprint provided in alert") return fingerprint labels = alert.get("labels", {}) fingerprint = labels.get("fingerprint", "") if fingerprint: logger.debug("Fingerprint provided in alert labels") return fingerprint fingerprint_string = None if not labels: logger.warning( "No labels found in alert will use old behaviour", extra={ "labels": labels, }, ) else: try: logger.info( "No fingerprint in alert, calculating fingerprint by labels" ) fingerprint_string = json.dumps(labels) except Exception: logger.exception( "Failed to calculate fingerprint", extra={ "labels": labels, }, ) # from some reason, the fingerprint is not provided in the alert + no labels or failed to calculate if not fingerprint_string: # old behavior service = GrafanaProvider.get_service(alert) fingerprint_string = alert.get( "fingerprint", alert.get("alertname", "") + service ) fingerprint = hashlib.sha256(fingerprint_string.encode()).hexdigest() return fingerprint @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: # Check if this is a legacy alert based on structure if "evalMatches" in event: return GrafanaProvider._format_legacy_alert(event) alerts = event.get("alerts", []) logger.info("Formatting Grafana alerts", extra={"num_of_alerts": len(alerts)}) formatted_alerts = [] for alert in alerts: labels = alert.get("labels", {}) # map status and severity to Keep format: status = GrafanaProvider.STATUS_MAP.get( event.get("status"), AlertStatus.FIRING ) severity = GrafanaProvider.SEVERITIES_MAP.get( labels.get("severity"), AlertSeverity.INFO ) fingerprint = GrafanaProvider.calculate_fingerprint(alert) environment = labels.get( "deployment_environment", labels.get("environment", "unknown") ) extra = {} annotations = alert.get("annotations", {}) if annotations: extra["annotations"] = annotations values = alert.get("values", {}) if values: extra["values"] = values url = alert.get("generatorURL", None) image_url = alert.get("imageURL", None) # Always set these as "" when absent so workflow templates can # reference them safely without triggering render_context safe=True errors. dashboard_url = alert.get("dashboardURL", "") panel_url = alert.get("panelURL", "") silence_url = alert.get("silenceURL", "") description = alert.get("annotations", {}).get("description") or alert.get( "annotations", {} ).get("summary", "") valueString = alert.get("valueString", "") alert_dto = AlertDto( id=alert.get("fingerprint"), fingerprint=fingerprint, name=event.get("title"), status=status, severity=severity, environment=environment, lastReceived=datetime.datetime.now( tz=datetime.timezone.utc ).isoformat(), description=description, source=["grafana"], labels=labels, url=url or None, imageUrl=image_url or None, dashboardUrl=dashboard_url, panelUrl=panel_url, silenceURL=silence_url, valueString=valueString, value="", datasource="", **extra, # add annotations and values ) # enrich extra payload with labels for label in labels: if getattr(alert_dto, label, None) is None: setattr(alert_dto, label, labels[label]) formatted_alerts.append(alert_dto) return formatted_alerts @staticmethod def _format_legacy_alert(event: dict) -> AlertDto: # Legacy alerts have a different structure status = ( AlertStatus.FIRING if event.get("state") == "alerting" else AlertStatus.RESOLVED ) severity = GrafanaProvider.SEVERITIES_MAP.get("critical", AlertSeverity.INFO) alert_dto = AlertDto( id=str(event.get("ruleId", "")), fingerprint=str(event.get("ruleId", "")), name=event.get("ruleName", ""), status=status, severity=severity, lastReceived=datetime.datetime.now(tz=datetime.timezone.utc).isoformat(), description=event.get("message", ""), source=["grafana"], labels={ "metric": event.get("metric", ""), "ruleId": str(event.get("ruleId", "")), "ruleName": event.get("ruleName", ""), "ruleUrl": event.get("ruleUrl", ""), "state": event.get("state", ""), }, ) return [alert_dto] def _get_grafana_version(self) -> str: """Get the Grafana version (PEP 440-compatible for comparison). Grafana Cloud/Enterprise returns versions like '13.0.0-22843068776.patch2' which packaging.version.Version cannot parse. We extract the base semantic version (e.g. '13.0.0') before returning. """ try: headers = {"Authorization": f"Bearer {self.authentication_config.token}"} health_url = f"{self.authentication_config.host}/api/health" resp = requests.get(health_url, verify=False, headers=headers, timeout=5) if resp.ok: health_data = resp.json() raw_version = health_data.get("version", "unknown") if not raw_version or raw_version == "unknown": return "0.0.0" match = re.match(r"^(\d+\.\d+(?:\.\d+)?)", raw_version) return match.group(1) if match else "0.0.0" else: self.logger.warning( f"Failed to get Grafana version: {resp.status_code}" ) return "unknown" except Exception as e: self.logger.error(f"Error getting Grafana version: {str(e)}") return "unknown" def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): self.logger.info("Setting up webhook") webhook_name = ( f"{GrafanaProvider.KEEP_GRAFANA_WEBHOOK_INTEGRATION_NAME}-{tenant_id}" ) headers = {"Authorization": f"Bearer {self.authentication_config.token}"} contacts_api = ( f"{self.authentication_config.host}/api/v1/provisioning/contact-points" ) try: self.logger.info("Getting contact points") all_contact_points = requests.get( contacts_api, verify=False, headers=headers ) all_contact_points.raise_for_status() all_contact_points = all_contact_points.json() except Exception: self.logger.exception("Failed to get contact points") raise # check if webhook already exists webhook_exists = [ webhook_exists for webhook_exists in all_contact_points if webhook_exists.get("name") == webhook_name or webhook_exists.get("uid") == webhook_name ] # grafana version lesser then 9.4.7 do not send their authentication correctly # therefor we need to add the api_key as a query param instead of the normal digest token self.logger.info("Getting Grafana version") try: grafana_version = self._get_grafana_version() except Exception: self.logger.exception("Failed to get Grafana version") raise self.logger.info(f"Grafana version is {grafana_version}") # if grafana version is greater then 9.4.7 we can use the digest token if Version(grafana_version) > Version("9.4.7"): self.logger.info("Installing Grafana version > 9.4.7") if webhook_exists: webhook = webhook_exists[0] webhook["settings"]["url"] = keep_api_url webhook["settings"]["authorization_scheme"] = "digest" webhook["settings"]["authorization_credentials"] = api_key requests.put( f'{contacts_api}/{webhook["uid"]}', verify=False, json=webhook, headers=headers, ) self.logger.info(f'Updated webhook {webhook["uid"]}') else: self.logger.info('Creating webhook with name "{webhook_name}"') webhook = { "name": webhook_name, "type": "webhook", "settings": { "httpMethod": "POST", "url": keep_api_url, "authorization_scheme": "digest", "authorization_credentials": api_key, }, } response = requests.post( contacts_api, verify=False, json=webhook, headers={**headers, "X-Disable-Provenance": "true"}, ) if not response.ok: raise Exception(response.json()) self.logger.info(f"Created webhook {webhook_name}") # if grafana version is lesser then 9.4.7 we need to add the api_key as a query param else: self.logger.info("Installing Grafana version < 9.4.7") if webhook_exists: webhook = webhook_exists[0] webhook["settings"]["url"] = f"{keep_api_url}&api_key={api_key}" requests.put( f'{contacts_api}/{webhook["uid"]}', verify=False, json=webhook, headers=headers, ) self.logger.info(f'Updated webhook {webhook["uid"]}') else: self.logger.info('Creating webhook with name "{webhook_name}"') webhook = { "name": webhook_name, "type": "webhook", "settings": { "httpMethod": "POST", "url": f"{keep_api_url}?api_key={api_key}", }, } response = requests.post( contacts_api, verify=False, json=webhook, headers={**headers, "X-Disable-Provenance": "true"}, ) if not response.ok: raise Exception(response.json()) self.logger.info(f"Created webhook {webhook_name}") # Finally, we need to update the policies to match the webhook if setup_alerts: self.logger.info("Setting up alerts") policies_api = ( f"{self.authentication_config.host}/api/v1/provisioning/policies" ) all_policies = requests.get( policies_api, verify=False, headers=headers ).json() policy_exists = any( [ p for p in all_policies.get("routes", []) if p.get("receiver") == webhook_name ] ) if not policy_exists: if all_policies["receiver"]: default_policy = { "receiver": all_policies["receiver"], "continue": True, } if not any( [ p for p in all_policies.get("routes", []) if p == default_policy ] ): # This is so we won't override the default receiver if customer has one. if "routes" not in all_policies: all_policies["routes"] = [] all_policies["routes"].append( {"receiver": all_policies["receiver"], "continue": True} ) all_policies["routes"].append( { "receiver": webhook_name, "continue": True, } ) requests.put( policies_api, verify=False, json=all_policies, headers={**headers, "X-Disable-Provenance": "true"}, ) self.logger.info("Updated policices to match alerts to webhook") else: self.logger.info("Policies already match alerts to webhook") # After setting up unified alerting, check and setup legacy alerting if enabled try: self.logger.info("Checking legacy alerting") if self._is_legacy_alerting_enabled(): self.logger.info("Legacy alerting is enabled") self._setup_legacy_alerting_webhook( webhook_name, keep_api_url, api_key, setup_alerts ) self.logger.info("Legacy alerting setup successful") except Exception: self.logger.warning( "Failed to check or setup legacy alerting", exc_info=True ) self.logger.info("Webhook successfuly setup") def _get_all_alerts(self, alerts_api: str, headers: dict) -> list: """Helper function to get all alerts with proper pagination""" all_alerts = [] page = 0 page_size = 1000 # Grafana's recommended limit try: while True: params = { "dashboardId": None, "panelId": None, "limit": page_size, "startAt": page * page_size, } self.logger.debug( f"Fetching alerts page {page + 1}", extra={"params": params} ) response = requests.get( alerts_api, params=params, verify=False, headers=headers, timeout=30 ) response.raise_for_status() page_alerts = response.json() if not page_alerts: # No more alerts to fetch break all_alerts.extend(page_alerts) # If we got fewer alerts than the page size, we've reached the end if len(page_alerts) < page_size: break page += 1 time.sleep(0.2) # Add delay to avoid rate limiting self.logger.info(f"Successfully fetched {len(all_alerts)} alerts") return all_alerts except requests.exceptions.RequestException as e: self.logger.error("Failed to fetch alerts", extra={"error": str(e)}) raise def _is_legacy_alerting_enabled(self) -> bool: """Check if legacy alerting is enabled by trying to access legacy endpoints""" try: headers = {"Authorization": f"Bearer {self.authentication_config.token}"} notification_api = ( f"{self.authentication_config.host}/api/alert-notifications" ) response = requests.get(notification_api, verify=False, headers=headers) # If we get a 404, legacy alerting is disabled # If we get a 200, legacy alerting is enabled # If we get a 401/403, we don't have permissions return response.status_code == 200 except Exception: self.logger.warning("Failed to check legacy alerting status", exc_info=True) return False def _update_dashboard_alert( self, dashboard_uid: str, panel_id: int, notification_uid: str, headers: dict ) -> bool: """Helper function to update a single dashboard alert""" try: # Get the dashboard dashboard_api = ( f"{self.authentication_config.host}/api/dashboards/uid/{dashboard_uid}" ) dashboard_response = requests.get( dashboard_api, verify=False, headers=headers, timeout=30 ) dashboard_response.raise_for_status() dashboard = dashboard_response.json()["dashboard"] updated = False # Find the panel and update its alert for panel in dashboard.get("panels", []): if panel.get("id") == panel_id and "alert" in panel: if "notifications" not in panel["alert"]: panel["alert"]["notifications"] = [] # Check if notification already exists if not any( notif.get("uid") == notification_uid for notif in panel["alert"]["notifications"] ): panel["alert"]["notifications"].append( {"uid": notification_uid} ) updated = True if updated: # Update the dashboard update_dashboard_api = ( f"{self.authentication_config.host}/api/dashboards/db" ) update_response = requests.post( update_dashboard_api, verify=False, json={"dashboard": dashboard, "overwrite": True}, headers=headers, timeout=30, ) update_response.raise_for_status() return True return False except requests.exceptions.RequestException as e: self.logger.warning( f"Failed to update dashboard {dashboard_uid}", extra={"error": str(e)} ) return False def _setup_legacy_alerting_webhook( self, webhook_name: str, keep_api_url: str, api_key: str, setup_alerts: bool = True, ): """Setup webhook for legacy alerting""" self.logger.info("Setting up legacy alerting notification channel") headers = {"Authorization": f"Bearer {self.authentication_config.token}"} try: # Create legacy notification channel notification_api = ( f"{self.authentication_config.host}/api/alert-notifications" ) self.logger.debug(f"Using notification API endpoint: {notification_api}") notification = { "name": webhook_name, "type": "webhook", "isDefault": False, "sendReminder": False, "settings": { "url": keep_api_url, "httpMethod": "POST", "username": "keep", "password": api_key, }, } self.logger.debug(f"Prepared notification config: {notification}") # Check if notification channel exists self.logger.info("Checking for existing notification channels") existing_channels = requests.get( notification_api, verify=False, headers=headers ).json() self.logger.debug(f"Found {len(existing_channels)} existing channels") channel_exists = any( channel for channel in existing_channels if channel.get("name") == webhook_name ) if not channel_exists: self.logger.info(f"Creating new notification channel '{webhook_name}'") response = requests.post( notification_api, verify=False, json=notification, headers=headers ) if not response.ok: error_msg = response.json() self.logger.error( f"Failed to create notification channel: {error_msg}" ) raise Exception(error_msg) notification_uid = response.json().get("uid") self.logger.info( f"Created legacy notification channel with UID: {notification_uid}" ) else: self.logger.info( f"Legacy notification channel '{webhook_name}' already exists" ) notification_uid = next( channel["uid"] for channel in existing_channels if channel.get("name") == webhook_name ) self.logger.debug( f"Using existing notification channel UID: {notification_uid}" ) if setup_alerts: alerts_api = f"{self.authentication_config.host}/api/alerts" self.logger.info("Starting alert setup process") # Get all alerts using the helper function self.logger.info("Fetching all alerts") all_alerts = self._get_all_alerts(alerts_api, headers) self.logger.info(f"Found {len(all_alerts)} alerts to process") updated_count = 0 for alert in all_alerts: dashboard_uid = alert.get("dashboardUid") panel_id = alert.get("panelId") if dashboard_uid and panel_id: self.logger.debug( f"Processing alert - Dashboard: {dashboard_uid}, Panel: {panel_id}" ) if self._update_dashboard_alert( dashboard_uid, panel_id, notification_uid, headers ): updated_count += 1 self.logger.debug( f"Successfully updated alert {updated_count}" ) # Add delay to avoid rate limiting time.sleep(0.1) self.logger.info( f"Completed alert updates - Updated {updated_count} alerts with notification channel" ) except Exception as e: self.logger.exception(f"Failed to setup legacy alerting: {str(e)}") raise def __extract_rules(self, alerts: dict, source: list) -> list[AlertDto]: alert_ids = [] alert_dtos = [] for group in alerts.get("data", {}).get("groups", []): for rule in group.get("rules", []): for alert in rule.get("alerts", []): alert_id = rule.get( "id", rule.get("name", "").replace(" ", "_").lower() ) if alert_id in alert_ids: # de duplicate alerts continue description = alert.get("annotations", {}).pop( "description", None ) or alert.get("annotations", {}).get("summary", rule.get("name")) labels = {k.lower(): v for k, v in alert.get("labels", {}).items()} annotations = { k.lower(): v for k, v in alert.get("annotations", {}).items() } try: status = alert.get("state", rule.get("state")) status = GrafanaProvider.STATUS_MAP.get( status, AlertStatus.FIRING ) alert_dto = AlertDto( id=alert_id, name=rule.get("name"), description=description, status=status, lastReceived=alert.get("activeAt"), source=source, **labels, **annotations, ) alert_ids.append(alert_id) alert_dtos.append(alert_dto) except Exception: self.logger.warning( "Failed to parse alert", extra={ "alert_id": alert_id, "alert_name": rule.get("name"), }, ) continue return alert_dtos def _get_alerts_datasource(self) -> list: """ Get raw alerts from all available datasources (Prometheus, Loki, Grafana, Alertmanager). Returns a list of raw alert dictionaries, or an empty list if there are errors. """ self.logger.info("Starting to fetch alerts from Grafana datasources") headers = {"Authorization": f"Bearer {self.authentication_config.token}"} all_alerts = [] # Step 1: Get all datasources try: self.logger.info("Fetching list of datasources") datasources_url = f"{self.authentication_config.host}/api/datasources" datasources_resp = requests.get( datasources_url, headers=headers, timeout=5, verify=False ) if datasources_resp.status_code != 200: self.logger.error( f"Failed to get datasources: {datasources_resp.status_code}", extra={"response_text": datasources_resp.text[:500]}, ) return [] self.logger.info( f"Successfully fetched datasources, got {len(datasources_resp.json())} datasources" ) except Exception as e: self.logger.error(f"Error fetching datasources list: {str(e)}") return [] # Step 2: Extract relevant datasources (Prometheus, Loki, Mimir) alert_datasources = [] try: for ds in datasources_resp.json(): if ( ds.get("type") in ["prometheus", "loki"] or "mimir" in ds.get("name", "").lower() ): alert_datasources.append( { "uid": ds.get("uid"), "name": ds.get("name"), "type": ds.get("type"), } ) self.logger.info( f"Found {len(alert_datasources)} alert-capable datasources" ) except Exception as e: self.logger.error(f"Error parsing datasources: {str(e)}") return [] # Step 3: Query alerts from each datasource for ds in alert_datasources: try: # Log the datasource we're about to query self.logger.info( f"Querying alerts for datasource: {ds.get('name')}", extra={"datasource": ds}, ) # Different endpoint based on datasource type if ds.get("type") == "loki": # For Loki, use the Prometheus-compatible alerts endpoint alert_url = f"{self.authentication_config.host}/api/datasources/proxy/uid/{ds.get('uid')}/prometheus/api/v1/alerts" else: # For Prometheus/Mimir, use the standard alerts endpoint alert_url = f"{self.authentication_config.host}/api/datasources/proxy/uid/{ds.get('uid')}/api/v1/alerts" # Query the alerts endpoint self.logger.info(f"Querying {ds.get('name')} alerts at: {alert_url}") resp = requests.get(alert_url, headers=headers, timeout=8, verify=False) if resp.status_code == 200: data = resp.json() if data.get("status") == "success" and "alerts" in data.get( "data", {} ): ds_alerts = data["data"]["alerts"] if ds_alerts: # Only process non-empty alert lists self.logger.info( f"Found {len(ds_alerts)} alerts in {ds.get('name')}" ) for alert in ds_alerts: # Tag with source name and type alert["datasource"] = ds.get("name") alert["datasource_type"] = ds.get("type") all_alerts.extend(ds_alerts) else: self.logger.info(f"No alerts found for {ds.get('name')}") else: self.logger.info( f"No alerts data found in response from {ds.get('name')}", extra={ "status": data.get("status"), "has_data": "data" in data, "has_alerts": "data" in data and "alerts" in data.get("data", {}), }, ) else: self.logger.warning( f"Failed to get alerts for {ds.get('name')}: {resp.status_code}", extra={"response": resp.text[:500]}, # Limit response log size ) except Exception as e: self.logger.error( f"Error querying alerts for {ds.get('name')}: {str(e)}", exc_info=True, ) # Continue to the next datasource continue # Step 4: Process and format the alerts formatted_alerts = [] for alert in all_alerts: try: # Format the alert using the existing method alertname = alert.get( "name", alert.get("alertname", alert.get("labels", {}).get("alertname")), ) if not alertname: logger.warning( "Alert name not found, using default", extra={ "alert": alert, }, ) alertname = "Grafana Alert [Unknown]" severity = alert.get( "severity", alert.get("labels", {}).get("severity") ) if not severity: logger.warning( "Alert severity not found, using default", extra={ "alert": alert, }, ) severity = "info" severity = GrafanaProvider.SEVERITIES_MAP.get( severity, AlertSeverity.INFO ) status = alert.get("state") if not status: logger.warning( "Alert status not found, using default", extra={ "alert": alert, }, ) status = "firing" status = GrafanaProvider.STATUS_MAP.get(status, AlertStatus.FIRING) labels = alert.get("labels", {}) # pop severity from labels to avoid duplication labels.pop("severity", None) annotations = alert.get("annotations", {}) description = annotations.get("description", annotations.get("summary")) try: alert_dto = AlertDto( name=alertname, status=status, severity=severity, source=["grafana"], labels=labels, annotations=annotations, datasource=alert.get("datasource") or "", datasource_type=alert.get("datasource_type"), value=str(alert.get("value") or ""), # Always set these so workflow templates can reference # them safely regardless of which alert path fired. panelUrl="", dashboardUrl="", silenceURL="", valueString="", ) if description: alert_dto.description = description formatted_alerts.append(alert_dto) except Exception: self.logger.exception( "Failed to format datasoruce alert", extra={ "alert": alert, }, ) continue except Exception as e: self.logger.error( f"Error formatting alert: {str(e)}", extra={"alert": alert} ) self.logger.info( f"Total alerts found across all datasources: {len(formatted_alerts)}" ) return formatted_alerts def _get_alerts(self) -> list[AlertDto]: self.logger.info("Starting to fetch alerts from Grafana") # First get alerts from datasources directly datasource_alerts = self._get_alerts_datasource() self.logger.info(f"Found {len(datasource_alerts)} alerts from datasources") # Get Grafana version to determine best approach for history API grafana_version = self._get_grafana_version() self.logger.info(f"Detected Grafana version: {grafana_version}") history_alerts = [] # Calculate time range (7 days ago to now) week_ago = int( (datetime.datetime.now() - datetime.timedelta(days=7)).timestamp() ) now = int(datetime.datetime.now().timestamp()) self.logger.info( f"Using time range for alerts: from={week_ago} to={now}", extra={"from_timestamp": week_ago, "to_timestamp": now}, ) headers = {"Authorization": f"Bearer {self.authentication_config.token}"} # First try the general history API (works in older Grafana versions) try: api_endpoint = f"{self.authentication_config.host}/api/v1/rules/history?from={week_ago}&to={now}&limit=0" self.logger.info(f"Querying Grafana history API endpoint: {api_endpoint}") response = requests.get( api_endpoint, verify=False, headers=headers, timeout=5 ) self.logger.info( f"Received response from Grafana history API with status code: {response.status_code}" ) if response.ok: # Process the response events_history = response.json() events_data = events_history.get("data", {}) if events_data and "values" in events_data: events_data_values = events_data.get("values") if events_data_values and len(events_data_values) >= 2: # If we have values, extract the events and timestamps events = events_data_values[1] events_time = events_data_values[0] self.logger.info(f"Found {len(events)} events in history API") for i in range(0, len(events)): event = events[i] try: event_labels = event.get("labels", {}) alert_name = event_labels.get("alertname") alert_status = event_labels.get( "alertstate", event.get("current") ) # Map status to Keep format alert_status = GrafanaProvider.STATUS_MAP.get( alert_status, AlertStatus.FIRING ) # Extract other fields alert_severity = event_labels.get("severity") alert_severity = GrafanaProvider.SEVERITIES_MAP.get( alert_severity, AlertSeverity.INFO ) environment = event_labels.get("environment", "unknown") fingerprint = event_labels.get("fingerprint") description = event.get("error", "") rule_id = event.get("ruleUID") condition = event.get("condition") # Convert timestamp timestamp = datetime.datetime.fromtimestamp( events_time[i] / 1000 ).isoformat() # Create AlertDto alert_dto = AlertDto( id=str(i), fingerprint=fingerprint, name=alert_name, status=alert_status, severity=alert_severity, environment=environment, description=description, lastReceived=timestamp, rule_id=rule_id, condition=condition, labels=event_labels, source=["grafana"], ) history_alerts.append(alert_dto) except Exception as e: self.logger.error( f"Error processing event {i+1}", extra={"event": event, "error": str(e)}, ) self.logger.info( f"Successfully processed {len(history_alerts)} alerts from Grafana history API" ) else: # If general API fails with 'ruleUID is required' error in newer Grafana versions if "ruleUID is required" in response.text: self.logger.info( "Grafana version requires ruleUID parameter, trying per-rule approach" ) # Get all rules first rules_endpoint = ( f"{self.authentication_config.host}/api/alerting/rules" ) self.logger.info(f"Fetching alert rules from: {rules_endpoint}") rules_response = requests.get( rules_endpoint, verify=False, headers=headers, timeout=5 ) if rules_response.ok: rules_data = rules_response.json() rule_uids = [] # Extract all rule UIDs for group in rules_data.get("data", {}).get("groups", []): for rule in group.get("rules", []): if "uid" in rule: rule_uids.append(rule["uid"]) self.logger.info(f"Found {len(rule_uids)} rule UIDs") # For each rule UID, get its history for rule_uid in rule_uids: rule_history_url = f"{self.authentication_config.host}/api/v1/rules/history?from={week_ago}&to={now}&limit=100&ruleUID={rule_uid}" try: rule_resp = requests.get( rule_history_url, verify=False, headers=headers, timeout=5, ) if rule_resp.ok: rule_history = rule_resp.json() rule_data = rule_history.get("data", {}) if rule_data and "values" in rule_data: rule_values = rule_data.get("values") if rule_values and len(rule_values) >= 2: rule_events = rule_values[1] rule_times = rule_values[0] self.logger.info( f"Found {len(rule_events)} events for rule {rule_uid}" ) for i in range(0, len(rule_events)): event = rule_events[i] try: event_labels = event.get( "labels", {} ) alert_name = event_labels.get( "alertname", f"Rule {rule_uid}" ) alert_status = event_labels.get( "alertstate", event.get("current"), ) alert_status = ( GrafanaProvider.STATUS_MAP.get( alert_status, AlertStatus.FIRING, ) ) alert_severity = event_labels.get( "severity" ) alert_severity = GrafanaProvider.SEVERITIES_MAP.get( alert_severity, AlertSeverity.INFO, ) environment = event_labels.get( "environment", "unknown" ) fingerprint = event_labels.get( "fingerprint", rule_uid ) description = event.get("error", "") condition = event.get("condition") # Convert timestamp timestamp = ( datetime.datetime.fromtimestamp( rule_times[i] / 1000 ).isoformat() ) alert_dto = AlertDto( id=f"{rule_uid}_{i}", fingerprint=fingerprint, name=alert_name, status=alert_status, severity=alert_severity, environment=environment, description=description, lastReceived=timestamp, rule_id=rule_uid, condition=condition, labels=event_labels, source=["grafana"], ) history_alerts.append(alert_dto) except Exception as e: self.logger.error( f"Error processing event for rule {rule_uid}", extra={ "event": event, "error": str(e), }, ) except Exception as e: self.logger.error( f"Error processing history for rule {rule_uid}", extra={"error": str(e)}, ) # if response is 404, it means the API is not available elif rules_response.status_code == 404: # if legacy alerting is not enabled, we can assume the API is not available self.logger.error("Grafana history API not available") else: self.logger.error( "Failed to get alerts from Grafana history API", extra={ "status_code": response.status_code, "response_text": response.text, "api_endpoint": api_endpoint, }, ) self.logger.info( f"Processed {len(history_alerts)} alerts from per-rule history API" ) else: self.logger.error( "Failed to get alerts from Grafana history API", extra={ "status_code": response.status_code, "response_text": response.text, "api_endpoint": api_endpoint, }, ) except Exception as e: self.logger.error( "Error querying Grafana history API", extra={"error": str(e)} ) # Also try to get alerts from Alertmanager alertmanager_alerts = [] try: alertmanager_url = f"{self.authentication_config.host}/api/alertmanager/grafana/api/v2/alerts" self.logger.info(f"Querying Alertmanager at: {alertmanager_url}") am_resp = requests.get( alertmanager_url, verify=False, headers=headers, timeout=5 ) if am_resp.ok: am_alerts_data = am_resp.json() if am_alerts_data: self.logger.info( f"Found {len(am_alerts_data)} alerts in Alertmanager" ) for i, alert in enumerate(am_alerts_data): try: # Extract alert properties labels = alert.get("labels", {}) annotations = alert.get("annotations", {}) # Extract alert name alert_name = labels.get("alertname", f"Alert_{i}") # Determine status alert_status = AlertStatus.FIRING if alert.get("status", {}).get("state") == "suppressed": alert_status = AlertStatus.SUPPRESSED elif ( alert.get("endsAt") and alert.get("endsAt") != "0001-01-01T00:00:00Z" ): alert_status = AlertStatus.RESOLVED # Extract severity alert_severity = labels.get("severity", "info") alert_severity = GrafanaProvider.SEVERITIES_MAP.get( alert_severity, AlertSeverity.INFO ) # Create AlertDto try: alert_dto = AlertDto( id=alert.get("fingerprint", str(i)), fingerprint=alert.get("fingerprint"), name=alert_name, status=alert_status, severity=alert_severity, environment=labels.get("environment", "unknown"), description=annotations.get( "description", annotations.get("summary", "") ), lastReceived=alert.get("startsAt"), rule_id=labels.get("ruleId"), condition="", labels=labels, source=["grafana"], ) alertmanager_alerts.append(alert_dto) except Exception: self.logger.exception( f"Error creating AlertDto for Alertmanager alert {i}", extra={ "alert": alert, }, ) except Exception as e: self.logger.error( f"Error processing Alertmanager alert {i}", extra={"alert": alert, "error": str(e)}, ) else: self.logger.warning( f"Failed to get alerts from Alertmanager: {am_resp.status_code}" ) except Exception as e: self.logger.error("Error querying Alertmanager", extra={"error": str(e)}) # Combine all alert sources all_alerts = datasource_alerts + history_alerts + alertmanager_alerts self.logger.info(f"Total alerts found from all sources: {len(all_alerts)}") return all_alerts @classmethod def simulate_alert(cls, **kwargs) -> dict: import hashlib import json import random from keep.providers.grafana_provider.alerts_mock import ALERTS alert_type = kwargs.get("alert_type") if not alert_type: alert_type = random.choice(list(ALERTS.keys())) to_wrap_with_provider_type = kwargs.get("to_wrap_with_provider_type") if "payload" in ALERTS[alert_type]: alert_payload = ALERTS[alert_type]["payload"] else: alert_payload = ALERTS[alert_type]["alerts"][0] alert_parameters = ALERTS[alert_type].get("parameters", {}) alert_renders = ALERTS[alert_type].get("renders", {}) # Generate random data for parameters for parameter, parameter_options in alert_parameters.items(): if "." in parameter: parameter = parameter.split(".") if parameter[0] not in alert_payload: alert_payload[parameter[0]] = {} alert_payload[parameter[0]][parameter[1]] = random.choice( parameter_options ) else: alert_payload[parameter] = random.choice(parameter_options) # Apply renders for param, choices in alert_renders.items(): # replace annotations # HACK param_to_replace = "{{ " + param + " }}" alert_payload["annotations"]["summary"] = alert_payload["annotations"][ "summary" ].replace(param_to_replace, random.choice(choices)) # Implement specific Grafana alert structure here # For example: alert_payload["state"] = AlertStatus.FIRING.value alert_payload["evalMatches"] = [ { "value": random.randint(0, 100), "metric": "some_metric", "tags": alert_payload.get("labels", {}), } ] # Generate fingerprint fingerprint_src = json.dumps(alert_payload, sort_keys=True) fingerprint = hashlib.md5(fingerprint_src.encode()).hexdigest() alert_payload["fingerprint"] = fingerprint final_payload = { "alerts": [alert_payload], "severity": alert_payload.get("labels", {}).get("severity"), "title": alert_type, } if to_wrap_with_provider_type: return {"keep_source_type": "grafana", "event": final_payload} return final_payload def query_datasource_for_topology(self): self.logger.info("Attempting to query datasource for topology data.") headers = { "Authorization": f"Bearer {self.authentication_config.token}", "Content-Type": "application/json", } json_data = { "queries": [ { "format": "table", "refId": "traces_service_graph_request_total", "expr": "sum by (client, server) (rate(traces_service_graph_request_total[3600s]))", "instant": True, "exemplar": False, "requestId": "service_map_request", "utcOffsetSec": 19800, "interval": "", "legendFormat": "", "datasource": { "uid": self.authentication_config.datasource_uid, }, "datasourceId": 1, "intervalMs": 5000, "maxDataPoints": 954, }, { "format": "table", "refId": "traces_service_graph_request_server_seconds_sum", "expr": "sum by (client, server) (rate(traces_service_graph_request_server_seconds_sum[3600s]))", "instant": True, "exemplar": False, "requestId": "service_map_request_avg", "utcOffsetSec": 19800, "interval": "", "legendFormat": "", "datasource": { "uid": self.authentication_config.datasource_uid, }, "datasourceId": 1, "intervalMs": 5000, "maxDataPoints": 954, }, ], "to": "now", } try: response = requests.post( f"{self.authentication_config.host}/api/ds/query", verify=False, headers=headers, json=json_data, timeout=10, ) if response.status_code != 200: raise Exception(response.text) return response.json() except Exception as e: self.logger.error( "Error while querying datasource for topology map", extra={"exception": str(e)}, ) @staticmethod def __extract_schema_value_pair(results, query: str): client_server_data = {} for frames in results.get(query, {}).get("frames", []): value_index = 0 for fields in frames.get("schema", {}).get("fields", []): if ( "labels" in fields and "client" in fields["labels"] and "server" in fields["labels"] ): client_server_data[ (fields["labels"]["client"], fields["labels"]["server"]) ] = float(frames["data"]["values"][value_index][0]) break value_index += 1 return client_server_data def pull_topology(self): self.logger.info("Pulling Topology data from Grafana...") if not self.authentication_config.datasource_uid: self.logger.debug("No datasource uid found, skipping topology pull") return [], {} try: service_topology = {} results = self.query_datasource_for_topology().get("results", {}) self.logger.info( "Scraping traces_service_graph_request_total data from the response" ) requests_per_second_data = GrafanaProvider.__extract_schema_value_pair( results=results, query="traces_service_graph_request_total" ) self.logger.info( "Scraping traces_service_graph_request_server_seconds_sum data from the response" ) total_response_times_data = GrafanaProvider.__extract_schema_value_pair( results=results, query="traces_service_graph_request_server_seconds_sum" ) self.logger.info("Building Topology map.") for client_server in requests_per_second_data: client, server = client_server requests_per_second = requests_per_second_data[client_server] total_response_time = total_response_times_data.get(client_server, None) if client not in service_topology: service_topology[client] = TopologyServiceInDto( source_provider_id=self.provider_id, service=client, display_name=client, ) if server not in service_topology: service_topology[server] = TopologyServiceInDto( source_provider_id=self.provider_id, service=server, display_name=server, ) service_topology[client].dependencies[server] = ( "unknown" if total_response_time is None else f"{round(requests_per_second, 2)}r/sec || {round((total_response_time / requests_per_second) * 1000, 2)}ms/r" ) self.logger.info("Successfully pulled Topology data from Grafana...") return list(service_topology.values()), {} except Exception as e: self.logger.error( "Error while pulling topology data from Grafana", extra={"exception": str(e)}, ) raise e if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) # Load environment variables import os host = os.environ.get("GRAFANA_HOST") token = os.environ.get("GRAFANA_TOKEN") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = { "authentication": {"host": host, "token": token}, } provider = ProvidersFactory.get_provider( context_manager, provider_id="grafana-keephq", provider_type="grafana", provider_config=config, ) version = provider.get_provider_metadata() alerts = provider.get_alerts() alerts = provider.setup_webhook( "test", "http://localhost:3000/alerts/event/grafana", "some-api-key", True ) print(alerts) ================================================ FILE: keep/providers/grafana_provider/prometheus/prometheus.yml ================================================ global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: "node" static_configs: - targets: - "node-exporter-1:9100" - "node-exporter-2:9100" relabel_configs: - source_labels: [__address__] target_label: instance regex: "(.*):.*" replacement: "${1}" ================================================ FILE: keep/providers/graylog_provider/README.md ================================================ # Instructions for a quick setup ## Setting up Graylog (v6) ### Installation 1. Spin up Graylog, [docs](https://go2docs.graylog.org/6-0/downloading_and_installing_graylog/docker_installation.htm) ```bash cd keep/providers/graylog_provider docker compose up ``` 2. Once the containers are up and running, go to [http://localhost:9000](http://localhost:9000) and sign in with username `admin` & password `admin`. ### Getting Access Token 1. Navigate to System > Users and Teams to view the Users Overview page. 2. For the user `Admin`, select Edit tokens from the More drop-down menu. 3. Enter a token name, then click Create Token. ### Setting up Inputs and Event Definition ```python import requests auth = ("YOUR_ACCESS_TOKEN", "token") # from the previous step headers = { "Accept": "application/json", "X-Requested-By": "Keep", "Content-Type": "application/json", } input_data = { 'type': 'org.graylog2.inputs.raw.tcp.RawTCPInput', 'configuration': { 'bind_address': '0.0.0.0', 'port': 5044, 'recv_buffer_size': 1048576, 'number_worker_threads': 3, 'tls_cert_file': '', 'tls_key_file': '', 'tls_enable': False, 'tls_key_password': '', 'tls_client_auth': 'disabled', 'tls_client_auth_cert_file': '', 'tcp_keepalive': False, 'use_null_delimiter': False, 'max_message_size': 2097152, 'override_source': None, 'charset_name': 'UTF-8', }, 'title': 'Keep-Input', 'global': True, } input_response = requests.post( url="http://127.0.0.1:9000/api/system/inputs", headers=headers, json=input_data, auth=auth, ) print(input_response.text) event_data = { 'title': 'Keep-Event', 'description': 'This is an event for Keep', 'priority': 3, 'config': { 'query': 'source:*', 'query_parameters': [], 'streams': [], 'filters': [], 'search_within_ms': 86400000, 'execute_every_ms': 60000, 'event_limit': 100, 'group_by': [], 'series': [], 'conditions': {}, 'type': 'aggregation-v1', }, 'field_spec': {}, 'key_spec': [], 'notification_settings': { 'grace_period_ms': 300000, 'backlog_size': None, }, 'notifications': [], 'alert': True, } event_response = requests.post( url="http://127.0.0.1:9000/api/events/definitions", headers=headers, json=event_data, auth=auth, ) print(event_response.text) ``` ### Sending a log 1. After that you can send a plain text message to the Graylog raw/plaintext TCP input running on port 5044 using the following command: ```bash echo 'First log message' | nc localhost 5044 # @tb: it used to be 5555 but what worked for me was 5044 ``` ## Setup Keep to receive from Graylog --- ### **Note** 1. Run without `NGROK` 2. After Step 2, do this: - Go to Alerts > Notifications - Click the `title` of the newly create notification > `Edit Notification` > Replace `0.0.0.0` with your ip address > Click `Add to URL whitelist ` > Fill in the `Title` > `Update Configuration` > `Update Notification` --- 1. Go to `Providers` > search for `Graylog` > - Username: `admin` - Graylog Access Token: Access tokens from previous steps - Deployment Url: http://localhost:9000 - Install webhook: True 2. This will create a new notification and install that notification in the existing events. 3. Send a log to `Graylog`, this will trigger an alert. 4. Check your feed. ================================================ FILE: keep/providers/graylog_provider/__init__.py ================================================ ================================================ FILE: keep/providers/graylog_provider/alerts_mock.py ================================================ ALERTS = { "event_definition_id": "671a28a03696bb3801a7a9f1", "event_definition_type": "aggregation-v1", "event_definition_title": "Event - 1", "event_definition_description": ".", "job_definition_id": "671a97cc3696bb3801a846a6", "job_trigger_id": "671a9dfe3696bb3801a8536d", "event": { "id": "01JAZZJAKS82TDZAE82E0WAENT", "event_definition_type": "aggregation-v1", "event_definition_id": "671a28a03696bb3801a7a9f1", "origin_context": "urn:graylog:message:es:graylog_0:d0a9a7a0-91f1-11ef-9a79-0242ac170004", "timestamp": "2024-10-24T10:22:04.556Z", "timestamp_processing": "2024-10-24T19:20:30.585Z", "timerange_start": None, "timerange_end": None, "streams": [], "source_streams": ["000000000000000000000001"], "message": "Event - 1", "source": "server", "key_tuple": [], "key": "", "priority": 3, "scores": {}, "alert": True, "fields": {}, "group_by_fields": {}, "replay_info": { "timerange_start": "2024-10-23T19:20:29.706Z", "timerange_end": "2024-10-24T19:20:29.706Z", "query": "source:172.23.0.1", "streams": ["000000000000000000000001"], "filters": [], }, }, "backlog": [], } ================================================ FILE: keep/providers/graylog_provider/docker-compose-v4.yml ================================================ version: '3' services: # MongoDB: https://hub.docker.com/_/mongo/ mongo: image: mongo:4.2 networks: - graylog # Elasticsearch: https://www.elastic.co/guide/en/elasticsearch/reference/7.10/docker.html elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2 environment: - http.host=0.0.0.0 - transport.host=localhost - network.host=0.0.0.0 - "ES_JAVA_OPTS=-Xms512m -Xmx512m" ulimits: memlock: soft: -1 hard: -1 deploy: resources: limits: memory: 1g networks: - graylog ports: - "9200:9200" - "9300:9300" # Graylog: https://hub.docker.com/r/graylog/graylog/ graylog: image: graylog/graylog:4.0 environment: - GRAYLOG_PASSWORD_SECRET=somepasswordpepper - GRAYLOG_ROOT_PASSWORD_SHA2=8c6976e5b5410415bde908bd4dee15dfb167a9c873fc4bb8a81f6f2ab448a918 - GRAYLOG_HTTP_EXTERNAL_URI=http://127.0.0.1:9000/ - GRAYLOG_ELASTICSEARCH_HOSTS=http://elasticsearch:9200 entrypoint: /usr/bin/tini -- wait-for-it elasticsearch:9200 -t 60 -- /docker-entrypoint.sh networks: - graylog restart: always depends_on: - mongo - elasticsearch ports: - "9000:9000" - "1514:1514" - "1514:1514/udp" - "12201:12201" - "12201:12201/udp" networks: graylog: driver: bridge ================================================ FILE: keep/providers/graylog_provider/docker-compose.yml ================================================ version: '3' services: # MongoDB: https://hub.docker.com/_/mongo/ mongodb: image: "mongo:6.0.18" ports: - "27017:27017" restart: "on-failure" networks: - graylog volumes: - "mongodb_data:/data/db" opensearch: image: "opensearchproject/opensearch:2.15.0" environment: - "OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g" - "bootstrap.memory_lock=true" - "discovery.type=single-node" - "action.auto_create_index=false" - "plugins.security.ssl.http.enabled=false" - "plugins.security.disabled=true" # Can generate a password for `OPENSEARCH_INITIAL_ADMIN_PASSWORD` using a linux device via: # tr -dc A-Z-a-z-0-9_@#%^-_=+ < /dev/urandom | head -c${1:-32} - "OPENSEARCH_INITIAL_ADMIN_PASSWORD=+_8r#wliY3Pv5-HMIf4qzXImYzZf-M=M" ulimits: memlock: hard: -1 soft: -1 nofile: soft: 65536 hard: 65536 ports: - "9203:9200" - "9303:9300" restart: "on-failure" networks: - graylog volumes: - "opensearch:/usr/share/opensearch/data" # Graylog: https://hub.docker.com/r/graylog/graylog/ graylog: hostname: "server" image: "graylog/graylog:6.0" # To install Graylog Open: "graylog/graylog:6.0" depends_on: mongodb: condition: "service_started" opensearch: condition: "service_started" entrypoint: "/usr/bin/tini -- wait-for-it opensearch:9200 -- /docker-entrypoint.sh" environment: GRAYLOG_NODE_ID_FILE: "/usr/share/graylog/data/config/node-id" GRAYLOG_HTTP_BIND_ADDRESS: "0.0.0.0:9000" GRAYLOG_ELASTICSEARCH_HOSTS: "http://opensearch:9200" GRAYLOG_MONGODB_URI: "mongodb://mongodb:27017/graylog" # To make reporting (headless_shell) work inside a Docker container GRAYLOG_REPORT_DISABLE_SANDBOX: "true" # CHANGE ME (must be at least 16 characters)! GRAYLOG_PASSWORD_SECRET: "somepasswordpepper" # Password: "admin" GRAYLOG_ROOT_PASSWORD_SHA2: "8c6976e5b5410415bde908bd4dee15dfb167a9c873fc4bb8a81f6f2ab448a918" GRAYLOG_HTTP_EXTERNAL_URI: "http://127.0.0.1:9000/" ports: # Graylog web interface and REST API - "9000:9000/tcp" # Beats - "5044:5044/tcp" # Exposing for TCP Ingestion - "5555:5555/tcp" # Syslog TCP - "5140:5140/tcp" # Syslog UDP - "5140:5140/udp" # GELF TCP - "12201:12201/tcp" # GELF UDP - "12201:12201/udp" # Forwarder data - "13301:13301/tcp" # Forwarder config - "13302:13302/tcp" restart: "on-failure" networks: - graylog volumes: - "graylog_data:/usr/share/graylog/data/data" - "graylog_config:/usr/share/graylog/data/config" - "graylog_journal:/usr/share/graylog/data/journal" networks: graylog: driver: "bridge" volumes: mongodb_data: opensearch: graylog_data: graylog_config: graylog_journal: ================================================ FILE: keep/providers/graylog_provider/graylog_provider.py ================================================ """ Graylog Provider is a class that allows to install webhooks in Graylog. """ # Documentation for older versions of graylog: https://github.com/Graylog2/documentation import dataclasses import math import uuid from datetime import datetime, timedelta, timezone from typing import List from urllib.parse import urlencode, urljoin, urlparse import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod from keep.providers.providers_factory import ProvidersFactory class ResourceAlreadyExists(Exception): def __init__(self, *args): super().__init__(*args) @pydantic.dataclasses.dataclass class GraylogProviderAuthConfig: """ Graylog authentication configuration. """ graylog_user_name: str = dataclasses.field( metadata={ "required": True, "description": "Username", "hint": "Your Username associated with the Access Token", }, ) graylog_access_token: str = dataclasses.field( metadata={ "required": True, "description": "Graylog Access Token", "hint": "Graylog Access Token ", "sensitive": True, }, ) deployment_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Deployment Url", "hint": "Example: http://127.0.0.1:9000", "validation": "any_http_url", }, ) verify: bool = dataclasses.field( metadata={ "description": "Verify SSL certificates", "hint": "Set to false to allow self-signed certificates", "sensitive": False, }, default=True, ) class GraylogProvider(BaseProvider): """Install Webhooks and receive alerts from Graylog.""" PROVIDER_CATEGORY = ["Monitoring"] webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Graylog to Keep, Use the following webhook url to configure Graylog send alerts to Keep: 1. In Graylog, from the Topbar, go to `Alerts` > `Notifications`. 2. Click "Create Notification". 3. In the New Notification form, configure: **Note**: For Graylog v4.x please set the **URL** to `{keep_webhook_api_url}?api_key={api_key}`. - **Display Name**: keep-graylog-webhook-integration - **Title**: keep-graylog-webhook-integration - **Notification Type**: Custom HTTP Notification - **URL**: {keep_webhook_api_url} # Whitelist this URL - **Headers**: X-API-KEY:{api_key} 4. Erase the Body Template. 5. Click on "Create Notification". 6. Go the the `Event Definitions` tab, and select the Event Definition that will trigger the alert you want to send to Keep and click on More > Edit. 7. Go to "Notifications" tab. 8. Click on "Add Notification" and select the "keep-graylog-webhook-integration" that you created in step 3. 9. Click on "Add Notification". 10. Click `Next` > `Update` event definition """ PROVIDER_DISPLAY_NAME = "Graylog" PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="Mandatory for all operations, ensures the user is authenticated.", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ProviderScope( name="authorized", description="Mandatory for querying incidents and managing resources, ensures the user has `Admin` privileges.", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ] PROVIDER_METHODS = [ ProviderMethod( name="Search", func_name="search", scopes=["authorized"], description="Search using elastic query language in Graylog", type="action", ), ] """ Graylog does not behave like Prometheus; it does not resend identical alerts. Once an alert is triggered, it is sent only once. The event_definition_id refers to the notification configuration, not the individual event. Using this as the deduplication key causes all alerts from the same definition to be suppressed—even if triggered on different days. Switching to the id field is preferable, as it uniquely identifies each alert instance. About alerts: https://go2docs.graylog.org/current/interacting_with_your_log_data/alerts.html About event definitions: https://go2docs.graylog.org/current/interacting_with_your_log_data/event_definitions.html """ FINGERPRINT_FIELDS = ["id"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._host = None self.is_v4 = self.__get_graylog_version().startswith("4") def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Graylog provider. """ self.logger.debug("Validating configuration for Graylog provider") self.authentication_config = GraylogProviderAuthConfig( **self.config.authentication ) def search( self, query: str, query_type: str, timerange_seconds: int, timerange_type: str, page: int, per_page: int, ): """ Search for logs in Graylog using the specified query. Args: query (str): The query string to search for. query_type (str): The type of query to use. Default is "elastic". timerange_seconds (int): The time range in seconds. Default is 300 seconds. timerange_type (str): The type of time range. Default is "relative". page (int): Page number, starting from 0. per_page (int): Number of results per page. """ self.logger.info(f"Searching in Graylog with query: {query}") # Calculate offset based on page and per_page offset = page * per_page if offset < 0: offset = 0 # Extra protection against negative offsets query_id = str(uuid.uuid4()) search_type_id = str(uuid.uuid4()) search_body = { "parameters": [], "queries": [ { "id": query_id, "query": {"type": query_type, "query_string": query}, "timerange": {"from": timerange_seconds, "type": timerange_type}, "search_types": [ { "timerange": None, "query": None, "streams": [], "type": "messages", "id": search_type_id, "name": None, "limit": per_page, "offset": offset, "sort": [{"field": "timestamp", "order": "DESC"}], "fields": [], "decorators": [], "filter": None, "filters": [], } ], } ], } search_response = requests.post( url=self.__get_url(paths=["views", "search","sync"]), headers=self._headers, auth=self._auth, json=search_body, verify=self.authentication_config.verify, ) search_response.raise_for_status() result = search_response.json() self.logger.info(f"Graylog sync search result: {result}") # Get results from Graylog results = next(iter(result["results"].values())) search_types = results.get("search_types", {}) search = search_types.get(search_type_id) messages = search.get("messages", []) for i, msg in enumerate(messages): self.logger.info(f"message[{i}] type: {type(msg)}, content: {msg}") return messages @property def graylog_host(self): self.logger.debug("Fetching Graylog host") if self._host: self.logger.debug("Returning cached Graylog host") return self._host # Handle host determination logic with logging if self.authentication_config.deployment_url.startswith( "http://" ) or self.authentication_config.deployment_url.startswith("https://"): self.logger.info("Using supplied Graylog host with protocol") self._host = self.authentication_config.deployment_url return self._host # Otherwise, attempt to use https try: self.logger.debug( f"Trying HTTPS for {self.authentication_config.deployment_url}" ) requests.get( f"https://{self.authentication_config.deployment_url}", verify=self.authentication_config.verify, ) self.logger.info("HTTPS protocol confirmed") self._host = f"https://{self.authentication_config.deployment_url}" except requests.exceptions.SSLError: self.logger.warning("SSL error encountered, falling back to HTTP") self._host = f"http://{self.authentication_config.deployment_url}" except Exception as e: self.logger.error( "Failed to determine Graylog host", extra={"exception": str(e)} ) self._host = self.authentication_config.deployment_url.rstrip("/") return self._host @property def _headers(self): return { "Accept": "application/json", "X-Requested-By": "Keep", } @property def _auth(self): return self.authentication_config.graylog_access_token, "token" def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for Graylog api requests. """ host = self.graylog_host.rstrip("/").rstrip() + "/api/" self.logger.info(f"Building URL with host: {host}") url = urljoin( host, "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" self.logger.debug(f"Constructed URL: {url}") return url def validate_scopes(self) -> dict[str, bool | str]: self.logger.info("Validating user scopes for Graylog provider") required_role = "Admin" try: user_response = requests.get( url=self.__get_url( paths=["users", self.authentication_config.graylog_user_name] ), headers=self._headers, auth=self._auth, verify=self.authentication_config.verify, ) self.logger.debug("User information request sent") if user_response.status_code != 200: raise Exception(user_response.text) authenticated = True user_response = user_response.json() if required_role in user_response["roles"]: self.logger.info("User has required admin privileges") authorized = True else: self.logger.warning("User lacks required admin privileges") authorized = "Missing admin Privileges" except Exception as e: self.logger.error( "Error while validating user scopes", extra={"exception": str(e)} ) authenticated = str(e) authorized = False return { "authenticated": authenticated, "authorized": authorized, } def __get_graylog_version(self) -> str: self.logger.info("Getting graylog version info") try: version_response = requests.get( url=self.__get_url(), headers=self._headers, verify=self.authentication_config.verify, ) if version_response.status_code != 200: raise Exception(version_response.text) version = version_response.json()["version"].strip() self.logger.info(f"We are working with Graylog version: {version}") return version except Exception as e: self.logger.error( "Error while getting Graylog Version", extra={"exception": str(e)} ) def __get_url_whitelist(self): try: self.logger.info("Fetching URL Whitelist") whitelist_response = requests.get( url=self.__get_url(paths=["system/urlwhitelist"]), headers=self._headers, auth=self._auth, timeout=10, verify=self.authentication_config.verify, ) if whitelist_response.status_code != 200: raise Exception(whitelist_response.text) self.logger.info("Successfully retrieved URL Whitelist") return whitelist_response.json() except Exception as e: self.logger.error( "Error while fetching URL whitelist", extra={"exception": str(e)} ) raise e def __update_url_whitelist(self, whitelist): try: self.logger.info("Updating URL whitelist") whitelist_response = requests.put( url=self.__get_url(paths=["system/urlwhitelist"]), headers=self._headers, auth=self._auth, json=whitelist, verify=self.authentication_config.verify, ) if whitelist_response.status_code != 204: raise Exception(whitelist_response.text) self.logger.info("Successfully updated URL whitelist") except Exception as e: self.logger.error( "Error while updating URL whitelist", extra={"exception": str(e)} ) raise e def __get_events(self, page: int, per_page: int): self.logger.info( f"Fetching events from Graylog (page: {page}, per_page: {per_page})" ) try: events_response = requests.get( url=self.__get_url(paths=["events", "definitions"]), headers=self._headers, auth=self._auth, params={"page": page, "per_page": per_page}, verify=self.authentication_config.verify, ) if events_response.status_code != 200: raise Exception(events_response.text) events_response = events_response.json() self.logger.info("Successfully fetched events from Graylog") return events_response except Exception as e: self.logger.error( "Error while fetching events", extra={"exception": str(e)} ) raise e def __update_event(self, event): try: self.logger.info(f"Updating event with ID: {event['id']}") event_update_response = requests.put( url=self.__get_url(paths=["events", "definitions", event["id"]]), timeout=10, json=event, auth=self._auth, headers=self._headers, verify=self.authentication_config.verify, ) if event_update_response.status_code != 200: raise Exception(event_update_response.text) self.logger.info(f"Successfully updated event with ID: {event['id']}") except Exception as e: self.logger.error( f"Error while updating event with ID: {event['id']}", extra={"exception": str(e)}, ) raise e def __get_notification(self, page: int, per_page: int, notification_name: str): try: self.logger.info(f"Fetching notification: {notification_name}") notifications_response = requests.get( url=self.__get_url(paths=["events", "notifications"]), params={ "page": page, "per_page": per_page, "query": f"title:{notification_name}", }, auth=self._auth, headers=self._headers, timeout=10, verify=self.authentication_config.verify, ) if notifications_response.status_code != 200: raise Exception(notifications_response.text) self.logger.info(f"Successfully fetched notification: {notification_name}") return notifications_response.json() except Exception as e: self.logger.error( f"Error while fetching notification {notification_name}", extra={"exception": str(e)}, ) raise e def __delete_notification(self, notification_id: str): try: self.logger.info( f"Attempting to delete notification with ID: {notification_id}" ) notification_delete_response = requests.delete( url=self.__get_url(paths=["events", "notifications", notification_id]), auth=self._auth, headers=self._headers, verify=self.authentication_config.verify, ) if notification_delete_response.status_code != 204: raise Exception(notification_delete_response.text) self.logger.info( f"Successfully deleted notification with ID: {notification_id}" ) except Exception as e: self.logger.error( f"Error while deleting notification with ID {notification_id}", extra={"exception": str(e)}, ) raise e def __create_notification(self, notification_name: str, notification_body): try: self.logger.info(f"Attempting to create notification: {notification_name}") notification_creation_response = requests.post( url=self.__get_url(paths=["events", "notifications"]), headers=self._headers, auth=self._auth, timeout=10, json=notification_body, verify=self.authentication_config.verify, ) if notification_creation_response.status_code != 200: raise Exception(notification_creation_response.text) self.logger.info(f"Successfully created notification: {notification_name}") return notification_creation_response.json() except Exception as e: self.logger.error( f"Error while creating notification {notification_name}", extra={"exception": str(e)}, ) raise e def __update_notification(self, notification_id: str, notification_body): try: self.logger.info( f"Attempting to update notification with ID: {notification_id}" ) notification_update_response = requests.put( url=self.__get_url(paths=["events", "notifications", notification_id]), headers=self._headers, auth=self._auth, timeout=10, json=notification_body, verify=self.authentication_config.verify, ) if notification_update_response.status_code != 200: raise Exception(notification_update_response.text) self.logger.info( f"Successfully updated notification with ID: {notification_id}" ) return notification_update_response.json() except Exception as e: self.logger.error( f"Error while updating notification with ID {notification_id}", extra={"exception": str(e)}, ) raise e def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): self.logger.info("Setting up webhook in Graylog") # Extracting provider_id from the keep_api_url parsed_url = urlparse(keep_api_url) query_params = parsed_url.query provider_id = query_params.split("provider_id=")[-1] notification_name = f"Keep-{provider_id}" if self.is_v4: keep_api_url = f"{keep_api_url}&api_key={api_key}" try: event_definitions = [] events_1 = self.__get_events(page=1, per_page=100) event_definitions.extend(events_1["event_definitions"]) total_pages = math.ceil(int(events_1["total"]) / 100) for page in range(2, total_pages): self.logger.debug(f"Fetching events page: {page}") event_definitions.extend( self.__get_events(page=page, per_page=100)["event_definitions"] ) # Whitelist URL url_whitelist = self.__get_url_whitelist() url_found = False for entry in url_whitelist["entries"]: if entry["value"] == keep_api_url: self.logger.info("URL already whitelisted") url_found = True break if not url_found: self.logger.info("Adding URL to whitelist") url_whitelist["entries"].append( { "id": str(uuid.uuid4()), "title": notification_name, "value": keep_api_url, "type": "literal", } ) self.__update_url_whitelist(url_whitelist) # Create notification notification = self.__get_notification( page=1, per_page=1, notification_name=notification_name ) existing_notification_id = None if int(notification["count"]) > 0: self.logger.info("Notification already exists, deleting it") # We need to clean up the previously installed notification existing_notification_id = notification["notifications"][0]["id"] self.__delete_notification(notification_id=existing_notification_id) self.logger.info("Creating new notification") if self.is_v4: config = {"type": "http-notification-v1", "url": keep_api_url} else: config = { "type": "http-notification-v2", "basic_auth": None, "api_key_as_header": False, "api_key": "", "api_secret": None, "url": keep_api_url, "skip_tls_verification": True, "method": "POST", "time_zone": "UTC", "content_type": "JSON", "headers": f"X-API-KEY:{api_key}", "body_template": "", } notification_body = { "title": notification_name, "description": "Hello, this Notification is created by Keep, please do not change the title.", "config": config, } new_notification = self.__create_notification( notification_name=notification_name, notification_body=notification_body ) for event_definition in event_definitions: if ( not self.is_v4 and event_definition["_scope"] == "SYSTEM_NOTIFICATION_EVENT" ): self.logger.info("Skipping SYSTEM_NOTIFICATION_EVENT") continue self.logger.info(f"Updating event with ID: {event_definition['id']}") # Attempting to clean up the deleted notification from the event, it is not handled well in Graylog v4. for ind, notification in enumerate(event_definition["notifications"]): if notification["notification_id"] == existing_notification_id: event_definition["notifications"].pop(ind) break event_definition["notifications"].append( {"notification_id": new_notification["id"]} ) self.__update_event(event=event_definition) self.logger.info("Webhook setup completed successfully") except Exception as e: self.logger.error( "Error while setting up webhook", extra={"exception": str(e)} ) raise e @staticmethod def __map_event_to_alert(event: dict) -> AlertDto: alert = AlertDto( id=event["event"]["id"], name=event.get("event_definition_title", event["event"]["message"]), severity=[AlertSeverity.LOW, AlertSeverity.WARNING, AlertSeverity.HIGH][ int(event["event"]["priority"]) - 1 ], description=event.get("event_definition_description", None), event_definition_id=event["event"]["event_definition_id"], origin_context=event["event"].get("origin_context", None), status=AlertStatus.FIRING, lastReceived=datetime.fromisoformat( event["event"]["timestamp"].replace("z", "") ) .replace(tzinfo=timezone.utc) .isoformat(), message=event["event"].get("message", None), source=["graylog"], ) alert.fingerprint = GraylogProvider.get_alert_fingerprint( alert, GraylogProvider.FINGERPRINT_FIELDS ) return alert @staticmethod def _format_alert( event: dict, provider_instance: BaseProvider | None = None ) -> AlertDto: return GraylogProvider.__map_event_to_alert(event=event) @classmethod def simulate_alert(cls) -> dict: import random import string from keep.providers.graylog_provider.alerts_mock import ALERTS # Use the provided ALERTS structure alert_data = ALERTS.copy() # Start with the base event payload simulated_alert = alert_data["event"] alert_data["event_definition_title"] = random.choice( [ "EventDefinition - 1", "EventDefinition - 2", "EventDefinition - 3", ] ) alert_data["event_definition_description"] = random.choice( [ "Description - add", "Description - commit", "Description - push", ] ) # Apply variability to the event message and priority simulated_alert["message"] = alert_data["event_definition_title"] simulated_alert["priority"] = random.choice([1, 2, 3]) chars = string.ascii_uppercase + string.digits # Generate a random ID of specified length random_id = "".join(random.choice(chars) for _ in range(25)) simulated_alert["id"] = random_id simulated_alert["event_definition_id"] = alert_data["event_definition_id"] = ( "".join( random.choice(string.ascii_lowercase + string.digits) for _ in range(24) ) ) # Set the current timestamp simulated_alert["timestamp"] = datetime.now().isoformat() # Apply variability to replay_info replay_info = simulated_alert.get("replay_info", {}) replay_info["timerange_start"] = ( datetime.now() - timedelta(hours=1) ).isoformat() replay_info["timerange_end"] = datetime.now().isoformat() simulated_alert["replay_info"] = replay_info return alert_data def __get_alerts(self, json_data: dict): try: self.logger.info( f"Fetching alerts (page: {json_data['page']}, per_page: {json_data['per_page']})" ) alert_response = requests.post( url=self.__get_url(paths=["events", "search"]), headers=self._headers, auth=self._auth, timeout=10, json=json_data, verify=self.authentication_config.verify, ) if alert_response.status_code != 200: raise Exception(alert_response.text) self.logger.info("Successfully fetched alerts") return alert_response.json() except Exception as e: self.logger.error( "Error while fetching alerts", extra={"exception": str(e)} ) raise e def _get_alerts(self) -> list[AlertDto]: self.logger.info("Getting alerts from Graylog") json_data = { "query": "", "page": 1, "per_page": 1000, "filter": { "alerts": "only", }, "timerange": { "range": 1 * 24 * 60 * 60, "type": "relative", }, } all_alerts = [] alerts_1 = self.__get_alerts(json_data=json_data) all_alerts.extend(alerts_1["events"]) total_events = max(10, math.ceil(alerts_1["total_events"] / 1000)) for page in range(2, total_events + 1): self.logger.debug(f"Fetching alerts page: {page}") json_data["page"] = page alerts = self.__get_alerts(json_data=json_data) all_alerts.extend(alerts["events"]) self.logger.info("Successfully fetched all alerts") return [ GraylogProvider.__map_event_to_alert(event=event) for event in all_alerts ] def _query(self, events_search_parameters: dict, **kwargs: dict): self.logger.info("Querying Graylog with specified parameters") # If there's a query, use the search method # Handle events_search_parameters to maintain compatibility query = kwargs.get("query") or events_search_parameters.get("query") if query: return self.search( query=query, query_type=kwargs.get("query_type", events_search_parameters.get("query_type", "elastic")), timerange_seconds=kwargs.get("timerange_seconds", events_search_parameters.get("timerange_seconds", 300)), timerange_type=kwargs.get("timerange_type", events_search_parameters.get("timerange_type", "relative")), page=kwargs.get("page", events_search_parameters.get("page", 0)), per_page=kwargs.get("per_page", events_search_parameters.get("per_page", 150)), ) # If no query specified, then run the get_alerts method alerts = self.__get_alerts(json_data=events_search_parameters)["events"] return [GraylogProvider.__map_event_to_alert(event=event) for event in alerts] if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os auth_token = os.environ.get("GRAYLOG_TOKEN") provider_config = { "authentication": { "graylog_access_token": auth_token, "graylog_user_name": "admin", "deployment_url": "http://localhost:9000", }, } provider: GraylogProvider = ProvidersFactory.get_provider( context_manager, provider_id="graylog", provider_type="graylog", provider_config=provider_config, ) logs = provider.search( query="first", timerange_seconds=3600, timerange_type="relative" ) print(logs) ================================================ FILE: keep/providers/grok_provider/__init__.py ================================================ ================================================ FILE: keep/providers/grok_provider/grok_provider.py ================================================ import json import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class GrokProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "X.AI Grok API Key", "sensitive": True, }, ) class GrokProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Grok" PROVIDER_CATEGORY = ["AI"] API_BASE = "https://api.x.ai/v1" # Example API base URL def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = GrokProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _query( self, prompt, model="grok-1", max_tokens=1024, structured_output_format=None, ): headers = { "Authorization": f"Bearer {self.authentication_config.api_key}", "Content-Type": "application/json" } # Prepare payload with structured output if needed payload = { "model": model, "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens, } if structured_output_format: payload["response_format"] = structured_output_format try: response = requests.post( f"{self.API_BASE}/chat/completions", headers=headers, json=payload ) response.raise_for_status() content = response.json()["choices"][0]["message"]["content"] # Try to parse as JSON if structured output was requested if structured_output_format: try: content = json.loads(content) except Exception: pass return { "response": content, } except requests.exceptions.RequestException as e: raise ProviderException(f"Error calling Grok API: {str(e)}") if __name__ == "__main__": import os import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) api_key = os.environ.get("GROK_API_KEY") config = ProviderConfig( description="Grok Provider", authentication={ "api_key": api_key, }, ) provider = GrokProvider( context_manager=context_manager, provider_id="grok_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works.", model="grok-1", structured_output_format={ "type": "json_schema", "json_schema": { "name": "environment_restoration", "schema": { "type": "object", "properties": { "environment": { "type": "string", "enum": ["production", "debug", "pre-prod"], }, }, "required": ["environment"], "additionalProperties": False, }, "strict": True, }, }, max_tokens=100, ) ) ================================================ FILE: keep/providers/http_provider/__init__.py ================================================ ================================================ FILE: keep/providers/http_provider/http_provider.py ================================================ """ HttpProvider is a class that provides a way to send HTTP requests. """ import copy import json import typing import requests from requests.exceptions import JSONDecodeError from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class HttpProvider(BaseProvider): """Enrich alerts with data from HTTP.""" BLACKLISTED_ENDPOINTS = [ "metadata.google.internal", "metadata.internal", "169.254.169.254", "localhost", "googleapis.com", ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def __validate_url(self, url: str): """ Validate that the url is not blacklisted. """ for endpoint in HttpProvider.BLACKLISTED_ENDPOINTS: if endpoint in url: raise Exception(f"URL {url} is blacklisted") def dispose(self): """ Nothing to do here. """ pass def validate_config(self): """ No configuration to validate here """ def _notify( self, url: str, method: typing.Literal["GET", "POST", "PUT", "DELETE"], headers: dict = None, body: dict = None, params: dict = None, proxies: dict = None, verify: bool = True, **kwargs, ): """ Send a HTTP request to the given url. """ return self.query( url=url, method=method, headers=headers, body=body, params=params, proxies=proxies, verify=verify, **kwargs, ) def _query( self, url: str, method: typing.Literal["GET", "POST", "PUT", "DELETE"], headers: dict = None, body: dict = None, params: dict = None, proxies: dict = None, fail_on_error: bool = True, verify: bool = True, **kwargs: dict, ) -> dict: """ Send a HTTP request to the given url. """ self.__validate_url(url) if headers is None: headers = {} if isinstance(headers, str): headers = json.loads(headers) if body is None: body = {} if params is None: params = {} extra_args = copy.deepcopy(kwargs) # todo: this might be problematic if params/body/headers contain sensitive data # think about changing those debug messages or adding a flag to enable/disable them self.logger.debug( f"Sending {method} request to {url}", extra={ "body": body, "headers": headers, "params": params, }, ) if method == "GET": response = requests.get( url, headers=headers, params=params, proxies=proxies, verify=verify, **extra_args, ) elif method == "POST": response = requests.post( url, headers=headers, json=body, proxies=proxies, verify=verify, **extra_args, ) elif method == "PUT": response = requests.put( url, headers=headers, json=body, proxies=proxies, verify=verify, **extra_args, ) elif method == "DELETE": response = requests.delete( url, headers=headers, json=body, proxies=proxies, verify=verify, **extra_args, ) else: raise Exception(f"Unsupported HTTP method: {method}") self.logger.debug( f"Sent {method} request to {url}", extra={ "body": body, "headers": headers, "params": params, "status_code": response.status_code, }, ) if fail_on_error: self.logger.info( f"HTTP response: {response.status_code} {response.reason}", extra={"body": body}, ) response.raise_for_status() result = {"status": response.ok, "status_code": response.status_code} try: body = response.json() except JSONDecodeError: body = response.text result["body"] = body return result ================================================ FILE: keep/providers/icinga2_provider/icinga2_provider.py ================================================ """ Icinga2 Provider is a class that provides a way to receive alerts from Icinga2 using webhooks. """ import dataclasses import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class Icinga2ProviderAuthConfig: """ Allows User Authentication with Icinga2 API. config params: - host_url: Base URL of Icinga2 instance - api_user: Username for API authentication - api_password: Password for API authentication """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Icinga2 Host URL", "hint": "e.g. https://icinga2.example.com", "sensitive": False, "validation": "any_http_url", } ) api_user: str = dataclasses.field( metadata={ "required": True, "description": "Icinga2 API User", "sensitive": False, } ) api_password: str = dataclasses.field( metadata={ "required": True, "description": "Icinga2 API Password", "sensitive": True, } ) class Icinga2Provider(BaseProvider): """ Get alerts from Icinga2 into Keep primarily via webhooks. feat: - Fetching alerts from Icinga2 services & hosts - Mapping Icinga2 states to Keep alert status and severity - Formatting alerts according to Keep's alert model - Supporting webhook integration for real-time alerts """ webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Icinga2 to Keep, configure a new notification command: 1. In Icinga2, create a new notification command 2. Set the webhook URL as: {keep_webhook_api_url} 3. Add header "X-API-KEY" with your Keep API key (webhook role) 4. Configure notification rules to use this command 5. For detailed setup instructions, see [Keep documentation](https://docs.keephq.dev/providers/documentation/icinga2-provider) """ PROVIDER_DISPLAY_NAME = "Icinga2" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] WEBHOOK_INSTALLATION_REQUIRED = True PROVIDER_ICON = "icinga2-icon.png" # Define provider scopes PROVIDER_SCOPES = [ ProviderScope( name="read_alerts", description="Read alerts from Icinga2", ), ] # Icinga2 states Mapping to Keep alert states ... STATUS_MAP = { "OK": AlertStatus.RESOLVED, "WARNING": AlertStatus.FIRING, "CRITICAL": AlertStatus.FIRING, "UNKNOWN": AlertStatus.FIRING, "UP": AlertStatus.RESOLVED, "DOWN": AlertStatus.FIRING, } # Mapping Icinga2 states to Keep alert severities SEVERITY_MAP = { "OK": AlertSeverity.INFO, "WARNING": AlertSeverity.WARNING, "CRITICAL": AlertSeverity.CRITICAL, "UNKNOWN": AlertSeverity.INFO, "UP": AlertSeverity.INFO, "DOWN": AlertSeverity.CRITICAL, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose of the provider. """ pass def validate_config(self): """ Validates required configuration for Icinga2 provider. Affirms all required authentication parameters are present. """ self.authentication_config = Icinga2ProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validate provider scopes by testing API connectivity. Attempts to fetch Icinga2 status to verify credentials. """ self.logger.info("Validating Icinga2 provider") try: response = requests.get( url=f"{self.authentication_config.host_url}/v1/status", auth=( self.authentication_config.api_user, self.authentication_config.api_password, ), verify=True, ) if response.status_code != 200: response.raise_for_status() self.logger.info( "Scopes Validation is successful", extra={"response": response.json()} ) return {"read_alerts": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"error": e}) return {"read_alerts": str(e)} def _get_alerts(self) -> list[AlertDto]: """ Get alerts from Icinga2 via API. Returns: list[AlertDto]: List of alerts in Keep format """ self.logger.info("Getting alerts from Icinga2") try: response = requests.get( url=f"{self.authentication_config.host_url}/v1/services?attrs=name,display_name,state,last_state_change", auth=( self.authentication_config.api_user, self.authentication_config.api_password, ), verify=True, ) if response.status_code != 200: response.raise_for_status() services = response.json()["results"] return [ AlertDto( id=service.get("name"), name=service.get("display_name"), status=self.STATUS_MAP.get( service.get("state"), AlertStatus.FIRING ), severity=self.SEVERITY_MAP.get( service.get("state"), AlertSeverity.INFO ), timestamp=service.get("last_state_change"), source=["icinga2"], ) for service in services ] except Exception as e: self.logger.exception("Failed to get alerts from Icinga2") raise Exception(f"Failed to get alerts from Icinga2: {str(e)}") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: """ Format Icinga2 webhook payload into Keep alert format. Args: event (dict): Raw alert data from Icinga2 provider_instance (BaseProvider, optional): Provider instance Returns: AlertDto: Formatted alert in Keep format """ check_result = event.get("check_result", {}) service = event.get("service", {}) host = event.get("host", {}) status = check_result.get("exit_status", 0) state = check_result.get("state", "UNKNOWN") output = check_result.get("output", "No output provided") alert = AlertDto( id=service.get("name") or host.get("name"), name=service.get("display_name") or host.get("display_name"), status=Icinga2Provider.STATUS_MAP.get(state, AlertStatus.FIRING), severity=Icinga2Provider.SEVERITY_MAP.get(state, AlertSeverity.INFO), timestamp=check_result.get("execution_start"), lastReceived=check_result.get("execution_end"), description=output, source=["icinga2"], hostname=host.get("name"), service_name=service.get("name"), check_command=service.get("check_command") or host.get("check_command"), state=state, state_type=check_result.get("state_type"), attempt=check_result.get("attempt"), acknowledgement=service.get("acknowledgement") or host.get("acknowledgement"), downtime_depth=service.get("downtime_depth") or host.get("downtime_depth"), flapping=service.get("flapping") or host.get("flapping"), execution_time=check_result.get("execution_time"), latency=check_result.get("latency"), raw_output=output, exit_status=status, ) return alert if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os icinga2_api_user = os.getenv("ICINGA2_API_USER") icinga2_api_password = os.getenv("ICINGA2_API_PASSWORD") config = ProviderConfig( description="Icinga2 Provider", authentication={ "host_url": "https://icinga2.example.com", "api_user": icinga2_api_user, "api_password": icinga2_api_password, }, ) provider = Icinga2Provider(context_manager, "icinga2", config) ================================================ FILE: keep/providers/ilert_provider/__init__.py ================================================ ================================================ FILE: keep/providers/ilert_provider/ilert_provider.py ================================================ """ ilert Provider is a class that allows to create/close incidents in ilert. """ import dataclasses import enum import json import os from typing import Literal import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory from keep.validation.fields import HttpsUrl class IlertIncidentStatus(str, enum.Enum): """ ilert incident status. """ INVESTIGATING = "INVESTIGATING" RESOLVED = "RESOLVED" MONITORING = "MONITORING" IDENTIFIED = "IDENTIFIED" @pydantic.dataclasses.dataclass class IlertProviderAuthConfig: """ ilert authentication configuration. """ ilert_token: str = dataclasses.field( metadata={ "required": True, "description": "ILert API token", "hint": "Bearer eyJhbGc...", "sensitive": True, } ) ilert_host: HttpsUrl = dataclasses.field( metadata={ "required": False, "description": "ILert API host", "hint": "https://api.ilert.com/api", "validation": "https_url", }, default="https://api.ilert.com/api", ) class IlertProvider(BaseProvider): """Create/Resolve incidents in ilert.""" PROVIDER_DISPLAY_NAME = "ilert" PROVIDER_SCOPES = [ ProviderScope( name="read_permission", description="Read permission", mandatory=True ), ProviderScope( name="write_permission", description="Write permission", mandatory=False ), ] PROVIDER_CATEGORY = ["Incident Management"] SEVERITIES_MAP = { "MAJOR_OUTAGE": AlertSeverity.CRITICAL, "PARTIAL_OUTAGE": AlertSeverity.HIGH, "DEGRADED": AlertSeverity.WARNING, "UNDER_MAINTENANCE": AlertSeverity.INFO, "OPERATIONAL": AlertSeverity.INFO, } STATUS_MAP = { "RESOLVED": AlertStatus.RESOLVED, "INVESTIGATING": AlertStatus.ACKNOWLEDGED, "MONITORING": AlertStatus.ACKNOWLEDGED, "IDENTIFIED": AlertStatus.ACKNOWLEDGED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for ilert provider. """ self.authentication_config = IlertProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): scopes = {} self.logger.info("Validating scopes") for scope in self.PROVIDER_SCOPES: try: if scope.name == "read_permission": res = requests.get( f"{self.authentication_config.ilert_host}/incidents", headers={ "Authorization": self.authentication_config.ilert_token }, ) res.raise_for_status() scopes[scope.name] = True elif scope.name == "write_permission": res = requests.get( f"{self.authentication_config.ilert_host}/users/current", headers={ "Authorization": self.authentication_config.ilert_token }, timeout=10, ) res.raise_for_status() data = res.json() if data["role"] not in ["USER", "ADMIN"]: warning_msg = ( f"User role '{data['role']}' has limited permissions" ) self.logger.warning(warning_msg) scopes[scope.name] = warning_msg else: self.logger.debug( f"Write permission validated successfully for role: {data['role']}" ) scopes[scope.name] = True except Exception as e: self.logger.warning( "Failed to validate scope", extra={"scope": scope.name}, ) scopes[scope.name] = str(e) self.logger.info("Scopes validated", extra=scopes) return scopes def _query(self, incident_id: str, **kwargs): """ Query ilert incident. """ self.logger.info( "Querying ilert incident", extra={ "incident_id": incident_id, **kwargs, }, ) headers = {"Authorization": self.authentication_config.ilert_token} response = requests.get( f"{self.authentication_config.ilert_host}/incidents/{incident_id}", headers=headers, ) if not response.ok: self.logger.error( "Failed to query ilert incident", extra={ "status_code": response.status_code, "response": response.text, }, ) raise Exception( f"Failed to query ilert incident: {response.status_code} {response.text}" ) self.logger.info( "ilert incident queried", extra={"status_code": response.status_code}, ) return response.json() def _get_alerts(self) -> list[AlertDto]: """ Get incidents from ilert. """ if not self.authentication_config.ilert_host.endswith("/api"): self.authentication_config.ilert_host = ( f"{self.authentication_config.ilert_host}/api" ) headers = {"Authorization": f"{self.authentication_config.ilert_token}"} response = requests.get( f"{self.authentication_config.ilert_host}/incidents", headers=headers, ) if not response.ok: self.logger.error( "Failed to get alerts", extra={ "status_code": response.status_code, "response": response.text, }, ) raise Exception( f"Failed to get alerts: {response.status_code} {response.text}" ) alerts = response.json() self.logger.info( "Got alerts from ilert", extra={"number_of_alerts": len(alerts)} ) alert_dtos = [] for alert in alerts: severity = IlertProvider.SEVERITIES_MAP.get( alert.get("affectedServices", [{}])[0].get("impact", "OPERATIONAL") ) status = IlertProvider.STATUS_MAP.get( alert.get("status"), AlertStatus.ACKNOWLEDGED ) alert_dto = AlertDto( id=alert["id"], name=alert["summary"], title=alert["summary"], description=alert["message"], status=status, severity=severity, sendNotification=alert["sendNotification"], createdAt=alert["createdAt"], updatedAt=alert["updatedAt"], affectedServices=alert["affectedServices"], createdBy=alert["createdBy"], lastHistory=alert["lastHistory"], lastHistoryCreatedAt=alert["lastHistoryCreatedAt"], lastHistoryUpdatedAt=alert["lastHistoryUpdatedAt"], lastReceived=alert["updatedAt"], ) alert_dtos.append(alert_dto) return alert_dtos def __create_or_update_incident( self, summary, status, message, affectedServices, id ): self.logger.info( "Creating/updating ilert incident", extra={ "summary": summary, "status": status, "incident_message": message, "affectedServices": affectedServices, "id": id, }, ) headers = {"Authorization": self.authentication_config.ilert_token} # Create or update incident payload = { "id": id, "status": str(status), "message": message, } # if id is set, we update the incident, otherwise we create a new one should_update = id and id != "0" if not should_update: try: payload["affectedServices"] = ( json.loads(affectedServices) if isinstance(affectedServices, str) else affectedServices ) except Exception: self.logger.warning( "Failed to parse affectedServices", extra={"affectedServices": affectedServices}, ) raise if not summary: raise Exception("summary is required") payload["summary"] = summary response = requests.post( f"{self.authentication_config.ilert_host}/incidents", headers=headers, json=payload, ) else: incident = requests.get( f"{self.authentication_config.ilert_host}/incidents/{id}", headers=headers, ).json() response = requests.put( f"{self.authentication_config.ilert_host}/incidents/{id}", headers=headers, json={**incident, **payload}, ) if not response.ok: self.logger.error( "Failed to create/update ilert incident", extra={ "status_code": response.status_code, "response": response.text, }, ) raise Exception( f"Failed to create/update ilert incident: {response.status_code} {response.text}" ) self.logger.info( "ilert incident created/updated", extra={"status_code": response.status_code}, ) return response.json() def __post_ilert_event( self, event_type: Literal["ALERT", "ACCEPT", "RESOLVE"] = "ALERT", summary: str = "", details: str = "", alert_key: str = "", priority: Literal["HIGH", "LOW"] = "HIGH", images: list = [], links: list = [], custom_details: dict = {}, routing_key: str = "", ): payload = { "eventType": event_type, "summary": summary, "details": details, "alertKey": alert_key, "priority": priority, "images": images, "links": links, "customDetails": custom_details, } self.logger.info("Posting ilert event", extra=payload) response = requests.post( f"{self.authentication_config.ilert_host}/events/keep/{self.authentication_config.ilert_token} ", json=payload, ) self.logger.info( "ilert event posted", extra={"status_code": response.status_code} ) return response.json() def _notify( self, _type: Literal["incident", "event"] = "event", summary: str = "", status: IlertIncidentStatus = IlertIncidentStatus.INVESTIGATING, message: str = "", affectedServices: str | list = "[]", id: str = "0", event_type: Literal["ALERT", "ACCEPT", "RESOLVE"] = "ALERT", details: str = "", alert_key: str = "", priority: Literal["HIGH", "LOW"] = "HIGH", images: list = [], links: list = [], custom_details: dict = {}, **kwargs: dict, ): """ Notify ilert about an incident or event. Args: _type: Type of notification ('incident' or 'event') - determines which endpoint is used summary: A brief summary of the incident (required for new incidents) status: Current status of the incident (INVESTIGATING, RESOLVED, MONITORING, IDENTIFIED) message: Detailed message describing the incident (default: empty string) affectedServices: JSON string of affected services and their statuses (default: "[]") id: ID of incident to update (use "0" to create a new incident) event_type: Type of event to post (ALERT, ACCEPT, RESOLVE) details: Detailed information about the event alert_key: Unique key for event deduplication priority: Priority level of the event (HIGH, LOW) images: List of image URLs to include with the event links: List of related links to include with the event custom_details: Custom key-value pairs for additional context """ self.logger.info("Notifying ilert", extra=locals()) if _type == "incident": return self.__create_or_update_incident( summary, status, message, affectedServices, id ) else: return self.__post_ilert_event( event_type, summary, details, alert_key, priority, images, links, custom_details, ) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os api_key = os.environ.get("ILERT_API_TOKEN") host = os.environ.get("ILERT_API_HOST") provider_config = { "authentication": {"ilert_token": api_key, "ilert_host": host}, } provider: IlertProvider = ProvidersFactory.get_provider( context_manager=context_manager, provider_id="ilert", provider_type="ilert", provider_config=provider_config, ) """ result = provider._query( "Example", message="Lorem Ipsum", status="MONITORING", affectedServices=json.dumps( [ { "impact": "OPERATIONAL", "service": {"id": 339743}, } ] ), id="242530", ) print(result) """ alerts = provider._get_alerts() print(alerts) ================================================ FILE: keep/providers/incidentio_provider/__init__.py ================================================ ================================================ FILE: keep/providers/incidentio_provider/incidentio_provider.py ================================================ """ IncidentioProvider is a class that allows to get all incidents as well query specific incidents in Incidentio. """ import dataclasses from typing import List from urllib.parse import urlencode, urljoin import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope class ResourceAlreadyExists(Exception): def __init__(self, *args): super().__init__(*args) @pydantic.dataclasses.dataclass class IncidentioProviderAuthConfig: """ Incidentio authentication configuration. """ incidentIoApiKey: str = dataclasses.field( metadata={ "required": True, "description": "IncidentIO's API_KEY", "hint": "API KEY for incident.io", "sensitive": True, }, ) class IncidentioProvider(BaseProvider, ProviderHealthMixin): """Receive Incidents from Incidentio.""" PROVIDER_DISPLAY_NAME = "incident.io" PROVIDER_CATEGORY = ["Incident Management"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authenticated", mandatory=True, alias="authenticated", ), ProviderScope( name="read_access", description="User has read access", mandatory=True, alias="can_read", ), ] SEVERITIES_MAP = { "Warning": AlertSeverity.WARNING, "Major": AlertSeverity.HIGH, "Info": AlertSeverity.INFO, "Critical": AlertSeverity.CRITICAL, "Minor": AlertSeverity.LOW, } STATUS_MAP = { "triage": AlertStatus.ACKNOWLEDGED, "declined": AlertStatus.SUPPRESSED, "merged": AlertStatus.RESOLVED, "canceled": AlertStatus.SUPPRESSED, "live": AlertStatus.FIRING, "learning": AlertStatus.PENDING, "closed": AlertStatus.RESOLVED, "paused": AlertStatus.SUPPRESSED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Incidentio provider. """ self.authentication_config = IncidentioProviderAuthConfig( **self.config.authentication ) def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for Incidentio api requests. Example: paths = ["issue", "createmeta"] query_params = {"projectKeys": "key1"} url = __get_url("test", paths, query_params) # url = https://incidentio.com/api/2/issue/createmeta?projectKeys=key1 """ base_url = "https://api.incident.io/v2/" path_str = "/".join(str(path) for path in paths) url = urljoin(base_url, path_str) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def __get_headers(self): """ Building the headers for api requests """ return { "Authorization": f"Bearer {self.authentication_config.incidentIoApiKey}", } def validate_scopes(self) -> dict[str, bool | str]: self.logger.info("Validating IncidentIO scopes...") try: print(self.__get_url(paths=["incidents"])) response = requests.get( url=self.__get_url(paths=["incidents"]), timeout=10, headers=self.__get_headers(), ) if response.ok: return {"authenticated": True, "read_access": True} else: self.logger.error(f"Failed to validate scopes: {response.status_code}") scopes = { "authenticated": "Unable to query incidents: {response.status_code}", "read_access": False, } except Exception as e: self.logger.error( "Error getting IncidentIO scopes:", extra={"exception": str(e)} ) scopes = { "authenticated": "Unable to query incidents: {e}", "read_access": False, } return scopes def _query(self, incident_id, **kwargs) -> AlertDto: """query IncidentIO Incident""" self.logger.info( "Querying IncidentIO incident", extra={ "incident_id": incident_id, **kwargs, }, ) try: response = requests.get( url=self.__get_url(paths=["incidents", incident_id]), headers=self.__get_headers(), ) except Exception as e: self.logger.error( "Error while fetching Incident", extra={ "incident_id": incident_id, "kwargs": kwargs, "exception": str(e), }, ) raise e else: if response.ok: res = response.json() return self.__map_alert_to_AlertDTO({"event": res}) else: self.logger.error( "Error while fetching Incident", extra={ "incident_id": incident_id, "kwargs": kwargs, "res": response.text, }, ) def _get_alerts(self) -> list[AlertDto]: alerts = [] next_page = None while True: try: params = {"page_size": 100} if next_page: params["after"] = next_page response = requests.get( self.__get_url(paths=["incidents"]), headers=self.__get_headers(), params=params, timeout=15, ) response.raise_for_status() except requests.RequestException as e: self.logger.error( "Error getting IncidentIO scopes:", extra={"exception": str(e)} ) raise e else: data = response.json() try: for incident in data.get("incidents", []): alerts.append(self.__map_alert_to_AlertDTO(incident)) except Exception as e: self.logger.error( "Error while mapping incidents to AlertDTO", extra={"exception": str(e)}, ) raise e pagination_meta = data.get("pagination_meta", {}) next_page = pagination_meta.get("after") if not next_page: break return alerts def __map_alert_to_AlertDTO(self, incident) -> AlertDto: return AlertDto( id=incident["id"], fingerprint=incident["id"], name=incident["name"], status=IncidentioProvider.STATUS_MAP[ incident["incident_status"]["category"] ], severity=IncidentioProvider.SEVERITIES_MAP.get( incident.get("severity", {}).get("name", "minor"), AlertSeverity.WARNING ), lastReceived=incident.get("created_at"), description=incident.get("summary", ""), apiKeyRef=incident["creator"]["api_key"]["id"], assignee=", ".join( assignment["role"]["name"] for assignment in incident["incident_role_assignments"] ), url=incident.get("permalink", "https://app.incident.io/"), ) if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os api_key = os.getenv("INCIDENTIO_API_KEY") config = ProviderConfig( description="Incidentio Provider", authentication={"incidentIoApiKey": api_key}, ) provider = IncidentioProvider( context_manager, provider_id="incidentio_provider", config=config, ) print(provider.validate_scopes()) print(provider._get_alerts()) ================================================ FILE: keep/providers/incidentmanager_provider/__init__.py ================================================ ================================================ FILE: keep/providers/incidentmanager_provider/incidentmanager_provider.py ================================================ """ IncidentManagerProvider is a class that provides a way to read data from AWS Incident Manager. """ import dataclasses import logging import os from urllib.parse import urlparse from uuid import uuid4 import boto3 import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class IncidentmanagerProviderAuthConfig: region: str = dataclasses.field( metadata={ "required": True, "description": "AWS region", "senstive": False, }, ) response_plan_arn: str = dataclasses.field( default=None, metadata={ "required": True, "description": "AWS Response Plan's arn", "hint": "Default response plan arn to use when interacting with incidents, if not provided, we won't be able to register web hook for the incidents", "sensitive": False, }, ) sns_topic_arn: str = dataclasses.field( default=None, metadata={ "required": True, "description": "AWS SNS Topic arn you want to be used/using in response plan", "hint": "Default sns topic to use when creating incidents, if not provided, we won't be able to register web hook for the incidents", "sensitive": False, }, ) access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS access key (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) access_key_secret: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS access key secret (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) class IncidentmanagerProvider(BaseProvider): """Push incidents from AWS IncidentManager to Keep.""" PROVIDER_CATEGORY = ["Incident Management"] PROVIDER_SCOPES = [ ProviderScope( name="ssm-incidents:ListIncidentRecords", description="Required to retrieve incidents.", documentation_url="https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html", mandatory=True, alias="Describe Incidents", ), # this is not needed until we figure out how to override dismiss call # ProviderScope( # name="ssm-incidents:UpdateIncidentRecord", # description="Required to update incidents, when you resolve them for example.", # documentation_url="https://docs.aws.amazon.com/incident-manager/latest/userguide/what-is-incident-manager.html#features", # mandatory=False, # alias="Update Incident Records", # ), ProviderScope( name="ssm-incidents:GetResponsePlan", description="Required to get response plan and register keep as webhook", documentation_url="https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html", mandatory=False, alias="Update Response Plan", ), ProviderScope( name="ssm-incidents:UpdateResponsePlan", description="Required to update response plan and register keep as webhook", documentation_url="https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html", mandatory=False, alias="Update Response Plan", ), ProviderScope( name="iam:SimulatePrincipalPolicy", description="Allow Keep to test the scopes of the current user/role without modifying any resource.", documentation_url="https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html", mandatory=False, alias="Simulate IAM Policy", ), ProviderScope( name="sns:ListSubscriptionsByTopic", description="Required to list all subscriptions of a topic, so Keep will be able to add itself as a subscription.", documentation_url="https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm-incidents.html", mandatory=False, alias="List Subscriptions", ), ] PROVIDER_DISPLAY_NAME = "Incident Manager" STATUS_MAP = { "OPEN": AlertStatus.FIRING, "RESOLVED": AlertStatus.RESOLVED, } SEVERITIES_MAP = { 1: AlertSeverity.CRITICAL, 2: AlertSeverity.HIGH, 3: AlertSeverity.LOW, 4: AlertSeverity.WARNING, 5: AlertSeverity.INFO, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.aws_client_type = None self._client = None def validate_scopes(self): # init the scopes as False scopes = {scope.name: False for scope in self.PROVIDER_SCOPES} # the scope name is the action actions = scopes.keys() # fetch the results try: sts_client = self.__generate_client("sts") identity = sts_client.get_caller_identity()["Arn"] iam_client = self.__generate_client("iam") except Exception as e: self.logger.exception("Error validating AWS IAM scopes") scopes = {s: str(e) for s in scopes.keys()} return scopes # 0. try to validate all scopes using simulate_principal_policy # if the user/role have permissions to simulate_principal_policy, we can validate the scopes easily try: iam_resp = iam_client.simulate_principal_policy( PolicySourceArn=identity, ActionNames=list(actions) ) scopes = { res.get("EvalActionName"): res.get("EvalDecision") == "allowed" for res in iam_resp.get("EvaluationResults") } scopes["iam:SimulatePrincipalPolicy"] = True if all(scopes.values()): self.logger.info( "All AWS IAM scopes are granted!", extra={"scopes": scopes} ) return scopes # if not all the scopes are granted, we need to test them one by one else: self.logger.warning( "Some of the AWS IAM scopes are not granted, testing them one by one...", extra={"scopes": scopes}, ) # otherwise, we need to test them one by one except Exception: self.logger.info("Error validating AWS IAM scopes") scopes["iam:SimulatePrincipalPolicy"] = ( "No permissions to simulate_principal_policy (but its cool, its not a must)" ) self.logger.info("Validating aws incident manager scopes") # 1. validate list incident records ssm_incident_client = self.__generate_client("ssm-incidents") results = None try: results = ssm_incident_client.list_incident_records()[ "incidentRecordSummaries" ] scopes["ssm-incidents:ListIncidentRecords"] = True except Exception: self.logger.exception( "Error starting AWS incident manager list_incident_records query - add ssm-incidents:ListIncidentRecords permissions", ) raise if results: if len(results) <= 0: scopes["ssm-incidents:UpdateIncidentRecord"] = ( "We need atleast on incident to test the update scope. Please create an incident manually and try again." ) raise try: # here using impact , because if we use status it won't be able to be updated again incase of resolved. ssm_incident_client.update_incident_record( arn=results[0]["arn"], impact=1 ) # restoring impact ssm_incident_client.update_incident_record( arn=results[0]["arn"], impact=results[0]["impact"] ) scopes["ssm-incidents:UpdateIncidentRecord"] = True except Exception: scopes["ssm-incidents:UpdateIncidentRecord"] = ( "No permissions to update incidents it seems" ) raise # 2 validate if we are already getting user's sns topic and able to fetch sns from aws, not mandatory though try: sns_topic = self.authentication_config.sns_topic_arn if not sns_topic.startswith("arn:aws:sns"): account_id = self._get_account_id() sns_topic = f"arn:aws:sns:{self.authentication_config.region}:{account_id}:{self.authentication_config.sns_topic_arn}" scopes["sns:ListSubscriptionsByTopic"] = True except Exception as e: self.logger.exception( "Error validating AWS sns:ListSubscriptionsByTopic scope" ) scopes["sns:ListSubscriptionsByTopic"] = str(e) # 3 validate get response plan response_plan = None try: response_plan = ssm_incident_client.get_response_plan( arn=self.authentication_config.response_plan_arn ) scopes["ssm-incidents:GetResponsePlan"] = True except Exception: scopes["ssm-incidents:GetResponsePlan"] = ( "No permissions to get response plan" ) raise # 4 validate update response plan try: if not response_plan: raise Exception("No response plan found") ssm_incident_client.update_response_plan( arn=self.authentication_config.response_plan_arn, displayName="test" ) ssm_incident_client.update_response_plan( arn=self.authentication_config.response_plan_arn, displayName=response_plan["displayName"], ) scopes["ssm-incidents:UpdateResponsePlan"] = True except Exception: scopes["ssm-incidents:UpdateResponsePlan"] = ( "No permissions to update response plan" ) raise return scopes @property def client(self): if self._client is None: self.client = self.__generate_client(self.aws_client_type) return self._client def _get_alerts(self) -> list[AlertDto]: all_alerts = [] for alert in self._query(): all_alerts.append(self._format_alert(alert, self)) return all_alerts def _query(self, **kwargs: dict) -> dict: """ Query AWS Incident Manager to get all incidents """ ssm_incident_client = self.__generate_client("ssm-incidents") all_records = [] try: all_records.extend( ssm_incident_client.list_incident_records()["incidentRecordSummaries"] ) except Exception: self.logger.exception( "Error starting AWS incident manager query - add logs:StartQuery permissions", extra={"kwargs": kwargs}, ) raise return all_records def _get_account_id(self): sts_client = self.__generate_client("sts") identity = sts_client.get_caller_identity() return identity["Account"] def __generate_client(self, aws_client_type: str): client = boto3.client( aws_client_type, aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.access_key_secret, region_name=self.authentication_config.region, ) return client def dispose(self): try: self.client.close() except Exception: self.logger.exception("Error closing boto3 connection") def validate_config(self): self.authentication_config = IncidentmanagerProviderAuthConfig( **self.config.authentication ) def add_hook_to_topic(self, topic: str, keep_api_url: str, api_key: str): sns_client = self.__generate_client("sns") subscriptions = [] try: subscriptions = sns_client.list_subscriptions_by_topic(TopicArn=topic).get( "Subscriptions", [] ) except Exception: self.logger.exception( "Error fetching subscriptions for the topic", extra={"topic": topic}, ) return False hostname = urlparse(keep_api_url).hostname already_subscribed = any( hostname in sub["Endpoint"] and not sub["SubscriptionArn"] == "PendingConfirmation" for sub in subscriptions ) if already_subscribed: self.logger.info("Already subscribed to topic %s", topic) return True url_with_api_key = keep_api_url.replace( "https://", f"https://api_key:{api_key}@" ) # print(url_with_api_key) self.logger.info("Subscribing to topic %s...", topic) sns_client.subscribe( TopicArn=topic, Protocol="https", Endpoint=url_with_api_key, ) self.logger.info("Subscribed to topic %s!", topic) return True def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): """ Steps: 1. Query the response plan 2. Add/Update given sns topic to add keep's webhook """ if not self.authentication_config.response_plan_arn: self.logger.warning( "No default response plan name provided, skipping webhook setup" ) return ssm_incident_client = self.__generate_client("ssm-incidents") response_plan = ssm_incident_client.get_response_plan( arn=self.authentication_config.response_plan_arn ) # print(response_plan) if self.authentication_config.sns_topic_arn: sns_topic = self.authentication_config.sns_topic_arn if not self.authentication_config.sns_topic_arn.startswith("arn:aws:sns"): account_id = self._get_account_id() sns_topic = f"arn:aws:sns:{self.authentication_config.region}:{account_id}:{self.authentication_config.sns_topic_arn}" if "notificationTargets" not in response_plan["incidentTemplate"]: ssm_incident_client.update_response_plan( arn=self.authentication_config.response_plan_arn, chatChannel={ "chatbotSns": [sns_topic], }, incidentTemplateNotificationTargets=[ {"snsTopicArn": sns_topic}, ], ) response_plan = ssm_incident_client.get_response_plan( arn=self.authentication_config.response_plan_arn ) notification_targets = response_plan["incidentTemplate"][ "notificationTargets" ] for topic in notification_targets: # print(topic) if topic["snsTopicArn"] == sns_topic: result = self.add_hook_to_topic( topic=sns_topic, keep_api_url=keep_api_url, api_key=api_key, ) if result: break self.logger.info("Webhook setup completed!") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: logger = logging.getLogger(__name__) # if its confirmation event, we need to confirm the subscription if event.get("Type") == "SubscriptionConfirmation": logger.info("Confirming subscription...") subscribe_url = event.get("SubscribeURL") requests.get(subscribe_url) logger.info("Subscription confirmed!") # Done return alert = event # Map the status to Keep status status = IncidentmanagerProvider.STATUS_MAP.get( alert.get("status"), AlertStatus.FIRING ) del alert["status"] severity = IncidentmanagerProvider.SEVERITIES_MAP.get(alert.get("IMPACT"), 5) return AlertDto( id=alert.get("arn", str(uuid4())), name=alert.get("title", alert.get("alertname")), status=status, severity=severity, lastReceived=str(alert.get("creationTime")), description=alert.get("summary"), url=alert.pop("url", alert.get("generatorURL")), source=["incidentmanager"], **alert, ) if __name__ == "__main__": config = ProviderConfig( authentication={ "access_key": os.environ.get("AWS_ACCESS_KEY_ID"), "access_key_secret": os.environ.get("AWS_SECRET_ACCESS_KEY"), "region": os.environ.get("AWS_REGION"), "response_plan_arn": "arn:aws:ssm-incidents::085059502819:response-plan/ResponseEmail", "sns_topic_arn": "arn:aws:sns:ap-south-1:085059502819:Keep", } ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) provider = IncidentmanagerProvider(context_manager, "asdasd", config) results = provider.validate_scopes() print(results) # provider.setup_webhook( # tenant_id="keep", # keep_api_url="https://1064-2401-4900-1c0f-ae0f-dbba-8aae-8a51-8d29.ngrok-free.app/alerts/event/incidentmanager", # api_key="localhost", # ) # results = provider.get_alerts() # print(results) ================================================ FILE: keep/providers/jira_provider/__init__.py ================================================ ================================================ FILE: keep/providers/jira_provider/jira_provider.py ================================================ """ JiracloudProvider is a class that implements the BaseProvider interface for Jira updates. """ import dataclasses import json from typing import List, Optional from urllib.parse import urlencode, urljoin import pydantic import requests from requests.auth import HTTPBasicAuth from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class JiraProviderAuthConfig: """Jira Cloud authentication configuration.""" email: str = dataclasses.field( metadata={ "required": True, "description": "Atlassian Jira Email", "sensitive": False, "documentation_url": "https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/#Create-an-API-token", } ) api_token: str = dataclasses.field( metadata={ "required": True, "description": "Atlassian Jira API Token", "sensitive": True, "documentation_url": "https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/#Create-an-API-token", } ) host: HttpsUrl = dataclasses.field( metadata={ "required": True, "description": "Atlassian Jira Host", "sensitive": False, "documentation_url": "https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/#Create-an-API-token", "hint": "https://keephq.atlassian.net", "validation": "https_url", } ) ticket_creation_url: str = dataclasses.field( metadata={ "required": False, "description": "URL for creating new tickets (optional, will use default if not provided)", "sensitive": False, "hint": "https://keephq.atlassian.net/secure/CreateIssue.jspa", }, default="", ) class JiraProvider(BaseProvider): """Enrich alerts with Jira tickets.""" PROVIDER_CATEGORY = ["Ticketing"] PROVIDER_SCOPES = [ ProviderScope( name="BROWSE_PROJECTS", description="Browse Jira Projects", mandatory=True, alias="Browse projects", ), ProviderScope( name="CREATE_ISSUES", description="Create Jira Issues", mandatory=True, alias="Create issue", ), ProviderScope( name="CLOSE_ISSUES", description="Close Jira Issues", mandatory=False, alias="Close issues", ), ProviderScope( name="EDIT_ISSUES", description="Edit Jira Issues", mandatory=False, alias="Edit issues", ), ProviderScope( name="DELETE_ISSUES", description="Delete Jira Issues", mandatory=False, alias="Delete issues", ), ProviderScope( name="MODIFY_REPORTER", description="Modify Jira Issue Reporter", mandatory=False, alias="Modidy issue reporter", ), ProviderScope( name="TRANSITION_ISSUES", description="Transition Jira Issues", mandatory=False, alias="Transition issues", ), ] PROVIDER_TAGS = ["ticketing"] PROVIDER_DISPLAY_NAME = "Jira Cloud" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._host = None def validate_scopes(self): """ Validate that the provider has the required scopes. """ headers = {"Accept": "application/json"} auth = requests.auth.HTTPBasicAuth( self.authentication_config.email, self.authentication_config.api_token ) # first, validate user/api token are correct: resp = requests.get( f"{self.jira_host}/rest/api/3/myself", headers={"Accept": "application/json"}, auth=auth, verify=False, ) try: resp.raise_for_status() except Exception: scopes = { scope.name: "Failed to authenticate with Jira - wrong credentials" for scope in JiraProvider.PROVIDER_SCOPES } return scopes params = { "permissions": ",".join( [scope.name for scope in JiraProvider.PROVIDER_SCOPES] ) } resp = requests.get( f"{self.jira_host}/rest/api/3/mypermissions", headers=headers, auth=auth, params=params, verify=False, ) try: resp.raise_for_status() except Exception as e: scopes = { scope.name: f"Failed to authenticate with Jira: {e}" for scope in JiraProvider.PROVIDER_SCOPES } return scopes permissions = resp.json().get("permissions", []) scopes = { scope: scope_result.get("havePermission", False) for scope, scope_result in permissions.items() } return scopes def validate_config(self): self.authentication_config = JiraProviderAuthConfig( **self.config.authentication ) @property def jira_host(self) -> str: if self._host is not None: return self._host host = ( self.authentication_config.host if self.authentication_config.host.startswith("https://") or self.authentication_config.host.startswith("http://") else f"https://{self.authentication_config.host}" ) self._host = host return self._host def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for jira api requests. Example: paths = ["issue", "createmeta"] query_params = {"projectKeys": "key1"} url = __get_url("test", paths, query_params) # url = https://test.atlassian.net/rest/api/2/issue/createmeta?projectKeys=key1 """ # add url path url = urljoin( f"{self.jira_host}/rest/api/2/", "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def __get_auth(self): """ Helper method to build the auth payload for jira api requests. """ return HTTPBasicAuth( self.authentication_config.email, self.authentication_config.api_token ) def __get_createmeta(self, project_key: str): try: self.logger.info("Fetching create meta data...") url = self.__get_url( paths=["issue", "createmeta"], query_params={"projectKeys": project_key}, ) response = requests.get(url=url, auth=self.__get_auth(), verify=False) response.raise_for_status() self.logger.info("Fetched create meta data!") return response.json() except Exception as e: raise ProviderException(f"Failed to fetch createmeta: {e}") def __get_single_createmeta(self, project_key: str): """ Helper method to get single createmeta. As the original createmeta api returns multiple issue types and other config. """ try: self.logger.info("Fetching single createmeta...") createmeta = self.__get_createmeta(project_key) projects = createmeta.get("projects", []) project = projects[0] if len(project_key) > 0 else {} issuetypes = project.get("issuetypes", []) issuetype = issuetypes[0] if len(issuetypes) > 0 else {} issue_type_name = issuetype.get("name", "") if not issue_type_name: raise ProviderException("No issue types found!") self.logger.info("Fetched single createmeta!") return {"issue_type_name": issue_type_name} except Exception as e: raise ProviderException(f"Failed to fetch single createmeta: {e}") def __get_available_transitions(self, issue_id: str): """ Get available transitions for an issue. Args: issue_id: The Jira issue ID or key Returns: List of available transitions with their IDs and names """ try: self.logger.info(f"Fetching available transitions for issue {issue_id}...") url = self.__get_url(paths=["issue", issue_id, "transitions"]) response = requests.get(url=url, auth=self.__get_auth(), verify=False) response.raise_for_status() transitions = response.json().get("transitions", []) self.logger.info( f"Found {len(transitions)} available transitions for issue {issue_id}" ) return transitions except Exception as e: raise ProviderException( f"Failed to fetch transitions for issue {issue_id}: {e}" ) def __transition_issue( self, issue_id: str, transition_name: Optional[str] = None, transition_id: Optional[str] = None ): """ Transition an issue to a new status. Args: issue_id: The Jira issue ID or key transition_name: Name of the transition (e.g., "Done", "Resolved", "In Progress") transition_id: Direct transition ID (if known, skips lookup) Returns: dict with transition result """ try: self.logger.info(f"Transitioning issue {issue_id}...") # If transition_id is not provided, look it up by name if not transition_id: if not transition_name: raise ProviderException( "Either transition_name or transition_id must be provided" ) transitions = self.__get_available_transitions(issue_id) # Find transition by name (case-insensitive) transition_id = None for transition in transitions: if transition["name"].lower() == transition_name.lower(): transition_id = transition["id"] self.logger.info( f"Found transition '{transition_name}' with ID {transition_id}" ) break if not transition_id: available_names = [t["name"] for t in transitions] raise ProviderException( f"Transition '{transition_name}' not found. " f"Available transitions: {', '.join(available_names)}" ) # Execute the transition url = self.__get_url(paths=["issue", issue_id, "transitions"]) request_body = {"transition": {"id": transition_id}} response = requests.post( url=url, json=request_body, auth=self.__get_auth(), verify=False ) if response.status_code != 204: response.raise_for_status() self.logger.info(f"Successfully transitioned issue {issue_id}!") return { "issue_id": issue_id, "transition_id": transition_id, "transition_name": transition_name, "success": True, } except Exception as e: raise ProviderException(f"Failed to transition issue {issue_id}: {e}") def __create_issue( self, project_key: str, summary: str, description: str = "", issue_type: str = "", labels: List[str] = None, components: List[str] = None, custom_fields: dict = None, **kwargs: dict, ): """ Helper method to create an issue in jira. """ try: self.logger.info("Creating an issue...") if not issue_type: create_meta = self.__get_single_createmeta(project_key=project_key) issue_type = create_meta.get("issue_type_name", "") url = self.__get_url(paths=["issue"]) fields = { "summary": summary, "description": description, "project": {"key": project_key}, "issuetype": {"name": issue_type}, } if labels: fields["labels"] = labels if components: fields["components"] = [{"name": component} for component in components] if custom_fields: # Filter out priority field if it's set to "none" or empty filtered_fields = {} for key, value in custom_fields.items(): if key == "priority" and (not value or str(value).lower() in ["none", "", "null"]): self.logger.info(f"Skipping priority field with value '{value}' as it may not be available on the issue screen") continue filtered_fields[key] = value fields.update(filtered_fields) # Also handle priority that might come through kwargs if kwargs: filtered_kwargs = {} for key, value in kwargs.items(): if key == "priority" and (not value or str(value).lower() in ["none", "", "null"]): self.logger.info(f"Skipping priority field from kwargs with value '{value}' as it may not be available on the issue screen") continue filtered_kwargs[key] = value fields.update(filtered_kwargs) request_body = {"fields": fields} response = requests.post( url=url, json=request_body, auth=self.__get_auth(), verify=False ) try: response.raise_for_status() except Exception: self.logger.exception( "Failed to create an issue", extra=response.json() ) raise ProviderException(f"Failed to create an issue: {response.json()}") self.logger.info("Created an issue!") return {"issue": response.json()} except Exception as e: raise ProviderException(f"Failed to create an issue: {e}") def __update_issue( self, issue_id: str, summary: str, description: str = "", labels: List[str] = None, components: List[str] = None, custom_fields: dict = None, **kwargs: dict, ): """ Helper method to update an issue in jira. """ try: self.logger.info("Updating an issue...") url = self.__get_url(paths=["issue", issue_id]) update = {} if summary: update["summary"] = [{"set": summary}] if description: update["description"] = [{"set": description}] if components: update["components"] = [{"set": component} for component in components] if labels: update["labels"] = [{"set": label} for label in labels] if custom_fields: # Format custom fields properly for Jira API for field_name, field_value in custom_fields.items(): update[field_name] = [{"set": field_value}] request_body = {"update": update} response = requests.put( url=url, json=request_body, auth=self.__get_auth(), verify=False ) try: if response.status_code != 204: response.raise_for_status() except Exception: self.logger.exception("Failed to update an issue", extra=response.text) raise ProviderException("Failed to update an issue") self.logger.info("Updated an issue!") return { "issue": { "id": issue_id, "key": self._extract_issue_key_from_issue_id(issue_id), "self": self.__get_url(paths=["issue", issue_id]), } } except Exception as e: raise ProviderException(f"Failed to update an issue: {e}") def _extract_project_key_from_board_name(self, board_name: str): boards_response = requests.get( f"{self.jira_host}/rest/agile/1.0/board", auth=self.__get_auth(), headers={"Accept": "application/json"}, verify=False, ) if boards_response.status_code == 200: boards = boards_response.json()["values"] for board in boards: if board["name"].lower() == board_name.lower(): self.logger.info( f"Found board {board_name} with project key {board['location']['projectKey']}" ) return board["location"]["projectKey"] # if we got here, we didn't find the board name so let's throw an indicative exception board_names = [board["name"] for board in boards] raise Exception( f"Could not find board {board_name} - please verify your board name is in this list: {board_names}." ) else: raise Exception("Could not fetch boards: " + boards_response.text) def _extract_issue_key_from_issue_id(self, issue_id: str): issue_key = requests.get( f"{self.jira_host}/rest/api/2/issue/{issue_id}", auth=self.__get_auth(), headers={"Accept": "application/json"}, verify=False, ) if issue_key.status_code == 200: return issue_key.json()["key"] else: raise Exception("Could not fetch issue key: " + issue_key.text) def _notify( self, summary: str, description: str = "", issue_type: str = "", project_key: str = "", board_name: str = "", issue_id: str = None, labels: List[str] = None, components: List[str] = None, custom_fields: dict = None, transition_to: Optional[str] = None, **kwargs: dict, ): """ Notify jira by creating an issue. Args: summary (str): The summary of the issue. description (str): The description of the issue. issue_type (str): The type of the issue. project_key (str): The project key of the issue. board_name (str): The board name of the issue. issue_id (str): The issue id of the issue. labels (List[str]): The labels of the issue. components (List[str]): The components of the issue. custom_fields (dict): The custom fields of the issue. transition_to (str): Optional transition name (e.g., "Done", "Resolved") to apply after update/create. """ issue_type = ( issue_type if issue_type else ( kwargs.get("issuetype", "Task") if isinstance(kwargs, dict) else "Task" ) ) if labels and isinstance(labels, str): labels = json.loads(labels.replace("'", '"')) try: self.logger.info("Notifying jira...") if issue_id: result = self.__update_issue( issue_id=issue_id, summary=summary, description=description, labels=labels, components=components, custom_fields=custom_fields, **kwargs, ) issue_key = self._extract_issue_key_from_issue_id(issue_id) result["ticket_url"] = f"{self.jira_host}/browse/{issue_key}" # Apply transition if requested if transition_to: self.logger.info(f"Applying transition '{transition_to}' to issue {issue_id}") transition_result = self.__transition_issue( issue_id=issue_id, transition_name=transition_to ) result["transition"] = transition_result self.logger.info("Updated a jira issue: " + str(result)) return result if not project_key: project_key = self._extract_project_key_from_board_name(board_name) if not project_key or not summary or not issue_type or not description: raise ProviderException( f"Project key and summary are required! - {project_key}, {summary}, {issue_type}, {description}" ) result = self.__create_issue( project_key=project_key, summary=summary, description=description, issue_type=issue_type, labels=labels, components=components, custom_fields=custom_fields, **kwargs, ) result["ticket_url"] = f"{self.jira_host}/browse/{result['issue']['key']}" # Apply transition if requested (on newly created issue) if transition_to: created_issue_id = result["issue"]["key"] self.logger.info(f"Applying transition '{transition_to}' to newly created issue {created_issue_id}") transition_result = self.__transition_issue( issue_id=created_issue_id, transition_name=transition_to ) result["transition"] = transition_result self.logger.info("Notified jira!") return result except Exception as e: context = { "summary": summary, "description": description, "issue_type": issue_type, "project_key": project_key, } raise ProviderException(f"Failed to notify jira: {e} - Params: {context}") def _query(self, ticket_id="", board_id="", **kwargs: dict): """ API for fetching issues: https://developer.atlassian.com/cloud/jira/software/rest/api-group-board/#api-rest-agile-1-0-board-boardid-issue-get Args: ticket_id (str): The ticket id of the issue, optional. board_id (str): The board id of the issue. """ if not ticket_id: request_url = f"{self.jira_host}/rest/agile/1.0/board/{board_id}/issue" response = requests.get(request_url, auth=self.__get_auth(), verify=False) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to fetch data from Jira: {response.text}" ) issues = response.json() return {"number_of_issues": issues["total"]} else: request_url = self.__get_url(paths=["issue", ticket_id]) response = requests.get(request_url, auth=self.__get_auth(), verify=False) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to fetch data from Jira: {response.text}" ) issue = response.json() return {"issue": issue} if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os jira_api_token = os.environ.get("JIRA_API_TOKEN") jira_email = os.environ.get("JIRA_EMAIL") jira_host = os.environ.get("JIRA_HOST") # Initalize the provider and provider config config = ProviderConfig( description="Jira Input Provider", authentication={ "api_token": jira_api_token, "email": jira_email, "host": jira_host, }, ) provider = JiraProvider(context_manager, provider_id="jira", config=config) scopes = provider.validate_scopes() # Example 1: Create ticket result = provider.notify( board_name="ALERTS", issue_type="Task", summary="Test", description="Test", ) # Example 2: Update ticket and transition to Done provider.notify( issue_id=result["issue"]["key"], summary="Test Alert - Updated", description="Alert has been resolved", transition_to="Done" ) ================================================ FILE: keep/providers/jiraonprem_provider/README.md ================================================ *Instructions for Jira On Prem* 1. Start Jira On Prem with docker - https://hub.docker.com/r/atlassian/jira-software/ 2. Create Personal Access Token (PAT) - https://confluence.atlassian.com/enterprise/using-personal-access-tokens-1026032365.html 3. Create some project/board 4. Profit :) ================================================ FILE: keep/providers/jiraonprem_provider/__init__.py ================================================ ================================================ FILE: keep/providers/jiraonprem_provider/jiraonprem_provider.py ================================================ """ JiraonpremProvider is a class that implements the BaseProvider interface for Jira updates. """ import dataclasses import json from typing import List from urllib.parse import urlencode, urljoin import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class JiraonpremProviderAuthConfig: """Jira On Prem authentication configuration.""" host: HttpsUrl = dataclasses.field( metadata={ "required": True, "description": "Jira Host", "sensitive": False, "hint": "jira.onprem.com", "validation": "any_http_url", } ) personal_access_token: str = dataclasses.field( metadata={ "required": True, "description": "Jira PAT", "sensitive": True, "documentation_url": "https://confluence.atlassian.com/enterprise/using-personal-access-tokens-1026032365.html", } ) ticket_creation_url: str = dataclasses.field( metadata={ "required": False, "description": "URL for creating new tickets", "sensitive": False, "hint": "https://jira.onprem.com/secure/CreateIssue.jspa", }, default="", ) class JiraonpremProvider(BaseProvider): """Enrich alerts with Jira tickets.""" PROVIDER_CATEGORY = ["Ticketing"] PROVIDER_SCOPES = [ ProviderScope( name="BROWSE_PROJECTS", description="Browse Jira Projects", mandatory=True, alias="Browse projects", ), ProviderScope( name="CREATE_ISSUES", description="Create Jira Issues", mandatory=True, alias="Create issue", ), ProviderScope( name="CLOSE_ISSUES", description="Close Jira Issues", mandatory=False, alias="Close issues", ), ProviderScope( name="EDIT_ISSUES", description="Edit Jira Issues", mandatory=False, alias="Edit issues", ), ProviderScope( name="DELETE_ISSUES", description="Delete Jira Issues", mandatory=False, alias="Delete issues", ), ProviderScope( name="MODIFY_REPORTER", description="Modify Jira Issue Reporter", mandatory=False, alias="Modidy issue reporter", ), ] PROVIDER_TAGS = ["ticketing"] PROVIDER_DISPLAY_NAME = "Jira On-Prem" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): self._host = None super().__init__(context_manager, provider_id, config) def validate_scopes(self): """ Validate that the provider has the required scopes. """ headers = { "Accept": "application/json", "Authorization": f"Bearer {self.authentication_config.personal_access_token}", } # first, validate user/api token are correct: # Note: Jira On Prem does not support api/3 resp = requests.get( f"{self.jira_host}/rest/api/2/myself", headers=headers, verify=False, timeout=10, ) try: resp.raise_for_status() except Exception: scopes = { scope.name: "Failed to authenticate with Jira - wrong credentials" for scope in JiraonpremProvider.PROVIDER_SCOPES } return scopes params = { "permissions": ",".join( [scope.name for scope in JiraonpremProvider.PROVIDER_SCOPES] ) } resp = requests.get( f"{self.jira_host}/rest/api/2/mypermissions", headers=headers, params=params, verify=False, timeout=10, ) try: resp.raise_for_status() except Exception as e: scopes = { scope.name: f"Failed to authenticate with Jira: {e}" for scope in JiraonpremProvider.PROVIDER_SCOPES } return scopes permissions = resp.json().get("permissions", []) scopes = { scope: scope_result.get("havePermission", False) for scope, scope_result in permissions.items() } return scopes def validate_config(self): self.authentication_config = JiraonpremProviderAuthConfig( **self.config.authentication ) @property def jira_host(self): # if not the first time, return the cached host if self._host: return self._host # if the user explicitly supplied a host with http/https, use it if self.authentication_config.host.startswith( "http://" ) or self.authentication_config.host.startswith("https://"): self._host = self.authentication_config.host return self.authentication_config.host # otherwise, try to use https: try: requests.get( f"https://{self.authentication_config.host}", verify=False, timeout=10 ) self.logger.debug("Using https") self._host = f"https://{self.authentication_config.host}" return self._host except requests.exceptions.SSLError: self.logger.debug("Using http") self._host = f"http://{self.authentication_config.host}" return self._host # should happen only if the user supplied invalid host, so just let validate_config fail except Exception: return self.authentication_config.host def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for jira api requests. Example: paths = ["issue", "createmeta"] query_params = {"projectKeys": "key1"} url = __get_url("test", paths, query_params) # url = https://test.atlassian.net/rest/api/2/issue/createmeta?projectKeys=key1 """ # add url path url = urljoin( f"{self.jira_host}/rest/api/2/", "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def __get_auth_header(self): """ Helper method to build the auth payload for jira api requests. """ return { "Authorization": f"Bearer {self.authentication_config.personal_access_token}" } def __get_createmeta(self, project_key: str): try: self.logger.info("Fetching create meta data...") url = self.__get_url( paths=["issue", "createmeta"], query_params={"projectKeys": project_key}, ) headers = self.__get_auth_header() response = requests.get(url=url, headers=headers, verify=False, timeout=10) response.raise_for_status() self.logger.info("Fetched create meta data!") return response.json() except Exception as e: raise ProviderException(f"Failed to fetch createmeta: {e}") def __get_single_createmeta(self, project_key: str): """ Helper method to get single createmeta. As the original createmeta api returns multiple issue types and other config. """ try: self.logger.info("Fetching single createmeta...") createmeta = self.__get_createmeta(project_key) projects = createmeta.get("projects", []) project = projects[0] if len(project_key) > 0 else {} issuetypes = project.get("issuetypes", []) issuetype = issuetypes[0] if len(issuetypes) > 0 else {} issue_type_name = issuetype.get("name", "") if not issue_type_name: raise ProviderException("No issue types found!") self.logger.info("Fetched single createmeta!") return {"issue_type_name": issue_type_name} except Exception as e: raise ProviderException(f"Failed to fetch single createmeta: {e}") def __create_issue( self, project_key: str, summary: str, description: str = "", issue_type: str = "", labels: List[str] = None, components: List[str] = None, custom_fields: dict = None, priority: str = "Medium", **kwargs: dict, ): """ Helper method to create an issue in jira. """ try: self.logger.info("Creating an issue...") if not issue_type: create_meta = self.__get_single_createmeta(project_key=project_key) issue_type = create_meta.get("issue_type_name", "") url = self.__get_url(paths=["issue"]) fields = { "summary": summary, "description": description, "project": {"key": project_key}, "issuetype": {"name": issue_type}, "priority": {"name": priority}, } if labels: fields["labels"] = labels if components: fields["components"] = [{"name": component} for component in components] if custom_fields: fields.update(custom_fields) request_body = {"fields": fields} response = requests.post( url=url, json=request_body, headers=self.__get_auth_header(), verify=False, timeout=10, ) try: response.raise_for_status() except Exception: self.logger.exception( "Failed to create an issue", extra=response.json() ) raise ProviderException(f"Failed to create an issue: {response.json()}") self.logger.info("Created an issue!") return {"issue": response.json()} except Exception as e: raise ProviderException(f"Failed to create an issue: {e}") def __update_issue( self, issue_id: str, summary: str = "", description: str = "", priority: str = "Medium", labels: List[str] = None, components: List[str] = None, custom_fields: dict = None, **kwargs: dict, ): """ Helper method to update an issue in jira. """ try: self.logger.info("Updating an issue...") url = self.__get_url(paths=["issue", issue_id]) update = {} if summary: update["summary"] = [{"set": summary}] if description: update["description"] = [{"set": description}] if priority: update["priority"] = [{"set": {"name": priority}}] if components: update["components"] = [ {"set": [{"name": component} for component in components]} ] if labels: update["labels"] = [{"set": label} for label in labels] if custom_fields: # Format custom fields properly for Jira API for field_name, field_value in custom_fields.items(): update[field_name] = [{"set": field_value}] request_body = {"update": update} response = requests.put( url=url, json=request_body, headers=self.__get_auth_header(), verify=False, timeout=10, ) try: if response.status_code != 204: response.raise_for_status() except Exception: self.logger.exception("Failed to update an issue", extra=response.text) raise ProviderException("Failed to update an issue") result = { "issue": { "id": issue_id, "key": self._extract_issue_key_from_issue_id(issue_id), "self": self.__get_url(paths=["issue", issue_id]), } } self.logger.info("Updated an issue!") return result except Exception as e: raise ProviderException(f"Failed to update an issue: {e}") def _extract_project_key_from_board_name(self, board_name: str): headers = { "Accept": "application/json", } headers.update(self.__get_auth_header()) boards_response = requests.get( f"{self.jira_host}/rest/agile/1.0/board", headers=headers, verify=False, timeout=10, ) if boards_response.status_code == 200: boards = boards_response.json()["values"] for board in boards: if board["name"].lower() == board_name.lower(): # Jira On Prem does not have the "location" in its response so we need to figure it out board_id = board["id"] # get the filter board_configuration = requests.get( f"{self.jira_host}/rest/agile/1.0/board/{board_id}/configuration", headers=headers, verify=False, timeout=10, ) if board_configuration.status_code != 200: raise Exception( f"Could not fetch board configuration for board {board_name}" ) # get the filter id filter_id = board_configuration.json()["filter"]["id"] # get the filter filter_response = requests.get( f"{self.jira_host}/rest/api/2/filter/{filter_id}", headers=headers, verify=False, timeout=10, ) if filter_response.status_code != 200: raise Exception( f"Could not fetch filter for board {board_name}" ) # get the project key # todo: should be more robust way but that's enough for now. note that the user can use projectKey directly project_key = ( filter_response.json()["jql"] .split("project = ")[1] .split(" ")[0] ) self.logger.info( f"Found board {board_name} with project key {project_key}" ) return project_key # if we got here, we didn't find the board name so let's throw an indicative exception board_names = [board["name"] for board in boards] raise Exception( f"Could not find board {board_name} - please verify your board name is in this list: {board_names}." ) else: raise Exception("Could not fetch boards: " + boards_response.text) def _extract_issue_key_from_issue_id(self, issue_id: str): headers = { "Accept": "application/json", } headers.update(self.__get_auth_header()) issue_key = requests.get( f"{self.jira_host}/rest/api/2/issue/{issue_id}", headers=headers, verify=False, timeout=10, ) if issue_key.status_code == 200: return issue_key.json()["key"] else: raise Exception("Could not fetch issue key: " + issue_key.text) def _notify( self, summary: str, description: str = "", issue_type: str = "", project_key: str = "", board_name: str = "", issue_id: str = None, labels: List[str] = None, components: List[str] = None, custom_fields: dict = None, priority: str = "Medium", **kwargs: dict, ): """ Notify jira by creating an issue. """ # if the user didn't provider a project_key, try to extract it from the board name issue_type = ( issue_type if issue_type else ( kwargs.get("issuetype", "Task") if isinstance(kwargs, dict) else "Task" ) ) if labels and isinstance(labels, str): labels = json.loads(labels.replace("'", '"')) try: self.logger.info("Notifying jira...") if issue_id: result = self.__update_issue( issue_id=issue_id, summary=summary, description=description, labels=labels, components=components, custom_fields=custom_fields, priority=priority, **kwargs, ) issue_key = self._extract_issue_key_from_issue_id(issue_id) result["ticket_url"] = f"{self.jira_host}/browse/{issue_key}" self.logger.info("Updated a jira issue: " + str(result)) return result if not project_key: project_key = self._extract_project_key_from_board_name(board_name) if not project_key or not summary or not issue_type or not description: raise ProviderException( f"Project key and summary are required! - {project_key}, {summary}, {issue_type}, {description}" ) result = self.__create_issue( project_key=project_key, summary=summary, description=description, issue_type=issue_type, labels=labels, components=components, custom_fields=custom_fields, priority=priority, **kwargs, ) result["ticket_url"] = f"{self.jira_host}/browse/{result['issue']['key']}" self.logger.info("Notified jira!") return result except Exception as e: context = { "summary": summary, "description": description, "issue_type": issue_type, "project_key": project_key, } raise ProviderException(f"Failed to notify jira: {e} - Params: {context}") def _query(self, ticket_id="", board_id="", **kwargs: dict): """ API for fetching issues: https://developer.atlassian.com/cloud/jira/software/rest/api-group-board/#api-rest-agile-1-0-board-boardid-issue-get Args: ticket_id (str): The ticket id. board_id (str): The board id. """ if not ticket_id: request_url = ( f"https://{self.jira_host}/rest/agile/1.0/board/{board_id}/issue" ) response = requests.get( request_url, headers=self.__get_auth_header(), verify=False, timeout=10 ) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to fetch data from Jira: {response.text}" ) issues = response.json() return {"number_of_issues": issues["total"]} else: request_url = self.__get_url(paths=["issue", ticket_id]) response = requests.get( request_url, headers=self.__get_auth_header(), verify=False, timeout=10 ) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to fetch data from Jira: {response.text}" ) issue = response.json() return {"issue": issue} if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os jira_pat = os.environ.get("JIRA_PAT") jira_host = os.environ.get("JIRA_HOST") # Initalize the provider and provider config config = ProviderConfig( description="Jira On Prem Provider", authentication={ "personal_access_token": jira_pat, "host": jira_host, }, ) provider = JiraonpremProvider(context_manager, provider_id="jira", config=config) scopes = provider.validate_scopes() # Create ticket provider.notify( board_name="KEEP board", issue_type="Task", summary="Test Alert", description="Test Alert Description", ) ================================================ FILE: keep/providers/kafka_provider/README.md ================================================ # Run the docker-compose ```docker docker-compose up -d ``` # Create the topic ```bash docker-compose exec kafka /opt/kafka/bin/kafka-topics.sh --create --topic alert --partitions 1 --replication-factor 1 --zookeeper zookeeper:2181 ``` # Publish event ## With SASL ```bash echo '{"id": "1234","name": "Kafka Alert","status": "firing", "lastReceived": "2023-10-23T09:56:44.950Z","environment": "production","isDuplicate": false, "duplicateReason": null, "service": "backend","message": "Alert from Kafka", "description": "Alert kafka description", "severity": "critical", "pushed": true, "event_id": "1234", "url": "https://www.google.com/search?q=open+source+alert+management"}' | kafkacat -v -b kafka:9092 -t alert -P -X security.protocol=SASL_PLAINTEXT -X sasl.mechanisms=PLAIN -X sasl.username=admin -X sasl.password=admin-secret ``` ## Without SASL ```bash echo '{"id": "1234","name": "Kafka Alert","status": "firing", "lastReceived": "2023-10-23T09:56:44.950Z","environment": "production","isDuplicate": false, "duplicateReason": null, "service": "backend","message": "Alert from Kafka", "description": "Alert kafka description", "severity": "critical", "pushed": true, "event_id": "1234", "url": "https://www.google.com/search?q=open+source+alert+management"}' | kafkacat -v -b kafka:9092 -t alert -P ``` # Consume event ```bash kafkacat -v -b kafka:9092 -t alert -C -X security.protocol=SASL_PLAINTEXT -X sasl.mechanisms=PLAIN -X sasl.username=admin -X sasl.password=admin-secret ``` ================================================ FILE: keep/providers/kafka_provider/__init__.py ================================================ ================================================ FILE: keep/providers/kafka_provider/docker-compose-no-auth.yml ================================================ version: '2' services: zookeeper: image: wurstmeister/zookeeper ports: - "2181:2181" kafka: image: wurstmeister/kafka ports: - "9092:9093" environment: KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL KAFKA_LISTENERS: INTERNAL://:9092,EXTERNAL://:9093 KAFKA_ADVERTISED_LISTENERS: INTERNAL://kafka:9092,EXTERNAL://localhost:9093 KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT volumes: - /var/run/docker.sock:/var/run/docker.sock links: - zookeeper kafkacat: image: edenhill/kafkacat:1.6.0 depends_on: - kafka entrypoint: /bin/sh -c "apk add --no-cache curl && tail -f /dev/null" ================================================ FILE: keep/providers/kafka_provider/docker-compose.yml ================================================ version: '2' services: zookeeper: image: wurstmeister/zookeeper ports: - "2181:2181" kafka: image: wurstmeister/kafka ports: - "9092:9093" environment: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:SASL_PLAINTEXT,EXTERNAL:SASL_PLAINTEXT KAFKA_LISTENERS: INTERNAL://:9092,EXTERNAL://:9093 KAFKA_ADVERTISED_LISTENERS: INTERNAL://kafka:9092,EXTERNAL://localhost:9092 KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 KAFKA_OPTS: "-Djava.security.auth.login.config=/etc/kafka/kafka_server_jaas.conf" KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL KAFKA_SASL_ENABLED_MECHANISMS: PLAIN KAFKA_SASL_MECHANISM_INTER_BROKER_PROTOCOL: PLAIN volumes: - /var/run/docker.sock:/var/run/docker.sock - ./kafka_server_jaas.conf:/etc/kafka/kafka_server_jaas.conf links: - zookeeper kafkacat: image: edenhill/kafkacat:1.6.0 depends_on: - kafka entrypoint: /bin/sh -c "apk add --no-cache curl && tail -f /dev/null" ================================================ FILE: keep/providers/kafka_provider/kafka_provider.py ================================================ """ Kafka Provider is a class that allows to ingest/digest data from Grafana. """ import dataclasses import inspect import logging import pydantic # from confluent_kafka import Consumer, KafkaError, KafkaException from kafka import KafkaConsumer from kafka.errors import KafkaError, NoBrokersAvailable from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory from keep.validation.fields import NoSchemeMultiHostUrl @pydantic.dataclasses.dataclass class KafkaProviderAuthConfig: """ Kafka authentication configuration. """ host: NoSchemeMultiHostUrl = dataclasses.field( metadata={ "required": True, "description": "Kafka host", "hint": "e.g. localhost:9092 or localhost:9092,localhost:8093", "validation": "no_scheme_multihost_url" }, ) topic: str = dataclasses.field( metadata={ "required": True, "description": "The topic to subscribe to", "hint": "e.g. alerts-topic", }, ) username: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Username", "hint": "Kafka username (Optional for SASL authentication)", "sensitive": True, }, ) password: str = dataclasses.field( default=None, metadata={ "required": False, "description": "Password", "hint": "Kafka password (Optional for SASL authentication)", "sensitive": True, }, ) class ClientIdInjector(logging.Filter): def filter(self, record): # For this example, let's pretend we can obtain the client_id # by inspecting the caller or some context. Replace the next line # with the actual logic to get the client_id. client_id, provider_id = self.get_client_id_from_caller() if not hasattr(record, "extra"): record.extra = { "client_id": client_id, "provider_id": provider_id, } return True def get_client_id_from_caller(self): # Here, you should implement the logic to extract client_id based on the caller. # This can be tricky and might require you to traverse the call stack. # Return a default or None if you can't find it. frame = inspect.currentframe() client_id = None while frame: # Use dict() to convert frame.f_locals into a plain dict. # In Python 3.13+, frame.f_locals returns a FrameLocalsProxy # which cannot be copied via copy.copy() (pickle fails). local_vars = dict(frame.f_locals) for var_name, var_value in local_vars.items(): if isinstance(var_value, KafkaProvider): client_id = var_value.context_manager.tenant_id provider_id = var_value.provider_id break if client_id: return client_id, provider_id frame = frame.f_back return None, None class KafkaProvider(BaseProvider): """ Kafka provider class. """ PROVIDER_CATEGORY = ["Developer Tools", "Queues"] PROVIDER_DISPLAY_NAME = "Kafka" PROVIDER_SCOPES = [ ProviderScope( name="topic_read", description="The kafka user that have permissions to read the topic.", mandatory=True, alias="Topic Read", ) ] PROVIDER_TAGS = ["queue"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.consume = False self.consumer = None self.err = "" # patch all Kafka loggers to contain the tenant_id for logger_name in logging.Logger.manager.loggerDict: if logger_name.startswith("kafka"): logger = logging.getLogger(logger_name) if not any(isinstance(f, ClientIdInjector) for f in logger.filters): self.logger.info(f"Patching kafka logger {logger_name}") logger.addFilter(ClientIdInjector()) def validate_scopes(self): scopes = {"topic_read": False} self.logger.info("Validating kafka scopes") conf = self._get_conf() try: self.logger.info("Trying to connect to Kafka with SASL_SSL") consumer = KafkaConsumer(self.authentication_config.topic, **conf) except NoBrokersAvailable: # retry with SASL_PLAINTEXT try: conf["security_protocol"] = "SASL_PLAINTEXT" self.logger.info("Trying to connect to Kafka with SASL_PLAINTEXT") consumer = KafkaConsumer(self.authentication_config.topic, **conf) except NoBrokersAvailable: self.err = f"Auth/Network problem: could not connect to Kafka at {self.authentication_config.host}" self.logger.warning(self.err) scopes["topic_read"] = self.err return scopes except KafkaError as e: self.err = str(e) self.logger.warning(f"Error connecting to Kafka: {e}") scopes["topic_read"] = self.err or "Could not connect to Kafka " return scopes topics = consumer.topics() if self.authentication_config.topic in topics: self.logger.info(f"Topic {self.authentication_config.topic} exists") scopes["topic_read"] = True return scopes else: self.err = f"The user have permission to Kafka, but topic '{self.authentication_config.topic}' does not exist or the user does not have permissions to read it - available topics: {consumer.topics()}" self.logger.warning(self.err) scopes["topic_read"] = self.err return scopes def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Kafka provider. """ self.authentication_config = KafkaProviderAuthConfig( **self.config.authentication ) def _get_conf(self): basic_conf = { "bootstrap_servers": self.authentication_config.host, "group_id": "keephq-group", "auto_offset_reset": "earliest", "enable_auto_commit": True, # this is typically needed "reconnect_backoff_max_ms": 30000, # 30 seconds "client_id": self.context_manager.tenant_id, # add tenant id to the logs } if self.authentication_config.username and self.authentication_config.password: basic_conf.update( { "security_protocol": ( "SASL_SSL" if self.authentication_config.username else "PLAINTEXT" ), "sasl_mechanism": "PLAIN", "sasl_plain_username": self.authentication_config.username, "sasl_plain_password": self.authentication_config.password, } ) return basic_conf def status(self): """ Get the status of the provider. Returns: dict: The status of the provider. """ if not self.consumer: status = "not-initialized" else: try: status = { str(conn_id): conn.state for conn_id, conn in self.consumer._client._conns.items() } except Exception as e: status = str(e) return { "status": status, "error": self.err, } def start_consume(self): self.consume = True conf = self._get_conf() try: self.consumer = KafkaConsumer(self.authentication_config.topic, **conf) except NoBrokersAvailable: # retry with SASL_PLAINTEXT try: conf["security_protocol"] = "SASL_PLAINTEXT" self.consumer = KafkaConsumer(self.authentication_config.topic, **conf) except NoBrokersAvailable: self.logger.exception( f"Could not connect to Kafka at {self.authentication_config.host}" ) return while self.consume: try: topics = self.consumer.poll(timeout_ms=1000) if not topics: continue for tp, records in topics.items(): for record in records: self.logger.info( f"Received message {record.value} from topic {tp.topic} partition {tp.partition}" ) try: self._push_alert(record.value) except Exception: self.logger.warning("Error pushing alert to API") pass except Exception: self.logger.exception("Error consuming message from Kafka") break # finally, dispose if self.consumer: try: self.consumer.close() except Exception: self.logger.exception("Error closing Kafka connection") self.consumer = None self.logger.info("Consuming stopped") def stop_consume(self): self.consume = False if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) # Load environment variables import os os.environ["KEEP_API_URL"] = "http://localhost:8080" # Before the provider can be run, we need to docker-compose up the kafka container # check the docker-compose in this folder # Now start the container host = "localhost:9092" topic = "alert" username = "admin" password = "admin-secret" from keep.api.core.dependencies import SINGLE_TENANT_UUID context_manager = ContextManager(tenant_id=SINGLE_TENANT_UUID) config = { "authentication": { "host": host, "topic": topic, "username": username, "password": password, } } provider = ProvidersFactory.get_provider( context_manager, provider_id="kafka-keephq", provider_type="kafka", provider_config=config, ) provider.start_consume() ================================================ FILE: keep/providers/kafka_provider/kafka_server_jaas.conf ================================================ KafkaServer { org.apache.kafka.common.security.plain.PlainLoginModule required username="admin" password="admin-secret" user_admin="admin-secret" user_alice="alice-secret"; }; KafkaClient { org.apache.kafka.common.security.plain.PlainLoginModule required username="admin" password="admin-secret"; }; Client { }; ================================================ FILE: keep/providers/keep_provider/__init__.py ================================================ ================================================ FILE: keep/providers/keep_provider/keep_provider.py ================================================ """ Keep Provider is a class that allows to ingest/digest data from Keep. """ import copy import logging from datetime import datetime, timedelta, timezone from html import unescape import yaml from keep.api.core.db import get_alerts_with_filters from keep.api.models.alert import AlertDto, AlertStatus from keep.api.tasks.process_event_task import process_event from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.iohandler.iohandler import IOHandler from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.searchengine.searchengine import SearchEngine from keep.workflowmanager.workflowstore import WorkflowStore class KeepProvider(BaseProvider): """ Automation on your alerts with Keep. """ def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): self.io_handler = IOHandler(context_manager) super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def _calculate_time_delta(self, timerange=None, default_time_range=1): """Calculate time delta in days from timerange dict.""" if not timerange or "from" not in timerange: return default_time_range # default value from_time_str = timerange["from"] to_time_str = timerange.get("to", "now") # Parse from_time and ensure it's timezone-aware from_time = datetime.fromisoformat(from_time_str.replace("Z", "+00:00")) if from_time.tzinfo is None: from_time = from_time.replace(tzinfo=timezone.utc) # Handle 'to' time if to_time_str == "now": to_time = datetime.now(timezone.utc) else: to_time = datetime.fromisoformat(to_time_str.replace("Z", "+00:00")) if to_time.tzinfo is None: to_time = to_time.replace(tzinfo=timezone.utc) # Calculate difference in days delta = (to_time - from_time).total_seconds() / (24 * 3600) # convert to days return delta def _query( self, filters=None, version=1, distinct=True, time_delta=1, timerange=None, filter=None, limit: int | None = None, **kwargs, ): """ Query Keep for alerts. Args: filters: filters to query Keep (only for version 1) version: version of Keep API distinct: if True, return only distinct alerts time_delta: time delta in days to query Keep timerange: timerange dict to calculate time delta filter: filter to query Keep (only for version 2) limit: limit number of results (only for version 2) """ self.logger.info( "Querying Keep for alerts", extra={ "filters": filters, "is_distinct": distinct, "time_delta": time_delta, }, ) # if timerange is provided, calculate time delta if timerange: time_delta = int( self._calculate_time_delta( timerange=timerange, default_time_range=time_delta ) ) if version == 1: # filters are mandatory for version 1 if not filters: raise ValueError("Filters are required for version") db_alerts = get_alerts_with_filters( self.context_manager.tenant_id, filters=filters, time_delta=time_delta ) fingerprints = {} # distinct if needed alerts = [] if db_alerts: for alert in db_alerts: if fingerprints.get(alert.fingerprint) and distinct is True: continue alert_event = alert.event if alert.alert_enrichment: alert_event["enrichments"] = alert.alert_enrichment.enrichments alerts.append(alert_event) fingerprints[alert.fingerprint] = True else: search_engine = SearchEngine(tenant_id=self.context_manager.tenant_id) if not filter: raise ValueError("Filter is required for version 2") try: alerts = search_engine.search_alerts_by_cel( cel_query=filter, limit=limit or 100, timeframe=float(time_delta) ) except Exception as e: self.logger.exception( "Failed to search alerts by CEL: %s", str(e), ) raise self.logger.info("Got alerts from Keep", extra={"num_of_alerts": len(alerts)}) return alerts def _build_alert(self, alert_data, fingerprint_fields=[], **kwargs): """ Build alerts from Keep. """ labels = copy.copy(kwargs.get("labels", {})) alert = AlertDto( name=kwargs["name"], status=kwargs.get("status"), lastReceived=kwargs.get("lastReceived"), environment=kwargs.get("environment", "undefined"), duplicateReason=kwargs.get("duplicateReason"), service=kwargs.get("service"), message=kwargs.get("message"), description=kwargs.get("description"), severity=kwargs.get("severity"), pushed=True, url=kwargs.get("url"), labels=labels, ticket_url=kwargs.get("ticket_url"), fingerprint=kwargs.get("fingerprint"), annotations=kwargs.get("annotations"), workflowId=self.context_manager.workflow_id, ) # to avoid multiple key word argument, add and key,val on alert data only if it doesn't exists: if isinstance(alert_data, dict): for key, val in alert_data.items(): if not hasattr(alert, key): setattr(alert, key, val) # if fingerprint was explicitly mentioned in the workflow: if "fingerprint" in alert_data or "fingerprint" in kwargs: return alert # else, if fingerprint_fields are not provided, use labels if not fingerprint_fields: fingerprint_fields = ["labels." + label for label in list(labels.keys())] # workflowId is used as the "rule id" - it's used to identify the rule that created the alert fingerprint_fields.append("workflowId") alert.fingerprint = self.get_alert_fingerprint(alert, fingerprint_fields) return alert def _handle_state_alerts( self, _for, state_alerts: list[AlertDto], keep_firing_for=timedelta(minutes=15) ): """ Handle state alerts with proper state transitions. Args: _for: timedelta indicating how long alert should be PENDING before FIRING state_alerts: list of new alerts from current evaluation keep_firing_for: (future use) how long to keep alerts FIRING after stopping matching (default 15m) Returns: list of alerts that need state updates """ self.logger.info( "Starting state alert handling", extra={"num_alerts": len(state_alerts)} ) alerts_to_notify = [] search_engine = SearchEngine(tenant_id=self.context_manager.tenant_id) curr_alerts = search_engine.search_alerts_by_cel( cel_query=f"providerId == '{self.context_manager.workflow_id}'" ) self.logger.debug( "Found existing alerts", extra={"num_curr_alerts": len(curr_alerts)} ) # Create lookup by fingerprint for efficient comparison curr_alerts_map = {alert.fingerprint: alert for alert in curr_alerts} state_alerts_map = {alert.fingerprint: alert for alert in state_alerts} self.logger.debug( "Created alert maps", extra={ "curr_alerts_count": len(curr_alerts_map), "state_alerts_count": len(state_alerts_map), }, ) # Handle existing alerts for fingerprint, curr_alert in curr_alerts_map.items(): now = datetime.now(timezone.utc) alert_still_exists = fingerprint in state_alerts_map self.logger.debug( "Processing existing alert", extra={ "fingerprint": fingerprint, "still_exists": alert_still_exists, "current_status": curr_alert.status, }, ) if curr_alert.status == AlertStatus.FIRING.value: if not alert_still_exists: # TODO: keep_firing_for logic # Alert no longer exists, transition to RESOLVED curr_alert.status = AlertStatus.RESOLVED curr_alert.lastReceived = datetime.now(timezone.utc).isoformat() alerts_to_notify.append(curr_alert) self.logger.info( "Alert resolved", extra={ "fingerprint": fingerprint, "last_received": curr_alert.lastReceived, }, ) # else: alert still exists, maintain FIRING state else: curr_alert.status = AlertStatus.FIRING alerts_to_notify.append(curr_alert) self.logger.debug( "Alert still firing", extra={"fingerprint": fingerprint} ) elif curr_alert.status == AlertStatus.PENDING.value: if not alert_still_exists: # If PENDING alerts are not triggered, make them RESOLVED # TODO: maybe INACTIVE? but we don't have this status yet curr_alert.status = AlertStatus.RESOLVED curr_alert.lastReceived = datetime.now(timezone.utc).isoformat() alerts_to_notify.append(curr_alert) self.logger.info( "Pending alert resolved", extra={ "fingerprint": fingerprint, "last_received": curr_alert.lastReceived, }, ) else: # Check if should transition to FIRING if not hasattr(curr_alert, "activeAt"): # This shouldn't happen but handle it gracefully curr_alert.activeAt = curr_alert.lastReceived self.logger.debug( "Alert missing activeAt, using lastReceived", extra={ "fingerprint": fingerprint, "activeAt": curr_alert.lastReceived, }, ) if isinstance(curr_alert.activeAt, str): activeAt = datetime.fromisoformat(curr_alert.activeAt) else: activeAt = curr_alert.activeAt # Convert duration string to timedelta # Parse duration string like "1m", "5m", etc try: value = int(_for[:-1]) unit = _for[-1] except ValueError: raise ValueError(f"Invalid duration format: {_for}") if unit == "m": duration = timedelta(minutes=value) elif unit == "h": duration = timedelta(hours=value) elif unit == "s": duration = timedelta(seconds=value) else: raise ValueError(f"Invalid duration unit: {unit}") curr_alert.lastReceived = datetime.now(timezone.utc).isoformat() if now - activeAt >= duration: curr_alert.status = AlertStatus.FIRING self.logger.info( "Alert transitioned to firing", extra={ "fingerprint": fingerprint, "duration_elapsed": str(now - activeAt), }, ) # Keep pending, update lastReceived else: curr_alert.status = AlertStatus.PENDING self.logger.debug( "Alert still pending", extra={ "fingerprint": fingerprint, "time_remaining": str(duration - (now - activeAt)), }, ) alerts_to_notify.append(curr_alert) # if alert is RESOLVED, add it to the list elif curr_alert.status == AlertStatus.RESOLVED.value: if not alert_still_exists: # if alert is not in current state, add it to the list alerts_to_notify.append(curr_alert) self.logger.debug( "Keeping resolved alert", extra={"fingerprint": fingerprint} ) else: # if its resolved and with _for, then it first need to be pending curr_alert.status = AlertStatus.PENDING curr_alert.lastReceived = datetime.now(timezone.utc).isoformat() alerts_to_notify.append(curr_alert) self.logger.info( "Resolved alert back to pending", extra={ "fingerprint": fingerprint, "last_received": curr_alert.lastReceived, }, ) # Handle new alerts not in current state for fingerprint, new_alert in state_alerts_map.items(): if fingerprint not in curr_alerts_map: # Brand new alert - set to PENDING new_alert.status = AlertStatus.PENDING new_alert.activeAt = datetime.now(timezone.utc).isoformat() alerts_to_notify.append(new_alert) self.logger.info( "New alert created", extra={"fingerprint": fingerprint, "activeAt": new_alert.activeAt}, ) self.logger.info( "Completed state alert handling", extra={"alerts_to_notify": len(alerts_to_notify)}, ) return alerts_to_notify def _handle_stateless_alerts( self, stateless_alerts: list[AlertDto], read_only=False ) -> list[AlertDto]: """ Handle alerts without PENDING state - just FIRING or RESOLVED. Args: state_alerts: list of new alerts from current evaluation Returns: list of alerts that need state updates """ self.logger.info( "Starting stateless alert handling", extra={"num_alerts": len(stateless_alerts)}, ) alerts_to_notify = [] if not read_only: search_engine = SearchEngine(tenant_id=self.context_manager.tenant_id) curr_alerts = search_engine.search_alerts_by_cel( cel_query=f"providerId == '{self.context_manager.workflow_id}'" ) self.logger.debug( "Found existing alerts", extra={"num_curr_alerts": len(curr_alerts)} ) else: curr_alerts = [] # Create lookup by fingerprint for efficient comparison curr_alerts_map = {alert.fingerprint: alert for alert in curr_alerts} state_alerts_map = {alert.fingerprint: alert for alert in stateless_alerts} self.logger.debug( "Created alert maps", extra={ "curr_alerts_count": len(curr_alerts_map), "state_alerts_count": len(state_alerts_map), }, ) # Handle existing alerts for fingerprint, curr_alert in curr_alerts_map.items(): alert_still_exists = fingerprint in state_alerts_map self.logger.debug( "Processing existing alert", extra={ "fingerprint": fingerprint, "still_exists": alert_still_exists, "current_status": curr_alert.status, }, ) if curr_alert.status == AlertStatus.FIRING.value: if not alert_still_exists: # Alert no longer exists, transition to RESOLVED curr_alert.status = AlertStatus.RESOLVED curr_alert.lastReceived = datetime.now(timezone.utc).isoformat() alerts_to_notify.append(curr_alert) self.logger.info( "Alert resolved", extra={ "fingerprint": fingerprint, "last_received": curr_alert.lastReceived, }, ) # Handle new alerts not in current state for fingerprint, new_alert in state_alerts_map.items(): alerts_to_notify.append(new_alert) self.logger.info( "New alert firing", extra={ "fingerprint": fingerprint, "last_received": new_alert.lastReceived, }, ) self.logger.info( "Completed stateless alert handling", extra={"alerts_to_notify": len(alerts_to_notify)}, ) return alerts_to_notify def _notify_alert( self, alert: dict | None = None, if_condition: str | None = None, for_duration: str | None = None, fingerprint_fields: list | None = None, override_source_with: str | None = None, read_only: bool = False, fingerprint: str | None = None, **kwargs, ) -> list: """ Notify alerts with the given parameters Args: alert: alert data to create if_condition: condition to evaluate for alert creation for_duration: duration for state alerts fingerprint_fields: fields to use for alert fingerprinting override_source_with: override alert source read_only: if True, don't modify existing alerts fingerprint: alert fingerprint Returns: list of created/updated alerts """ self.logger.debug("Starting _notify_alert") context = self.context_manager.get_full_context() alert_results = context.get("foreach", {}).get("items", None) # if foreach_context is provided, get alert results if alert_results: self.logger.debug( "Got alert results from foreach context", extra={"alert_results": alert_results}, ) # else, the last step results are the alert results else: # TODO: this is a temporary solution until we have a better way to get the alert results alert_results = context.get("steps", {}).get("this", {}).get("results", {}) self.logger.info( "Got alert results from 'this' step", extra={"alert_results": alert_results}, ) # alert_results must be a list if not isinstance(alert_results, list): self.logger.warning( "Alert results must be a list, but got a non-list type", extra={"alert_results": alert_results}, ) alert_results = None # create_alert_in_keep.yml for example if not alert_results: self.logger.info("No alert results found") if alert: self.logger.info("Creating alert from 'alert' parameter") alert_results = [alert] self.logger.debug( "Got condition parameters", extra={ "if": if_condition, "for": for_duration, "fingerprint_fields": fingerprint_fields, }, ) # if we need to check if_condition, handle the condition trigger_alerts = [] if if_condition: self.logger.info( "Processing alerts with 'if' condition", extra={"condition": if_condition}, ) # if its multialert, handle each alert separately if isinstance(alert_results, list): self.logger.debug("Processing multiple alerts") for alert_result in alert_results: # render if_rendered = self.io_handler.render( if_condition, safe=True, additional_context=alert_result ) self.logger.debug( "Rendered if condition", extra={"original": if_condition, "rendered": if_rendered}, ) # evaluate if not self._evaluate_if(if_condition, if_rendered): self.logger.debug( "Alert did not meet condition", extra={"alert": alert_result}, ) continue trigger_alerts.append(alert_result) self.logger.debug( "Alert met condition", extra={"alert": alert_result} ) else: pass # if no if_condition, trigger all alerts else: self.logger.info("No 'if' condition - triggering all alerts") trigger_alerts = alert_results # build the alert dtos alert_dtos = [] self.logger.info( "Building alert DTOs", extra={"trigger_count": len(trigger_alerts)} ) # render alert data for alert_result in trigger_alerts: alert_data = copy.deepcopy(alert or {}) # render alert data if isinstance(alert_result, dict): rendered_alert_data = self.io_handler.render_context( alert_data, additional_context=alert_result ) else: self.logger.warning( "Alert data is not a dict, skipping rendering", extra={"alert_data": alert_data}, ) rendered_alert_data = alert_data self.logger.debug( "Rendered alert data", extra={"original": alert_data, "rendered": rendered_alert_data}, ) # render tenrary expressions rendered_alert_data = self._handle_ternary_expressions(rendered_alert_data) alert_dto = self._build_alert( alert_result, fingerprint_fields or [], **rendered_alert_data ) if override_source_with: alert_dto.source = [override_source_with] alert_dtos.append(alert_dto) self.logger.debug( "Built alert DTO", extra={"fingerprint": alert_dto.fingerprint} ) # sanity check - if more than one alert has the same fingerprint it means something is wrong # this would happen if the fingerprint fields are not unique fingerprints = {} for alert_dto in alert_dtos: if fingerprints.get(alert_dto.fingerprint): self.logger.warning( "Alert with the same fingerprint already exists - it means your fingerprint labels are not unique", extra={"alert": alert_dto, "fingerprint": alert_dto.fingerprint}, ) fingerprints[alert_dto.fingerprint] = True # if for_duration is provided, handle state alerts if for_duration: self.logger.info( "Handling state alerts with 'for' condition", extra={"for": for_duration}, ) # handle alerts with state alerts = self._handle_state_alerts(for_duration, alert_dtos) # else, handle all alerts else: self.logger.info("Handling stateless alerts") alerts = self._handle_stateless_alerts(alert_dtos, read_only=read_only) # handle all alerts self.logger.info( "Processing final alerts", extra={"number_of_alerts": len(alerts)} ) process_event( ctx={}, tenant_id=self.context_manager.tenant_id, provider_type="keep", provider_id=self.context_manager.workflow_id, # so we can track the alerts that are created by this workflow fingerprint=fingerprint, api_key_name=None, trace_id=None, event=alerts, ) self.logger.info( "Alerts processed successfully", extra={"alert_count": len(alerts)} ) return alerts def _delete_workflows(self, except_workflow_id=None): self.logger.info("Deleting all workflows") workflow_store = WorkflowStore() workflows = workflow_store.get_all_workflows(self.context_manager.tenant_id) for workflow in workflows: if not (except_workflow_id and workflow.id == except_workflow_id): self.logger.info(f"Deleting workflow {workflow.id}") try: workflow_store.delete_workflow( self.context_manager.tenant_id, workflow.id ) self.logger.info(f"Deleted workflow {workflow.id}") except Exception as e: self.logger.exception( f"Failed to delete workflow {workflow.id}: {e}" ) raise ProviderException( f"Failed to delete workflow {workflow.id}: {e}" ) else: self.logger.info( f"Not deleting workflow {workflow.id} as it's current workflow" ) self.logger.info("Deleted all workflows") def _notify( self, delete_all_other_workflows: bool = False, workflow_full_sync: bool = False, workflow_to_update_yaml: str | None = None, alert: dict | None = None, fingerprint_fields: list | None = None, override_source_with: str | None = None, read_only: bool = False, fingerprint: str | None = None, if_: str | None = None, for_: str | None = None, **kwargs, ): """ Notify alerts or update workflow Args: delete_all_other_workflows: if True, delete all other workflows workflow_full_sync: if True, sync all workflows workflow_to_update_yaml: workflow yaml to update alert: alert data to create if: condition to evaluate for alert creation for: duration for state alerts fingerprint_fields: fields to use for alert fingerprinting override_source_with: override alert source read_only: if True, don't modify existing alerts fingerprint: alert fingerprint """ # TODO: refactor this to be two separate ProviderMethods, when wf engine will support calling provider methods is_workflow_action = ( workflow_full_sync or delete_all_other_workflows or workflow_to_update_yaml ) if workflow_full_sync or delete_all_other_workflows: # We need DB id, not user id for the workflow, so getting it from the wf execution. workflow_store = WorkflowStore() workflow_execution = workflow_store.get_workflow_execution( self.context_manager.tenant_id, self.context_manager.workflow_execution_id, ) workflow_db_id = workflow_execution.workflow_id if not workflow_execution.workflow_id == "test": self._delete_workflows(except_workflow_id=workflow_db_id) else: self.logger.info( "Not deleting workflow as it's a test run", ) if workflow_to_update_yaml: self.logger.info( "Updating workflow YAML", extra={"workflow_to_update_yaml": workflow_to_update_yaml}, ) workflowstore = WorkflowStore() # Create the workflow try: # In case the workflow has HTML entities: workflow_to_update_yaml = unescape(workflow_to_update_yaml) workflow_to_update_yaml = yaml.safe_load(workflow_to_update_yaml) if "workflow" in workflow_to_update_yaml: workflow_to_update_yaml = workflow_to_update_yaml["workflow"] workflow = workflowstore.create_workflow( tenant_id=self.context_manager.tenant_id, created_by=f"workflow id: {self.context_manager.workflow_id}", workflow=workflow_to_update_yaml, force_update=False, lookup_by_name=True, ) self.logger.info( "Workflow created successfully", extra={ "tenant_id": self.context_manager.tenant_id, "workflow": workflow, }, ) except Exception as e: self.logger.exception( "Failed to create workflow", extra={ "tenant_id": self.context_manager.tenant_id, "workflow": self.context_manager.workflow_id, }, ) raise ProviderException(f"Failed to create workflow: {e}") elif not is_workflow_action: self.logger.info("Notifying Alerts") # for backward compatibility if_condition = if_ or kwargs.get("if", None) for_duration = for_ or kwargs.get("for", None) alerts = self._notify_alert( alert=alert, if_condition=if_condition, for_duration=for_duration, fingerprint_fields=fingerprint_fields, override_source_with=override_source_with, read_only=read_only, fingerprint=fingerprint, ) self.logger.info("Alerts notified") return alerts def validate_config(self): """ Validates required configuration for Keep provider. """ pass @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: return AlertDto( **event, ) def _evaluate_if(self, if_conf, if_conf_rendered): # Evaluate the condition string from asteval import Interpreter aeval = Interpreter() evaluated_if_met = aeval(if_conf_rendered) # tb: when Shahar and I debugged, conclusion was: if isinstance(evaluated_if_met, str): evaluated_if_met = aeval(evaluated_if_met) # if the evaluation failed, raise an exception if aeval.error_msg: self.logger.error( f"Failed to evaluate if condition, you probably used a variable that doesn't exist. Condition: {if_conf}, Rendered: {if_conf_rendered}, Error: {aeval.error_msg}", extra={ "condition": if_conf, "rendered": if_conf_rendered, }, ) return False return evaluated_if_met def _handle_ternary_expressions(self, rendered_providers_parameters): """ Handle ternary expressions in rendered parameters without using js2py. Parses and evaluates expressions like: "x > 0.9 ? 'critical' : x > 0.7 ? 'warning' : 'info'" Args: rendered_providers_parameters (dict): Dictionary of rendered parameters Returns: dict: Updated parameters with evaluated ternary expressions """ from asteval import Interpreter def evaluate_ternary(expression, aeval): """Recursively evaluate a ternary expression using Python.""" # Find the position of the first question mark that's not inside quotes in_quotes = False quote_type = None question_pos = -1 for i, char in enumerate(expression): if char in ['"', "'"]: if not in_quotes: in_quotes = True quote_type = char elif char == quote_type: in_quotes = False if char == "?" and not in_quotes: question_pos = i break if question_pos == -1: # No ternary operator found, evaluate as regular expression return aeval(expression) # Find the matching colon colon_pos = -1 nested_level = 0 for i in range(question_pos + 1, len(expression)): char = expression[i] if char in ['"', "'"]: if not in_quotes: in_quotes = True quote_type = char elif char == quote_type: in_quotes = False if not in_quotes: if char == "?": nested_level += 1 elif char == ":": if nested_level == 0: colon_pos = i break else: nested_level -= 1 if colon_pos == -1: # Malformed ternary expression self.logger.warning( f"Malformed ternary expression: {expression}", extra={"expression": expression}, ) return expression # Split into condition, true_expr, and false_expr condition = expression[:question_pos].strip() true_expr = expression[question_pos + 1 : colon_pos].strip() false_expr = expression[colon_pos + 1 :].strip() # Evaluate the condition condition_result = aeval(condition) # Evaluate the appropriate branch (true or false) if condition_result: return evaluate_ternary(true_expr, aeval) else: return evaluate_ternary(false_expr, aeval) # Process each parameter value for key, value in rendered_providers_parameters.items(): if not isinstance(value, str): continue # Check if the value might contain a ternary expression if "?" in value and ":" in value: try: aeval = Interpreter() result = evaluate_ternary(value, aeval) # If there were errors during evaluation, log them but keep the original value if aeval.error_msg: self.logger.warning( f"Error evaluating ternary expression: {value}. Error: {aeval.error_msg}", extra={"value": value, "error": aeval.error_msg}, ) else: rendered_providers_parameters[key] = result except Exception as e: self.logger.warning( f"Failed to evaluate potential ternary expression: {value}. Error: {str(e)}", extra={"value": value, "error": str(e)}, ) return rendered_providers_parameters if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables ================================================ FILE: keep/providers/kibana_provider/__init__.py ================================================ ================================================ FILE: keep/providers/kibana_provider/kibana_provider.py ================================================ """ Kibana provider. """ import dataclasses import datetime import json import logging import uuid from typing import Literal, Union from urllib.parse import urlparse import pydantic import requests from fastapi import HTTPException from packaging.version import Version from starlette.datastructures import FormData from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory from keep.validation.fields import UrlPort @pydantic.dataclasses.dataclass class KibanaProviderAuthConfig: """Kibana authentication configuration.""" api_key: str = dataclasses.field( metadata={ "required": True, "description": "Kibana API Key", "sensitive": True, } ) kibana_host: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Kibana Host", "hint": "https://keep.kb.us-central1.gcp.cloud.es.io", "validation": "any_http_url", } ) kibana_port: UrlPort = dataclasses.field( metadata={ "required": False, "description": "Kibana Port (defaults to 9243)", "validation": "port", }, default=9243, ) class KibanaProvider(BaseProvider): """Enrich alerts with data from Kibana.""" PROVIDER_CATEGORY = ["Monitoring", "Developer Tools"] DEFAULT_TIMEOUT = 10 WEBHOOK_PAYLOAD = json.dumps( { "webhook_body": { "context_info": "{{#context}}{{.}}{{/context}}", "alert_info": "{{#alert}}{{.}}{{/alert}}", "rule_info": "{{#rule}}{{.}}{{/rule}}", } } ) SIEM_WEBHOOK_PAYLOAD = """{{#context.alerts}}{{{.}}}{{/context.alerts}}""" # Mock payloads for validating scopes MOCK_ALERT_PAYLOAD = { "name": "keep-test-alert", "schedule": {"interval": "1m"}, "rule_type_id": "observability.rules.custom_threshold", "consumer": "logs", "enabled": False, "params": { "criteria": [], "searchConfiguration": { "query": {"query": "*", "language": "kuery"}, "index": "", }, }, } MOCK_CONNECTOR_PAYLOAD = { "name": "keep-test-connector", "config": { "hasAuth": False, "method": "post", "url": "https://api.keephq.dev", "authType": False, "headers": {}, }, "secrets": {}, "connector_type_id": ".webhook", } PROVIDER_SCOPES = [ ProviderScope( name="rulesSettings:read", description="Read alerts", mandatory=True, alias="Read Alerts", ), ProviderScope( name="rulesSettings:write", description="Modify alerts", mandatory=True, alias="Modify Alerts", ), ProviderScope( name="actions:read", description="Read connectors", mandatory=True, alias="Read Connectors", ), ProviderScope( name="actions:write", description="Write connectors", mandatory=True, alias="Write Connectors", ), ] SEVERITIES_MAP = {} STATUS_MAP = { "active": AlertStatus.FIRING, "Alert": AlertStatus.FIRING, "recovered": AlertStatus.RESOLVED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) @staticmethod def parse_event_raw_body(raw_body: Union[bytes, dict, FormData]) -> dict: """ Parse the raw body from various input types into a dictionary. Args: raw_body: Can be bytes, dict, or FormData Returns: dict: Parsed event data Raises: ValueError: If the input type is not supported or parsing fails """ # Handle FormData if hasattr(raw_body, "_list") and hasattr( raw_body, "getlist" ): # Check if it's FormData # Convert FormData to dict form_dict = {} for key, value in raw_body.items(): # Handle multiple values for the same key existing_value = form_dict.get(key) if existing_value is not None: if isinstance(existing_value, list): existing_value.append(value) else: form_dict[key] = [existing_value, value] else: form_dict[key] = value # If there's a 'payload' field that's a string, try to parse it as JSON if "payload" in form_dict and isinstance(form_dict["payload"], str): try: form_dict["payload"] = json.loads(form_dict["payload"]) except json.JSONDecodeError: pass # Keep the original string if it's not valid JSON return form_dict # Handle bytes if isinstance(raw_body, bytes): # Handle the Kibana escape issue if b'"payload": "{' in raw_body: raw_body = raw_body.replace(b'"payload": "{', b'"payload": {') raw_body = raw_body.replace(b'}",', b"},") return json.loads(raw_body) # Handle dict if isinstance(raw_body, dict): return raw_body raise ValueError(f"Unsupported raw_body type: {type(raw_body)}") def validate_scopes(self) -> dict[str, bool | str]: """ Validate the scopes of the provider. Returns: dict[str, bool | str]: A dictionary of scopes and whether they are valid or not """ validated_scopes = {} connector = None alert = None for scope in self.PROVIDER_SCOPES: try: if scope.name == "rulesSettings:read": self.request( "GET", "api/alerting/rules/_find", params={"per_page": 1} ) elif scope.name == "rulesSettings:write": alert = self.request( "POST", "api/alerting/rule", json=self.MOCK_ALERT_PAYLOAD ) if not alert: raise Exception("Failed validating rulesSettings:write") self.request("DELETE", f"api/alerting/rule/{alert.get('id')}") elif scope.name == "actions:read": self.request("GET", "api/actions/connectors") elif scope.name == "actions:write": connector = self.request( "POST", "api/actions/connector", json=self.MOCK_CONNECTOR_PAYLOAD, ) if not connector: raise Exception("Failed validating actions:write") self.request( "DELETE", f"api/actions/connector/{connector.get('id')}" ) except HTTPException as e: self.logger.exception( "Failed validating scope", extra={ "scope": scope.name, "error": e.detail, "tenant_id": self.context_manager.tenant_id, "connector": connector, "alert": alert, }, ) if e.status_code == 403 or e.status_code == 401: validated_scopes[scope.name] = e.detail # this means we faild on something else which is not permissions and it's probably ok. pass except Exception as e: self.logger.exception( "Failed validating scope", extra={ "scope": scope.name, "error": e, "tenant_id": self.context_manager.tenant_id, "connector": connector, "alert": alert, }, ) validated_scopes[scope.name] = str(e) continue validated_scopes[scope.name] = True return validated_scopes def request( self, method: Literal["GET", "POST", "PUT", "DELETE"], uri: str, **kwargs ) -> dict: """ Make a request to Kibana. Adds the API key to the headers. Args: method (POST|GET|PUT|DELETE): The HTTP method uri (str): The URI to request. This is relative to the Kibana host. (e.g. api/actions/connector) Raises: HTTPException: If the request fails Returns: dict: The response JSON """ headers = kwargs.pop("headers", {}) headers["Authorization"] = f"ApiKey {self.authentication_config.api_key}" headers["kbn-xsrf"] = "reporting" response: requests.Response = getattr(requests, method.lower())( f"{self.authentication_config.kibana_host}:{self.authentication_config.kibana_port}/{uri}", headers=headers, **kwargs, ) if not response.ok: response_json: dict = response.json() raise HTTPException( response_json.get("statusCode", 404), detail=response_json.get("message"), ) try: return response.json() except requests.JSONDecodeError: return {} def __setup_webhook_alerts(self, tenant_id: str, keep_api_url: str, api_key: str): """ Setup the webhook alerts for Kibana Alerting. Args: tenant_id (str): The tenant ID keep_api_url (str): The URL of the Keep API api_key (str): The API key of the Keep API """ # Check kibana version kibana_version = ( self.request("GET", "api/status").get("version", {}).get("number") ) rule_types = self.request("GET", "api/alerting/rule_types") rule_types = {rule_type["id"]: rule_type for rule_type in rule_types} # if not version, assume < 8 for backwards compatibility if not kibana_version: kibana_version = "7.0.0" # First get all existing connectors and check if we're already installed: connectors = self.request("GET", "api/actions/connectors") connector_name = f"keep-{tenant_id}" connector = next( iter( [ connector for connector in connectors if connector["name"] == connector_name ] ), None, ) if connector: self.logger.info( "Connector already exists, updating", extra={"connector_id": connector["id"]}, ) # this means we already have a connector installed, so we just need to update it config: dict = connector["config"] config["url"] = keep_api_url config["headers"] = { "X-API-KEY": api_key, "Content-Type": "application/json", } self.request( "PUT", f"api/actions/connector/{connector['id']}", json={ "config": config, "name": connector_name, }, ) else: self.logger.info("Connector does not exist, creating") # we need to create a new connector body = { "name": connector_name, "config": { "hasAuth": False, "method": "post", "url": keep_api_url, "authType": None, "headers": { "X-API-KEY": api_key, "Content-Type": "application/json", }, }, "secrets": {}, "connector_type_id": ".webhook", } connector = self.request("POST", "api/actions/connector", json=body) self.logger.info( "Connector created", extra={"connector_id": connector["id"]} ) connector_id = connector["id"] # Now we need to update all the alerts and add actions that use this connector self.logger.info("Updating alerts") alert_rules = self.request( "GET", "api/alerting/rules/_find", params={"per_page": 1000}, # TODO: pagination ) for alert_rule in alert_rules.get("data", []): self.logger.info(f"Updating alert {alert_rule['id']}") alert_actions = alert_rule.get("actions") or [] # kibana 8: # pop any connector_type_id if Version(kibana_version) > Version("8.0.0"): for action in alert_actions: action.pop("connector_type_id", None) keep_action_exists = any( iter( [ action for action in alert_actions if action.get("id") == connector_id ] ) ) if keep_action_exists: # This alert was already modified by us / manually added self.logger.info(f"Alert {alert_rule['id']} already updated, skipping") continue rule_type_id = alert_rule.get("rule_type_id") action_groups = rule_types.get(alert_rule["rule_type_id"], {}).get( "action_groups", [] ) for action_group in action_groups: alert_actions.append( { "group": action_group.get("id"), "id": connector_id, "params": { # SIEM can use a different payload for more context "body": ( KibanaProvider.WEBHOOK_PAYLOAD if "siem" not in rule_type_id else KibanaProvider.SIEM_WEBHOOK_PAYLOAD ) }, "frequency": { "notify_when": "onActionGroupChange", "throttle": None, "summary": False, }, "uuid": str(uuid.uuid4()), } ) try: self.request( "PUT", f"api/alerting/rule/{alert_rule['id']}", json={ "actions": alert_actions, "name": alert_rule["name"], "tags": alert_rule["tags"], "schedule": alert_rule["schedule"], "params": alert_rule["params"], }, ) self.logger.info(f"Updated alert {alert_rule['id']}") except HTTPException as e: self.logger.warning( f"Failed to update alert {alert_rule['id']}", extra={"error": e.detail}, ) self.logger.info("Done updating alerts") def __setup_watcher_alerts(self, tenant_id: str, keep_api_url: str, api_key: str): """ Setup the webhook alerts for Kibana Watcher. Args: tenant_id (str): The tenant ID keep_api_url (str): The URL of the Keep API api_key (str): The API key of the Keep API """ parsed_keep_url = urlparse(keep_api_url) keep_host = parsed_keep_url.netloc keep_port = 80 if "localhost" in keep_host else 443 self.logger.info("Getting and updating all watches") watches = self.request( "POST", "api/console/proxy?path=%2F_watcher%2F_query%2Fwatches&method=GET" ) for watch in watches.get("watches", []): watch_id = watch.get("_id") self.logger.info(f"Handling watch with id {watch_id}") watch = self.request( "POST", f"api/console/proxy?path=%2F_watcher%2Fwatch%2F{watch_id}&method=GET", ).get("watch") actions = watch.get("actions", {}) actions[f"keep-{tenant_id}"] = { "webhook": { "scheme": "https" if keep_port == 443 else "http", "host": keep_host, "port": keep_port, "method": "post", "path": f"{parsed_keep_url.path}", "params": {}, "headers": {}, "auth": {"basic": {"username": "keep", "password": api_key}}, "body": '{"payload": "{{#toJson}}ctx{{/toJson}}", "status": "Alert"}', } } self.request( "POST", f"api/console/proxy?path=%2F_watcher%2Fwatch%2F{watch_id}&method=PUT", json={**watch}, ) self.logger.info(f"Finished handling watch with id {watch_id}") self.logger.info("Done getting and updating all watches") def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): """ Setup the webhook for Kibana. Args: tenant_id (str): The tenant ID keep_api_url (str): The URL of the Keep API api_key (str): The API key of the Keep API setup_alerts (bool, optional): Whether to setup alerts or not. Defaults to True. """ self.logger.info("Setting up webhooks") self.logger.info("Setting up Kibana Alerting webhook alerts") try: self.__setup_webhook_alerts(tenant_id, keep_api_url, api_key) self.logger.info("Done setting up Kibana Alerting webhook alerts") except Exception as e: self.logger.warning( "Failed to setup Kibana Alerting webhook alerts", extra={"error": str(e)}, ) self.logger.info("Setting up Kibana Watcher webhook alerts") try: self.__setup_watcher_alerts(tenant_id, keep_api_url, api_key) self.logger.info("Done setting up Kibana Watcher webhook alerts") except Exception as e: self.logger.warning( "Failed to setup Kibana Watcher webhook alerts", extra={"error": str(e)}, ) self.logger.info("Done setting up webhooks") def validate_config(self): if self.is_installed or self.is_provisioned: host = self.config.authentication["kibana_host"] if not (host.startswith("http://") or host.startswith("https://")): scheme = ( "http://" if ("localhost" in host or "127.0.0.1" in host) else "https://" ) self.config.authentication["kibana_host"] = scheme + host self.authentication_config = KibanaProviderAuthConfig( **self.config.authentication ) def dispose(self): # no need to dipose anything pass @staticmethod def format_alert_from_watcher(event: dict) -> AlertDto | list[AlertDto]: payload = event.get("payload", {}) alert_id = payload.pop("id") alert_metadata = payload.get("metadata", {}) alert_name = alert_metadata.get("name") if alert_metadata else alert_id last_received = payload.get("trigger", {}).get( "triggered_time", datetime.datetime.now(tz=datetime.timezone.utc).isoformat(), ) # map status to keep status status = KibanaProvider.STATUS_MAP.get( event.pop("status", None), AlertStatus.FIRING ) # kibana watcher doesn't have severity, so we'll use default (INFO) severity = AlertSeverity.INFO return AlertDto( id=alert_id, name=alert_name, fingerprint=payload.get("watch_id", alert_id), status=status, severity=severity, lastReceived=last_received, source=["kibana"], **event, ) @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: """ Formats an alert from Kibana to a standard format, supporting both old and new webhook formats. Args: event (dict): The event from Kibana, either in legacy or new webhook format provider_instance: The provider instance (optional) Returns: AlertDto | list[AlertDto]: The alert in a standard format """ # If this is coming from Kibana Watcher logger = logging.getLogger(__name__) if "payload" in event: return KibanaProvider.format_alert_from_watcher(event) # SIEM alert if "kibana" in event: logger.info("Parsing SIEM Kibana alert") description = ( event.get("kibana", {}) .get("alert", {}) .get("rule", {}) .get("description", "") ) if not description: logger.warning("Could not find description in SIEM Kibana alert") name = ( event.get("kibana", {}).get("alert", {}).get("rule", {}).get("name", "") ) if not name: logger.warning("Could not find name in SIEM Kibana alert") name = "SIEM Kibana Alert" fingerprint = event.get("kibana", {}).get("alert", {}).get("id", "") status = event.get("kibana", {}).get("alert", {}).get("status", "") if not status: logger.warning("Could not find status in SIEM Kibana alert") name = "active" # use map status = KibanaProvider.STATUS_MAP.get(status, AlertStatus.FIRING) severity = ( event.get("kibana", {}) .get("alert", {}) .get("severity", "could not find severity") ) # use map severity = KibanaProvider.SEVERITIES_MAP.get(severity, AlertSeverity.INFO) service = event.pop("service", {}).get("name", None) url = event.pop("url", {}).get("full", None) if not isinstance(url, str): logger.warning( "Could not extract url in SIEM Kibana alert", extra={"url": url} ) url = None if not isinstance(service, str): logger.warning( "Could not extract service in SIEM Kibana alert", extra={"service": service} ) service = None alert_dto = AlertDto( name=name, description=description, status=status, severity=severity, source=["kibana"], service=service, url=url, **event, ) if fingerprint: alert_dto.fingerprint = fingerprint logger.info("Finished to parse SIEM Kibana alert") return alert_dto # Check if this is the new webhook format # New Kibana webhook format if "webhook_body" in event: # Parse the JSON strings from the new format try: context_info = json.loads(event["webhook_body"]["context_info"]) alert_info = json.loads(event["webhook_body"]["alert_info"]) rule_info = json.loads(event["webhook_body"]["rule_info"]) # Construct event dict in old format for compatibility event = { "actionGroup": alert_info.get("actionGroup"), "status": alert_info.get("actionGroupName"), "actionSubgroup": alert_info.get("actionSubgroup"), "isFlapping": alert_info.get("flapping"), "kibana_alert_id": alert_info.get("id"), "fingerprint": alert_info.get("uuid"), "url": context_info.get("alertDetailsUrl"), "context.message": context_info.get("message"), "context.hits": context_info.get("matchingDocuments"), "context.link": context_info.get("viewInAppUrl"), "context.query": rule_info.get("params", {}).get("criteria"), "context.title": rule_info.get("name"), "description": context_info.get("reason"), "lastReceived": context_info.get("timestamp"), "ruleId": rule_info.get("id"), "rule.spaceId": rule_info.get("spaceId"), "ruleUrl": rule_info.get("url"), "ruleTags": rule_info.get("tags", []), "name": rule_info.get("name"), "rule.type": rule_info.get("type"), } except json.JSONDecodeError as e: logger.error(f"Error parsing new webhook format: {e}") # Fall through to process as old format # Process tags and labels (works for both old and new formats) labels = {} ruleTags = event.get("ruleTags", []) for tag in ruleTags: if "=" in tag: key, value = tag.split("=", 1) labels[key] = value context_tags = event.get("contextTags", []) for tag in context_tags: if "=" in tag: key, value = tag.split("=", 1) labels[key] = value environment = labels.get("environment", "undefined") # Format status and severity event["status"] = KibanaProvider.STATUS_MAP.get( event.get("status"), AlertStatus.FIRING ) event["severity"] = KibanaProvider.SEVERITIES_MAP.get( event.get("severity"), AlertSeverity.INFO ) # Handle URL fallback if not event.get("url"): event["url"] = event.get("ruleUrl") if not event.get("url"): event.pop("url", None) event["name"] = event.get( "name", event.get("rule.name", event.get("ruleId", event.get("message"))) ) # if its still empty, set a default name if not event.get("name"): event["name"] = "Kibana Alert [Could not extract name]" return AlertDto( environment=environment, labels=labels, source=["kibana"], **event, ) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os kibana_host = os.environ.get("KIBANA_HOST") api_key = os.environ.get("KIBANA_API_KEY") # Initalize the provider and provider config config = { "authentication": { "kibana_host": kibana_host, "api_key": api_key, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="kibana", provider_type="kibana", provider_config=config, ) result = provider.validate_scopes() print(result) ================================================ FILE: keep/providers/kubernetes_provider/__init__.py ================================================ ================================================ FILE: keep/providers/kubernetes_provider/kubernetes_provider.py ================================================ import dataclasses import datetime import pydantic from kubernetes import client from kubernetes.client.rest import ApiException from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class KubernetesProviderAuthConfig: """Kubernetes authentication configuration.""" api_server: pydantic.AnyHttpUrl = dataclasses.field( default=None, metadata={ "name": "api_server", "description": "The kubernetes api server url", "required": False, "sensitive": False, "validation": "any_http_url", }, ) token: str = dataclasses.field( default=None, metadata={ "name": "token", "description": "Bearer token to access kubernetes (leave empty for in-cluster auth)", "required": False, "sensitive": True, }, ) insecure: bool = dataclasses.field( default=True, metadata={ "name": "insecure", "description": "Skip TLS verification", "required": False, "sensitive": False, "type": "switch", }, ) use_in_cluster_config: bool = dataclasses.field( default=False, metadata={ "name": "use_in_cluster_config", "description": "Use in-cluster configuration (ServiceAccount)", "required": False, "sensitive": False, "type": "switch", }, ) class KubernetesProvider(BaseProvider): """Perform actions like rollout restart objects or list pods on Kubernetes.""" provider_id: str PROVIDER_DISPLAY_NAME = "Kubernetes" PROVIDER_CATEGORY = ["Cloud Infrastructure", "Developer Tools"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_kubernetes", description="Check if the provided token can connect to the kubernetes server", mandatory=True, alias="Connect to the kubernetes", ) ] def __init__(self, context_manager, provider_id: str, config: ProviderConfig): super().__init__(context_manager, provider_id, config) self.authentication_config = None self.validate_config() def dispose(self): """Dispose the provider.""" pass def validate_config(self): """ Validate the required configuration for the Kubernetes provider. """ if self.config.authentication is None: self.config.authentication = {} self.authentication_config = KubernetesProviderAuthConfig( **self.config.authentication ) def __create_k8s_client(self): """ Create a Kubernetes client. """ # Case 1: Manual configuration (API Server + Token) if self.authentication_config.api_server and self.authentication_config.token: client_configuration = client.Configuration() client_configuration.host = str(self.authentication_config.api_server) client_configuration.verify_ssl = not self.authentication_config.insecure client_configuration.api_key = { "authorization": "Bearer " + self.authentication_config.token } return client.ApiClient(client_configuration) # Case 2: In-cluster configuration (ServiceAccount) try: from kubernetes import config as k8s_config k8s_config.load_incluster_config() return client.ApiClient() except Exception as e: self.logger.error(f"Failed to load in-cluster config: {str(e)}") # Fallback to load default kubeconfig if exists try: from kubernetes import config as k8s_config k8s_config.load_kube_config() return client.ApiClient() except Exception as e: self.logger.error(f"Failed to load kube config: {str(e)}") raise Exception( "Kubernetes provider requires either manual configuration (API Server + Token) or in-cluster configuration (ServiceAccount)." ) def validate_scopes(self): """ Validate that the provided token has the required scopes to use the provider. """ self.logger.info("Validating scopes for Kubernetes provider") try: self.__create_k8s_client() self.logger.info("Successfully connected to the Kubernetes server") scopes = { "connect_to_kubernetes": True, } except Exception as e: self.logger.error(f"Failed to connect to the Kubernetes server: {str(e)}") scopes = { "connect_to_kubernetes": str(e), } return scopes def _query(self, command_type: str, **kwargs): """ Query Kubernetes resources. Args: command_type (str): The type of query to perform. Supported queries are: - get_logs: Get logs from a pod - get_deployment_logs: Get logs from all pods in a deployment - get_events: Get events for a namespace or pod - get_nodes: List nodes - get_pods: List pods - get_node_pressure: Get node pressure conditions - get_pvc: List persistent volume claims - get_deployments: List deployments - get_statefulsets: List statefulsets - get_daemonsets: List daemonsets - get_services: List services - get_namespaces: List namespaces - get_ingresses: List ingresses for a namespace or all namespaces - get_jobs: List jobs **kwargs: Additional arguments for the query. """ api_client = self.__create_k8s_client() if command_type == "get_logs": return self.__get_logs(api_client, **kwargs) elif command_type == "get_deployment_logs": return self.__get_deployment_logs(api_client, **kwargs) elif command_type == "get_events": return self.__get_events(api_client, **kwargs) elif command_type == "get_nodes": return self.__get_nodes(api_client, **kwargs) elif command_type == "get_pods": return self.__get_pods(api_client, **kwargs) elif command_type == "get_node_pressure": return self.__get_node_pressure(api_client, **kwargs) elif command_type == "get_pvc": return self.__get_pvc(api_client, **kwargs) elif command_type == "get_services": return self.__get_services(api_client, **kwargs) elif command_type == "get_deployments": return self.__get_deployments(api_client, **kwargs) elif command_type == "get_daemonsets": return self.__get_daemonsets(api_client, **kwargs) elif command_type == "get_statefulsets": return self.__get_statefulsets(api_client, **kwargs) elif command_type == "get_namespaces": return self.__get_namespaces(api_client, **kwargs) elif command_type == "get_ingresses": return self.__get_ingresses(api_client, **kwargs) elif command_type == "get_jobs": return self.__get_jobs(api_client, **kwargs) else: raise NotImplementedError(f"Command type {command_type} is not implemented") def _notify(self, action: str, **kwargs): """ Perform actions on Kubernetes resources. Args: action (str): The action to perform. Supported actions are: - rollout_restart: Restart a deployment/statefulset/daemonset - restart_pod: Restart a specific pod - cordon_node: Mark node as unschedulable - uncordon_node: Mark node as schedulable - drain_node: Safely evict pods from node - scale_deployment: Scale deployment up/down - scale_statefulset: Scale statefulset up/down - exec_pod_command: Execute command in pod **kwargs: Additional arguments for the action. """ if action == "rollout_restart": return self.__rollout_restart(**kwargs) elif action == "restart_pod": return self.__restart_pod(**kwargs) elif action == "cordon_node": return self.__cordon_node(**kwargs) elif action == "uncordon_node": return self.__uncordon_node(**kwargs) elif action == "drain_node": return self.__drain_node(**kwargs) elif action == "scale_deployment": return self.__scale_deployment(**kwargs) elif action == "scale_statefulset": return self.__scale_statefulset(**kwargs) elif action == "exec_pod_command": return self.__exec_pod_command(**kwargs) else: raise NotImplementedError(f"Action {action} is not implemented") def __get_logs( self, api_client, namespace, pod_name, container_name=None, tail_lines=100, **kwargs, ): """ Get logs from a pod. """ self.logger.info(f"Getting logs for pod {pod_name} in namespace {namespace}") core_v1 = client.CoreV1Api(api_client) try: logs = core_v1.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container_name, tail_lines=tail_lines, pretty=True, ) return logs.splitlines() except UnicodeEncodeError: logs = core_v1.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container_name, tail_lines=tail_lines, ) return logs.splitlines() except ApiException as e: self.logger.error(f"Error getting logs for pod {pod_name}: {e}") raise Exception(f"Error getting logs for pod {pod_name}: {e}") def __get_deployment_logs( self, api_client, namespace, deployment_name, container_name=None, tail_lines=100, **kwargs, ): """ Get logs from all pods in a deployment. """ self.logger.info(f"Getting logs for deployment {deployment_name} in namespace {namespace}") # First get pods for the deployment core_v1 = client.CoreV1Api(api_client) apps_v1 = client.AppsV1Api(api_client) try: # Get deployment to find its selector deployment = apps_v1.read_namespaced_deployment( name=deployment_name, namespace=namespace ) # Build label selector from deployment's selector match_labels = deployment.spec.selector.match_labels label_selector = ",".join([f"{k}={v}" for k, v in match_labels.items()]) # Get pods matching the selector pods = core_v1.list_namespaced_pod( namespace=namespace, label_selector=label_selector ) deployment_logs = {} for pod in pods.items: pod_name = pod.metadata.name try: logs = core_v1.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container_name, tail_lines=tail_lines, pretty=True, ) deployment_logs[pod_name] = logs.splitlines() except UnicodeEncodeError: logs = core_v1.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container_name, tail_lines=tail_lines, ) deployment_logs[pod_name] = logs.splitlines() except ApiException as pod_e: self.logger.warning(f"Could not get logs for pod {pod_name}: {pod_e}") deployment_logs[pod_name] = [f"Error getting logs: {pod_e}"] return deployment_logs except ApiException as e: self.logger.error(f"Error getting deployment logs for {deployment_name}: {e}") raise Exception(f"Error getting deployment logs for {deployment_name}: {e}") def __get_events( self, api_client, namespace, pod_name=None, sort_by=None, **kwargs ): """ Get events for a namespace or specific pod. """ self.logger.info( f"Getting events in namespace {namespace}" + (f" for pod {pod_name}" if pod_name else ""), extra={ "pod_name": pod_name, "namespace": namespace, "sort_by": sort_by, "tenant_id": self.context_manager.tenant_id, "workflow_id": self.context_manager.workflow_id, }, ) core_v1 = client.CoreV1Api(api_client) try: if pod_name: # Get the pod to find its UID pod = core_v1.read_namespaced_pod(name=pod_name, namespace=namespace) field_selector = f"involvedObject.kind=Pod,involvedObject.name={pod_name},involvedObject.uid={pod.metadata.uid}" else: field_selector = f"metadata.namespace={namespace}" events = core_v1.list_namespaced_event( namespace=namespace, field_selector=field_selector, ) if sort_by: self.logger.info( f"Sorting events by {sort_by}", extra={"sort_by": sort_by, "events_count": len(events.items)}, ) try: sorted_events = sorted( events.items, key=lambda event: getattr(event, sort_by, None), reverse=True, ) return sorted_events except Exception: self.logger.exception( f"Error sorting events by {sort_by}", extra={ "sort_by": sort_by, "events_count": len(events.items), "tenant_id": self.context_manager.tenant_id, "workflow_id": self.context_manager.workflow_id, }, ) # Convert events to dict return [event.to_dict() for event in events.items] except ApiException as e: self.logger.exception( "Error getting events", extra={ "tenant_id": self.context_manager.tenant_id, "workflow_id": self.context_manager.workflow_id, }, ) raise Exception(f"Error getting events: {e}") from e def __get_nodes(self, api_client, label_selector=None, return_full=False, **kwargs): """ List all nodes in the cluster. Args: return_full (bool): If True, return full node objects as dicts. If False (default), return only basic info. """ self.logger.info("Listing all nodes in the cluster") core_v1 = client.CoreV1Api(api_client) try: nodes = core_v1.list_node(label_selector=label_selector) if return_full: return [node.to_dict() for node in nodes.items] else: # Return basic info: name, status, labels basic_info = [] for node in nodes.items: info = { "name": node.metadata.name, "labels": node.metadata.labels, "status": node.status.conditions[-1].type if node.status.conditions else None, "addresses": [addr.address for addr in node.status.addresses] if node.status.addresses else [], } basic_info.append(info) return basic_info except ApiException as e: self.logger.error(f"Error listing nodes: {e}") raise Exception(f"Error listing nodes: {e}") def __get_pods(self, api_client, namespace=None, label_selector=None, **kwargs): """ List pods in a namespace or across all namespaces. """ core_v1 = client.CoreV1Api(api_client) try: if namespace: self.logger.info(f"Listing pods in namespace {namespace}") pods = core_v1.list_namespaced_pod( namespace=namespace, label_selector=label_selector ) else: self.logger.info("Listing pods across all namespaces") pods = core_v1.list_pod_for_all_namespaces( label_selector=label_selector ) return [pod.to_dict() for pod in pods.items] except ApiException as e: self.logger.error(f"Error listing pods: {e}") raise Exception(f"Error listing pods: {e}") def __get_node_pressure(self, api_client, **kwargs): """ Get node pressure conditions (Memory, Disk, PID). """ self.logger.info("Getting node pressure conditions") core_v1 = client.CoreV1Api(api_client) try: nodes = core_v1.list_node(watch=False) node_pressures = [] for node in nodes.items: pressures = { "name": node.metadata.name, "conditions": [], } for condition in node.status.conditions: if condition.type in [ "MemoryPressure", "DiskPressure", "PIDPressure", ]: pressures["conditions"].append(condition.to_dict()) node_pressures.append(pressures) return node_pressures except ApiException as e: self.logger.error(f"Error getting node pressures: {e}") raise Exception(f"Error getting node pressures: {e}") def __get_pvc(self, api_client, namespace=None, **kwargs): """ List persistent volume claims in a namespace or across all namespaces. """ core_v1 = client.CoreV1Api(api_client) try: if namespace: self.logger.info(f"Listing PVCs in namespace {namespace}") pvcs = core_v1.list_namespaced_persistent_volume_claim( namespace=namespace ) else: self.logger.info("Listing PVCs across all namespaces") pvcs = core_v1.list_persistent_volume_claim_for_all_namespaces() return [pvc.to_dict() for pvc in pvcs.items] except ApiException as e: self.logger.error(f"Error listing PVCs: {e}") raise Exception(f"Error listing PVCs: {e}") def __get_services(self, api_client, namespace=None, return_full=False, **kwargs): """ List services in a namespace or across all namespaces. Args: return_full (bool): If True, return full service objects as dicts. If False (default), return only the service names. """ core_v1 = client.CoreV1Api(api_client) try: if namespace: self.logger.info(f"Listing services in namespace {namespace}") services = core_v1.list_namespaced_service(namespace=namespace) else: self.logger.info("Listing services across all namespaces") services = core_v1.list_service_for_all_namespaces() if return_full: # Sanitize the services data to ensure JSON serialization sanitized_services = [] for service in services.items: service_dict = service.to_dict() # Convert any datetime objects to strings def sanitize_dict(obj): if isinstance(obj, dict): return {k: sanitize_dict(v) for k, v in obj.items()} elif isinstance(obj, list): return [sanitize_dict(item) for item in obj] elif hasattr(obj, 'isoformat'): # datetime objects return obj.isoformat() elif obj is None: return None else: return obj sanitized_service = sanitize_dict(service_dict) sanitized_services.append(sanitized_service) return sanitized_services else: # Return only service names return [service.metadata.name for service in services.items] except ApiException as e: self.logger.error(f"Error listing services: {e}") raise Exception(f"Error listing services: {e}") def __get_deployments(self, api_client, namespace=None, return_full=False, **kwargs): """ List deployments in a namespace or across all namespaces. """ apps_v1 = client.AppsV1Api(api_client) try: if namespace: self.logger.info(f"Listing deployments in namespace {namespace}") deployments = apps_v1.list_namespaced_deployment(namespace=namespace) else: self.logger.info("Listing deployments across all namespaces") deployments = apps_v1.list_deployment_for_all_namespaces() if return_full: return [deployment.to_dict() for deployment in deployments.items] else: return [deployment.metadata.name for deployment in deployments.items] except ApiException as e: self.logger.error(f"Error listing deployments: {e}") raise Exception(f"Error listing deployments: {e}") def __get_statefulsets(self, api_client, namespace=None, return_full=False, **kwargs): """ List statefulsets in a namespace or across all namespaces. """ apps_v1 = client.AppsV1Api(api_client) try: if namespace: self.logger.info(f"Listing statefulsets in namespace {namespace}") statefulsets = apps_v1.list_namespaced_stateful_set(namespace=namespace) else: self.logger.info("Listing statefulsets across all namespaces") statefulsets = apps_v1.list_stateful_set_for_all_namespaces() if return_full: return [statefulset.to_dict() for statefulset in statefulsets.items] else: return [statefulset.metadata.name for statefulset in statefulsets.items] except ApiException as e: self.logger.error(f"Error listing statefulsets: {e}") raise Exception(f"Error listing statefulsets: {e}") def __get_daemonsets(self, api_client, namespace=None, return_full=False, **kwargs): """ List daemonsets in a namespace or across all namespaces. """ apps_v1 = client.AppsV1Api(api_client) try: if namespace: self.logger.info(f"Listing daemonsets in namespace {namespace}") daemonsets = apps_v1.list_namespaced_daemon_set(namespace=namespace) else: self.logger.info("Listing daemonsets across all namespaces") daemonsets = apps_v1.list_daemon_set_for_all_namespaces() except ApiException as e: self.logger.error(f"Error listing daemonsets: {e}") raise Exception(f"Error listing daemonsets: {e}") if return_full: return [daemonset.to_dict() for daemonset in daemonsets.items] else: return [daemonset.metadata.name for daemonset in daemonsets.items] def __get_namespaces(self, api_client, return_full=False, **kwargs): """ List all namespaces. Args: return_full (bool): If True, return full namespace objects as dicts. If False (default), return only the names. """ self.logger.info("Listing namespaces") core_v1 = client.CoreV1Api(api_client) try: namespaces = core_v1.list_namespace() if return_full: return [namespace.to_dict() for namespace in namespaces.items] else: return [namespace.metadata.name for namespace in namespaces.items] except ApiException as e: self.logger.error(f"Error listing namespaces: {e}") raise Exception(f"Error listing namespaces: {e}") def __get_ingresses(self, api_client, namespace=None, return_full=False, **kwargs): """ List ingresses in a namespace or across all namespaces. Args: return_full (bool): If True, return full ingress objects as dicts. If False (default), return only the names. """ networking_v1 = client.NetworkingV1Api(api_client) try: if namespace: self.logger.info(f"Listing ingresses in namespace {namespace}") ingresses = networking_v1.list_namespaced_ingress(namespace=namespace) else: self.logger.info("Listing ingresses across all namespaces") ingresses = networking_v1.list_ingress_for_all_namespaces() if return_full: return [ingress.to_dict() for ingress in ingresses.items] else: return [ingress.metadata.name for ingress in ingresses.items] except ApiException as e: self.logger.error(f"Error listing ingresses: {e}") raise Exception(f"Error listing ingresses: {e}") def __get_jobs(self, api_client, namespace=None, return_full=False, **kwargs): """ List jobs in a namespace or across all namespaces. Args: return_full (bool): If True, return full job objects as dicts. If False (default), return only the names. """ batch_v1 = client.BatchV1Api(api_client) try: if namespace: self.logger.info(f"Listing jobs in namespace {namespace}") jobs = batch_v1.list_namespaced_job(namespace=namespace) else: self.logger.info("Listing jobs across all namespaces") jobs = batch_v1.list_job_for_all_namespaces() if return_full: return [job.to_dict() for job in jobs.items] else: return [job.metadata.name for job in jobs.items] except ApiException as e: self.logger.error(f"Error listing jobs: {e}") raise Exception(f"Error listing jobs: {e}") def __rollout_restart(self, kind, name, namespace, labels=None, **kwargs): """ Perform a rollout restart on a deployment, statefulset, or daemonset. """ api_client = self.__create_k8s_client() self.logger.info( f"Performing rollout restart for {kind} {name} in namespace {namespace}" ) now = datetime.datetime.now(datetime.timezone.utc) now = str(now.isoformat("T") + "Z") body = { "spec": { "template": { "metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": now} } } } } apps_v1 = client.AppsV1Api(api_client) try: if kind.lower() == "deployment": if labels: deployment_list = apps_v1.list_namespaced_deployment( namespace=namespace, label_selector=labels ) if not deployment_list.items: raise ValueError( f"Deployment with labels {labels} not found in namespace {namespace}" ) apps_v1.patch_namespaced_deployment( name=name, namespace=namespace, body=body ) elif kind.lower() == "statefulset": if labels: statefulset_list = apps_v1.list_namespaced_stateful_set( namespace=namespace, label_selector=labels ) if not statefulset_list.items: raise ValueError( f"StatefulSet with labels {labels} not found in namespace {namespace}" ) apps_v1.patch_namespaced_stateful_set( name=name, namespace=namespace, body=body ) elif kind.lower() == "daemonset": if labels: daemonset_list = apps_v1.list_namespaced_daemon_set( namespace=namespace, label_selector=labels ) if not daemonset_list.items: raise ValueError( f"DaemonSet with labels {labels} not found in namespace {namespace}" ) apps_v1.patch_namespaced_daemon_set( name=name, namespace=namespace, body=body ) else: raise ValueError(f"Unsupported kind {kind} to perform rollout restart") except ApiException as e: self.logger.error( f"Error performing rollout restart for {kind} {name}: {e}" ) raise Exception(f"Error performing rollout restart for {kind} {name}: {e}") self.logger.info(f"Successfully performed rollout restart for {kind} {name}") return { "status": "success", "message": f"Successfully performed rollout restart for {kind} {name}", } def __restart_pod( self, namespace, pod_name, container_name=None, message=None, **kwargs ): """ Restart a pod by deleting it (it will be recreated by its controller). This is useful for pods that are in a CrashLoopBackOff state. """ api_client = self.__create_k8s_client() core_v1 = client.CoreV1Api(api_client) self.logger.info(f"Restarting pod {pod_name} in namespace {namespace}") try: # Check if the pod exists pod = core_v1.read_namespaced_pod(name=pod_name, namespace=namespace) # If the pod is managed by a controller, it will be recreated # For standalone pods, this will simply delete the pod delete_options = client.V1DeleteOptions() core_v1.delete_namespaced_pod( name=pod_name, namespace=namespace, body=delete_options ) # Return success message response_message = ( message if message else f"Pod {pod_name} in namespace {namespace} was restarted" ) self.logger.info(response_message) return { "status": "success", "message": response_message, "pod_details": { "name": pod.metadata.name, "namespace": pod.metadata.namespace, "status": pod.status.phase, "containers": [container.name for container in pod.spec.containers], }, } except ApiException as e: error_message = f"Error restarting pod {pod_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __cordon_node(self, node_name, **kwargs): """ Mark a node as unschedulable (cordon). """ api_client = self.__create_k8s_client() core_v1 = client.CoreV1Api(api_client) self.logger.info(f"Cordoning node {node_name}") try: # Get the node node = core_v1.read_node(name=node_name) # Update the node to be unschedulable node.spec.unschedulable = True # Patch the node core_v1.patch_node(name=node_name, body=node) self.logger.info(f"Successfully cordoned node {node_name}") return { "status": "success", "message": f"Node {node_name} has been cordoned (marked unschedulable)", } except ApiException as e: error_message = f"Error cordoning node {node_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __uncordon_node(self, node_name, **kwargs): """ Mark a node as schedulable (uncordon). """ api_client = self.__create_k8s_client() core_v1 = client.CoreV1Api(api_client) self.logger.info(f"Uncordoning node {node_name}") try: # Get the node node = core_v1.read_node(name=node_name) # Update the node to be schedulable node.spec.unschedulable = False # Patch the node core_v1.patch_node(name=node_name, body=node) self.logger.info(f"Successfully uncordoned node {node_name}") return { "status": "success", "message": f"Node {node_name} has been uncordoned (marked schedulable)", } except ApiException as e: error_message = f"Error uncordoning node {node_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __drain_node(self, node_name, force=False, ignore_daemonsets=True, delete_emptydir_data=False, **kwargs): """ Safely evict pods from a node (drain). """ api_client = self.__create_k8s_client() core_v1 = client.CoreV1Api(api_client) self.logger.info(f"Draining node {node_name}") try: # First cordon the node self.__cordon_node(node_name) # Get all pods on the node field_selector = f"spec.nodeName={node_name}" pods = core_v1.list_pod_for_all_namespaces(field_selector=field_selector) evicted_pods = [] failed_pods = [] for pod in pods.items: # Skip pods that are already terminating if pod.metadata.deletion_timestamp: continue # Skip DaemonSet pods if ignore_daemonsets is True if ignore_daemonsets: owner_references = pod.metadata.owner_references or [] is_daemonset_pod = any( ref.kind == "DaemonSet" for ref in owner_references ) if is_daemonset_pod: continue # Skip pods with emptyDir volumes unless explicitly allowed if not delete_emptydir_data: volumes = pod.spec.volumes or [] has_emptydir = any( vol.empty_dir is not None for vol in volumes ) if has_emptydir and not force: failed_pods.append({ "name": pod.metadata.name, "namespace": pod.metadata.namespace, "reason": "Has emptyDir volumes (use delete_emptydir_data=True to override)" }) continue try: # Create eviction object eviction = client.V1Eviction( metadata=client.V1ObjectMeta( name=pod.metadata.name, namespace=pod.metadata.namespace ) ) # Evict the pod core_v1.create_namespaced_pod_eviction( name=pod.metadata.name, namespace=pod.metadata.namespace, body=eviction ) evicted_pods.append({ "name": pod.metadata.name, "namespace": pod.metadata.namespace }) except ApiException as e: if e.status == 429: # Too Many Requests - PodDisruptionBudget if force: # Force delete the pod if force is True try: core_v1.delete_namespaced_pod( name=pod.metadata.name, namespace=pod.metadata.namespace, grace_period_seconds=0 ) evicted_pods.append({ "name": pod.metadata.name, "namespace": pod.metadata.namespace, "forced": True }) except ApiException as delete_e: failed_pods.append({ "name": pod.metadata.name, "namespace": pod.metadata.namespace, "reason": f"Could not force delete: {delete_e}" }) else: failed_pods.append({ "name": pod.metadata.name, "namespace": pod.metadata.namespace, "reason": f"Blocked by PodDisruptionBudget (use force=True to override): {e}" }) else: failed_pods.append({ "name": pod.metadata.name, "namespace": pod.metadata.namespace, "reason": str(e) }) result = { "status": "success" if not failed_pods else "partial_success", "message": f"Node {node_name} drain completed", "evicted_pods": evicted_pods, "failed_pods": failed_pods, "summary": { "total_evicted": len(evicted_pods), "total_failed": len(failed_pods) } } self.logger.info(f"Drain completed for node {node_name}: {len(evicted_pods)} evicted, {len(failed_pods)} failed") return result except ApiException as e: error_message = f"Error draining node {node_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __scale_deployment(self, namespace, deployment_name, replicas, **kwargs): """ Scale a deployment to the specified number of replicas. """ api_client = self.__create_k8s_client() apps_v1 = client.AppsV1Api(api_client) self.logger.info(f"Scaling deployment {deployment_name} in namespace {namespace} to {replicas} replicas") try: # Get current deployment deployment = apps_v1.read_namespaced_deployment( name=deployment_name, namespace=namespace ) current_replicas = deployment.spec.replicas # Update replicas deployment.spec.replicas = replicas # Patch the deployment apps_v1.patch_namespaced_deployment( name=deployment_name, namespace=namespace, body=deployment ) self.logger.info(f"Successfully scaled deployment {deployment_name} from {current_replicas} to {replicas} replicas") return { "status": "success", "message": f"Deployment {deployment_name} scaled from {current_replicas} to {replicas} replicas", "previous_replicas": current_replicas, "new_replicas": replicas, } except ApiException as e: error_message = f"Error scaling deployment {deployment_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __scale_statefulset(self, namespace, statefulset_name, replicas, **kwargs): """ Scale a statefulset to the specified number of replicas. """ api_client = self.__create_k8s_client() apps_v1 = client.AppsV1Api(api_client) self.logger.info(f"Scaling statefulset {statefulset_name} in namespace {namespace} to {replicas} replicas") try: # Get current statefulset statefulset = apps_v1.read_namespaced_stateful_set( name=statefulset_name, namespace=namespace ) current_replicas = statefulset.spec.replicas # Update replicas statefulset.spec.replicas = replicas # Patch the statefulset apps_v1.patch_namespaced_stateful_set( name=statefulset_name, namespace=namespace, body=statefulset ) self.logger.info(f"Successfully scaled statefulset {statefulset_name} from {current_replicas} to {replicas} replicas") return { "status": "success", "message": f"StatefulSet {statefulset_name} scaled from {current_replicas} to {replicas} replicas", "previous_replicas": current_replicas, "new_replicas": replicas, } except ApiException as e: error_message = f"Error scaling statefulset {statefulset_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __exec_pod_command(self, namespace, pod_name, command, container_name=None, **kwargs): """ Execute a command inside a pod. """ api_client = self.__create_k8s_client() core_v1 = client.CoreV1Api(api_client) self.logger.info(f"Executing command in pod {pod_name} in namespace {namespace}: {command}") try: from kubernetes.stream import stream # Prepare the command if isinstance(command, str): # Split command string into list exec_command = ['/bin/sh', '-c', command] else: exec_command = command # Execute the command resp = stream( core_v1.connect_get_namespaced_pod_exec, pod_name, namespace, command=exec_command, container=container_name, stderr=True, stdin=False, stdout=True, tty=False, _preload_content=False ) # Read the output output = "" error = "" while resp.is_open(): resp.update(timeout=1) if resp.peek_stdout(): output += resp.read_stdout() if resp.peek_stderr(): error += resp.read_stderr() resp.close() result = { "status": "success", "command": command, "stdout": output, "stderr": error, "pod_name": pod_name, "namespace": namespace, "container": container_name, } self.logger.info(f"Successfully executed command in pod {pod_name}") return result except ApiException as e: error_message = f"Error executing command in pod {pod_name}: {e}" self.logger.error(error_message) raise Exception(error_message) except Exception as e: error_message = f"Error executing command in pod {pod_name}: {e}" self.logger.error(error_message) raise Exception(error_message) if __name__ == "__main__": # Output debug messages import json import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) # Load environment variables import os url = os.environ.get("KUBERNETES_URL") token = os.environ.get("KUBERNETES_TOKEN") insecure = os.environ.get("KUBERNETES_INSECURE", "false").lower() == "true" namespace = os.environ.get("KUBERNETES_NAMESPACE", "default") pod_name = os.environ.get("KUBERNETES_POD_NAME") deployment_name = os.environ.get("KUBERNETES_DEPLOYMENT_NAME") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = ProviderConfig( authentication={ "api_server": url, "token": token, "insecure": insecure, }, ) kubernetes_provider = KubernetesProvider( context_manager, "kubernetes_keephq", config ) # Example queries if pod_name: print("Getting logs:") try: logs = kubernetes_provider.query( command_type="get_logs", namespace=namespace, pod_name=pod_name ) print(logs[:10]) # Print first 10 lines except Exception as e: print(f"Error: {e}") print("\nGetting events:") try: events = kubernetes_provider.query( command_type="get_events", namespace=namespace, pod_name=pod_name ) print(json.dumps(events[:3], indent=2)) # Print first 3 events except Exception as e: print(f"Error: {e}") print("\nRestarting pod:") restart_result = kubernetes_provider.notify( action="restart_pod", namespace=namespace, pod_name=pod_name, message=f"Manually restarting pod {pod_name}", ) print(json.dumps(restart_result, indent=2)) else: print("Getting pods:") try: pods = kubernetes_provider.query(command_type="get_pods", namespace=namespace) print(f"Found {len(pods)} pods in namespace {namespace}") except Exception as e: print(f"Error: {e}") # Get namespaces print("\nGetting namespaces:") try: namespaces = kubernetes_provider.query(command_type="get_namespaces") print(f"Found {len(namespaces)} namespaces") for ns in namespaces[:3]: # Show first 3 print(f" - {ns['metadata']['name']}") except Exception as e: print(f"Error: {e}") # Get services print("\nGetting services:") try: services = kubernetes_provider.query(command_type="get_services", namespace=namespace) print(f"Found {len(services)} services in namespace {namespace}") for svc in services[:3]: # Show first 3 print(f" - {svc['metadata']['name']} ({svc['spec']['type']})") except Exception as e: print(f"Error: {e}") ================================================ FILE: keep/providers/libre_nms_provider/README.md ================================================ ## Setting up LibreNMS using Docker 1. Go to [LibreNMS Docker GitHub](https://github.com/librenms/docker) 2. Clone the repository ```bash git clone https://github.com/librenms/docker.git ``` 3. Go to the cloned repository ```bash cd docker ``` 3. Go to examples/compose ```bash cd examples/compose ``` 4. Start the containers using docker-compose ```bash docker compose up -d ``` 5. Your LibreNMS instance should be running on [http://localhost:8080](http://localhost:8080) ================================================ FILE: keep/providers/libre_nms_provider/__init__.py ================================================ ================================================ FILE: keep/providers/libre_nms_provider/alerts_mock.py ================================================ ALERTS = { "title": "Device 10.10.1.147 recovered from Devices up/down", "hostname": "10.10.1.147", "device_id": "2", "sysDescr": "Linux node 6.8.0-54-generic #56-Ubuntu SMP PREEMPT_DYNAMIC Sat Feb 8 00:37:57 UTC 2025 x86_64", "sysName": "node", "sysContact": "Me ", "os": "linux", "type": "server", "ip": "10.10.1.147", "display": "10.10.1.147", "version": "6.8.0-54-generic", "hardware": "Generic x86 64-bit", "features": "", "serial": "", "status": "1", "status_reason": "", "location": "Sitting on the Dock of the Bay", "description": "", "notes": "", "uptime": "59", "uptime_short": "59s", "uptime_long": "59 seconds", "elapsed": "3m 7s", "alerted": "1", "alert_id": "26", "alert_notes": "", "proc": "", "rule_id": "13", "id": "38", "faults": "", "uid": "41", "severity": "ok", "rule": "{\"condition\":\"AND\",\"rules\":[{\"id\":\"macros.device_down\",\"field\":\"macros.device_down\",\"type\":\"integer\",\"input\":\"radio\",\"operator\":\"equal\",\"value\":\"1\"}],\"valid\":true}", "name": "Devices up/down", "string": "", "timestamp": "2025-03-04 11:01:41", "contacts": "", "state": "0", "msg": "Device 10.10.1.147 recovered from Devices up/down\nSeverity: ok\nTime elapsed: 3m 7s Timestamp: 2025-03-04 11:01:41\nUnique-ID: 41\nRule: Devices up/down Faults:\n #1: sysObjectID => .1.3.6.1.4.1.8072.3.2.10; sysDescr => Linux node 6.8.0-54-generic #56-Ubuntu SMP PREEMPT_DYNAMIC Sat Feb 8 00:37:57 UTC 2025 x86_64; location_id => 1;\nAlert sent to:", "builder": "{\"condition\":\"AND\",\"rules\":[{\"id\":\"macros.device_down\",\"field\":\"macros.device_down\",\"type\":\"integer\",\"input\":\"radio\",\"operator\":\"equal\",\"value\":\"1\"}],\"valid\":true}" } ================================================ FILE: keep/providers/libre_nms_provider/libre_nms_provider.py ================================================ """ LibreNMS Provider is a class that provides a way to receive alerts from LibreNMS using API endpoints as well as webhooks. """ import dataclasses import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class LibreNmsProviderAuthConfig: """ LibreNmsProviderAuthConfig is a class that allows you to authenticate in LibreNMS. """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "LibreNMS Host URL", "hint": "e.g. https://librenms.example.com", "sensitive": False, "validation": "any_http_url", } ) api_key: str = dataclasses.field( metadata={ "required": True, "description": "LibreNMS API Key", "sensitive": True, } ) class LibreNmsProvider(BaseProvider): """ Get alerts from LibreNMS into Keep. """ webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from LibreNMS to Keep, Use the following webhook url to configure LibreNMS send alerts to Keep: 1. In LibreNMS Dashboard, go to Alerts > Alert Transports 2. Create transport with type API and POST method 3. Give a Transport Name and select Transport Type as API 4. Select the API Method as POST 3. Enter first part (without the options) of the Keep webhook URL as API URL: {keep_webhook_api_url} (until the "?") 4. Remove the questionmark and put the remaining string starting with "provider_id=" under Options 5. Add header "X-API-KEY" with your Keep API key (webhook role) 6. For JSON body format, refer to [Keep documentation](https://docs.keephq.dev/providers/documentation/libre_nms-provider) 7. Save the transport """ PROVIDER_DISPLAY_NAME = "LibreNMS" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="read_alerts", description="Read alerts from LibreNMS", ), ] STATUS_MAP = { "0": AlertStatus.RESOLVED, "1": AlertStatus.FIRING, "2": AlertStatus.ACKNOWLEDGED, } SEVERITY_MAP = { "ok": AlertSeverity.INFO, "warning": AlertSeverity.WARNING, "critical": AlertSeverity.CRITICAL, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider """ pass def validate_config(self): """ Validates required configuration for LibreNMS provider. """ self.authentication_config = LibreNmsProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validate scopes for the provider """ self.logger.info("Validating LibreNMS provider") try: response = requests.get( url=self._get_url("alerts"), headers=self._get_auth_headers() ) if response.status_code != 200: response.raise_for_status() self.logger.info( "Successfully validated scopes", extra={"response": response.json()} ) return {"read_alerts": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"error": e}) return {"read_alerts": str(e)} def _get_url(self, endpoint: str): return f"{self.authentication_config.host_url}/api/v0/{endpoint}" def _get_auth_headers(self): return {"X-Auth-Token": self.authentication_config.api_key} def _get_alerts(self) -> list[AlertDto]: """ Get alerts from LibreNMS. """ self.logger.info("Getting alerts from LibreNMS") try: response = requests.get( url=self._get_url("alerts"), headers=self._get_auth_headers() ) if response.status_code != 200: response.raise_for_status() alerts = response.json()["alerts"] return [ AlertDto( id=alert.get("id"), name=alert.get("rule_name", "Could not fetch rule name"), hostname=alert.get("hostname", "Could not fetch hostname"), device_id=alert.get("device_id", "Could not fetch device id"), rule_id=alert.get("rule_id", "Could not fetch rule id"), status=LibreNmsProvider.STATUS_MAP.get( alert.get("state"), AlertStatus.FIRING ), alerted=alert.get("alerted", "Could not fetch alerted"), open=alert.get("open", "Could not fetch open"), note=alert.get("note", "Could not fetch note"), timestamp=alert.get("timestamp", "Could not fetch timestamp"), lastReceived=alert.get( "timestamp", "Could not fetch last received" ), info=alert.get("info", "Could not fetch info"), severity=LibreNmsProvider.SEVERITY_MAP.get( alert.get("severity"), AlertSeverity.INFO ), source=["libre_nms"], ) for alert in alerts ] except Exception as e: self.logger.exception("Failed to get alerts from LibreNMS") raise Exception(f"Failed to get alerts from LibreNMS: {str(e)}") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: if event.get("description") == "": description = event.get("title", "Could not fetch description") else: description = event.get("description", "Could not fetch description") alert = AlertDto( id=event.get("id"), name=event.get("name", "Could not fetch rule name"), status=LibreNmsProvider.STATUS_MAP.get( event.get("state"), AlertStatus.FIRING ), severity=LibreNmsProvider.SEVERITY_MAP.get( event.get("severity"), AlertSeverity.INFO ), timestamp=event.get("timestamp"), lastReceived=event.get("timestamp"), title=event.get("title", "Could not fetch title"), hostname=event.get("hostname", "Could not fetch hostname"), device_id=event.get("device_id", "Could not fetch device id"), sysDescr=event.get("sysDescr", "Could not fetch sysDescr"), sysName=event.get("sysName", "Could not fetch sysName"), sysContact=event.get("sysContact", "Could not fetch sysContact"), host_os=event.get("os", "Could not fetch host_os"), host_type=event.get("type", "Could not fetch host_type"), ip=event.get("ip", "Could not fetch ip"), display=event.get("display", "Could not fetch display"), version=event.get("version", "Could not fetch version"), hardware=event.get("hardware", "Could not fetch hardware"), features=event.get("features", "Could not fetch features"), serial=event.get("serial", "Could not fetch serial"), status_reason=event.get("status_reason", "Could not fetch status_reason"), location=event.get("location", "Could not fetch location"), description=description, notes=event.get("notes", "Could not fetch notes"), uptime=event.get("uptime", "Could not fetch uptime"), uptime_sort=event.get("uptime_sort", "Could not fetch uptime_sort"), uptime_long=event.get("uptime_long", "Could not fetch uptime_long"), elapsed=event.get("elapsed", "Could not fetch elapsed"), alerted=event.get("alerted", "Could not fetch alerted"), alert_id=event.get("alert_id", "Could not fetch alert_id"), alert_notes=event.get("alert_notes", "Could not fetch alert_notes"), proc=event.get("proc", "Could not fetch proc"), rule_id=event.get("rule_id", "Could not fetch rule_id"), faults=event.get("faults", "Could not fetch faults"), uid=event.get("uid", "Could not fetch uid"), rule=event.get("rule", "Could not fetch rule"), builder=event.get("builder", "Could not fetch builder"), source=["libre_nms"], ) return alert if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os librenms_api_key = os.getenv("LIBRENMS_API_KEY") config = ProviderConfig( description="LibreNMS Provider", authentication={ "host_url": "https://librenms.example.com", "api_key": librenms_api_key, }, ) provider = LibreNmsProvider(context_manager, "libre_nms", config) alerts = provider.get_alerts() print(alerts) ================================================ FILE: keep/providers/linear_provider/__init__.py ================================================ ================================================ FILE: keep/providers/linear_provider/linear_provider.py ================================================ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class LinearProviderAuthConfig: """Linear authentication configuration.""" api_token: str = dataclasses.field( metadata={ "required": True, "description": "Linear API Token", "sensitive": True, } ) ticket_creation_url: str = dataclasses.field( metadata={ "required": False, "description": "URL for creating new tickets", "sensitive": False, "hint": "https://linear.app/your-team/issue/new", }, default="", ) class LinearProvider(BaseProvider): """Enrich alerts with Linear tickets.""" PROVIDER_DISPLAY_NAME = "Linear" LINEAR_GRAPHQL_URL = "https://api.linear.app/graphql" PROVIDER_CATEGORY = ["Ticketing"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = LinearProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def __query_linear_projects(self, team_name=""): """Helper method to fetch the linear projects by team.""" try: self.logger.info(f"Fetching projects for linear team:{team_name}...") query = f""" query {{ teams(filter: {{name: {{eq: "{team_name}"}}}}) {{ nodes {{ id name projects {{ nodes {{ id name }} }} }} }} }} """ response = requests.post( url=self.LINEAR_GRAPHQL_URL, json={"query": query}, headers=self.__headers, ) response.raise_for_status() data: dict = response.json().get("data") if data is None: # if data is None the response.json() has error details raise ProviderException(response.json()) team_nodes = data.get("teams", {}).get("nodes", []) # note: "team_name" are unique, so it's ok to select the first team node team_node = team_nodes[0] if len(team_nodes) > 0 else {} projects = team_node.get("projects", {}).get("nodes", []) self.logger.info(f"Fetched projects for linear team:{team_name}!") return {"projects": projects} except Exception as e: raise ProviderException(f"Failed to fetch linear projects: {e}") def __query_linear_data(self, team_name="", project_name=""): """Helper method to fetch the linear team and project data.""" try: self.logger.info( f"Fetching linear data for team: {team_name} and project: {project_name}..." ) query = f""" query {{ teams(filter: {{name: {{eq: "{team_name}"}}}}) {{ nodes {{ id name projects(filter: {{ name: {{ eq: "{project_name}" }} }}) {{ nodes {{ id name }} }} }} }} }} """ response = requests.post( url=self.LINEAR_GRAPHQL_URL, json={"query": query}, headers=self.__headers, ) response.raise_for_status() data: dict = response.json().get("data") if data is None: # if data is None the response.json() has error details raise ProviderException(response.json()) team_nodes = data.get("teams", {}).get("nodes", []) # note: "team_name" are unique, so it's ok to select the first team node team_node = team_nodes[0] if len(team_nodes) > 0 else {} team_id = team_node.get("id", "") project_nodes = team_node.get("projects", {}).get("nodes", []) # note: there can be multiple projects with same "project_name", so we select the first project_node = project_nodes[0] if len(project_nodes) > 0 else {} project_id = project_node.get("id", "") if project_id == "" or team_id == "": raise ProviderException( f"Linear team:{team_name} or project:{project_name}, doesn't exists" ) self.logger.info( f"Fetched linear data for team: {team_name} and project: {project_name}!" ) return {"project_id": project_id, "team_id": team_id} except Exception as e: self.logger.error(e) raise ProviderException( f"Failed to fetch linear data for team:{team_name}, project:{project_name} : {e}" ) def __create_issue( self, team_name="", project_name="", title="", description="", priority=0, **kwargs: dict, ): """ Create an issue inside a linear project for given team. """ try: self.logger.info(f"Creating an issue with title:{title} ...") linear_data = self.__query_linear_data( team_name=team_name, project_name=project_name ) query = f""" mutation {{ issueCreate( input: {{ title: "{title}" description: "{description}" priority: {priority} teamId: "{linear_data["team_id"]}" projectId: "{linear_data["project_id"]}" }} ) {{ success issue {{ id title }} }} }} """ response = requests.post( url=self.LINEAR_GRAPHQL_URL, json={"query": query}, headers=self.__headers, ) response.raise_for_status() data: dict = response.json().get("data") if data is None: raise ProviderException(response.json()) issue = data.get("issueCreate", {}).get("issue", {}) self.logger.info(f"Created an issue with title:{title} !") return {"issue": issue} except Exception as e: raise ProviderException(f"Failed to create an issue in linear: {e}") def _notify( self, team_name: str, project_name: str, title: str, description="", priority=0, **kwargs: dict, ): """ Notify linear by creating an issue. """ try: self.logger.info("Notifying linear...") result = self.__create_issue( team_name=team_name, project_name=project_name, title=title, description=description, priority=priority, ) self.logger.info("Notified linear!") return result except Exception as e: raise ProviderException(f"Failed to notify linear: {e}") def _query(self, team_name: str, **kwargs: dict): """ Query linear data for given team. """ try: self.logger.info("Querying from linear...") result = self.__query_linear_projects(team_name=team_name) self.logger.info("Queried from linear!") return result except Exception as e: raise ProviderException(f"Failed to query linear: {e}") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os linear_api_token = os.environ.get("LINEAR_API_TOKEN") linear_project_id = os.environ.get("LINEAR_PROJECT_ID") # Initialize the provider and provider config config = ProviderConfig( description="Linear Input Provider", authentication={ "api_token": linear_api_token, "project_id": linear_project_id, }, ) provider = LinearProvider(context_manager, provider_id="linear", config=config) provider.query(team_name="Keep") provider.notify( team_name="Keep", project_name="keep", title="ISSUE1", description="some description", priority=2, ) ================================================ FILE: keep/providers/linearb_provider/__init__.py ================================================ ================================================ FILE: keep/providers/linearb_provider/linearb_provider.py ================================================ import dataclasses import datetime import json import pydantic import requests from asteval import Interpreter from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class LinearbProviderAuthConfig: """LinearB authentication configuration.""" api_token: str = dataclasses.field( metadata={ "required": True, "description": "LinearB API Token", "sensitive": True, } ) class LinearbProvider(BaseProvider): """LinearB provider.""" PROVIDER_DISPLAY_NAME = "LinearB" LINEARB_API = "https://public-api.linearb.io" PROVIDER_CATEGORY = ["Developer Tools"] PROVIDER_SCOPES = [ ProviderScope( name="any", description="A way to validate the provider", mandatory=True ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_scopes(self) -> dict[str, bool | str]: headers = { "x-api-key": self.authentication_config.api_token, } result = requests.get( f"{self.LINEARB_API}/api/v1/health", headers=headers, timeout=10 ) if not result.ok: return {"any": "Failed to validate the API token"} return {"any": True} def validate_config(self): self.authentication_config = LinearbProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything. """ pass def _notify( self, incident_id: str, http_url: str = "", title: str = "", teams="", repository_urls="", services="", started_at="", ended_at="", git_ref="", should_delete="", issued_at="", **kwargs: dict, ): """ Notify linear by creating/updating an incident. """ try: self.logger.info("Notifying LinearB...") headers = { "x-api-key": self.authentication_config.api_token, } # If should_delete is true (any string that is not false), delete the incident and return. if should_delete and should_delete != "false": result = requests.delete( f"{self.LINEARB_API}/api/v1/incidents/{incident_id}", headers=headers, timeout=10, ) if result.ok: self.logger.info("Deleted incident successfully") else: r = result.json() # don't override message if "message" in r: r["message_from_linearb"] = r.pop("message") self.logger.warning("Failed to delete incident", extra={**r}) raise Exception(f"Failed to notify linearB {result.text}") return result.text # Try to get the incident incident_response = requests.get( f"{self.LINEARB_API}/api/v1/incidents/{incident_id}", headers=headers, timeout=10, ) if incident_response.ok: incident = incident_response.json() self.logger.info("Found LinearB Incident", extra={"incident": incident}) payload = {**incident} if "teams" in payload: self.logger.info( "Handling teams", extra={"teams": payload["teams"]} ) team_names = [team["name"] for team in payload["teams"]] if teams and isinstance(teams, str): try: teams = json.loads(teams) for team in teams: if team not in team_names: team_names.append(team) except json.JSONDecodeError: self.logger.warning("Failed to parse teams to JSON") payload["teams"] = team_names self.logger.info("Updated teams", extra={"teams": payload["teams"]}) if repository_urls: self.logger.info( "Handling repository_urls", extra={"repository_urls": repository_urls}, ) if isinstance(repository_urls, str): try: repository_urls = json.loads(repository_urls) except json.JSONDecodeError: self.logger.warning( "Failed to parse repository_urls to JSON" ) payload["repository_urls"] = repository_urls self.logger.info( "Updated repository_urls", extra={"repository_urls": payload["repository_urls"]}, ) else: # Might received repository_urls as a key in the payload payload.pop("repository_urls", None) if services: self.logger.info( "Got services from workflow", extra={"services": services} ) if isinstance(services, str): aeval = Interpreter() services: list = aeval(services) if len(services) > 0 and isinstance(services[0], dict): services = [service["name"] for service in services] payload["services"] = services self.logger.info( "Updated services", extra={"services": payload["services"]} ) elif "services" in payload: service_names = [service["name"] for service in payload["services"]] payload["services"] = service_names if started_at: payload["started_at"] = started_at if ended_at: payload["ended_at"] = ended_at if git_ref: payload["git_ref"] = git_ref result = requests.patch( f"{self.LINEARB_API}/api/v1/incidents/{incident_id}", json=payload, headers=headers, timeout=10, ) else: if not http_url or not title: raise ProviderException( "http_url and title are required for creating an incident" ) if teams and isinstance(teams, str): teams = json.loads(teams) if not teams: raise ProviderException( "At least 1 team is required for creating an incident" ) issued_at = issued_at or datetime.datetime.now().isoformat() payload = { "provider_id": incident_id, "http_url": http_url, "title": title, "issued_at": issued_at, "teams": teams, } if repository_urls: if isinstance(repository_urls, str): repository_urls = json.loads(repository_urls) payload["repository_urls"] = repository_urls if services: if isinstance(services, str): services = json.loads(services) payload["services"] = services result = requests.post( f"{self.LINEARB_API}/api/v1/incidents", json=payload, headers=headers, timeout=10, ) if result.ok: self.logger.info( "Notified LinearB successfully", extra={"payload": payload} ) else: # don't override message r = result.json() if "message" in r: r["message_from_linearb"] = r.pop("message") self.logger.warning( "Failed to notify linearB", extra={**r, "payload": payload}, ) raise Exception(f"Failed to notify linearB {result.text}") return result.text except Exception as e: self.logger.exception("Failed to notify LinearB") raise ProviderException(f"Failed to notify LinearB: {e}") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os linearb_api_token = os.environ.get("LINEARB_API_TOKEN") # Initialize the provider and provider config config = ProviderConfig( description="Linear Input Provider", authentication={ "api_token": linearb_api_token, }, ) provider = LinearbProvider(context_manager, provider_id="linear", config=config) provider.notify( incident_id="linear", http_url="https://www.google.com", title="Test", teams='["All Contributors"]', repository_urls='["https://www.keephq.dev"]', started_at=datetime.datetime.now().isoformat(), should_delete="true", ) ================================================ FILE: keep/providers/litellm_provider/__init__.py ================================================ ================================================ FILE: keep/providers/litellm_provider/litellm_provider.py ================================================ import json import dataclasses import pydantic import requests from typing import Optional, Dict, Any, List from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class LitellmProviderAuthConfig: api_url: str = dataclasses.field( metadata={ "required": True, "description": "LiteLLM API endpoint URL", "sensitive": False, } ) api_key: str | None = dataclasses.field( metadata={ "required": False, "description": "Optional API key if your LiteLLM deployment requires authentication", "sensitive": True, }, default=None, ) class LitellmProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "LiteLLM" PROVIDER_CATEGORY = ["AI"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = LitellmProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _prepare_headers(self) -> Dict[str, str]: headers = {"Content-Type": "application/json"} if self.authentication_config.api_key: headers["Authorization"] = f"Bearer {self.authentication_config.api_key}" return headers def _format_messages(self, prompt: str) -> List[Dict[str, str]]: """Format the prompt as a chat message.""" return [{"role": "user", "content": prompt}] def _query( self, prompt: str, temperature: float = 0.7, model: str = "gpt-3.5-turbo", max_tokens: int = 1024, structured_output_format: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: headers = self._prepare_headers() formatted_messages = self._format_messages(prompt) # Prepare the request payload payload = { "model": model, "messages": formatted_messages, "max_tokens": max_tokens, "temperature": temperature, } # Add structured output format if provided if structured_output_format: # Append system message with format instructions format_instructions = f"You must respond with a JSON object that conforms to the following schema: {json.dumps(structured_output_format)}" payload["messages"].insert( 0, {"role": "system", "content": format_instructions} ) try: response = requests.post( f"{self.authentication_config.api_url}/chat/completions", headers=headers, json=payload, timeout=60, ) response.raise_for_status() # Parse the response result = response.json() # Extract the generated text from the response try: generated_text = result["choices"][0]["message"]["content"] except KeyError: generated_text = "" # Try to parse as JSON if it's meant to be structured if structured_output_format: try: generated_text = json.loads(generated_text) except json.JSONDecodeError: raise ProviderException( f"Failed to parse generated text as JSON: {generated_text}. Model not following the structured output format. Response: {result}" ) return { "response": generated_text, } except requests.exceptions.RequestException as e: raise ProviderException(f"Error querying LiteLLM API: {str(e)}") if __name__ == "__main__": import os import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = ProviderConfig( description="LiteLLM Provider", authentication={ "api_url": "http://localhost:4000", # Default LiteLLM API endpoint "api_key": os.environ.get("LITELLM_API_KEY"), # Optional }, ) provider = LitellmProvider( context_manager=context_manager, provider_id="litellm_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works.", temperature=0, model="gpt-3.5-turbo", structured_output_format={ "type": "object", "properties": { "environment": { "type": "string", "enum": ["production", "debug", "pre-prod"], }, }, "required": ["environment"], }, max_tokens=100, ) ) ================================================ FILE: keep/providers/llamacpp_provider/__init__.py ================================================ ================================================ FILE: keep/providers/llamacpp_provider/llamacpp_provider.py ================================================ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class LlamacppProviderAuthConfig: host: str = dataclasses.field( metadata={ "required": True, "description": "Llama.cpp Server Host URL", "sensitive": False, }, default="http://localhost:8080" ) class LlamacppProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Llama.cpp" PROVIDER_CATEGORY = ["AI"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = LlamacppProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _query( self, prompt, max_tokens=1024, ): # Build the API URL for completion api_url = f"{self.authentication_config.host}/completion" # Prepare the request payload payload = { "prompt": prompt, "n_predict": max_tokens, "temperature": 0.7, "stop": ["\n\n"], # Common stop sequence "stream": False } try: # Make the API request response = requests.post(api_url, json=payload) response.raise_for_status() content = response.json()["content"] return { "response": content, } except requests.exceptions.RequestException as e: raise ProviderException(f"Error calling Llama.cpp API: {str(e)}") if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = ProviderConfig( description="Llama.cpp Provider", authentication={ "host": "http://localhost:8080", # Default Llama.cpp server host }, ) provider = LlamacppProvider( context_manager=context_manager, provider_id="llamacpp_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works. Give one word: production or dev.", max_tokens=10, ) ) ================================================ FILE: keep/providers/mailgun_provider/__init__.py ================================================ ================================================ FILE: keep/providers/mailgun_provider/mailgun_provider.py ================================================ import dataclasses import datetime import logging import os import re import typing import pydantic import requests from keep.api.models.alert import AlertDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class MailgunProviderAuthConfig: email: str = dataclasses.field( metadata={ "required": False, "description": "Email address to send alerts to", "sensitive": False, "hint": "This will get populated automatically after installation", "readOnly": True, }, default="", ) sender: str = dataclasses.field( metadata={ "required": False, "description": "Sender email address to validate", "hint": ".*@keephq.dev for example, leave empty for any.", }, default="", ) email_domain: str = dataclasses.field( metadata={ "required": False, "description": "Custom email domain for receiving alerts", "hint": "e.g., alerts.yourcompany.com (uses env MAILGUN_DOMAIN if not set)", "sensitive": False, }, default="", ) extraction: typing.Optional[list[dict[str, str]]] = dataclasses.field( default=None, metadata={ "description": "Extraction Rules", "type": "form", "required": False, "hint": "Read more about extraction in Keep's Mailgun documentation", }, ) class MailgunProvider(BaseProvider): MAILGUN_API_KEY = os.environ.get("MAILGUN_API_KEY") MAILGUN_DOMAIN = os.environ.get("MAILGUN_DOMAIN", "mails.keephq.dev") WEBHOOK_INSTALLATION_REQUIRED = True PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ) -> None: super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = MailgunProviderAuthConfig( **self.config.authentication ) def dispose(self): pass @staticmethod def parse_event_raw_body(raw_body: bytes | dict) -> dict: """ Parse the raw body of a Mailgun webhook event and create an ingestable dict. Args: raw_body (bytes | dict): The raw body from the webhook Returns: dict: Parsed event data in a format compatible with _format_alert """ if not isinstance(raw_body, bytes): return raw_body logger.info("Parsing Mail Body") try: # Use latin1 as it can handle any byte sequence content = raw_body.decode("latin1", errors="replace") parsed_data = {} # Try to find body-plain content if 'Content-Disposition: form-data; name="body-plain"' in content: logger.info("Mail Body Found") # Extract body-plain content parts = content.split( 'Content-Disposition: form-data; name="body-plain"' ) if len(parts) > 1: body_content = parts[1].split("\r\n\r\n", 1)[1].split("\r\n--")[0] # Convert the alert format to Mailgun expected format parsed_data = { "subject": "", # Will be populated below "from": "", # Will be populated from Source "stripped-text": "", # Will be populated from message content "timestamp": "", # Will be populated from Opened } # Parse the content line by line for line in body_content.strip().splitlines(): if ":" in line: key, value = line.split(":", 1) key = key.strip() value = value.strip() # Map the fields to what _format_alert expects if key == "Summary": parsed_data["subject"] = value elif key == "Source": parsed_data["from"] = value elif key in ["Alert Status", "Severity"]: parsed_data[key.lower()] = value elif key == "Opened": # Convert the date format to timestamp try: dt = datetime.datetime.strptime( value, "%d %b %Y %H:%M UTC" ) parsed_data["timestamp"] = str(dt.timestamp()) except ValueError: parsed_data["timestamp"] = str( datetime.datetime.now().timestamp() ) else: parsed_data[key.lower()] = value # Combine relevant fields for the message message_parts = [] for key in [ "Summary", "Alert Category", "Service Test", "Severity", "Alert Status", ]: if key in body_content: for line in body_content.split("\r\n"): if line.startswith(key + ":"): message_parts.append(line) parsed_data["stripped-text"] = "\n".join(message_parts) # Store the full original content parsed_data["raw_content"] = body_content logger.info("Mail Body Parsed", extra={"parsed_data": parsed_data}) return parsed_data logger.info("Mail Body Not Found") return { "subject": "Unknown Alert", "from": "system@keep", "stripped-text": content, } except Exception as e: logger.exception(f"Error parsing webhook body: {e}") return { "subject": "Error Processing Alert", "from": "system", "stripped-text": "Error processing the alert content", "timestamp": str(datetime.datetime.now().timestamp()), } def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ) -> dict[str, str]: if not MailgunProvider.MAILGUN_API_KEY: raise Exception("MAILGUN_API_KEY is not set") # Use custom domain from config, env var, or default email_domain = ( self.authentication_config.email_domain or MailgunProvider.MAILGUN_DOMAIN ) email = f"{tenant_id}-{self.provider_id}@{email_domain}" expression = f'match_recipient("{email}")' if ( "match_header" in self.authentication_config.sender or "match_recipient" in self.authentication_config.sender ): # validate that somebody doesn't try to use match_header or match_recipient raise ValueError("Invalid sender value") if self.authentication_config.sender: sender = self.authentication_config.sender # Bob if not sender.startswith(".*"): sender = f".*{sender}" if not sender.endswith(">"): sender = f"{sender}>" expression = f'({expression} and match_header("from", "{sender}"))' url = "https://api.mailgun.net/v3/routes" payload = { "priority": 0, "expression": expression, "description": f"Keep {self.provider_id} alerting", "action": [ f"forward('{keep_api_url}&api_key={api_key}')", "stop()", ], } route_id = self.config.authentication.get("route_id") if route_id: response = requests.put( f"{url}/{self.config.authentication.get('route_id')}", files=payload, auth=("api", MailgunProvider.MAILGUN_API_KEY), data=payload, ) else: response = requests.post( url, files=payload, auth=("api", MailgunProvider.MAILGUN_API_KEY), data=payload, ) response.raise_for_status() response_json = response.json() route_id = route_id or response_json.get("route", {}).get("id") return {"route_id": route_id, "email": email} @staticmethod def _format_alert( event: dict, provider_instance: "MailgunProvider" = None ) -> AlertDto: # We receive FormData here, convert it to simple dict. logger.info( "Received alert from mail", extra={ "from": event["from"], "subject": event.get("subject") }, ) event = dict(event) source = event["from"] name = event.get("subject", source) body_plain = event.get("Body-plain") message = event.get("stripped-text", body_plain) raw_content = event.get("raw_content") if isinstance(raw_content, bytes) and b"dmarc" in raw_content.lower(): logger.warning("DMARC alert detected, skipping") return None elif isinstance(raw_content, str) and "dmarc" in raw_content.lower(): logger.warning("DMARC alert detected, skipping") return None if not name or not message: raise Exception( "Could not create alert from email when name or message is missing." ) try: timestamp = datetime.datetime.fromtimestamp( float(event["timestamp"]) ).isoformat() except Exception: timestamp = datetime.datetime.now().isoformat() # default values severity = "info" status = "firing" # clean redundant event.pop("signature", "") event.pop("token", "") logger.info("Basic formatting done") alert = AlertDto( name=name, source=[source], message=message, description=message, lastReceived=timestamp, severity=severity, status=status, raw_email={**event}, ) # now I want to add all attributes from raw_email to the alert dto, except the ones that are already set for key, value in event.items(): # avoid "-" in keys cuz CEL will failed [stripped-text screw CEL] if not hasattr(alert, key) and "-" not in key: setattr(alert, key, value) logger.info( "Alert formatted", ) if provider_instance: logger.info( "Provider instance found", ) extraction_rules = provider_instance.authentication_config.extraction if extraction_rules: logger.info( "Extraction rules found", ) for rule in extraction_rules: key = rule.get("key") regex = rule.get("value") if key in dict(event): try: match = re.search(regex, event[key]) if match: for ( group_name, group_value, ) in match.groupdict().items(): setattr(alert, group_name, group_value) except Exception as e: logger.exception( f"Error extracting key {key} with regex {regex}: {e}", extra={ "provider_id": provider_instance.provider_id, "tenant_id": provider_instance.context_manager.tenant_id, }, ) logger.info( "Alert extracted", ) return alert if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Initalize the provider and provider config config = { "description": "Console Output Provider", "authentication": {}, } provider = ProvidersFactory.get_provider( context_manager, provider_id="mock", provider_type="console", provider_config=config, ) provider.notify(alert_message="Simple alert showing context with name: John Doe") ================================================ FILE: keep/providers/mattermost_provider/__init__.py ================================================ ================================================ FILE: keep/providers/mattermost_provider/mattermost_provider.py ================================================ import dataclasses import json5 import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class MattermostProviderAuthConfig: """Mattermost authentication configuration.""" webhook_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Mattermost Webhook Url", "sensitive": True, "validation": "any_http_url", } ) class MattermostProvider(BaseProvider): """send alert message to Mattermost.""" PROVIDER_DISPLAY_NAME = "Mattermost" PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = MattermostProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify(self, message="", attachments=[], channel="", **kwargs: dict): """ Notify alert message to Mattermost using the Mattermost Incoming Webhook API https://docs.mattermost.com/developer/webhooks-incoming.html Args: message (str): The content of the message. attachments (list): The attachments of the message. channel (str): The channel to send the message """ self.logger.info("Notifying alert message to Mattermost") if not message: message = attachments[0].get("text") webhook_url = self.authentication_config.webhook_url payload = {"text": message, **kwargs} if channel: payload["channel"] = channel if attachments: try: attachments = json5.loads(attachments) except Exception: pass payload["attachments"] = attachments response = requests.post(webhook_url, json=payload, verify=False) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Mattermost: {response.text}" ) self.logger.info( "Alert message notified to Mattermost", extra={"response": response.text} ) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os mattermost_webhook_url = os.environ.get("MATTERMOST_WEBHOOK_URL") # Initalize the provider and provider config config = ProviderConfig( id="mattermost-test", description="Mattermost Output Provider", authentication={"webhook_url": mattermost_webhook_url}, ) provider = MattermostProvider( context_manager, provider_id="mattermost", config=config ) provider.notify(message="Simple alert showing context with name: John Doe") ================================================ FILE: keep/providers/microsoft-planner-provider/__init__.py ================================================ ================================================ FILE: keep/providers/microsoft-planner-provider/microsoft-planner-provider.py ================================================ import dataclasses from urllib.parse import urljoin import pydantic import requests from azure.identity import ClientSecretCredential from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class PlannerProviderAuthConfig: """Planner authentication configuration.""" PLANNER_DEFAULT_SCOPE = "https://graph.microsoft.com/.default" tenant_id: str | None = dataclasses.field( metadata={ "required": True, "description": "Planner Tenant ID", "sensitive": True, }, ) client_id: str | None = dataclasses.field( metadata={ "required": True, "description": "Planner Client ID", "sensitive": True, }, ) client_secret: str | None = dataclasses.field( metadata={ "required": True, "description": "Planner Client Secret", "sensitive": True, }, ) scopes: list = dataclasses.field(default_factory=[PLANNER_DEFAULT_SCOPE]) class PlannerProvider(BaseProvider): """Microsoft Planner provider class.""" MS_GRAPH_BASE_URL = "https://graph.microsoft.com/v1.0" MS_PLANS_URL = urljoin(base=MS_GRAPH_BASE_URL, url="planner/plans") MS_TASKS_URL = urljoin(base=MS_GRAPH_BASE_URL, url="planner/tasks") PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.authentication_config = PlannerProviderAuthConfig( **self.config.authentication ) self.access_token = self.__generate_access_token() self.headers = { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json", } def __generate_access_token(self): credential = ClientSecretCredential( self.authentication_config.tenant_id, self.authentication_config.client_id, self.authentication_config.client_secret, ) access_token = credential.get_token( scopes=self.authentication_config.scopes ).token return access_token def dispose(self): pass def validate_config(self): self.authentication_config = PlannerProviderAuthConfig( **self.config.authentication ) def __get_plan_by_id(self, plan_id=""): MS_PLAN_URL = f"{self.MS_PLANS_URL}/{plan_id}" self.logger.info(f"Fetching plan by id: {plan_id}") response = requests.get(url=MS_PLAN_URL, headers=self.headers) # In case of an error response response.raise_for_status() response_data = response.json() self.logger.info(f"Fetched plan by id: {plan_id}") return response_data def __create_task(self, plan_id="", title="", bucket_id=None): request_body = {"planId": plan_id, "title": title, "bucketId": bucket_id} self.logger.info(f"Creating a new task with title: {title}") response = requests.post( url=self.MS_TASKS_URL, headers=self.headers, json=request_body ) # In case of an error response response.raise_for_status() response_data = response.json() self.logger.info( f"Created a new task with id: {response_data.get('id')} and title: {response_data.get('title')}" ) return response_data def _notify( self, plan_id="", title="", bucket_id=None, description="", due_date=None, assigned_to=None, **kwargs: dict, ): # To verify if the plan with plan_id exists or not self.__get_plan_by_id(plan_id=plan_id) # Create a new task in the given plan created_task = self.__create_task( plan_id=plan_id, title=title, bucket_id=bucket_id ) return created_task if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os planner_client_id = os.environ.get("PLANNER_CLIENT_ID") planner_client_secret = os.environ.get("PLANNER_CLIENT_SECRET") planner_tenant_id = os.environ.get("PLANNER_TENANT_ID") config = { "authentication": { "client_id": planner_client_id, "client_secret": planner_client_secret, "tenant_id": planner_tenant_id, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="planner-keephq", provider_type="planner", provider_config=config, ) result = provider.notify( plan_id="YOUR_PLANNER_ID", ) ================================================ FILE: keep/providers/mock_provider/__init__.py ================================================ ================================================ FILE: keep/providers/mock_provider/mock_provider.py ================================================ """ MockProvider is a class that implements the BaseOutputProvider interface for Mock messages. """ from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class MockProvider(BaseProvider): def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): pass def _query(self, **kwargs): """This is mock provider that just return the command output. Args: **kwargs: Just will return all parameters passed to it. Returns: _type_: _description_ """ return kwargs.get("command_output") def _notify(self, **kwargs): """This is mock provider that just return the command output. Args: **kwargs: Just will return all parameters passed to it. Returns: _type_: _description_ """ return kwargs def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass ================================================ FILE: keep/providers/models/__init__.py ================================================ ================================================ FILE: keep/providers/models/provider_config.py ================================================ """ Provider configuration model. """ import os from typing import Optional import chevron from pydantic.dataclasses import dataclass @dataclass class ProviderScope: """ Provider scope model. Args: name (str): The name of the scope. description (Optional[str]): The description of the scope. mandatory (bool): Whether the scope is mandatory. mandatory_for_webhook (bool): Whether the scope is mandatory for webhook auto installation. documentation_url (Optional[str]): The documentation url of the scope. alias (Optional[str]): Another alias of the scope. """ name: str description: Optional[str] = None mandatory: bool = False mandatory_for_webhook: bool = False documentation_url: Optional[str] = None alias: Optional[str] = None @dataclass class ProviderConfig: """ Provider configuration model. Args: description (Optional[str]): The description of the provider. authentication (dict): The configuration for the provider. """ authentication: Optional[dict] name: Optional[str] = None description: Optional[str] = None def __post_init__(self): if not self.authentication: return for key, value in self.authentication.items(): if ( isinstance(value, str) and value.startswith("{{") and value.endswith("}}") ): self.authentication[key] = chevron.render(value, {"env": os.environ}) ================================================ FILE: keep/providers/models/provider_method.py ================================================ from typing import Literal from pydantic import BaseModel class ProviderMethodParam(BaseModel): """ Just a simple model to represent a provider method parameter """ name: str type: str mandatory: bool = True default: str | None = None expected_values: list[str] | None = ( None # for example if type is Literal or something ) class ProviderMethod(BaseModel): """ Provider "special" method model. """ name: str func_name: str # the name of the function in the provider class scopes: list[str] = [] # required scope names, should match ProviderScope names description: str | None = None category: str | None = None type: Literal["view", "action"] = "view" class ProviderMethodDTO(ProviderMethod): """ Constructred in providers_factory, this includes the paramters the function receives We use this to generate the UI for the provider method This is populated using reflection from the function signature """ func_params: list[ProviderMethodParam] = [] ================================================ FILE: keep/providers/monday_provider/__init__.py ================================================ ================================================ FILE: keep/providers/monday_provider/monday_provider.py ================================================ """ MondayProvider is a class that provides a way to create new pulse on Monday.com. """ import dataclasses import json import pydantic import requests from keep.api.core.config import config from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class MondayProviderAuthConfig: """ MondayProviderAuthConfig is a class that holds the authentication information for the MondayProvider. """ api_token: str = dataclasses.field( metadata={ "required": False, "description": "Personal API Token", "sensitive": True, }, default="", ) access_token: str = dataclasses.field( metadata={ "description": "For access token installation flow, use Keep UI", "required": False, "sensitive": True, "hidden": True, }, default="", ) scopes: str = dataclasses.field( metadata={ "description": "Scopes from OAuth logic, comma separated", "required": False, "sensitive": False, "hidden": True, }, default="", ) class MondayProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Monday" PROVIDER_CATEGORY = ["Collaboration", "Organizational Tools"] OAUTH2_URL = config("MONDAY_OAUTH2_URL", default=None) MONDAY_CLIENT_ID = config("MONDAY_CLIENT_ID", default=None) MONDAY_CLIENT_SECRET = config("MONDAY_CLIENT_SECRET", default=None) PROVIDER_SCOPES = [ ProviderScope( name="create_pulse", description="Create a new pulse", ), ] PROVIDER_TAGS = ["ticketing"] url = "https://api.monday.com/v2" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): self.authentication_config = MondayProviderAuthConfig( **self.config.authentication ) if ( not self.authentication_config.access_token and not self.authentication_config.api_token ): raise ProviderException("API token or access token is required") def validate_scopes(self) -> dict[str, bool | str]: """ Validate scopes for the provider """ try: response = requests.post( self.url, json={"query": "query { me { id } }"}, headers=self._get_auth_headers(), ) if response.status_code != 200: response.raise_for_status() self.logger.info(f"Successfully validated scopes {response.json()}") return {"create_pulse": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"error": e}) return {"create_pulse": str(e)} @staticmethod def oauth2_logic(**payload) -> dict: """ Handle the OAuth2 flow for the Monday provider """ request = requests.post( "https://auth.monday.com/oauth2/token", json={ "client_id": MondayProvider.MONDAY_CLIENT_ID, "client_secret": MondayProvider.MONDAY_CLIENT_SECRET, "code": payload.get("code"), "redirect_uri": payload.get("redirect_uri"), }, ) request.raise_for_status() response = request.json() new_provider_info = { "access_token": response.get("access_token"), "scopes": response.get("scope"), } return new_provider_info def _get_auth_headers(self): """ Get the authentication headers """ if self.authentication_config.access_token: return { "Authorization": f"Bearer {self.authentication_config.access_token}", } else: return { "Authorization": self.authentication_config.api_token, } def _create_new_pulse( self, board_id: int, group_id: str, item_name: str, column_values: dict = None, ): try: self.logger.info("Creating new item") headers = self._get_auth_headers() query = """ mutation ($board_id: ID!, $group_id: String!, $item_name: String!, $column_values: JSON) { create_item(board_id: $board_id, group_id: $group_id, item_name: $item_name, column_values: $column_values) { id } } """ if column_values is None: column_values = {} column_values = json.dumps( {k: v for d in column_values for k, v in d.items()} ) variables = { "board_id": board_id, "group_id": group_id, "item_name": item_name, "column_values": column_values, } response = requests.post( self.url, json={"query": query, "variables": variables}, headers=headers ) self.logger.info("Response received", extra={"resp": response.json()}) self.logger.info(f"Status Code: {response.status_code}") try: if response.status_code != 200: response.raise_for_status() self.logger.info("Item created successfully") return response.json() except Exception: self.logger.exception( "Failed to create item", extra={"resp": response.json()} ) raise ProviderException(f"Failed to create item: {response.json()}") except Exception as e: raise ProviderException(f"Failed to create item: {e}") def _notify( self, board_id: int, group_id: str, item_name: str, column_values: dict = None, ): try: self.logger.info("Creating new item") self._create_new_pulse(board_id, group_id, item_name, column_values) self.logger.info("Item created successfully") except Exception as e: raise ProviderException(f"Failed to create item: {e}") if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os api_token = os.environ.get("API_TOKEN") if api_token is None: raise Exception("API_TOKEN is required") config = ProviderConfig( description="Monday Provider", authentication={ "api_token": api_token, }, ) monday_provider = MondayProvider( context_manager=context_manager, provider_id="monday_provider", config=config, ) board_id = 1956384489 group_id = "topics" item_name = "New Item" column_values = [{"text_mkm77x3p": "helo"}, {"text_1_mkm7x2ep": "10"}] monday_provider._notify(board_id, group_id, item_name, column_values) ================================================ FILE: keep/providers/mongodb_provider/__init__.py ================================================ ================================================ FILE: keep/providers/mongodb_provider/mongodb_provider.py ================================================ """ MongodbProvider is a class that provides a way to read data from MySQL. """ import dataclasses import json import os import pydantic from pymongo import MongoClient from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_config_exception import ProviderConfigException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import MultiHostUrl @pydantic.dataclasses.dataclass class MongodbProviderAuthConfig: host: MultiHostUrl = dataclasses.field( metadata={ "required": True, "description": "Mongo host_uri", "hint": "mongodb+srv://host:port, mongodb://host1:port1,host2:port2?authSource", "validation": "multihost_url", } ) username: str = dataclasses.field( metadata={"required": False, "description": "MongoDB username"}, default=None ) password: str = dataclasses.field( metadata={ "required": False, "description": "MongoDB password", "sensitive": True, }, default=None, ) database: str = dataclasses.field( metadata={"required": False, "description": "MongoDB database name"}, default=None, ) auth_source: str | None = dataclasses.field( metadata={"required": False, "description": "Mongo authSource database name"}, default=None, ) additional_options: str | None = dataclasses.field( metadata={ "required": False, "description": "Mongo kwargs, these will be passed to MongoClient", }, default=None, ) class MongodbProvider(BaseProvider): """Enrich alerts with data from MongoDB.""" PROVIDER_DISPLAY_NAME = "MongoDB" PROVIDER_CATEGORY = ["Database"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_server", description="The user can connect to the server", mandatory=True, alias="Connect to the server", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.client = None def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ try: client = self.__generate_client() client.admin.command( "ping" ) # will raise an exception if the server is not available client.close() scopes = { "connect_to_server": True, } except Exception: self.logger.exception("Error validating scopes") scopes = { "connect_to_server": "Unable to connect to server. Please check the connection details.", } return scopes def __generate_client(self): """ Generates a MongoDB client. Returns: pymongo.MongoClient: MongoDB Client """ # removing all None fields, as mongo will not accept None fields} if self.authentication_config.additional_options: try: self.logger.debug("Casting the additional_options to dict") additional_options = json.loads( self.authentication_config.additional_options ) self.logger.debug("Successfully casted the additional_options to dict") except Exception: self.logger.debug("Failed to cast the additional_options to dict") raise ValueError("additional_options must be a valid dict") else: additional_options = {} client_conf = { k: v for k, v in self.authentication_config.__dict__.items() if v and not k.startswith("__pydantic") # removing pydantic default key and k != "additional_options" # additional_options will go seperately and k != "database" } # database is not a valid mongo option client = MongoClient( **client_conf, **additional_options, serverSelectionTimeoutMS=10000 ) # 10 seconds timeout return client def dispose(self): try: self.client.close() except Exception: self.logger.exception("Error closing MongoDB connection") def validate_config(self): """ Validates required configuration for MongoDB's provider. """ host = self.config.authentication["host"] if host is None: raise ProviderConfigException("Please provide a value for `host`") if not host.strip(): raise ProviderConfigException("Host cannot be empty") if not (host.startswith("mongodb://") or host.startswith("mongodb+srv://")): host = f"mongodb://{host}" self.authentication_config = MongodbProviderAuthConfig( **self.config.authentication ) def _query( self, query: dict, as_dict=False, single_row=False, **kwargs: dict ) -> list | tuple: """ Executes a query against the MongoDB database. Returns: list | tuple: list of results or single result if single_row is True """ if isinstance(query, str): query = json.loads(query) client = self.__generate_client() database = client[self.authentication_config.database] results = list(database.cursor_command(query)) if single_row: return results[0] if results else None return results if __name__ == "__main__": config = ProviderConfig( authentication={ "host": os.environ.get("MONGODB_HOST"), "username": os.environ.get("MONGODB_USER"), "password": os.environ.get("MONGODB_PASSWORD"), "database": os.environ.get("MONGODB_DATABASE"), # "additional_options": '{"retryWrites": false}', } ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) mongodb_provider = MongodbProvider(context_manager, "mongodb-prod", config) query = {"find": "restaurants", "limit": 5} results = mongodb_provider.query(query=query) print(results) ================================================ FILE: keep/providers/mysql_provider/__init__.py ================================================ ================================================ FILE: keep/providers/mysql_provider/mysql_provider.py ================================================ """ MysqlProvider is a class that provides a way to read data from MySQL. """ import dataclasses import os import mysql.connector import pydantic from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import NoSchemeUrl @pydantic.dataclasses.dataclass class MysqlProviderAuthConfig: username: str = dataclasses.field( metadata={"required": True, "description": "MySQL username"} ) password: str = dataclasses.field( metadata={"required": True, "description": "MySQL password", "sensitive": True} ) host: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "MySQL hostname", "validation": "no_scheme_url", } ) database: str | None = dataclasses.field( metadata={"required": False, "description": "MySQL database name"}, default=None ) port: int | None = dataclasses.field( metadata={"required": False, "description": "MySQL port"}, default=3306 ) class MysqlProvider(BaseProvider): """Enrich alerts with data from MySQL.""" PROVIDER_DISPLAY_NAME = "MySQL" PROVIDER_CATEGORY = ["Database"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_server", description="The user can connect to the server", mandatory=True, alias="Connect to the server", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.client = None def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ try: client = self.__generate_client() client.close() scopes = { "connect_to_server": True, } except Exception as e: self.logger.exception("Error validating scopes") scopes = { "connect_to_server": str(e), } return scopes def __generate_client(self): """ Generates a MySQL client. Returns: mysql.connector.CMySQLConnection: MySQL Client """ client = mysql.connector.connect( user=self.authentication_config.username, password=self.authentication_config.password, host=self.authentication_config.host, database=self.authentication_config.database, port=self.authentication_config.port or 3306, ) return client def dispose(self): try: self.client.close() except Exception: self.logger.exception("Error closing MySQL connection") def validate_config(self): """ Validates required configuration for MySQL's provider. """ self.authentication_config = MysqlProviderAuthConfig( **self.config.authentication ) def _notify(self, query="", as_dict=False, single_row=False, **kwargs: dict): """ For MySQL there is no difference if we're querying data or we want to make an impact. This will allow using the provider in actions as well as steps. Args: query (str): Query to execute as_dict (bool): If True, returns the results as a list of dictionaries single_row (bool): If True, returns only the first row of the results **kwargs: Arguments will me passed to the query.format(**kwargs) """ return self._query(query, as_dict, single_row, **kwargs) def _query( self, query="", as_dict=False, single_row=False, **kwargs: dict ) -> list | tuple: """ Executes a query against the MySQL database. Args: query (str): Query to execute as_dict (bool): If True, returns the results as a list of dictionaries single_row (bool): If True, returns only the first row of the results **kwargs: Arguments will me passed to the query.format(**kwargs) Returns: list | tuple: list of results or single result if single_row is True """ client = self.__generate_client() cursor = client.cursor(dictionary=as_dict) if kwargs: query = query.format(**kwargs) cursor.execute(query) # Commit if this is a write operation (INSERT, UPDATE, DELETE) if query.strip().upper().startswith(("INSERT", "UPDATE", "DELETE")): client.commit() results = cursor.fetchall() cursor.close() if single_row: if results: return results[0] else: self.logger.warning("No results found for query: %s", query) raise ValueError(f"Query {query} returned no rows") return results if __name__ == "__main__": config = ProviderConfig( authentication={ "username": os.environ.get("MYSQL_USER"), "password": os.environ.get("MYSQL_PASSWORD"), "host": os.environ.get("MYSQL_HOST"), "database": os.environ.get("MYSQL_DATABASE"), } ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) mysql_provider = MysqlProvider(context_manager, "mysql-prod", config) results = mysql_provider.query(query="SELECT MAX(datetime) FROM demo_table LIMIT 1") print(results) ================================================ FILE: keep/providers/netbox_provider/README.md ================================================ ## Setting up the NetBox Community instance using Docker This guide will help you set up a NetBox Community instance using Docker. The guide assumes you have Docker installed on your system. 1. Clone the NetBox community docker repository ```bash git clone -b release https://github.com/netbox-community/netbox-docker.git ``` 2. Change directory to the cloned repository ```bash cd netbox-docker ``` 3. Create `docker-compose.override.yml` file with the following content ```yaml version: '3.4' services: netbox: ports: - 8000:8080 ``` 4. Start the NetBox Community instance ```bash docker compose up ``` 5. To create first admin user account run the following command and follow the prompts ```bash docker compose exec netbox /opt/netbox/netbox/manage.py createsuperuser ``` 6. You can now access the NetBox Community instance by visiting [http://localhost:8000](http://localhost:8000) in your browser. ================================================ FILE: keep/providers/netbox_provider/__init__.py ================================================ ================================================ FILE: keep/providers/netbox_provider/alerts_mock.py ================================================ ALERTS = { "event": "created", "timestamp": "2025-02-02T11:10:24.231786+00:00", "model": "site", "username": "admin", "request_id": "7886b12c-593d-46bb-a781-5da0e5be255b", "data": { "id": 4, "url": "/api/dcim/sites/4/", "display_url": "/dcim/sites/4/", "display": "Test", "name": "Test", "slug": "test", "status": { "value": "active", "label": "Active" }, "region": None, "group": None, "tenant": None, "facility": "", "time_zone": None, "description": "", "physical_address": "", "shipping_address": "", "latitude": None, "longitude": None, "comments": "", "asns": [], "tags": [], "custom_fields": {}, "created": "2025-02-02T11:10:24.208770Z", "last_updated": "2025-02-02T11:10:24.208787Z" }, "snapshots": { "prechange": None, "postchange": { "created": "2025-02-02T11:10:24.208Z", "last_updated": "2025-02-02T11:10:24.208Z", "description": "", "comments": "", "name": "Test", "slug": "test", "status": "active", "region": None, "group": None, "tenant": None, "facility": "", "time_zone": None, "physical_address": "", "shipping_address": "", "latitude": None, "longitude": None, "asns": [], "custom_fields": {}, "tags": [] } } } ================================================ FILE: keep/providers/netbox_provider/netbox_provider.py ================================================ """ NetBox combines IP address management (IPAM) and datacenter infrastructure management (DCIM) with powerful APIs and extensions, serving as the ideal "source of truth" for network automation. Thousands of organizations worldwide rely on NetBox for their infrastructure. """ from keep.api.models.alert import AlertDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class NetboxProvider(BaseProvider): """ Get alerts from NetBox into Keep. """ webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from NetBox to Keep, Use the following webhook url to configure NetBox send alerts to Keep: 1. In NetBox, go to Webhooks under Operations. 2. Create a new webhook with URL as {keep_webhook_api_url} and request method as POST. 3. Disable SSL verification. 4. Add 'X-API-KEY' as the request header with the value as {api_key}. 5. Save the webhook. 6. Go to Event Rules and create a new rule and select the webhook created in step 2 to receive alerts. """ PROVIDER_DISPLAY_NAME = "NetBox" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Cloud Infrastructure", "Monitoring"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for NetBox's provider. """ pass @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: data = event.get("data", {}) snapshots = event.get("snapshots", {}) alert = AlertDto( name=data.get("name", "Could not fetch name"), lastReceived=event.get("timestamp"), startedAt=data.get("created"), model=event.get("model", "Could not fetch model"), username=event.get("username", "Could not fetch username"), id=event.get("request_id"), data=data, description=event.get("event", "Could not fetch event"), snapshots=snapshots, source=["netbox"], ) return alert if __name__ == "__main__": pass ================================================ FILE: keep/providers/netdata_provider/__init__.py ================================================ ================================================ FILE: keep/providers/netdata_provider/netdata_provider.py ================================================ """ Netdata is a cloud-based monitoring tool that provides real-time monitoring of servers, applications, and devices. """ from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class NetdataProvider(BaseProvider): """Get alerts from Netdata into Keep.""" webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from Netdata to Keep, Use the following webhook url to configure Netdata send alerts to Keep: 1. In Netdata, go to Space settings. 2. Go to "Alerts & Notifications". 3. Click on "Add configuration". 4. Add "Webhook" as the notification method. 5. Add a name to the configuration. 6. Select Room(s) to apply the configuration. 7. Select Notification(s) to apply the configuration. 8. In the "Webhook URL" field, add {keep_webhook_api_url}. 9. Add a request header with the key "x-api-key" and the value as {api_key}. 10. Leave the Authentication as "No Authentication". 11. Add the "Challenge secret" as "keep-netdata-webhook-integration". 12. Save the configuration. """ SEVERITIES_MAP = { "warning": AlertSeverity.WARNING, "info": AlertSeverity.INFO, "critical": AlertSeverity.CRITICAL, } STATUS_MAP = { "reachable": AlertStatus.RESOLVED, "unreachable": AlertStatus.FIRING, } PROVIDER_DISPLAY_NAME = "Netdata" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for Prometheus's provider. """ # no config pass @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: alert = AlertDto( id=event["id"] if "id" in event else None, name=event["name"] if "name" in event else None, host=event["host"], message=event["message"], severity=NetdataProvider.SEVERITIES_MAP.get( event["severity"], AlertSeverity.INFO ), status=( NetdataProvider.STATUS_MAP.get( event["status"]["text"], AlertStatus.FIRING ) if "status" in event else AlertStatus.FIRING ), alert=event["alert"] if "alert" in event else None, url=( event["alert_url"] or event["url"] if "alert_url" in event or "url" in event else None ), chart=event["chart"] if "chart" in event else None, alert_class=event["class"] if "class" in event else None, context=event["context"] if "context" in event else None, lastReceived=event["date"] if "date" in event else None, duration=event["duration"] if "duration" in event else None, info=event["info"] if "info" in event else None, space=event["space"] if "space" in event else None, total_critical=( event["total_critical"] if "total_critical" in event else None ), total_warnings=( event["total_warnings"] if "total_warnings" in event else None ), value=event["value"] if "value" in event else None, ) return alert if __name__ == "__main__": pass ================================================ FILE: keep/providers/netxms_provider/__init__.py ================================================ ================================================ FILE: keep/providers/netxms_provider/netxms_provider.py ================================================ import dataclasses import pydantic from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class NetxmsProviderAuthConfig: api_key: str = dataclasses.field( metadata={"required": True, "description": "NetXMS API key", "sensitive": True} ) class NetxmsProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "NetXMS" PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_COMING_SOON = True def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = NetxmsProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass ================================================ FILE: keep/providers/newrelic_provider/__init__.py ================================================ ================================================ FILE: keep/providers/newrelic_provider/newrelic_provider.py ================================================ """ NewrelicProvider is a provider that provides a way to interact with New Relic. """ import dataclasses import json import logging from datetime import datetime import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_config_exception import ProviderConfigException from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class NewrelicProviderAuthConfig: """ Destinations can be only be created through ADMIN User key. reference: https://api.newrelic.com/docs/#/Deprecation%20Notice%20-%20Alerts%20Channels/post_alerts_channels_json not mentioned in GraphQL docs though, got to know after trying this out. """ api_key: str = dataclasses.field( metadata={ "required": True, "description": "New Relic User key. To receive webhooks, use `User key` of an admin account", "sensitive": True, } ) account_id: str = dataclasses.field( metadata={"required": True, "description": "New Relic account ID"} ) new_relic_api_url: HttpsUrl = dataclasses.field( metadata={ "required": False, "description": "New Relic API URL", "validation": "https_url" }, default="https://api.newrelic.com", ) class NewrelicProvider(BaseProvider): """Get alerts from New Relic into Keep.""" PROVIDER_CATEGORY = ["Monitoring"] NEWRELIC_WEBHOOK_NAME = "keep-webhook" PROVIDER_DISPLAY_NAME = "New Relic" PROVIDER_SCOPES = [ ProviderScope( name="ai.issues:read", description="Required to read issues and related information", mandatory=True, mandatory_for_webhook=False, documentation_url="https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/", alias="Rules Reader", ), ProviderScope( name="ai.destinations:read", description="Required to read whether keep webhooks are registered", mandatory=False, mandatory_for_webhook=True, documentation_url="https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/", alias="Rules Reader", ), ProviderScope( name="ai.destinations:write", description="Required to register keep webhooks", mandatory=False, mandatory_for_webhook=True, documentation_url="https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/", alias="Rules Writer", ), ProviderScope( name="ai.channels:read", description="Required to know informations about notification channels.", mandatory=False, mandatory_for_webhook=True, documentation_url="https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/", alias="Rules Reader", ), ProviderScope( name="ai.channels:write", description="Required to create notification channel", mandatory=False, mandatory_for_webhook=True, documentation_url="https://docs.newrelic.com/docs/accounts/accounts-billing/new-relic-one-user-management/user-management-concepts/", alias="Rules Writer", ), ] SEVERITIES_MAP = { "critical": AlertSeverity.CRITICAL, "warning": AlertSeverity.WARNING, "info": AlertSeverity.INFO, } STATUS_MAP = { "open": AlertStatus.FIRING, "closed": AlertStatus.RESOLVED, "acknowledged": AlertStatus.ACKNOWLEDGED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Nothing to dispose here """ pass def validate_config(self): """ Validates required configuration for New-Relic provider. """ self.newrelic_config = NewrelicProviderAuthConfig(**self.config.authentication) def __make_add_webhook_destination_query(self, url: str, name: str) -> dict: query = f"""mutation {{ aiNotificationsCreateDestination( accountId: {self.newrelic_config.account_id} destination: {{ type: WEBHOOK, name: "{name}", properties: [{{key: "url", value:"{url}"}}]}} ) {{ destination {{ id name }} }} }}""" return { "query": query, } def __make_delete_webhook_destination_query(self, destination_id: str): query = f"""mutation {{ aiNotificationsDeleteDestination( accountId: {self.newrelic_config.account_id} destinationId: "{destination_id}" ) {{ ids }} }}""" return { "query": query, } def validate_scopes(self) -> dict[str, bool | str]: scopes = {scope.name: "Invalid" for scope in self.PROVIDER_SCOPES} read_scopes = [key for key in scopes.keys() if "read" in key] try: """ try to check all read scopes """ query = { "query": f""" {{ actor {{ account(id: {self.newrelic_config.account_id}) {{ aiIssues {{ issues {{ issues {{ acknowledgedAt acknowledgedBy activatedAt closedAt closedBy mergeReason mutingState parentMergeId unAcknowledgedAt unAcknowledgedBy }} }} }} aiNotifications {{ destinations {{ entities {{name}} }} channels {{ entities {{name}} }} }} }} }} }} """ } response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query, ) content = response.content.decode("utf-8") if "errors" in content: raise for read_scope in read_scopes: scopes[read_scope] = True except Exception: self.logger.exception( "Error while trying to validate read scopes from new relic" ) return scopes write_scopes = [key for key in scopes.keys() if "write" in key] try: """ Checking if destination can be created Delete at the end if created Destinations can be only be created through ADMIN User key, this means if this succeeds any write will succeed, including channels. reference: https://api.newrelic.com/docs/#/Deprecation%20Notice%20-%20Alerts%20Channels/post_alerts_channels_json not mentioned in GraphQL docs though, got to know after trying this out. """ query = self.__make_add_webhook_destination_query( url="https://api.localhost.com", name="keep-webhook-test" ) # tried to do with localhost and port, didn't worked response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query, ) content = response.content.decode("utf-8") # delete created destination id = response.json()["data"]["aiNotificationsCreateDestination"][ "destination" ]["id"] query = self.__make_delete_webhook_destination_query(id) response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query, ) content = response.content.decode("utf-8") if "errors" in content: raise for write_scope in write_scopes: scopes[write_scope] = True except Exception: self.logger.exception( "Error while trying to validate write scopes from new relic" ) return scopes @property def new_relic_graphql_url(self): return f"{self.newrelic_config.new_relic_api_url}/graphql" @property def new_relic_alert_url(self): return f"{self.newrelic_config.new_relic_api_url}/v2/alerts_violations.json" def _query(self, nrql="", **kwargs: dict): """ Query New Relic account using the given NRQL Args: query (str): query to execute Returns: list[tuple] | list[dict]: results of the query """ if not nrql: raise ProviderConfigException( "Missing NRQL query", provider_id=self.provider_id ) query = f'{{actor {{account(id: {self.newrelic_config.account_id}) {{nrql(query: "{nrql}") {{results}}}}' payload = {"query": query} response = requests.post( self.new_relic_graphql_url, headers={"Api-Key": self.newrelic_config.api_key}, json=payload, ) if not response.ok: self.logger.debug( "Failed to query New Relic", extra={"response": response.text, "query": query}, ) raise ProviderException(f"Failed to query New Relic: {response.text}") # results are in response.json()['data']['actor']['account']['nrql']['results'], should we return this? return response.json() @property def __headers(self): return { "Api-Key": self.newrelic_config.api_key, "Content-Type": "application/json", } def get_alerts(self) -> list[AlertDto]: formatted_alerts = [] headers = self.__headers # GraphQL query for listing issues query = { "query": f""" {{ actor {{ account(id: {self.newrelic_config.account_id}) {{ aiIssues {{ issues {{ issues {{ account {{ id name }} acknowledgedAt acknowledgedBy activatedAt closedAt closedBy conditionFamilyId conditionName conditionProduct correlationRuleDescriptions correlationRuleIds correlationRuleNames createdAt deepLinkUrl description entityGuids entityNames entityTypes eventType incidentIds isCorrelated isIdle issueId mergeReason mutingState origins parentMergeId policyIds policyName priority sources state title totalIncidents unAcknowledgedBy unAcknowledgedAt updatedAt wildcard }} }} }} }} }} }} """ } response = requests.post( self.new_relic_graphql_url, headers=headers, json=query ) response.raise_for_status() data = response.json() # Extract and format the issues issues_data = data["data"]["actor"]["account"]["aiIssues"]["issues"]["issues"] formatted_alerts = [] for issue in issues_data: lastReceived = issue["updatedAt"] if "updatedAt" in issue else None # convert to date if lastReceived: lastReceived = datetime.fromtimestamp(lastReceived / 1000).strftime( "%Y-%m-%d %H:%M:%S" ) alert = AlertDto( id=issue["issueId"], name=( issue["title"][0] if issue["title"] else None ), # Assuming the first title in the list status=issue["state"], lastReceived=lastReceived, severity=issue["priority"], message=None, # New Relic doesn't provide a direct "message" field description=issue["description"][0] if issue["description"] else None, source=["newrelic"], acknowledgedAt=issue["acknowledgedAt"], acknowledgedBy=issue["acknowledgedBy"], activatedAt=issue["activatedAt"], closedAt=issue["closedAt"], closedBy=issue["closedBy"], createdAt=issue["createdAt"], ) formatted_alerts.append(alert) return formatted_alerts @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: """We are already registering template same as generic AlertDTO""" logger = logging.getLogger(__name__) logger.info("Got event from New Relic") lastReceived = event.pop("lastReceived", None) # from Keep policy if lastReceived: if isinstance(lastReceived, int): lastReceived = datetime.utcfromtimestamp( lastReceived / 1000 ).isoformat() else: # WTF? logger.error("lastReceived is not int") pass else: lastReceived = datetime.utcfromtimestamp( event.get("updatedAt", 0) / 1000 ).isoformat() # format status and severity to Keep format status = event.pop("status", "") or event.pop("state", "") status = NewrelicProvider.STATUS_MAP.get(status.lower(), AlertStatus.FIRING) severity = event.pop("severity", "") or event.pop("priority", "") severity = NewrelicProvider.SEVERITIES_MAP.get( severity.lower(), AlertSeverity.INFO ) name = event.pop("name", "") if not name: name = event.get("title", "") logger.info("Formatted event from New Relic") # TypeError: keep.api.models.alert.AlertDto() got multiple values for keyword argument 'source'" if "source" in event: newrelic_source = event.pop("source") return AlertDto( source=["newrelic"], name=name, lastReceived=lastReceived, status=status, severity=severity, newrelic_source=newrelic_source, **event, ) def __get_all_policy_ids( self, ) -> list[str]: try: query = { "query": f""" {{ actor {{ account(id: {self.newrelic_config.account_id}) {{ alerts {{ policiesSearch {{ policies {{ id }} }} }} }} }} }} """ } response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query ) content = response.content.decode("utf-8") if "errors" in content: raise all_objects = response.json()["data"]["actor"]["account"]["alerts"][ "policiesSearch" ]["policies"] return [obj["id"] for obj in all_objects] except Exception as e: self.logger.error(f"Error while fetching ploicies: {e}") return [] def __get_webhook_destination_id_by_name_and_url( self, name: str, url: str ) -> str | None: try: query = { "query": f""" {{ actor {{ account(id: {self.newrelic_config.account_id}) {{ aiNotifications {{ destinations(filters: {{ name: "{name}", type: WEBHOOK, property: {{ key: "url", value: "{url}" }} }}) {{ entities {{ id }} }} }} }} }} }} """ } response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query ) id_list = response.json()["data"]["actor"]["account"]["aiNotifications"][ "destinations" ]["entities"] return id_list[0]["id"] except Exception: self.logger.error("Error getting destination id") def __add_webhook_destination(self, name: str, url: str) -> str | None: try: query = self.__make_add_webhook_destination_query(name=name, url=url) response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query ) new_id = response.json()["data"]["aiNotificationsCreateDestination"][ "destination" ]["id"] return new_id except Exception: self.logger.exception("Error creating destination for webhook") def __get_channel_id_by_destination_and_name(self, destination_id: str, name: str): try: query = { "query": f""" {{ actor {{ account(id: {self.newrelic_config.account_id}) {{ aiNotifications {{ channels(filters: {{ destinationId: "{destination_id}", name: "{name}" }}) {{ entities {{ id }} }} }} }} }} }} """ } response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query ) id_list = response.json()["data"]["actor"]["account"]["aiNotifications"][ "channels" ]["entities"] return id_list[0]["id"] except Exception: self.logger.error("Exception fetching channel id") def __add_new_channel( self, destination_id: str, name: str, api_key: str ) -> str | None: try: """ To update the payload template Go to new relic -> Alerts & Ai -> workflows -> create the new channel int (Notfy section). Here set the template you want once set query channels with sort in descending order by CREATED_AT, maek sure to choose pay key and value in enteties. copy the string value of format change: { to {{, } to }}, \n to \\n, \t to \\t, " to \" """ mutation_query = """ mutation {{ aiNotificationsCreateChannel( accountId: {account_id}, channel: {{ name: "{name}", product: IINT, type: WEBHOOK, destinationId: "{destination_id}", properties: [ {{ key: "headers", value: "{{ \\\"X-API-KEY\\\":\\\"{api_key}\\\"}}" }}, {{ key: "payload", value: "{{\\n\\t\\\"id\\\": {{{{ json issueId }}}},\\n\\t\\\"issueUrl\\\": {{{{ json issuePageUrl }}}},\\n\\t\\\"name\\\": {{{{ json annotations.title.[0] }}}},\\n\\t\\\"severity\\\": {{{{ json priority }}}},\\n\\t\\\"impactedEntities\\\": {{{{ json entitiesData.names }}}},\\n\\t\\\"totalIncidents\\\": {{{{ json totalIncidents }}}},\\n\\t\\\"status\\\": {{{{ json state }}}},\\n\\t\\\"trigger\\\": {{{{ json triggerEvent }}}},\\n\\t\\\"isCorrelated\\\": {{{{ json isCorrelated }}}},\\n\\t\\\"createdAt\\\": {{{{ createdAt }}}},\\n\\t\\\"updatedAt\\\": {{{{ updatedAt }}}},\\n\\t\\\"lastReceived\\\": {{{{ updatedAt }}}},\\n\\t\\\"source\\\": {{{{ json accumulations.source }}}},\\n\\t\\\"alertPolicyNames\\\": {{{{ json accumulations.policyName }}}},\\n\\t\\\"alertConditionNames\\\": {{{{ json accumulations.conditionName }}}},\\n\\t\\\"workflowName\\\": {{{{ json workflowName }}}}\\n}}" }} ] }} ) {{ channel {{ id }} }} }} """.format( account_id=self.newrelic_config.account_id, destination_id=destination_id, name=name, api_key=api_key, ) query = {"query": mutation_query} # print(query) response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query ) # print(response.json()) new_id = response.json()["data"]["aiNotificationsCreateChannel"]["channel"][ "id" ] return new_id except Exception: self.logger.exception("Error creating channel for webhook") def __get_workflow_by_name_and_channel( self, name: str, channel_id: str ) -> str | None: try: query = { "query": f"""{{ actor {{ account(id: {self.newrelic_config.account_id}) {{ aiWorkflows {{ workflows( filters: {{name: "{name}", channelId: "{channel_id}"}} ) {{ entities {{ id }} }} }} }} }} }} """ } response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query ) id_list = response.json()["data"]["actor"]["account"]["aiWorkflows"][ "workflows" ]["entities"] # print(id_list) return id_list[0]["id"] except Exception as ex: self.logger.warning( "Error getting workflow by name and channel", exc_info=ex, extra={ "name": name, "channel_id": channel_id, } ) def __add_new_worflow( self, channel_id: str, policy_ids: list, name: str ) -> str | None: try: query = { "query": f""" mutation {{ aiWorkflowsCreateWorkflow( accountId: {self.newrelic_config.account_id} createWorkflowData: {{ destinationConfigurations: {{ channelId: "{channel_id}", notificationTriggers: [ACTIVATED, ACKNOWLEDGED, CLOSED, PRIORITY_CHANGED, OTHER_UPDATES] }}, issuesFilter: {{ predicates: [ {{ attribute: "labels.policyIds", operator: EXACTLY_MATCHES, values: {json.dumps(policy_ids)} }} ], type: FILTER }}, workflowEnabled: true, destinationsEnabled: true, mutingRulesHandling: DONT_NOTIFY_FULLY_MUTED_ISSUES name: "{name}", }} ) {{ workflow {{ id }} }} }} """ } response = requests.post( self.new_relic_graphql_url, headers=self.__headers, json=query ) # print(response.content.decode("utf-8")) return response.json()["data"]["aiWorkflowsCreateWorkflow"]["workflow"][ "id" ] except Exception: self.logger.exception("Error creating channel for webhook") def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): """ -> Fetch all policy ids -> Get/Create destination to keep webhook api url and get the created id -> Get/Create channel adding all policies to given destination id -> Get/Create workflow on a given channel """ self.logger.info("Setting up webhook to new relic") webhook_name = self.NEWRELIC_WEBHOOK_NAME + "-" + tenant_id policy_ids = [] self.logger.info("Fetching policies") policy_ids = self.__get_all_policy_ids() if not policy_ids: raise Exception("Not able to get policies") destination_id = self.__get_webhook_destination_id_by_name_and_url( name=webhook_name, url=keep_api_url ) if not destination_id: destination_id = self.__add_webhook_destination( name=webhook_name, url=keep_api_url ) if not destination_id: raise Exception("Not able to get webhook destination") channel_id = self.__get_channel_id_by_destination_and_name( destination_id, webhook_name ) if not channel_id: channel_id = self.__add_new_channel( name=webhook_name, destination_id=destination_id, api_key=api_key ) if not channel_id: raise Exception("Not able to get channels") worflow_id = self.__get_workflow_by_name_and_channel( name=webhook_name, channel_id=channel_id ) if not worflow_id: worflow_id = self.__add_new_worflow( name=webhook_name, channel_id=channel_id, policy_ids=policy_ids ) if not worflow_id: raise Exception("Not able to add worflow") self.logger.info(f"New relic webhook successfuly setup {worflow_id}") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os api_key = os.environ.get("NEWRELIC_API_KEY") account_id = os.environ.get("NEWRELIC_ACCOUNT_ID") provider_config = { "authentication": {"api_key": api_key, "account_id": account_id}, } from keep.providers.providers_factory import ProvidersFactory provider = ProvidersFactory.get_provider( context_manager=context_manager, provider_id="newrelic-keephq", provider_type="newrelic", provider_config=provider_config, ) scopes = provider.validate_scopes() # print(scopes) alerts = provider.get_alerts() # print(alerts) created = provider.setup_webhook( tenant_id="test-v2", keep_api_url="https://6fd6-2401-4900-1cb0-3b5f-6d04-474-81c5-30c7.ngrok-free.app/alerts/event", setup_alerts=True, ) # print(created) ================================================ FILE: keep/providers/ntfy_provider/README.md ================================================ ## Change the variables 1. Change the UID:GID in the docker-compose file to match your user and group id. ## Run the docker-compose file ```bash docker-compose up -d ``` ================================================ FILE: keep/providers/ntfy_provider/__init__.py ================================================ ================================================ FILE: keep/providers/ntfy_provider/docker-compose.yml ================================================ version: '2.3' services: ntfy: image: binwiederhier/ntfy container_name: ntfy command: - serve environment: - TZ=UTC # optional: set desired timezone volumes: - /var/cache/ntfy:/var/cache/ntfy - ./server.yml:/etc/ntfy/server.yml # Mount the configuration file ports: - 80:80 healthcheck: # optional: remember to adapt the host:port to your environment test: [ 'CMD-SHELL', "wget -q --tries=1 http://localhost:80/v1/health -O - | grep -Eo '\"healthy\"\\s*:\\s*true' || exit 1", ] interval: 60s timeout: 10s retries: 3 start_period: 40s restart: unless-stopped ================================================ FILE: keep/providers/ntfy_provider/ntfy_provider.py ================================================ """ NtfyProvider is a class that provides a way to send notifications to the user. """ import base64 import dataclasses from urllib.parse import urljoin import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class NtfyProviderAuthConfig: """ NtfyProviderAuthConfig is a class that holds the authentication information for the NtfyProvider. """ access_token: str = dataclasses.field( metadata={ "required": False, "description": "Ntfy Access Token", "sensitive": True, }, default=None, ) host: pydantic.AnyHttpUrl | None = dataclasses.field( metadata={ "required": False, "description": "Ntfy Host URL (For self-hosted Ntfy only)", "sensitive": False, "hint": "http://localhost:80", "validation": "any_http_url", }, default=None, ) username: str = dataclasses.field( metadata={ "required": False, "description": "Ntfy Username (For self-hosted Ntfy only)", "sensitive": False, }, default=None, ) password: str = dataclasses.field( metadata={ "required": False, "description": "Ntfy Password (For self-hosted Ntfy only)", "sensitive": True, }, default=None, ) class NtfyProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Ntfy.sh" PROVIDER_CATEGORY = ["Collaboration"] PROVIDER_SCOPES = [ ProviderScope( name="send_alert", mandatory=True, alias="Send Alert", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: validated_scopes = {} validated_scopes["send_alert"] = True return validated_scopes def validate_config(self): self.authentication_config = NtfyProviderAuthConfig( **self.config.authentication ) if ( self.authentication_config.access_token is None and self.authentication_config.host is None ): raise ProviderException("Either Access Token or Host is required") if self.authentication_config.host is not None: if self.authentication_config.username is None: raise ProviderException("Username is required when host is provided") if self.authentication_config.password is None: raise ProviderException("Password is required when host is provided") def __get_auth_headers(self): if self.authentication_config.access_token is not None: return { "Authorization": f"Bearer {self.authentication_config.access_token}" } else: username = self.authentication_config.username password = self.authentication_config.password token = base64.b64encode(f"{username}:{password}".encode("utf-8")).decode( "utf-8" ) return {"Authorization": f"Basic {token}"} def __send_alert(self, message="", topic=None): self.logger.debug(f"Sending notification to {topic}") if self.authentication_config.host is not None: base_url = self.authentication_config.host if not base_url.endswith("/"): base_url += "/" NTFY_URL = urljoin(base=base_url, url=topic) else: NTFY_URL = urljoin(base="https://ntfy.sh/", url=topic) try: response = requests.post( NTFY_URL, headers=self.__get_auth_headers(), data=message ) if response.status_code == 401: raise ProviderException( f"Failed to send notification to {NTFY_URL}. Error: Unauthorized" ) response.raise_for_status() return response.json() except Exception as e: raise ProviderException( f"Failed to send notification to {NTFY_URL}. Error: {e}" ) def _notify(self, message="", topic=None, **kwargs): if not message or not topic: raise ProviderException( "Message and Topic are required to send notification" ) return self.__send_alert(message, topic, **kwargs) if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os ntfy_access_token = os.environ.get("NTFY_ACCESS_TOKEN") ntfy_host = os.environ.get("NTFY_HOST") ntfy_username = os.environ.get("NTFY_USERNAME") ntfy_password = os.environ.get("NTFY_PASSWORD") ntfy_subscription_topic = os.environ.get("NTFY_SUBSCRIPTION_TOPIC") if ntfy_access_token is None and ntfy_host is None: raise Exception("NTFY_ACCESS_TOKEN or NTFY_HOST is required") if ntfy_host is not None: if ntfy_username is None: raise Exception("NTFY_USERNAME is required") if ntfy_password is None: raise Exception("NTFY_PASSWORD is required") if ntfy_access_token is not None: config = ProviderConfig( description="Ntfy Provider", authentication={ "access_token": ntfy_access_token, "subcription_topic": ntfy_subscription_topic, }, ) else: config = ProviderConfig( description="Ntfy Provider", authentication={ "host": ntfy_host, "username": ntfy_username, "password": ntfy_password, "subcription_topic": ntfy_subscription_topic, }, ) provider = NtfyProvider( context_manager, provider_id="ntfy-keephq", config=config, ) provider.notify(message="Test message from Keephq") ================================================ FILE: keep/providers/ntfy_provider/server.yml ================================================ auth-file: /etc/ntfy/auth.db auth-default-access: deny-all ================================================ FILE: keep/providers/ollama_provider/__init__.py ================================================ ================================================ FILE: keep/providers/ollama_provider/ollama_provider.py ================================================ import json import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class OllamaProviderAuthConfig: host: str = dataclasses.field( metadata={ "required": True, "description": "Ollama API Host URL", "sensitive": False, }, default="http://localhost:11434", ) class OllamaProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Ollama" PROVIDER_CATEGORY = ["AI"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = OllamaProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _query( self, prompt, model="llama3.1:8b-instruct-q6_K", max_tokens=1024, structured_output_format=None, ): # Build the API URL api_url = f"{self.authentication_config.host}/api/generate" # Prepare the request payload payload = { "model": model, "prompt": prompt, "stream": False, "raw": True, # Raw mode for more consistent output "options": { "num_predict": max_tokens, }, } if structured_output_format is not None: payload["format"] = structured_output_format try: # Make the API request response = requests.post(api_url, json=payload) response.raise_for_status() content = response.json()["response"] # Try to parse as JSON if structured output was requested if structured_output_format: try: content = json.loads(content) except Exception: pass return { "response": content, } except requests.exceptions.RequestException as e: raise Exception(f"Error calling Ollama API: {str(e)}") if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = ProviderConfig( description="Ollama Provider", authentication={ "host": "http://localhost:11434", # Default Ollama host }, ) provider = OllamaProvider( context_manager=context_manager, provider_id="ollama_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works.", model="llama3.1:8b-instruct-q6_K", # or any other model you have pulled in Ollama structured_output_format={ "type": "object", "properties": { "environment": { "type": "string", "enum": ['production', 'debug'] }, }, "required": ["environment"], }, max_tokens=100, ) ) ================================================ FILE: keep/providers/openai_provider/__init__.py ================================================ ================================================ FILE: keep/providers/openai_provider/openai_provider.py ================================================ import json import dataclasses import pydantic from openai import OpenAI from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class OpenaiProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "OpenAI Platform API Key", "sensitive": True, }, ) organization_id: str | None = dataclasses.field( metadata={ "required": False, "description": "OpenAI Platform Organization ID", "sensitive": False, }, default=None, ) class OpenaiProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "OpenAI" PROVIDER_CATEGORY = ["AI"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = OpenaiProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _query( self, prompt, model="gpt-3.5-turbo", max_tokens=1024, structured_output_format=None, ): client = OpenAI( api_key=self.authentication_config.api_key, organization=self.authentication_config.organization_id, ) response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], max_tokens=max_tokens, response_format=structured_output_format, ) response = response.choices[0].message.content try: response = json.loads(response) except Exception: pass return { "response": response, } if __name__ == "__main__": import os import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) api_key = os.environ.get("API_KEY") config = ProviderConfig( description="My Provider", authentication={ "api_key": api_key, }, ) provider = OpenaiProvider( context_manager=context_manager, provider_id="my_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works.", model="gpt-4o-mini", structured_output_format={ "type": "json_schema", "json_schema": { "name": "environment_restoration", "schema": { "type": "object", "properties": { "environment": { "type": "string", "enum": ["production", "debug", "pre-prod"], }, }, "required": ["environment"], "additionalProperties": False, }, "strict": True, }, }, max_tokens=100, ) ) # https://platform.openai.com/docs/guides/function-calling ================================================ FILE: keep/providers/openobserve_provider/__init__.py ================================================ ================================================ FILE: keep/providers/openobserve_provider/alerttemplate.json ================================================ { "org_name": "{org_name}", "stream_type": "{stream_type}", "stream_name": "{stream_name}", "alert_name": "{alert_name}", "alert_type": "{alert_type}", "alert_period": "{alert_period}", "alert_operator": "{alert_operator}", "alert_threshold": "{alert_threshold}", "alert_count": "{alert_count}", "alert_agg_value": "{alert_agg_value}", "alert_start_time": "{alert_start_time}", "alert_end_time": "{alert_end_time}", "alert_url": "{alert_url}" } ================================================ FILE: keep/providers/openobserve_provider/openobserve_provider.py ================================================ """ OpenObserve Provider is a class that allows to install webhooks in OpenObserve. """ import dataclasses import json import logging import uuid from pathlib import Path from typing import List from urllib.parse import urlencode, urljoin import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import UrlPort class ResourceAlreadyExists(Exception): def __init__(self, *args): super().__init__(*args) @pydantic.dataclasses.dataclass class OpenobserveProviderAuthConfig: """ OpenObserve authentication configuration. """ openObserveUsername: str = dataclasses.field( metadata={ "required": True, "description": "OpenObserve Username", "hint": "Your Username", }, ) openObservePassword: str = dataclasses.field( metadata={ "required": True, "description": "Password", "hint": "Password associated with your account", "sensitive": True, }, ) openObserveHost: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "OpenObserve host url", "hint": "e.g. http://localhost", "validation": "any_http_url" }, ) openObservePort: UrlPort = dataclasses.field( metadata={ "required": True, "description": "OpenObserve Port", "hint": "e.g. 5080", "validation": "port" }, ) organisationID: str = dataclasses.field( metadata={ "required": True, "description": "OpenObserve organisationID", "hint": "default", }, ) class OpenobserveProvider(BaseProvider): """Install Webhooks and receive alerts from OpenObserve.""" PROVIDER_DISPLAY_NAME = "OpenObserve" PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authorized", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ] SEVERITIES_MAP = { "ERROR": AlertSeverity.CRITICAL, "WARN": AlertSeverity.WARNING, "INFO": AlertSeverity.INFO, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for OpenObserve provider. """ if self.is_installed or self.is_provisioned: host = self.config.authentication['openObserveHost'] if not (host.startswith("http://") or host.startswith("https://")): scheme = "http://" if ("localhost" in host or "127.0.0.1" in host) else "https://" self.config.authentication['openObserveHost'] = scheme + host self.authentication_config = OpenobserveProviderAuthConfig( **self.config.authentication ) def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for OpenObserve api requests. Example: paths = ["issue", "createmeta"] query_params = {"projectKeys": "key1"} url = __get_url("test", paths, query_params) # url = https://baseballxyz.saas.openobserve.com/rest/api/2/issue/createmeta?projectKeys=key1 """ url = urljoin( f"{self.authentication_config.openObserveHost}:{self.authentication_config.openObservePort}", "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def validate_scopes(self) -> dict[str, bool | str]: authenticated = False self.logger.info("Validating OpenObserve Scopes") try: response = requests.post( url=self.__get_url( paths=[ "auth/login", ] ), json={ "name": self.authentication_config.openObserveUsername, "password": self.authentication_config.openObservePassword, }, timeout=10, ) except Exception as e: self.logger.error( "Error while validating scopes for OpenObserve", extra=e, ) return {"authenticated": str(e)} print( self.__get_url( paths=[ "auth/login", ] ) ) if response.ok: response = response.json() authenticated = response["status"] else: self.logger.error( "Error while validating scopes for OpenObserve", extra={"status_code": response.status_code, "error": response.text}, ) return {"authenticated": authenticated} def __get_auth(self) -> tuple[str, str]: return ( self.authentication_config.openObserveUsername, self.authentication_config.openObservePassword, ) def __update_alert_template(self, data): res = requests.put( url=self.__get_url( paths=[ f"api/{self.authentication_config.organisationID}/alerts/templates/KeepAlertTemplate" ] ), json=data, auth=self.__get_auth(), ) if res.ok: res = res.json() if res["code"] == 200: self.logger.info("Alert template Updated Successfully") else: self.logger.error( "Failed to update Alert Template", extra={"code": res["code"], "error": res["message"]}, ) else: self.logger.error( "Error while updating Alert Template", extra={"status_code": res.status_code, "error": res.text}, ) def __create_alert_template(self): # This is the template used for creating the alert template in openobserve template = open(rf"{Path(__file__).parent}/alerttemplate.json", "rt") data = template.read() try: res = requests.post( self.__get_url( paths=[ f"api/{self.authentication_config.organisationID}/alerts/templates" ] ), json={"body": data, "isDefault": False, "name": "KeepAlertTemplate"}, auth=self.__get_auth(), ) res = res.json() if res["code"] == 200: self.logger.info("Alert template Successfully Created") elif "already exists" in res["message"]: self.logger.info( "Alert template creation failed as it already exists", extra={"code": res["code"], "error": res["message"]}, ) self.logger.info( "Attempting to Update Alert Template with latest data..." ) self.__update_alert_template( data={"body": data, "isDefault": False, "name": "KeepAlertTemplate"} ) else: self.logger.error( "Alert template creation failed", extra={"code": res["code"], "error": res["message"]}, ) except Exception as e: self.logger.error( "Error While making alert Template", extra=e, ) def __update_destination(self, keep_api_url: str, api_key: str, data): res = requests.put( json=data, url=self.__get_url( paths=[ f"api/{self.authentication_config.organisationID}/alerts/destinations/KeepDestination" ] ), auth=self.__get_auth(), ) if res.ok: self.logger.info("Destination Successfully Updated") else: self.logger.error( "Error while updating destination", extra={"code": res.status_code, "error": res.text}, ) def __create_destination(self, keep_api_url: str, api_key: str): data = { "headers": { "X-API-KEY": api_key, }, "method": "post", "name": "KeepDestination", "template": "KeepAlertTemplate", "url": keep_api_url, } response = requests.post( url=self.__get_url( paths=[ f"api/{self.authentication_config.organisationID}/alerts/destinations" ] ), auth=self.__get_auth(), json=data, ) # if response.ok: res = response.json() if res["code"] == 200: self.logger.info("Destination Successfully Created") elif "already exists" in res["message"]: self.logger.info("Destination creation failed as it already exists") self.logger.info("Attempting to Update Destination...") self.__update_destination( keep_api_url=keep_api_url, api_key=api_key, data=data ) else: self.logger.error( "Destination creation failed", extra={"code": res["code"], "error": res["message"]}, ) def __get_all_stream_names(self) -> list[str]: names = [] response = requests.get( url=self.__get_url( paths=[f"api/{self.authentication_config.organisationID}/streams"] ), auth=self.__get_auth(), ) res = response.json() for stream in res["list"]: names.append(stream["name"]) return names def __get_and_update_actions(self): response = requests.get( url=self.__get_url( paths=[f"api/{self.authentication_config.organisationID}/alerts"] ), auth=self.__get_auth(), ) res = response.json() for alert in res["list"]: alert_stream = alert["stream_name"] alert_name = alert["name"] if "KeepDestination" not in alert["destinations"]: alert["destinations"].append("KeepDestination") self.logger.info(f"Updating Alert: {alert_name} in Stream: {alert_stream}") update_response = requests.put( url=self.__get_url( paths=[f"api/default/{alert_stream}/alerts/{alert_name}"] ), auth=self.__get_auth(), json=alert, ) update_res = update_response.json() if update_res["code"] == 200: self.logger.info( f"Updated Alert: {alert_name} in Stream: {alert_stream}", extra={"code": update_res["code"], "error": update_res["message"]}, ) else: self.logger.error( f"Error while updating Alert: {alert_name} in Stream: {alert_stream}", extra={"code": update_res["code"], "error": update_res["message"]}, ) def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): try: self.__create_alert_template() except Exception as e: self.logger.error("Error while creating Alert Template", extra=e) self.__create_destination(keep_api_url=keep_api_url, api_key=api_key) try: self.__get_and_update_actions() except Exception as e: self.logger.error("Error while updating Alerts", extra=e) self.logger.info("Webhook created") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | List[AlertDto]: logger = logging.getLogger(__name__) alert_name = event.pop("alert_name", "") # openoboserve does not provide severity severity = AlertSeverity.WARNING # Mapping 'stream_name' to 'environment' environment = event.pop("stream_name", "") # Mapping 'alert_start_time' to 'startedAt' startedAt = event.pop("alert_start_time", "") # Mapping 'alert_start_time' to 'startedAt' lastReceived = event.pop("alert_start_time", "") # Mapping 'alert_type' to 'description' description = event.pop("alert_type", "") alert_url = event.pop("alert_url", "") org_name = event.pop("org_name", "") # Our only way to distinguish between non aggregated alert and aggregated alerts is the alert_agg_value if "alert_agg_value" in event and ( len(event["alert_agg_value"].split(",")) == int(event.get("alert_count", -1)) or len(event["alert_agg_value"].split(",")) == 1 ): logger.info("Formatting openobserve aggregated alert") rows = event.pop("rows", "") if not rows: logger.exception( "Rows not found in the aggregated alert event", extra={"event": event}, ) raise ValueError("Rows not found in the aggregated alert event") alerts = [] number_of_rows = event.pop("alert_count", "") rows = rows.split("\n") agg_values = event.pop("alert_agg_value", "").split(",") # if there is only one value, repeat it for all rows if len(agg_values) == 1: logger.info("Only one value found, repeating it for all rows") agg_values = [agg_values[0]] * int(number_of_rows) # trim agg_values = [agg_value.strip() for agg_value in agg_values] for i in range(int(number_of_rows)): try: logger.info( "Formatting aggregated alert", extra={"row": rows[i]}, ) row = rows[i] value = agg_values[i] # try to parse value as a number since its metric try: value = float(value) except ValueError: pass try: row_data = json.loads(row) except json.JSONDecodeError: try: row_data = json.loads(row.replace("'", '"')) except json.JSONDecodeError: logger.exception(f"Failed to parse row: {row}") continue row_name = row_data.pop("name", "") if row_name: row_data['row_name'] = row_name group_by_keys = list(row_data.keys()) logger.info( "Formatting aggregated alert with group by keys", extra={ "group_by_keys": group_by_keys, }, ) alert_id = str(uuid.uuid4()) # we already take the value from the agg_value event.pop("value", "") # if the group_by_key is already in the event, remove it # since we are adding it to the alert_dto for group_by_key in group_by_keys: event.pop(group_by_key, "") alert_dto = AlertDto( id=f"{alert_id}", name=f"{alert_name}: {row_name}" if row_name else f"{alert_name}", severity=severity, environment=environment, startedAt=startedAt, lastReceived=lastReceived, description=description, row_data=row_data, source=["openobserve"], org_name=org_name, value=value, alert_url=alert_url, # I'm not putting on URL since sometimes it doesn't return full URL so pydantic will throw an error **event, **row_data ) # calculate the fingerprint based on name + group_by_value alert_dto.fingerprint = OpenobserveProvider.get_alert_fingerprint( alert_dto, fingerprint_fields=["name", *group_by_keys] ) logger.info( "Formatted openobserve aggregated alert", extra={"fingerprint": alert_dto.fingerprint}, ) alerts.append(alert_dto) except json.JSONDecodeError: logger.error(f"Failed to parse row: {row}") return alerts # else, one alert, one row, old calculation else: alert_id = str(uuid.uuid4()) labels = { "url": event.pop("alert_url", ""), "alert_period": event.pop("alert_period", ""), "alert_operator": event.pop("alert_operator", ""), "alert_threshold": event.pop("alert_threshold", ""), "alert_count": event.pop("alert_count", ""), "alert_agg_value": event.pop("alert_agg_value", ""), "alert_end_time": event.pop("alert_end_time", ""), } alert_dto = AlertDto( id=alert_id, name=alert_name, severity=severity, environment=environment, startedAt=startedAt, lastReceived=lastReceived, description=description, labels=labels, source=["openobserve"], org_name=org_name, alert_url=alert_url, # I'm not putting on URL since sometimes it doesn't return full URL so pydantic will throw an error **event, # any other fields ) # calculate fingerprint based on name + environment + event keys (e.g. host) fingerprint_fields = ["name", "environment", *event.keys()] # remove 'value' as its too dynamic try: fingerprint_fields.remove("value") except ValueError: pass logger.info( "Calculating fingerprint fields", extra={"fingerprint_fields": fingerprint_fields}, ) # sort the fields to ensure the fingerprint is consistent # for e.g. host1, host2 is the same as host2, host1 for field in fingerprint_fields: try: field_attr = getattr(alert_dto, field) if "," not in field_attr: continue # sort it lexographically logger.info( "Sorting field attributes", extra={"field": field, "field_attr": field_attr}, ) sorted_field_attr = sorted(field_attr.replace(" ", "").split(",")) sorted_field_attr = ", ".join(sorted_field_attr) logger.info( "Sorted field attributes", extra={"field": field, "sorted_field_attr": sorted_field_attr}, ) # set the attr setattr(alert_dto, field, sorted_field_attr) except AttributeError: pass except Exception as e: logger.error( "Error while sorting field attributes", extra={"field": field, "error": e}, ) alert_dto.fingerprint = OpenobserveProvider.get_alert_fingerprint( alert_dto, fingerprint_fields=fingerprint_fields ) logger.info( "Formatted openobserve alert", extra={"fingerprint": alert_dto.fingerprint}, ) return alert_dto ================================================ FILE: keep/providers/opensearchserverless_provider/README.md ================================================ # Instructions for setup 1. Open your aws console. 2. Search for `Amazon OpenSearch Service` 3. In the sidebar navigate to `Serverless` > `Dashboard`. 4. Click `Create Collection` > 1. Fill `Name` & `Description`. 2. Select Collection Type `Search` 3. Security :`Standard Create` 4. Encryption: `Use AWS owned key` 5. Access collections from : `Public` 6. Resource type: Select both Checkboxes. 5. Next 6. `Add principals` > `IAM User and Roles` > Select a User of your choice. 7. Grant access to Index : `Create Index`, `Read documents` & `Write or update documents`. 8. Enter a random policy name. 9. Submit 10. Wait for the deployment to be complete. 11. Meanwhile go to IAM. 12. Go to Access Management > Users > Click the user you selected in step 6. 13. Create a access key and download/save it. 14. Go to Add permission > Create inline policy > JSON Paste this ```json { "Version": "2012-10-17", "Statement": [ { "Sid": "VisualEditor0", "Effect": "Allow", "Action": [ "iam:SimulatePrincipalPolicy", "aoss:GetAccessPolicy", "aoss:APIAccessAll", "aoss:ListAccessPolicies" ], "Resource": "*" } ] } ``` 15. Click Next > Give a Policy name > Save. 16. Go back to your collection and copy the `OpenSearch endpoint` This is your Domain. ================================================ FILE: keep/providers/opensearchserverless_provider/__init__.py ================================================ ================================================ FILE: keep/providers/opensearchserverless_provider/opensearchserverless_provider.py ================================================ """ OpensearchProvider is a class that provides a way to read/add data from AWS Opensearch. """ import dataclasses from typing import List from urllib.parse import urlencode, urljoin import boto3 import pydantic import requests from requests_aws4auth import AWS4Auth from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class OpensearchserverlessProviderAuthConfig: domain_endpoint: str = dataclasses.field( metadata={ "required": True, "description": "Domain endpoint", "senstive": False, }, ) region: str = dataclasses.field( metadata={ "required": True, "description": "AWS region", "senstive": False, }, ) access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS access key", "sensitive": True, }, ) access_key_secret: str = dataclasses.field( default=None, metadata={ "required": False, "description": "AWS access key secret", "sensitive": True, }, ) class OpensearchserverlessProvider(BaseProvider, ProviderHealthMixin): """Push alarms from AWS Opensearch to Keep.""" PROVIDER_DISPLAY_NAME = "Opensearch Serverless" PROVIDER_CATEGORY = ["Database", "Observability"] PROVIDER_SCOPES = [ ProviderScope( name="iam:SimulatePrincipalPolicy", description="Required to check if we have access to AOSS API.", mandatory=True, alias="Needed to test the access for next 3 scopes.", ), ProviderScope( name="aoss:APIAccessAll", description="Required to make API calls to OpenSearch Serverless. (Add from IAM console)", mandatory=True, alias="Access to make API calls to serverless", ), ProviderScope( name="aoss:ListAccessPolicies", description="Required to access all Data Access Policies. (Add from IAM console)", mandatory=True, alias="Needed to list all Data Access Policies.", ), ProviderScope( name="aoss:GetAccessPolicy", description="Required to check each policy for read and write scope. (Add from IAM console)", mandatory=True, alias="Policy read access", ), ProviderScope( name="aoss:CreateIndex", description="Required to create indexes while saving a doc.", documentation_url="https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations", mandatory=True, alias="Index Creation Access", ), ProviderScope( name="aoss:ReadDocument", description="Required to query.", documentation_url="https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations", mandatory=True, alias="Read Access", ), ProviderScope( name="aoss:WriteDocument", description="Required to save documents.", documentation_url="https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-genref.html#serverless-operations", mandatory=True, alias="Write Access", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): self.auth = None self.client = None super().__init__(context_manager, provider_id, config) def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for Opensearch api requests. """ host = self.authentication_config.domain_endpoint.rstrip("/").rstrip() self.logger.info(f"Building URL with host: {host}") url = urljoin( host, "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def __generate_client(self, aws_client_type: str): client = boto3.client( aws_client_type, aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.access_key_secret, region_name=self.authentication_config.region, ) return client def validate_scopes(self): scopes = { scope.name: "Access needed to all previous scopes to continue" for scope in self.PROVIDER_SCOPES } actions = scopes.keys() try: sts_client = self.__generate_client("sts") identity = sts_client.get_caller_identity()["Arn"] iam_client = self.__generate_client("iam") results = iam_client.simulate_principal_policy( PolicySourceArn=identity, ActionNames=[ "aoss:APIAccessAll", "aoss:ListAccessPolicies", "aoss:GetAccessPolicy", ], ) scopes["iam:SimulatePrincipalPolicy"] = True except Exception as e: self.logger.error(e) scopes = {s: str(e) for s in scopes.keys()} return scopes all_allowed = True for res in results["EvaluationResults"]: if res["EvalActionName"] in actions: all_allowed &= res["EvalDecision"] == "allowed" scopes[res["EvalActionName"]] = ( True if res["EvalDecision"] == "allowed" else f'{res["EvalActionName"]} is not allowed' ) if not all_allowed: self.logger.error( "We don't have access to scopes needed to validate the rest" ) return scopes left_to_validate = [ "aoss:CreateIndex", "aoss:ReadDocument", "aoss:WriteDocument", ] try: aoss_client = self.__generate_client("opensearchserverless") all_policies = aoss_client.list_access_policies(type="data") for policy in all_policies["accessPolicySummaries"]: curr_policy = aoss_client.get_access_policy( type="data", name=policy["name"] )["accessPolicyDetail"] for pol in curr_policy["policy"]: if identity in pol["Principal"]: for rule in pol["Rules"]: if rule["ResourceType"] == "index": for left in left_to_validate: if left in rule["Permission"]: scopes[left] = True else: scopes[left] = "No Access" except Exception as e: for left in left_to_validate: scopes[left] = str(e) return scopes return scopes def dispose(self): pass def validate_config(self): self.authentication_config = OpensearchserverlessProviderAuthConfig( **self.config.authentication ) @property def __get_headers(self): return { "Content-Type": "application/json", "Accept": "application/json", } @property def __get_auth(self): if self.auth is None: self.auth = AWS4Auth( self.authentication_config.access_key, self.authentication_config.access_key_secret, self.authentication_config.region, "aoss", ) return self.auth def __create_doc(self, index, doc_id, doc): url = self.__get_url([index, "_doc", doc_id]) try: response = requests.put( url, headers=self.__get_headers, auth=self.__get_auth, json=doc ) return response except Exception as e: self.logger.error( "Error while creating document", extra={"exception": str(e)} ) raise def _query(self, query: dict, index: str): try: response = requests.get( self.__get_url([index, "_search"]), json=query, headers=self.__get_headers, auth=self.__get_auth, ) if response.status_code != 200: raise Exception(response.text) x = response.json() return x except Exception as e: self.logger.error("Error while querying index", extra={"exception": str(e)}) raise e def _notify(self, index: str, document: dict, doc_id: str): try: res = self.__create_doc(index, doc_id, document) if res.status_code not in [200, 201]: raise Exception( f"Failed to notify. Status: {res.status_code}, Response: {res.text}" ) self.logger.info("Notification document sent to OpenSearch successfully.") return res.json() except Exception as e: self.logger.error( "Error while sending notification to OpenSearch", extra={"exception": str(e)}, ) raise ================================================ FILE: keep/providers/openshift_provider/__init__.py ================================================ ================================================ FILE: keep/providers/openshift_provider/openshift_provider.py ================================================ import dataclasses import datetime import pydantic import requests import warnings from kubernetes import client from kubernetes.client.rest import ApiException from openshift_client import Context from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class OpenshiftProviderAuthConfig: """Openshift authentication configuration.""" api_server: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "name": "api_server", "description": "The openshift api server url", "required": True, "sensitive": False, "validation": "any_http_url", }, ) token: str = dataclasses.field( metadata={ "name": "token", "description": "The openshift token", "required": True, "sensitive": True, }, ) insecure: bool = dataclasses.field( default=False, metadata={ "name": "insecure", "description": "Skip TLS verification", "required": False, "sensitive": False, "type": "switch", }, ) class OpenshiftProvider(BaseProvider): """Perform rollout restart actions and query resources on Openshift.""" provider_id: str PROVIDER_DISPLAY_NAME = "Openshift" PROVIDER_CATEGORY = ["Cloud Infrastructure"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_openshift", description="Check if the provided token can connect to the openshift server", mandatory=True, alias="Connect to the openshift", ) ] def __init__(self, context_manager, provider_id: str, config: ProviderConfig): super().__init__(context_manager, provider_id, config) self.authentication_config = None self._k8s_client = None self.validate_config() def dispose(self): """Dispose the provider.""" if self._k8s_client: self._k8s_client.api_client.rest_client.pool_manager.clear() def validate_config(self): """ Validates required configuration for Openshift provider. """ if self.config.authentication is None: self.config.authentication = {} self.authentication_config = OpenshiftProviderAuthConfig( **self.config.authentication ) def __get_ocp_client(self): """Get the Openshift client.""" oc_context = Context() oc_context.api_server = self.authentication_config.api_server oc_context.token = self.authentication_config.token oc_context.insecure = self.authentication_config.insecure return oc_context def __get_k8s_client(self): """Get the Kubernetes client for OpenShift API access.""" if self._k8s_client is None: client_configuration = client.Configuration() client_configuration.host = self.authentication_config.api_server client_configuration.verify_ssl = not self.authentication_config.insecure client_configuration.api_key = { "authorization": "Bearer " + self.authentication_config.token } self._k8s_client = client.ApiClient(client_configuration) return self._k8s_client def __test_connection_via_rest_api(self): """ Test connection to OpenShift using REST API instead of CLI. This is more reliable as it doesn't depend on oc CLI being installed. """ try: # Suppress SSL warnings if insecure is True if self.authentication_config.insecure: # Suppress SSL verification warnings warnings.filterwarnings('ignore', message='Unverified HTTPS request') # Test API connectivity by hitting the /version endpoint headers = { 'Authorization': f'Bearer {self.authentication_config.token}', 'Accept': 'application/json' } verify_ssl = not self.authentication_config.insecure # Try to get cluster version info response = requests.get( f"{self.authentication_config.api_server}/version", headers=headers, verify=verify_ssl, timeout=30 ) if response.status_code == 200: self.logger.info("Successfully connected to OpenShift cluster via REST API") return True, None else: error_msg = f"API returned status code {response.status_code}: {response.text}" self.logger.error(f"Failed to connect to OpenShift cluster: {error_msg}") return False, error_msg except requests.exceptions.RequestException as e: error_msg = f"Connection error: {str(e)}" self.logger.error(f"Failed to connect to OpenShift cluster: {error_msg}") return False, error_msg except Exception as e: error_msg = f"Unexpected error: {str(e)}" self.logger.error(f"Failed to connect to OpenShift cluster: {error_msg}") return False, error_msg def validate_scopes(self): """ Validates that the provided token has the required scopes to use the provider. Uses REST API validation instead of CLI commands for better reliability. """ self.logger.info("Validating scopes for OpenShift provider") try: # Try REST API approach first success, error_msg = self.__test_connection_via_rest_api() if success: self.logger.info("Successfully validated OpenShift connection") scopes = { "connect_to_openshift": True, } else: self.logger.error(f"OpenShift validation failed: {error_msg}") scopes = { "connect_to_openshift": error_msg, } except Exception as e: self.logger.exception("Error validating scopes for OpenShift provider") scopes = { "connect_to_openshift": str(e), } return scopes def _query(self, command_type: str, **kwargs): """ Query OpenShift resources. Args: command_type (str): The type of query to perform. Supported queries are: - get_logs: Get logs from a pod - get_events: Get events for a namespace or pod - get_pods: List pods in a namespace or across all namespaces - get_node_pressure: Get node pressure conditions - get_pvc: List persistent volume claims - get_routes: List OpenShift routes - get_deploymentconfigs: List OpenShift deployment configs - get_projects: List OpenShift projects **kwargs: Additional arguments for the query. """ k8s_client = self.__get_k8s_client() if command_type == "get_logs": return self.__get_logs(k8s_client, **kwargs) elif command_type == "get_events": return self.__get_events(k8s_client, **kwargs) elif command_type == "get_pods": return self.__get_pods(k8s_client, **kwargs) elif command_type == "get_node_pressure": return self.__get_node_pressure(k8s_client, **kwargs) elif command_type == "get_pvc": return self.__get_pvc(k8s_client, **kwargs) elif command_type == "get_routes": return self.__get_routes(**kwargs) elif command_type == "get_deploymentconfigs": return self.__get_deploymentconfigs(**kwargs) elif command_type == "get_projects": return self.__get_projects(**kwargs) else: raise NotImplementedError(f"Command type {command_type} is not implemented") def _notify(self, action: str, **kwargs): """ Perform actions on OpenShift resources. Args: action (str): The action to perform. Supported actions are: - rollout_restart: Restart a deployment, statefulset, or daemonset - restart_pod: Restart a pod by deleting it - scale_deployment: Scale a deployment to specified replicas - scale_deploymentconfig: Scale a deployment config to specified replicas **kwargs: Additional arguments for the action. """ if action == "rollout_restart": return self.__rollout_restart(**kwargs) elif action == "restart_pod": return self.__restart_pod(**kwargs) elif action == "scale_deployment": return self.__scale_deployment(**kwargs) elif action == "scale_deploymentconfig": return self.__scale_deploymentconfig(**kwargs) else: raise NotImplementedError(f"Action {action} is not implemented") def __get_logs(self, k8s_client, namespace, pod_name, container_name=None, tail_lines=100, **kwargs): """Get logs from a pod.""" self.logger.info(f"Getting logs for pod {pod_name} in namespace {namespace}") core_v1 = client.CoreV1Api(k8s_client) try: logs = core_v1.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container_name, tail_lines=tail_lines, pretty=True, ) return logs.splitlines() except UnicodeEncodeError: logs = core_v1.read_namespaced_pod_log( name=pod_name, namespace=namespace, container=container_name, tail_lines=tail_lines, ) return logs.splitlines() except ApiException as e: self.logger.error(f"Error getting logs for pod {pod_name}: {e}") raise Exception(f"Error getting logs for pod {pod_name}: {e}") def __get_events(self, k8s_client, namespace, pod_name=None, sort_by=None, **kwargs): """Get events for a namespace or specific pod.""" self.logger.info( f"Getting events in namespace {namespace}" + (f" for pod {pod_name}" if pod_name else ""), ) core_v1 = client.CoreV1Api(k8s_client) try: if pod_name: # Get the pod to find its UID pod = core_v1.read_namespaced_pod(name=pod_name, namespace=namespace) field_selector = f"involvedObject.kind=Pod,involvedObject.name={pod_name},involvedObject.uid={pod.metadata.uid}" else: field_selector = f"metadata.namespace={namespace}" events = core_v1.list_namespaced_event( namespace=namespace, field_selector=field_selector, ) if sort_by: self.logger.info(f"Sorting events by {sort_by}") try: sorted_events = sorted( events.items, key=lambda event: getattr(event, sort_by, None), reverse=True, ) return sorted_events except Exception: self.logger.exception(f"Error sorting events by {sort_by}") # Convert events to dict return [event.to_dict() for event in events.items] except ApiException as e: self.logger.exception("Error getting events") raise Exception(f"Error getting events: {e}") from e def __get_pods(self, k8s_client, namespace=None, label_selector=None, **kwargs): """List pods in a namespace or across all namespaces.""" core_v1 = client.CoreV1Api(k8s_client) try: if namespace: self.logger.info(f"Listing pods in namespace {namespace}") pods = core_v1.list_namespaced_pod( namespace=namespace, label_selector=label_selector ) else: self.logger.info("Listing pods across all namespaces") pods = core_v1.list_pod_for_all_namespaces( label_selector=label_selector ) return [pod.to_dict() for pod in pods.items] except ApiException as e: self.logger.error(f"Error listing pods: {e}") raise Exception(f"Error listing pods: {e}") def __get_node_pressure(self, k8s_client, **kwargs): """Get node pressure conditions (Memory, Disk, PID).""" self.logger.info("Getting node pressure conditions") core_v1 = client.CoreV1Api(k8s_client) try: nodes = core_v1.list_node(watch=False) node_pressures = [] for node in nodes.items: pressures = { "name": node.metadata.name, "conditions": [], } for condition in node.status.conditions: if condition.type in [ "MemoryPressure", "DiskPressure", "PIDPressure", ]: pressures["conditions"].append(condition.to_dict()) node_pressures.append(pressures) return node_pressures except ApiException as e: self.logger.error(f"Error getting node pressures: {e}") raise Exception(f"Error getting node pressures: {e}") def __get_pvc(self, k8s_client, namespace=None, **kwargs): """List persistent volume claims in a namespace or across all namespaces.""" core_v1 = client.CoreV1Api(k8s_client) try: if namespace: self.logger.info(f"Listing PVCs in namespace {namespace}") pvcs = core_v1.list_namespaced_persistent_volume_claim( namespace=namespace ) else: self.logger.info("Listing PVCs across all namespaces") pvcs = core_v1.list_persistent_volume_claim_for_all_namespaces() return [pvc.to_dict() for pvc in pvcs.items] except ApiException as e: self.logger.error(f"Error listing PVCs: {e}") raise Exception(f"Error listing PVCs: {e}") def __get_routes(self, namespace=None, **kwargs): """List OpenShift routes.""" self.logger.info("Getting OpenShift routes") try: # Use REST API to get routes headers = { 'Authorization': f'Bearer {self.authentication_config.token}', 'Accept': 'application/json' } verify_ssl = not self.authentication_config.insecure if namespace: url = f"{self.authentication_config.api_server}/apis/route.openshift.io/v1/namespaces/{namespace}/routes" else: url = f"{self.authentication_config.api_server}/apis/route.openshift.io/v1/routes" response = requests.get(url, headers=headers, verify=verify_ssl, timeout=30) response.raise_for_status() routes_data = response.json() return routes_data.get('items', []) except Exception as e: self.logger.error(f"Error getting routes: {e}") raise Exception(f"Error getting routes: {e}") def __get_deploymentconfigs(self, namespace=None, **kwargs): """List OpenShift deployment configs.""" self.logger.info("Getting OpenShift deployment configs") try: # Use REST API to get deployment configs headers = { 'Authorization': f'Bearer {self.authentication_config.token}', 'Accept': 'application/json' } verify_ssl = not self.authentication_config.insecure if namespace: url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/namespaces/{namespace}/deploymentconfigs" else: url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/deploymentconfigs" response = requests.get(url, headers=headers, verify=verify_ssl, timeout=30) response.raise_for_status() dc_data = response.json() return dc_data.get('items', []) except Exception as e: self.logger.error(f"Error getting deployment configs: {e}") raise Exception(f"Error getting deployment configs: {e}") def __get_projects(self, **kwargs): """List OpenShift projects.""" self.logger.info("Getting OpenShift projects") try: # Use REST API to get projects headers = { 'Authorization': f'Bearer {self.authentication_config.token}', 'Accept': 'application/json' } verify_ssl = not self.authentication_config.insecure url = f"{self.authentication_config.api_server}/apis/project.openshift.io/v1/projects" response = requests.get(url, headers=headers, verify=verify_ssl, timeout=30) response.raise_for_status() projects_data = response.json() return projects_data.get('items', []) except Exception as e: self.logger.error(f"Error getting projects: {e}") raise Exception(f"Error getting projects: {e}") def __rollout_restart(self, kind, name, namespace, labels=None, **kwargs): """Perform a rollout restart on a deployment, statefulset, or daemonset using REST API.""" self.logger.info(f"Performing rollout restart for {kind} {name} in namespace {namespace}") k8s_client = self.__get_k8s_client() now = datetime.datetime.now(datetime.timezone.utc) now = str(now.isoformat("T") + "Z") body = { "spec": { "template": { "metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": now} } } } } apps_v1 = client.AppsV1Api(k8s_client) try: if kind.lower() == "deployment": if labels: deployment_list = apps_v1.list_namespaced_deployment( namespace=namespace, label_selector=labels ) if not deployment_list.items: raise ValueError( f"Deployment with labels {labels} not found in namespace {namespace}" ) apps_v1.patch_namespaced_deployment( name=name, namespace=namespace, body=body ) elif kind.lower() == "statefulset": if labels: statefulset_list = apps_v1.list_namespaced_stateful_set( namespace=namespace, label_selector=labels ) if not statefulset_list.items: raise ValueError( f"StatefulSet with labels {labels} not found in namespace {namespace}" ) apps_v1.patch_namespaced_stateful_set( name=name, namespace=namespace, body=body ) elif kind.lower() == "daemonset": if labels: daemonset_list = apps_v1.list_namespaced_daemon_set( namespace=namespace, label_selector=labels ) if not daemonset_list.items: raise ValueError( f"DaemonSet with labels {labels} not found in namespace {namespace}" ) apps_v1.patch_namespaced_daemon_set( name=name, namespace=namespace, body=body ) elif kind.lower() == "deploymentconfig": # Handle OpenShift DeploymentConfig using REST API return self.__rollout_restart_deploymentconfig(name, namespace) else: raise ValueError(f"Unsupported kind {kind} to perform rollout restart") except ApiException as e: self.logger.error(f"Error performing rollout restart for {kind} {name}: {e}") raise Exception(f"Error performing rollout restart for {kind} {name}: {e}") self.logger.info(f"Successfully performed rollout restart for {kind} {name}") return { "status": "success", "message": f"Successfully performed rollout restart for {kind} {name}", } def __rollout_restart_deploymentconfig(self, name, namespace): """Restart a DeploymentConfig using OpenShift REST API.""" try: headers = { 'Authorization': f'Bearer {self.authentication_config.token}', 'Content-Type': 'application/json' } verify_ssl = not self.authentication_config.insecure url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/namespaces/{namespace}/deploymentconfigs/{name}/instantiate" # Trigger a new deployment body = { "kind": "DeploymentRequest", "apiVersion": "apps.openshift.io/v1", "name": name, "latest": True, "force": True } response = requests.post(url, headers=headers, json=body, verify=verify_ssl, timeout=30) response.raise_for_status() self.logger.info(f"Successfully restarted DeploymentConfig {name}") return { "status": "success", "message": f"Successfully restarted DeploymentConfig {name}", } except Exception as e: self.logger.error(f"Error restarting DeploymentConfig {name}: {e}") raise Exception(f"Error restarting DeploymentConfig {name}: {e}") def __restart_pod(self, namespace, pod_name, container_name=None, message=None, **kwargs): """Restart a pod by deleting it (it will be recreated by its controller).""" k8s_client = self.__get_k8s_client() core_v1 = client.CoreV1Api(k8s_client) self.logger.info(f"Restarting pod {pod_name} in namespace {namespace}") try: # Check if the pod exists pod = core_v1.read_namespaced_pod(name=pod_name, namespace=namespace) # If the pod is managed by a controller, it will be recreated # For standalone pods, this will simply delete the pod delete_options = client.V1DeleteOptions() core_v1.delete_namespaced_pod( name=pod_name, namespace=namespace, body=delete_options ) # Return success message response_message = ( message if message else f"Pod {pod_name} in namespace {namespace} was restarted" ) self.logger.info(response_message) return { "status": "success", "message": response_message, "pod_details": { "name": pod.metadata.name, "namespace": pod.metadata.namespace, "status": pod.status.phase, "containers": [container.name for container in pod.spec.containers], }, } except ApiException as e: error_message = f"Error restarting pod {pod_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __scale_deployment(self, namespace, deployment_name, replicas, **kwargs): """Scale a deployment to specified replicas.""" k8s_client = self.__get_k8s_client() apps_v1 = client.AppsV1Api(k8s_client) self.logger.info(f"Scaling deployment {deployment_name} in namespace {namespace} to {replicas} replicas") try: apps_v1.patch_namespaced_deployment_scale( name=deployment_name, namespace=namespace, body={"spec": {"replicas": replicas}}, ) return { "status": "success", "message": f"Successfully scaled deployment {deployment_name} to {replicas} replicas", } except ApiException as e: error_message = f"Error scaling deployment {deployment_name}: {e}" self.logger.error(error_message) raise Exception(error_message) def __scale_deploymentconfig(self, namespace, deploymentconfig_name, replicas, **kwargs): """Scale a DeploymentConfig to specified replicas using OpenShift REST API.""" try: headers = { 'Authorization': f'Bearer {self.authentication_config.token}', 'Content-Type': 'application/strategic-merge-patch+json' } verify_ssl = not self.authentication_config.insecure url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/namespaces/{namespace}/deploymentconfigs/{deploymentconfig_name}/scale" body = { "spec": { "replicas": replicas } } response = requests.patch(url, headers=headers, json=body, verify=verify_ssl, timeout=30) response.raise_for_status() self.logger.info(f"Successfully scaled DeploymentConfig {deploymentconfig_name} to {replicas} replicas") return { "status": "success", "message": f"Successfully scaled DeploymentConfig {deploymentconfig_name} to {replicas} replicas", } except Exception as e: self.logger.error(f"Error scaling DeploymentConfig {deploymentconfig_name}: {e}") raise Exception(f"Error scaling DeploymentConfig {deploymentconfig_name}: {e}") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) # Load environment variables import os url = os.environ.get("OPENSHIFT_URL") token = os.environ.get("OPENSHIFT_TOKEN") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = ProviderConfig( authentication={ "api_server": url, "token": token, } ) openshift_provider = OpenshiftProvider(context_manager, "openshift-keephq", config) # Test validation scopes = openshift_provider.validate_scopes() print("Validation result:", scopes) # Test query operations try: projects = openshift_provider.query(command_type="get_projects") print(f"Found {len(projects)} projects") except Exception as e: print(f"Error getting projects: {e}") # Test restart action try: restart = openshift_provider.notify(action="rollout_restart", kind="deployment", name="nginx", namespace="default") print(restart) except Exception as e: print(f"Error restarting: {e}") ================================================ FILE: keep/providers/opsgenie_provider/__init__.py ================================================ ================================================ FILE: keep/providers/opsgenie_provider/opsgenie_provider.py ================================================ import dataclasses import typing import json5 import opsgenie_sdk import pydantic import requests from opsgenie_sdk.rest import ApiException from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod @pydantic.dataclasses.dataclass class OpsgenieProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "OpsGenie api key", "hint": "https://support.atlassian.com/opsgenie/docs/create-a-default-api-integration/", "sensitive": True, }, ) # Integration Name is only used for validating scopes integration_name: str = dataclasses.field( metadata={ "required": True, "description": "OpsGenie integration name", "hint": "https://support.atlassian.com/opsgenie/docs/create-a-default-api-integration/", }, ) class OpsGenieRecipient(pydantic.BaseModel): # https://github.com/opsgenie/opsgenie-python-sdk/blob/master/docs/Recipient.md type: str id: typing.Optional[str] = None class OpsgenieProvider(BaseProvider, ProviderHealthMixin): """Create incidents in OpsGenie.""" PROVIDER_DISPLAY_NAME = "OpsGenie" PROVIDER_CATEGORY = ["Incident Management"] PROVIDER_SCOPES = [ ProviderScope( name="opsgenie:create", description="Create OpsGenie alerts", mandatory=True, alias="Create alerts", ), ] PROVIDER_METHODS = [ ProviderMethod( name="Close an alert", func_name="close_alert", scopes=["opsgenie:create"], description="Close an alert", type="action", ), ProviderMethod( name="Comment an alert", func_name="comment_alert", scopes=["opsgenie:create"], description="Comment an alert", type="action", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.configuration = opsgenie_sdk.Configuration() self.configuration.retry_http_response = ["429", "500", "502-599", "404"] self.configuration.short_polling_max_retries = 3 # IMPORTANT: Create a new dict to avoid sharing with other instances self.configuration.api_key = {} self.configuration.api_key["Authorization"] = self.authentication_config.api_key def validate_scopes(self): scopes = {} self.logger.info("Validating scopes") try: api_key = "GenieKey " + self.authentication_config.api_key url = "https://api.opsgenie.com/v2/" # Get the list of integrations response = requests.get( url + "integrations/", headers={"Authorization": api_key}, ) if response.status_code != 200: response.raise_for_status() # Find the OpsGenie integration for integration in response.json()["data"]: if integration["name"] == self.authentication_config.integration_name: api_key_id = integration["id"] break else: self.logger.error("Failed to find OpsGenie integration") return { "opsgenie:create": f"Failed to find Integration name {self.authentication_config.integration_name}" } # Get the integration details and check if it has write access response = requests.get( url + "integrations/" + api_key_id, headers={"Authorization": api_key}, ) if response.status_code != 200: response.raise_for_status() if response.json()["data"]["allowWriteAccess"]: scopes["opsgenie:create"] = True else: scopes["opsgenie:create"] = ( "OpsGenie integration does not have write access" ) except Exception as e: self.logger.exception("Failed to create OpsGenie alert") scopes["opsgenie:create"] = str(e) return scopes def validate_config(self): self.authentication_config = OpsgenieProviderAuthConfig( **self.config.authentication ) def _delete_alert(self, alert_id: str) -> bool: api_instance = opsgenie_sdk.AlertApi(opsgenie_sdk.ApiClient(self.configuration)) request = api_instance.delete_alert(alert_id) response = request.retrieve_result() if not response.data.is_success: self.logger.error( "Failed to delete OpsGenie alert", extra={"alert_id": alert_id, "response": response.data.to_dict()}, ) return response.data.is_success # https://github.com/opsgenie/opsgenie-python-sdk/blob/master/docs/CreateAlertPayload.md def _create_alert( self, user: str | None = None, note: str | None = None, source: str | None = None, message: str | None = None, alias: str | None = None, description: str | None = None, responders: typing.List[OpsGenieRecipient] | None = None, visible_to: typing.List[OpsGenieRecipient] | None = None, actions: typing.List[str] | None = None, tags: typing.List[str] | None = None, details: typing.Dict[str, str] | None = None, entity: str | None = None, priority: str | None = None, ): """ Creates OpsGenie Alert. """ if isinstance(tags, str): self.logger.debug("Parsing tags", extra={"tags": tags}) try: tags = json5.loads(tags) self.logger.debug("Parsed tags", extra={"tags": tags}) except Exception: self.logger.exception("Failed to parse tags") api_instance = opsgenie_sdk.AlertApi(opsgenie_sdk.ApiClient(self.configuration)) create_alert_payload = opsgenie_sdk.CreateAlertPayload( user=user, note=note, source=source, message=message, alias=alias, description=description, responders=responders, visible_to=visible_to, actions=actions, tags=tags, details=details, entity=entity, priority=priority, ) try: alert = api_instance.create_alert(create_alert_payload) response = alert.retrieve_result() if not response.data.is_success: raise Exception( f"Failed to create OpsGenie alert: {response.data.status}" ) return response.data.to_dict() except ApiException: self.logger.exception("Failed to create OpsGenie alert") raise # https://github.com/opsgenie/opsgenie-python-sdk/blob/master/docs/CloseAlertPayload.md def close_alert( self, alert_id: str, ): """ Close OpsGenie Alert. """ self.logger.info("Closing Opsgenie alert", extra={"alert_id": alert_id}) api_instance = opsgenie_sdk.AlertApi(opsgenie_sdk.ApiClient(self.configuration)) close_alert_payload = opsgenie_sdk.CloseAlertPayload() try: api_instance.close_alert(alert_id, close_alert_payload=close_alert_payload) self.logger.info("Opsgenie Alert Closed", extra={"alert_id": alert_id}) except ApiException: self.logger.exception("Failed to close OpsGenie alert") raise # https://github.com/opsgenie/opsgenie-python-sdk/blob/master/docs/AddNoteToAlertPayload.md def comment_alert( self, alert_id: str, note: str, ): """ Add comment or note to an OpsGenie Alert. """ self.logger.info("Commenting Opsgenie alert", extra={"alert_id": alert_id}) api_instance = opsgenie_sdk.AlertApi(opsgenie_sdk.ApiClient(self.configuration)) add_note_to_alert_payload = opsgenie_sdk.AddNoteToAlertPayload( note=note, ) try: api_instance.add_note(alert_id, add_note_to_alert_payload) self.logger.info("Opsgenie Alert Commented", extra={"alert_id": alert_id}) except ApiException: self.logger.exception("Failed to comment OpsGenie alert") raise def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, user: str | None = None, note: str | None = None, source: str | None = None, message: str | None = None, alias: str | None = None, description: str | None = None, responders: typing.List[OpsGenieRecipient] | None = None, visible_to: typing.List[OpsGenieRecipient] | None = None, actions: typing.List[str] | None = None, tags: typing.List[str] | None = None, details: typing.Dict[str, str] | None = None, entity: str | None = None, priority: str | None = None, **kwargs: dict, ): """ Create a OpsGenie alert. Args: type (str): Type of the request, e.g. create_alert, close_alert user (str, optional): Display name of the request owner note (str, optional): Additional note that will be added while creating the alert source (str, optional): Source field of the alert. Default value is IP address of the incoming request message (str): Message of the alert alias (str, optional): Client-defined identifier of the alert, that is also the key element of alert deduplication description (str, optional): Description field of the alert that is generally used to provide a detailed information responders (List[Recipient], optional): Responders that the alert will be routed to send notifications visible_to (List[Recipient], optional): Teams and users that the alert will become visible to without sending any notification actions (List[str], optional): Custom actions that will be available for the alert tags (List[str], optional): Tags of the alert details (Dict[str, str], optional): Map of key-value pairs to use as custom properties of the alert entity (str, optional): Entity field of the alert that is generally used to specify which domain alert is related to priority (str, optional): Priority level of the alert **kwargs: Additional arguments """ if kwargs and "type" in kwargs and kwargs["type"] == "close_alert": # Create an incident alert_id = kwargs.get("alert_id") if not alert_id: self.logger.error("alert_id is required to close an alert") return self.logger.info( "Closing Opsgenie alert", extra={"alert_id": kwargs["alert_id"]} ) return self.close_alert( alert_id=alert_id, ) # default, backward compatibility behavior return self._create_alert( user, note, source, message, alias, description, responders, visible_to, actions, tags, details, entity, priority, **kwargs, ) def _query(self, query_type="", query="", **kwargs: dict): api_instance = opsgenie_sdk.AlertApi(opsgenie_sdk.ApiClient(self.configuration)) if query_type == "alerts": alerts = api_instance.list_alerts(query=query) else: raise NotImplementedError(f"Query type {query_type} not implemented") return { "alerts": alerts.data, "alerts_count": len(alerts.data), } if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os opsgenie_api_key = os.environ.get("OPSGENIE_API_KEY") assert opsgenie_api_key # Initalize the provider and provider config config = ProviderConfig( description="OpsGenie Provider", authentication={"api_key": opsgenie_api_key}, ) provider = OpsgenieProvider( context_manager, provider_id="opsgenie-test", config=config ) # provider.notify( # message="Simple alert showing context with name: John Doe", # note="Simple alert", # user="John Doe", # ) provider.query(type="alerts", query="status: open") ================================================ FILE: keep/providers/pagerduty_provider/__init__.py ================================================ ================================================ FILE: keep/providers/pagerduty_provider/pagerduty_provider.py ================================================ import dataclasses import datetime import hashlib import json import logging import os import time import typing import uuid import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.api.models.db.incident import IncidentSeverity, IncidentStatus from keep.api.models.db.topology import TopologyServiceInDto from keep.api.models.incident import IncidentDto from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_config_exception import ProviderConfigException from keep.providers.base.base_provider import ( BaseIncidentProvider, BaseProvider, BaseTopologyProvider, ProviderHealthMixin, ) from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory # Todo: think about splitting in to PagerdutyIncidentsProvider and PagerdutyAlertsProvider # Read this: https://community.pagerduty.com/forum/t/create-incident-using-python/3596/3 logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class PagerdutyProviderAuthConfig: routing_key: str | None = dataclasses.field( metadata={ "required": False, "description": "Routing Key (an integration or ruleset key)", }, default=None, ) api_key: str | None = dataclasses.field( metadata={ "required": False, "description": "Api Key (a user or team API key)", "sensitive": True, }, default=None, ) oauth_data: dict = dataclasses.field( metadata={ "description": "For oauth flow", "required": False, "sensitive": True, "hidden": True, }, default="", ) service_id: str | None = dataclasses.field( metadata={ "required": False, "description": "Service Id (if provided, keep will only operate on this service)", "sensitive": False, }, default=None, ) class PagerdutyProvider( BaseTopologyProvider, BaseIncidentProvider, ProviderHealthMixin ): """Pull alerts and query incidents from PagerDuty.""" PROVIDER_SCOPES = [ ProviderScope( name="incidents_read", description="Read incidents data.", mandatory=True, alias="Incidents Data Read", ), ProviderScope( name="incidents_write", description="Write incidents.", mandatory=False, alias="Incidents Write", ), ProviderScope( name="webhook_subscriptions_read", description="Read webhook data.", mandatory=False, mandatory_for_webhook=True, alias="Webhooks Data Read", ), ProviderScope( name="webhook_subscriptions_write", description="Write webhooks.", mandatory=False, mandatory_for_webhook=True, alias="Webhooks Write", ), ] BASE_API_URL = "https://api.pagerduty.com" SUBSCRIPTION_API_URL = f"{BASE_API_URL}/webhook_subscriptions" PROVIDER_DISPLAY_NAME = "PagerDuty" ALERT_SEVERITIES_MAP = { "critical": AlertSeverity.CRITICAL, "error": AlertSeverity.HIGH, "warning": AlertSeverity.WARNING, "info": AlertSeverity.INFO, } URGENCY_TO_ALERT_SEVERITY = { "high": AlertSeverity.HIGH, "low": AlertSeverity.INFO, } URGENCY_TO_INCIDENT_SEVERITY = { "high": IncidentSeverity.HIGH, "low": IncidentSeverity.INFO, } INCIDENT_SEVERITIES_MAP = { "P1": IncidentSeverity.CRITICAL, "P2": IncidentSeverity.HIGH, "P3": IncidentSeverity.WARNING, "P4": IncidentSeverity.INFO, } PRIORITY_TO_ALERT_SEVERITY = { "P1": AlertSeverity.CRITICAL, "P2": AlertSeverity.HIGH, "P3": AlertSeverity.WARNING, "P4": AlertSeverity.INFO, } ALERT_STATUS_MAP = { "triggered": AlertStatus.FIRING, "resolved": AlertStatus.RESOLVED, } ALERT_STATUS_TO_EVENT_TYPE_MAP = { AlertStatus.FIRING.value: "trigger", AlertStatus.RESOLVED.value: "resolve", AlertStatus.ACKNOWLEDGED.value: "acknowledge", } INCIDENT_STATUS_MAP = { "triggered": IncidentStatus.FIRING, "acknowledged": IncidentStatus.ACKNOWLEDGED, "resolved": IncidentStatus.RESOLVED, } BASE_OAUTH_URL = "https://identity.pagerduty.com" PAGERDUTY_CLIENT_ID = os.environ.get("PAGERDUTY_CLIENT_ID") PAGERDUTY_CLIENT_SECRET = os.environ.get("PAGERDUTY_CLIENT_SECRET") OAUTH2_URL = ( f"{BASE_OAUTH_URL}/oauth/authorize?client_id={PAGERDUTY_CLIENT_ID}&response_type=code" if PAGERDUTY_CLIENT_ID is not None and PAGERDUTY_CLIENT_SECRET is not None else None ) PROVIDER_CATEGORY = ["Incident Management"] FINGERPRINT_FIELDS = ["alert_key"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) if self.authentication_config.oauth_data: last_fetched_at = self.authentication_config.oauth_data["last_fetched_at"] expires_in: float | None = self.authentication_config.oauth_data.get( "expires_in", None ) if expires_in: # Calculate expiration time by adding expires_in to last_fetched_at expiration_time = last_fetched_at + expires_in - 600 # Check if the current epoch time (in seconds) has passed the expiration time if time.time() <= expiration_time: self.logger.debug("access_token is still valid") return self.logger.info("Refreshing access token") self.__refresh_token() elif ( self.authentication_config.api_key or self.authentication_config.routing_key ): # No need to do anything return else: raise Exception("WTF Exception: No authentication provided") def __refresh_token(self): """ Refresh the access token using the refresh token. """ # Using the refresh token to get the access token try: access_token_response = requests.post( url=f"{PagerdutyProvider.BASE_OAUTH_URL}/oauth/token", headers={"Content-Type": "application/x-www-form-urlencoded"}, data={ "grant_type": "refresh_token", "client_id": PagerdutyProvider.PAGERDUTY_CLIENT_ID, "client_secret": PagerdutyProvider.PAGERDUTY_CLIENT_SECRET, "refresh_token": f'{self.authentication_config.oauth_data["refresh_token"]}', }, ) access_token_response.raise_for_status() access_token_response = access_token_response.json() self.config.authentication["oauth_data"] = { "access_token": access_token_response["access_token"], "refresh_token": access_token_response["refresh_token"], "expires_in": access_token_response["expires_in"], "last_fetched_at": time.time(), } except Exception: self.logger.exception( "Error while refreshing token", ) raise def validate_config(self): self.authentication_config = PagerdutyProviderAuthConfig( **self.config.authentication ) if ( not self.authentication_config.routing_key and not self.authentication_config.api_key and not self.authentication_config.oauth_data ): raise ProviderConfigException( "PagerdutyProvider requires either routing_key or api_key or OAuth configuration", provider_id=self.provider_id, ) @staticmethod def oauth2_logic(**payload) -> dict: """ OAuth2 callback logic for Pagerduty. Raises: Exception: No code verifier Exception: No code Exception: No redirect URI Exception: Failed to get access token Exception: No access token Returns: dict: access token and refresh token """ code_verifier = payload.get("verifier") if not code_verifier: raise Exception("No code verifier") code = payload.get("code") if not code: raise Exception("No code") redirect_uri = payload.get("redirect_uri") if not redirect_uri: raise Exception("Missing redirect URI") access_token_params = { "client_id": PagerdutyProvider.PAGERDUTY_CLIENT_ID, "client_secret": PagerdutyProvider.PAGERDUTY_CLIENT_SECRET, "code_verifier": code_verifier, "code": code, "redirect_uri": redirect_uri, "grant_type": "authorization_code", } access_token_response = requests.post( url=f"{PagerdutyProvider.BASE_OAUTH_URL}/oauth/token", data=access_token_params, headers={"Content-Type": "application/x-www-form-urlencoded"}, ) try: access_token_response.raise_for_status() access_token_response = access_token_response.json() except Exception: response_text = access_token_response.text response_status = access_token_response.status_code logger.exception( "Failed to get access token", extra={ "response_text": response_text, "response_status": response_status, }, ) raise access_token = access_token_response.get("access_token") if not access_token: raise Exception("No access token provided") return { "oauth_data": { "access_token": access_token_response["access_token"], "refresh_token": access_token_response["refresh_token"], "last_fetched_at": time.time(), "expires_in": access_token_response.get("expires_in", None), } } def __get_headers(self, **kwargs): if self.authentication_config.api_key or self.authentication_config.routing_key: return { "Accept": "application/vnd.pagerduty+json;version=2", "Content-Type": "application/json", "Authorization": f"Token token={self.authentication_config.api_key}", **kwargs, } elif self.authentication_config.oauth_data: return { "Accept": "application/vnd.pagerduty+json;version=2", "Authorization": f"Bearer {self.authentication_config.oauth_data['access_token']}", "Content-Type": "application/json", } def validate_scopes(self): """ Validate that the provider has the required scopes. """ headers = self.__get_headers() scopes = {} for scope in self.PROVIDER_SCOPES: # If the provider is installed using a routing key, we skip scopes validation for now. if self.authentication_config.routing_key: if scope.name == "incidents_read": # This is because incidents_read is mandatory and will not let the provider install otherwise scopes[scope.name] = True else: scopes[scope.name] = "Skipped due to routing key" continue try: # Todo: how to check validity for write scopes? if scope.name.startswith("incidents"): response = requests.get( f"{self.BASE_API_URL}/incidents", headers=headers, ) elif scope.name.startswith("webhook_subscriptions"): response = requests.get( self.SUBSCRIPTION_API_URL, headers=headers, ) if response.ok: scopes[scope.name] = True else: try: response_json = response.json() scopes[scope.name] = str( response_json.get("error", response.reason) ) except Exception: scopes[scope.name] = response.reason except Exception as e: self.logger.exception("Error validating scopes") scopes[scope.name] = str(e) return scopes def _build_alert( self, title: str, routing_key: str, dedup: str | None = None, severity: typing.Literal["critical", "error", "warning", "info"] | None = None, event_type: typing.Literal["trigger", "acknowledge", "resolve"] | None = None, source: str | None = None, **kwargs, ) -> typing.Dict[str, typing.Any]: """ Builds the payload for an event alert. Args: title: Title of alert alert_body: UTF-8 string of custom message for alert. Shown in incident body dedup: Any string, max 255, characters used to deduplicate alerts event_type: The type of event to send to PagerDuty Returns: Dictionary of alert body for JSON serialization """ if not severity: # this is the default severity severity = "critical" # try to get it automatically from the context (if there's an alert, for example) if self.context_manager.event_context: severity = self.context_manager.event_context.severity if not event_type: event_type = "trigger" # try to get it automatically from the context (if there's an alert, for example) if self.context_manager.event_context: status = self.context_manager.event_context.status event_type = PagerdutyProvider.ALERT_STATUS_TO_EVENT_TYPE_MAP.get( status, "trigger" ) if not dedup: # If no dedup is given, use epoch timestamp dedup = str(datetime.datetime.now().timestamp()) # Try to get it from the context (if there's an alert, for example) if self.context_manager.event_context: dedup = self.context_manager.event_context.fingerprint if not source: source = "custom_event" if self.context_manager.event_context: source = self.context_manager.event_context.service or "custom_event" payload = { "routing_key": routing_key, "event_action": event_type, "dedup_key": dedup, "payload": { "summary": title, "source": source, "severity": severity, }, } custom_details = kwargs.get("custom_details", {}) if isinstance(custom_details, str): custom_details = json.loads(custom_details) if not custom_details and kwargs.get("alert_body"): custom_details = {"alert_body": kwargs.get("alert_body")} if custom_details: payload["payload"]["custom_details"] = custom_details if kwargs.get("timestamp"): payload["payload"]["timestamp"] = kwargs.get("timestamp") if kwargs.get("component"): payload["payload"]["component"] = kwargs.get("component") if kwargs.get("group"): payload["payload"]["group"] = kwargs.get("group") if kwargs.get("class"): payload["payload"]["class"] = kwargs.get("class") if kwargs.get("images"): images = kwargs.get("images", []) if isinstance(images, str): images = json.loads(images) payload["payload"]["images"] = images if kwargs.get("links"): links = kwargs.get("links", []) if isinstance(links, str): links = json.loads(links) payload["payload"]["links"] = links return payload def _send_alert( self, title: str, routing_key: str, dedup: str | None = None, severity: typing.Literal["critical", "error", "warning", "info"] | None = None, event_type: typing.Literal["trigger", "acknowledge", "resolve"] | None = None, source: str | None = None, **kwargs, ): """ Sends PagerDuty Alert Args: title: Title of the alert. alert_body: UTF-8 string of custom message for alert. Shown in incident body dedup: Any string, max 255, characters used to deduplicate alerts event_type: The type of event to send to PagerDuty """ url = "https://events.pagerduty.com/v2/enqueue" payload = self._build_alert( title, routing_key, dedup, severity, event_type, source, **kwargs ) result = requests.post(url, json=payload) result.raise_for_status() self.logger.info( "Sent alert to PagerDuty", extra={ "status_code": result.status_code, "response_text": result.text, "routing_key": routing_key, }, ) return result.json() def _trigger_incident( self, service_id: str, title: str, body: dict | str, requester: str, incident_key: str | None = None, priority: str = "", status: typing.Literal["resolved", "acknowledged"] = "", resolution: str = "", ): """Triggers an incident via the V2 REST API using sample data.""" update = True if not incident_key: incident_key = str(uuid.uuid4()).replace("-", "") update = False url = ( f"{self.BASE_API_URL}/incidents" if not update else f"{self.BASE_API_URL}/incidents/{incident_key}" ) headers = self.__get_headers(From=requester) if isinstance(body, str): body = json.loads(body) if "details" in body and "type" not in body: body["type"] = "incident_body" payload = { "incident": { "type": "incident", "title": title, "service": {"id": service_id, "type": "service_reference"}, "incident_key": incident_key, "body": body, } } if status: payload["incident"]["status"] = status if status == "resolved" and resolution: payload["incident"]["resolution"] = resolution if priority: payload["incident"]["priority"] = { "id": priority, "type": "priority_reference", } r = ( requests.post(url, headers=headers, data=json.dumps(payload)) if not update else requests.put(url, headers=headers, data=json.dumps(payload)) ) try: r.raise_for_status() response = r.json() self.logger.info( "Incident triggered", extra={ "update": update, "incident_key": incident_key, "tenant_id": self.context_manager.tenant_id, }, ) return response except Exception as e: self.logger.error( "Failed to trigger incident", extra={ "response_text": r.text, "update": update, "incident_key": incident_key, "tenant_id": self.context_manager.tenant_id, }, ) # This will give us a better error message in Keep workflows raise Exception(r.text) from e def clean_up(self): """ Clean up the provider. It will remove the webhook from PagerDuty if it exists. """ self.logger.info( "Cleaning up %s provider with id %s", self.PROVIDER_DISPLAY_NAME, self.provider_id, ) keep_webhook_incidents_api_url = f"{self.context_manager.api_url}/incidents/event/{self.provider_type}?provider_id={self.provider_id}" headers = self.__get_headers() request = requests.get(self.SUBSCRIPTION_API_URL, headers=headers) if not request.ok: raise Exception("Could not get existing webhooks") existing_webhooks = request.json().get("webhook_subscriptions", []) webhook_exists = next( iter( [ webhook for webhook in existing_webhooks if keep_webhook_incidents_api_url == webhook.get("delivery_method", {}).get("url", "") ] ), False, ) if webhook_exists: self.logger.info("Webhook exists, removing it") webhook_id = webhook_exists.get("id") request = requests.delete( f"{self.SUBSCRIPTION_API_URL}/{webhook_id}", headers=headers ) if not request.ok: raise Exception("Could not remove existing webhook") self.logger.info("Webhook removed", extra={"webhook_id": webhook_id}) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def setup_incident_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True, ): self.logger.info("Setting up Pagerduty webhook") if self.authentication_config.routing_key: self.logger.info("Skipping webhook setup due to routing key") return headers = self.__get_headers() request = requests.get(self.SUBSCRIPTION_API_URL, headers=headers) if not request.ok: raise Exception("Could not get existing webhooks") existing_webhooks = request.json().get("webhook_subscriptions", []) webhook_exists = next( iter( [ webhook for webhook in existing_webhooks if keep_api_url == webhook.get("delivery_method", {}).get("url", "") ] ), False, ) webhook_payload = { "webhook_subscription": { "type": "webhook_subscription", "delivery_method": { "type": "http_delivery_method", "url": keep_api_url, "custom_headers": [{"name": "X-API-KEY", "value": api_key}], }, "description": f"Keep Pagerduty webhook ({self.provider_id}) - do not change", "events": [ "incident.acknowledged", "incident.annotated", "incident.delegated", "incident.escalated", "incident.priority_updated", "incident.reassigned", "incident.reopened", "incident.resolved", "incident.responder.added", "incident.responder.replied", "incident.triggered", "incident.unacknowledged", ], "filter": ( { "type": "service_reference", "id": self.authentication_config.service_id, } if self.authentication_config.service_id else {"type": "account_reference"} ), }, } if webhook_exists: self.logger.info("Webhook already exists, removing and re-creating") webhook_id = webhook_exists.get("id") request = requests.delete( f"{self.SUBSCRIPTION_API_URL}/{webhook_id}", headers=headers ) if not request.ok: raise Exception("Could not remove existing webhook") self.logger.info("Webhook removed", extra={"webhook_id": webhook_id}) self.logger.info("Creating Pagerduty webhook") request = requests.post( self.SUBSCRIPTION_API_URL, headers=headers, json=webhook_payload, ) if not request.ok: self.logger.error("Failed to add webhook", extra=request.json()) raise Exception("Could not create webhook") self.logger.info("Webhook created") def _notify( self, title: str = "", dedup: str = "", service_id: str = "", routing_key: str = "", requester: str = "", incident_id: str = "", event_type: typing.Literal["trigger", "acknowledge", "resolve"] | None = None, severity: typing.Literal["critical", "error", "warning", "info"] | None = None, source: str = "custom_event", priority: str = "", status: typing.Literal["resolved", "acknowledged"] = "", resolution: str = "", **kwargs: dict, ): """ Create a PagerDuty alert or incident. For events API, uses Events API v2. For incidents, uses REST API v2. See: https://developer.pagerduty.com/docs/ZG9jOjQ1NzA0NTc-overview Args: title (str): Title of the alert or incident dedup (str | None): String used to deduplicate alerts for events API, max 255 chars service_id (str): ID of the service for incidents routing_key (str): API routing_key (optional), if not specified, fallbacks to the one provided in provider body (dict): Body of the incident as per https://developer.pagerduty.com/api-reference/a7d81b0e9200f-create-an-incident#request-body requester (str): Email of the user requesting the incident creation incident_id (str | None): Key to identify the incident. UUID generated if not provided priority (str | None): Priority reference ID for incidents event_type (str | None): Event type for events API (trigger/acknowledge/resolve) severity (str | None): Severity for events API (critical/error/warning/info) source (str): Source field for events API status (str): Status for incident updates (resolved/acknowledged) resolution (str): Resolution note for resolved incidents kwargs (dict): Additional event/incident fields """ if not routing_key: # If routing_key not specified in workflow, fallback to config routing_key routing_key = self.authentication_config.routing_key if routing_key: return self._send_alert( title, dedup=dedup, event_type=event_type, routing_key=routing_key, source=source, severity=severity, **kwargs, ) else: return self._trigger_incident( service_id, title, kwargs.get("alert_body"), requester, incident_id, priority, status, resolution, ) def _query(self, incident_id: str = None, incident_key: str = None): if incident_id: return self._get_specific_incident(incident_id) elif incident_key: # Query Incident via incident_key (dedup_key) return self._get_specific_incident_with_incident_key(incident_key) else: return self.__get_all_incidents_or_alerts() @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None, force_new_format: bool = False, ) -> AlertDto: # If somebody connected the provider before we refactored it old_format_event = event.get("event", {}) if ( old_format_event is not None and isinstance(old_format_event, dict) and not force_new_format ): return PagerdutyProvider._format_alert_old(event) status = PagerdutyProvider.ALERT_STATUS_MAP.get(event.get("status", "firing")) severity = PagerdutyProvider.ALERT_SEVERITIES_MAP.get( event.get("severity", "info") ) source = ["pagerduty"] fingerprint = event.get("alert_key", event.get("id")) try: origin = event.get("body", {}).get("cef_details", {}).get("source_origin") if origin: source.append(origin) except Exception: # Could not extract origin or fingerprint, so we'll use the event id pass return AlertDto( id=event.get("id"), name=event.get("summary"), url=event.get("html_url"), service=event.get("service", {}).get("name"), lastReceived=event.get("created_at"), status=status, severity=severity, source=source, original_alert=event, fingerprint=fingerprint, ) def _format_alert_old(event: dict) -> AlertDto: actual_event = event.get("event", {}) data = actual_event.get("data", {}) event_type = data.get("type", "incident") if event_type != "incident": return None url = data.pop("self", data.pop("html_url", None)) # format status and severity to Keep format status = PagerdutyProvider.ALERT_STATUS_MAP.get(data.pop("status", "firing")) urgency = data.get("urgency") priority_summary = (data.get("priority", {}) or {}).get("summary") if urgency is not None: priority = PagerdutyProvider.URGENCY_TO_ALERT_SEVERITY.get( urgency, AlertSeverity.INFO ) elif priority_summary: priority = PagerdutyProvider.PRIORITY_TO_ALERT_SEVERITY.get( priority_summary, AlertSeverity.INFO ) else: priority = AlertSeverity.INFO last_received = data.pop( "created_at", datetime.datetime.now(tz=datetime.timezone.utc).isoformat() ) name = data.pop("title", "unknown title") service = data.pop("service", {}).get("summary", "unknown") environment = next( iter( [ x for x in data.pop("custom_fields", []) if x.get("name") == "environment" ] ), {}, ).get("value", "unknown") last_status_change_by = data.get("last_status_change_by", {}).get("summary") acknowledgers = [x.get("summary") for x in data.get("acknowledgers", [])] conference_bridge = data.get("conference_bridge", {}) if isinstance(conference_bridge, dict): conference_bridge = conference_bridge.get("summary") urgency = data.get("urgency") # Additional metadata metadata = { "urgency": urgency, "acknowledgers": acknowledgers, "last_updated_by": last_status_change_by, "conference_bridge": conference_bridge, "impacted_services": service, } return AlertDto( **data, url=url, status=status, lastReceived=last_received, name=name, severity=priority, environment=environment, source=["pagerduty"], service=service, labels=metadata, ) def _get_specific_incident(self, incident_id: str): self.logger.info("Getting Incident", extra={"incident_id": incident_id}) url = f"{self.BASE_API_URL}/incidents/{incident_id}" params = { "include[]": [ "acknowledgers", "agents", "assignees", "conference_bridge", "custom_fields", "escalation_policies", "first_trigger_log_entries", "priorities", "services", "teams", "users", ] } response = requests.get(url, headers=self.__get_headers(), params=params) response.raise_for_status() return response.json() def _get_specific_incident_with_incident_key(self, incident_key: str): # Query Incident via incident_key (dedup_key) self.logger.info("Getting Incident", extra={"incident_key": incident_key}) url = f"{self.BASE_API_URL}/incidents" params = { "incident_key": incident_key, "include[]": [ "acknowledgers", "agents", "assignees", "conference_bridge", "custom_fields", "escalation_policies", "first_trigger_log_entries", "priorities", "services", "teams", "users", ] } response = requests.get(url, headers=self.__get_headers(), params=params) response.raise_for_status() return response.json() def __get_all_incidents_or_alerts(self, incident_id: str = None, limit: int = 100): self.logger.info( "Getting incidents or alerts", extra={ "incident_id": incident_id, "tenant_id": self.context_manager.tenant_id, }, ) paginated_response = [] offset = 0 max_iterations = os.environ.get("KEEP_PAGERDUTY_MAX_ITERATIONS", 2) current_iteration = 0 total = True while True: try: url = f"{self.BASE_API_URL}/incidents" include = [] resource = "incidents" if incident_id is not None: url += f"/{incident_id}/alerts" include = ["teams", "services"] resource = "alerts" params = { "include[]": include, "offset": offset, "limit": limit, "total": total, "sort_by": ["created_at:desc"], } if not incident_id and self.authentication_config.service_id: params["service_ids[]"] = [self.authentication_config.service_id] response = requests.get( url=url, headers=self.__get_headers(), params=params, ) response.raise_for_status() response = response.json() except Exception: self.logger.exception( "Failed to get incidents or alerts", extra={ "incident_id": incident_id, "tenant_id": self.context_manager.tenant_id, }, ) if paginated_response: self.logger.warning( "Failed to get incidents from offset", extra={ "offset": offset, "tenant_id": self.context_manager.tenant_id, }, ) break else: self.logger.exception( "Failed to get any incidents or alerts", extra={"tenant_id": self.context_manager.tenant_id}, ) raise offset += limit paginated_response.extend(response.get(resource, [])) extra = {"offset": offset, "tenant_id": self.context_manager.tenant_id} if total: extra["total"] = response.get("total", 0) extra["to_fetch"] = min([limit * max_iterations, extra["total"]]) self.logger.info( "Fetched incidents or alerts", extra=extra, ) # No more results if not response.get("more", False) or current_iteration >= max_iterations: self.logger.info( "No more incidents or alerts", extra={ "tenant_id": self.context_manager.tenant_id, "current_iteration": current_iteration, "max_iterations": max_iterations, }, ) break current_iteration += 1 # We want total only on the first iteration total = False self.logger.info( "Fetched all incidents or alerts", extra={ "count": len(paginated_response), "incident_id": incident_id, "tenant_id": self.context_manager.tenant_id, }, ) return paginated_response def __get_all_services(self, business_services: bool = False): all_services = [] offset = 0 more = True endpoint = "business_services" if business_services else "services" while more: try: services_response = requests.get( url=f"{self.BASE_API_URL}/{endpoint}", headers=self.__get_headers(), params={"include[]": ["teams"], "offset": offset, "limit": 100}, ) services_response.raise_for_status() services_response = services_response.json() except Exception as e: self.logger.error("Failed to get all services", extra={"exception": e}) raise e more = services_response.get("more", False) offset = services_response.get("offset", 0) all_services.extend(services_response.get(endpoint, [])) return all_services def pull_topology(self) -> tuple[list[TopologyServiceInDto], dict]: # Skipping topology pulling when we're installed with routing_key if self.authentication_config.routing_key: return [], {} all_services = self.__get_all_services() all_business_services = self.__get_all_services(business_services=True) service_metadata = {} for service in all_services: service_metadata[service["id"]] = service for business_service in all_business_services: service_metadata[business_service["id"]] = business_service try: service_map_response = requests.get( url=f"{self.BASE_API_URL}/service_dependencies", headers=self.__get_headers(), ) service_map_response.raise_for_status() service_map_response = service_map_response.json() except Exception: self.logger.exception("Error while getting service dependencies") raise service_topology = {} for relationship in service_map_response.get("relationships", []): # Extract dependent and supporting service details dependent = relationship["dependent_service"] supporting = relationship["supporting_service"] if dependent["id"] not in service_topology: service_topology[dependent["id"]] = TopologyServiceInDto( source_provider_id=self.provider_id, service=dependent["id"], display_name=service_metadata[dependent["id"]]["name"], description=service_metadata[dependent["id"]]["description"], team=", ".join( team["name"] for team in service_metadata[dependent["id"]].get("teams", []) ), ) if supporting["id"] not in service_topology: service_topology[supporting["id"]] = TopologyServiceInDto( source_provider_id=self.provider_id, service=supporting["id"], display_name=service_metadata[supporting["id"]]["name"], description=service_metadata[supporting["id"]]["description"], team=", ".join( team["name"] for team in service_metadata[supporting["id"]].get("teams", []) ), ) service_topology[dependent["id"]].dependencies[supporting["id"]] = "unknown" return list(service_topology.values()), {} def _get_incidents(self) -> list[IncidentDto]: # Skipping incidents pulling when we're installed with routing_key if self.authentication_config.routing_key: return [] raw_incidents = self.__get_all_incidents_or_alerts() incidents = [] for incident in raw_incidents: incident_dto = PagerdutyProvider._format_incident( {"event": {"data": incident}} ) incident_alerts = self.__get_all_incidents_or_alerts( incident_id=incident_dto.fingerprint ) try: incident_alerts = [ PagerdutyProvider._format_alert(alert, None, force_new_format=True) for alert in incident_alerts ] incident_dto._alerts = incident_alerts except Exception: self.logger.exception( "Failed to format incident alerts", extra={ "provider_id": self.provider_id, "source_incident_id": incident_dto.fingerprint, "tenant_id": self.context_manager.tenant_id, "alerts": incident_alerts, }, ) incidents.append(incident_dto) return incidents @staticmethod def _get_incident_id(incident_id: str) -> str: """ Create a UUID from the incident id. Args: incident_id (str): The original incident id Returns: str: The UUID """ md5 = hashlib.md5() md5.update(incident_id.encode("utf-8")) return uuid.UUID(md5.hexdigest()) @staticmethod def _format_incident( event: dict, provider_instance: "BaseProvider" = None ) -> IncidentDto | list[IncidentDto]: event = event["event"]["data"] # This will be the same for the same incident original_incident_id = event.get("id") # https://github.com/keephq/keep/issues/4681 if not original_incident_id: logger.warning( "No incident id found in the event", extra={ "event": event, }, ) return [] incident_id = PagerdutyProvider._get_incident_id(original_incident_id) status = PagerdutyProvider.INCIDENT_STATUS_MAP.get( event.get("status", "firing"), IncidentStatus.FIRING ) urgency = event.get("urgency") priority_summary = (event.get("priority", {}) or {}).get("summary") if urgency is not None: severity = PagerdutyProvider.URGENCY_TO_INCIDENT_SEVERITY.get( urgency, IncidentSeverity.INFO ) elif priority_summary: severity = PagerdutyProvider.INCIDENT_SEVERITIES_MAP.get( priority_summary, IncidentSeverity.INFO ) else: severity = IncidentSeverity.INFO service = event.pop("service", {}).get("summary", "unknown") created_at = event.get("created_at") if created_at: created_at = datetime.datetime.fromisoformat(created_at) else: created_at = datetime.datetime.now(tz=datetime.timezone.utc) title = event.get("title") if not title: logger.warning( "No title found in the event", extra={ "event": event, }, ) return [] return IncidentDto( id=incident_id, creation_time=created_at, user_generated_name=f'PD-{event.get("title", "unknown")}-{original_incident_id}', status=status, severity=severity, alert_sources=["pagerduty"], alerts_count=event.get("alert_counts", {}).get("all", 0), services=[service], is_predicted=False, is_candidate=False, # This is the reference to the incident in PagerDuty fingerprint=original_incident_id, ) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os api_key = os.environ.get("PAGERDUTY_API_KEY") provider_config = { "authentication": {"api_key": api_key}, } provider = ProvidersFactory.get_provider( context_manager=context_manager, provider_id="keep-pd", provider_type="pagerduty", provider_config=provider_config, ) incidents = provider.get_incidents() print(len(incidents)) ================================================ FILE: keep/providers/pagertree_provider/__init__.py ================================================ ================================================ FILE: keep/providers/pagertree_provider/pagertree_provider.py ================================================ """ PagetreeProvider is a class that provides a way to read get alerts from Pagetree. """ import dataclasses from typing import Literal import pydantic import requests from keep.api.models.alert import AlertDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class PagertreeProviderAuthConfig: api_token: str = dataclasses.field( metadata={ "required": True, "description": "Your pagertree APIToken", "sensitive": True, }, default=None, ) class PagertreeProvider(BaseProvider): """Get all alerts from pagertree""" PROVIDER_DISPLAY_NAME = "PagerTree" PROVIDER_CATEGORY = ["Incident Management"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="The user can connect to the server and is authenticated using their API_Key", mandatory=True, alias="Authenticated with pagertree", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def __get_headers(self): return { "Accept": "application/json", "Authorization": f"Bearer {self.authentication_config.api_token}", } def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ try: response = requests.get( "https://api.pagertree.com/api/v4/alerts", headers=self.__get_headers() ) if response.status_code == 200: scopes = { "authenticated": True, } else: self.logger.error("Unable to authenticate user") scopes = { "authenticated": f"User not authorized, StatusCode: {response.status_code}", } except Exception as e: self.logger.error("Error validating scopes", extra={"error": str(e)}) scopes = { "authenticated": str(e), } return scopes def dispose(self): pass def validate_config(self): """ Validates required configuration for pgartree's provider. """ self.authentication_config = PagertreeProviderAuthConfig( **self.config.authentication ) def _get_alerts(self) -> list[AlertDto]: try: response = requests.get( "https://api.pagertree.com/api/v4/alerts", headers=self.__get_headers() ) if not response.ok: self.logger.error("Failed to get alerts", extra=response.json()) raise Exception("Could not get alerts") return [ AlertDto( id=alert["id"], status=alert["status"], severity=alert["urgency"], source=alert["source"], message=alert["title"], startedAt=alert["created_at"], description=alert["description"], ) for alert in response.json()["alerts"] ] except Exception as e: self.logger.error( "Error while getting PagerTree alerts", extra={"error": str(e)} ) raise e def __send_alert( self, title: str, description: str, urgency: Literal["low", "medium", "high", "critical"], destination_team_ids: list[str], destination_router_ids: list[str], destination_account_user_ids: list[str], status: Literal["queued", "open", "acknowledged", "resolved", "dropped"], **kwargs: dict, ): """ Sends PagerDuty Alert Args: title: Title of the alert. description: UTF-8 string of custom message for alert. Shown in incident description urgency: low|medium|high|critical destination_team_ids: destination team_ids to send alert to destination_router_ids: destination router_ids to send alert to destination_account_user_ids: destination account_users_ids to send alert to status: alert status to send """ response = requests.post( "https://api.pagertree.com/api/v4/alerts", headers=self.__get_headers(), data={ "title": title, "description": description, "urgency": urgency, "destination_team_ids": destination_team_ids, "destination_router_ids": destination_router_ids, "destination_account_user_ids": destination_account_user_ids, "status": status, **kwargs, }, ) if not response.ok: self.logger.error("Failed to send alert", extra={"error": response.json()}) self.logger.info("Alert status: %s", response.status_code) self.logger.info("Alert created successfully", response.json()) def __send_incident( self, title: str, incident_severity: str, incident_message: str, urgency: Literal["low", "medium", "high", "critical"], destination_team_ids: list[str], destination_router_ids: list[str], destination_account_user_ids: list[str], **kwargs: dict, ): """ Marking an alert as an incident communicates to your team members this alert is a greater degree of severity than a normal alert. Args: title: Title of the alert. description: UTF-8 string of custom message for alert. Shown in incident description urgency: low|medium|high|critical destination_team_ids: destination team_ids to send alert to destination_router_ids: destination router_ids to send alert to destination_account_user_ids: destination account_users_ids to send alert to """ response = requests.post( "https://api.pagertree.com/api/v4/alerts", headers=self.__get_headers(), data={ "title": title, "meta": { "incident": True, "incident_severity": incident_severity, "incident_message": incident_message, }, "urgency": urgency, "destination_team_ids": destination_team_ids, "destination_router_ids": destination_router_ids, "destination_account_user_ids": destination_account_user_ids, **kwargs, }, ) if not response.ok: self.logger.error( "Failed to send incident", extra={"error": response.json()} ) self.logger.info("Incident status: %s", response.status_code) self.logger.info("Incident created successfully", response.json()) def _notify( self, title: str, urgency: Literal["low", "medium", "high", "critical"], incident: bool = False, severities: Literal[ "SEV-1", "SEV-2", "SEV-3", "SEV-4", "SEV-5", "SEV_UNKNOWN" ] = "SEV-5", incident_message: str = "", description: str = "", status: Literal[ "queued", "open", "acknowledged", "resolved", "dropped" ] = "queued", destination_team_ids: list[str] = [], destination_router_ids: list[str] = [], destination_account_user_ids: list[str] = [], **kwargs: dict, ): """ Sends an alert or incident to PagerTree Args: title: Title of the alert. urgency: low|medium|high|critical incident: True if the alert is an incident severities: SEV-1|SEV-2|SEV-3|SEV-4|SEV-5|SEV_UNKNOWN incident_message: Message to be displayed in the incident description: UTF-8 string of custom message for alert. Shown in incident description status: alert status to send destination_team_ids: destination team_ids to send alert to destination_router_ids: destination router_ids to send alert to destination_account_user_ids: destination account_users_ids to send alert to **kwargs: Additional parameters to be passed """ if ( len(destination_team_ids) + len(destination_router_ids) + len(destination_account_user_ids) == 0 ): raise Exception( "at least 1 destination (Team, Router, or Account User) is required" ) if not incident: self.__send_alert( title, description, urgency, destination_team_ids, destination_router_ids, destination_account_user_ids, status, **kwargs, ) else: self.__send_incident( incident_message, severities, title, urgency, destination_team_ids, destination_router_ids, destination_account_user_ids, **kwargs, ) ================================================ FILE: keep/providers/parseable_provider/__init__.py ================================================ ================================================ FILE: keep/providers/parseable_provider/parseable_provider.py ================================================ """ Parseable Provider is a class that allows to ingest/digest data from Parseable. """ import dataclasses import datetime import logging import os from uuid import uuid4 import pydantic from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class ParseableProviderAuthConfig: """ Parseable authentication configuration. """ parseable_server: str = dataclasses.field( metadata={ "required": True, "description": "Parseable Frontend URL", "hint": "https://demo.parseable.io", "sensitive": False, } ) username: str = dataclasses.field( metadata={ "required": True, "description": "Parseable username", "sensitive": False, } ) password: str = dataclasses.field( metadata={ "required": True, "description": "Parseable password", "sensitive": True, } ) class ParseableProvider(BaseProvider): """Parseable provider to ingest data from Parseable.""" PROVIDER_CATEGORY = ["Monitoring"] webhook_description = "This is an example of how to configure an alert to be sent to Keep using Parseable's webhook feature. Post this to https://YOUR_PARSEABLE_SERVER/api/v1/logstream/YOUR_STREAM_NAME/alert" webhook_template = """{{ "version": "v1", "alerts": [ {{ "name": "Alert: Server side error", "message": "server reporting status as 500", "rule": {{ "type": "column", "config": {{ "column": "status", "operator": "=", "value": 500, "repeats": 2 }} }}, "targets": [ {{ "type": "webhook", "endpoint": "{keep_webhook_api_url}", "skip_tls_check": true, "repeat": {{ "interval": "10s", "times": 5 }}, "headers": {{"X-API-KEY": "{api_key}"}} }} ] }} ] }}""" SEVERITIES_MAP = { "disaster": AlertSeverity.CRITICAL, "high": AlertSeverity.HIGH, "average": AlertSeverity.WARNING, "low": AlertSeverity.LOW, } STATUS_MAP = { "firing": AlertStatus.FIRING, "resolved": AlertStatus.RESOLVED, "acknowledged": AlertStatus.ACKNOWLEDGED, "pending": AlertStatus.PENDING, "suppressed": AlertStatus.SUPPRESSED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Parseable provider. """ self.authentication_config = ParseableProviderAuthConfig( **self.config.authentication ) @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: environment = "unknown" id = event.pop("id", str(uuid4())) name = event.pop("alert", "") # map severity and status to keep's format status = ParseableProvider.STATUS_MAP.get( event.pop("status", None), AlertStatus.FIRING ) severity = ParseableProvider.SEVERITIES_MAP.get( event.pop("severity", "").lower(), AlertSeverity.INFO ) lastReceived = event.pop("last_received", datetime.datetime.now().isoformat()) decription = event.pop("failing_condition", "") tags = event.get("tags", {}) if isinstance(tags, dict): environment = tags.get("environment", "unknown") return AlertDto( **event, id=id, name=name, status=status, lastReceived=lastReceived, description=decription, environment=environment, pushed=True, source=["parseable"], severity=severity, ) @staticmethod def parse_event_raw_body(raw_body: bytes | dict) -> dict: """ Parse the raw body of the event. > b'Alert: Server side error triggered on teststream1\nMessage: server reporting status as 500\nFailing Condition: status column equal to abcd, 2 times' and we want to return an object > b"{'alert': 'Server side error triggered on teststream1', 'message': 'server reporting status as 500', 'failing_condition': 'status column equal to abcd, 2 times'}" Args: raw_body (bytes): the message in form of raw bytes sent by parseable server Returns: bytes: parseable bytes of dictionary for the rest of the flow """ logger = logging.getLogger(__name__) raw_body_string = raw_body.decode() raw_body_split = raw_body_string.split("\n") event = {} for line in raw_body_split: if line: try: key, value = line.split(": ") event[key.lower().replace(" ", "_")] = value except Exception as e: logger.error(f"Failed to parse line {line} with error {e}") return event if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os auth_token = os.environ.get("PARSEABLE_AUTH_TOKEN") provider_config = { "authentication": { "auth_token": auth_token, "parseable_frontend_url": "http://localhost", }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="parseable-prod", provider_type="parseable", provider_config=provider_config, ) ================================================ FILE: keep/providers/pingdom_provider/__init__.py ================================================ ================================================ FILE: keep/providers/pingdom_provider/pingdom_provider.py ================================================ import dataclasses import datetime import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderScope from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class PingdomProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "description": "Pingdom API Key", "sensitive": True, "required": True, }, ) class PingdomProvider(BaseProvider): "Get alerts from Pingdom." webhook_description = """Install Keep as Pingdom webhook 1. Go to Settings > Integrations. 2. Click Add Integration. 3. Enter: Type = Webhook Name = Keep URL = {keep_webhook_api_url_with_auth} 4. Click Save Integration. """ webhook_template = """""" PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="read", description="Read alerts from Pingdom.", mandatory=True, ), ] # N/A SEVERITIES_MAP = {} STATUS_MAP = { "down": AlertStatus.FIRING, "up": AlertStatus.RESOLVED, "paused": AlertStatus.SUPPRESSED, } def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def validate_config(self): """ Validate provider configuration specific to Pingdom. """ self.authentication_config = PingdomProviderAuthConfig( **self.config.authentication ) def dispose(self): """ Dispose provider resources. """ pass def _get_headers(self): """ Helper method to get headers for Pingdom API requests. """ return { "Authorization": f"Bearer {self.authentication_config.api_key}", } def validate_scopes(self) -> dict[str, bool | str]: """ Validate Pingdom scopes. """ # try get alerts from pingdom try: self.get_alerts() return { "read": True, } except Exception as e: return {"read": str(e)} def _get_alerts(self) -> list[AlertDto]: """ Retrieve alerts from Pingdom. """ # Example API call to Pingdom to retrieve alerts alerts_response = requests.get( "https://api.pingdom.com/api/3.1/actions", headers=self._get_headers() ) alerts_response.raise_for_status() alerts = alerts_response.json().get("actions", {}).get("alerts") checks_response = requests.get( "https://api.pingdom.com/api/3.1/checks", headers=self._get_headers() ) checks_response.raise_for_status() checks = checks_response.json().get("checks", []) alerts_dtos = [] for alert in alerts: check_name = next( ( check.get("name") for check in checks if check.get("id") == alert.get("checkid") ), None, ) # map severity and status to keep's format description = alert.get("messagefull") status = alert.get("messageshort") if status not in PingdomProvider.STATUS_MAP.keys(): self.logger.warning( f"Unknown status {status} for alert {alert.get('id')}" ) if "UP" in description: status = "up" elif "DOWN" in description: status = "down" else: self.logger.warning( f"Unknown status {status} for alert {alert.get('id')}" ) status = "down" status = PingdomProvider.STATUS_MAP.get(status, AlertStatus.FIRING) # its N/A but maybe in the future we will have it severity = PingdomProvider.SEVERITIES_MAP.get( alert.get("severity"), AlertSeverity.INFO ) if "time" in alert: last_received = datetime.datetime.fromtimestamp( alert.get("time"), tz=datetime.timezone.utc ).isoformat() else: last_received = datetime.datetime.now().isoformat() alert_dto = AlertDto( id=alert.get("checkid"), fingerprint=str(alert.get("checkid")), name=check_name, severity=severity, status=status, lastReceived=last_received, description=description, charged=alert.get("charged"), source=["pingdom"], username=alert.get("username"), userid=alert.get("userid"), via=alert.get("via"), alert=alert, # keep the original alert ) alerts_dtos.append(alert_dto) return alerts_dtos @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: # https://pingdom.com/resources/webhooks/#Examples-of-webhook-JSON-output-for-uptime-checks # map severity and status to keep's format state = event.get("current_state") if state is None: provider_instance.logger.warning("'current_state' missing from payload.") state = "" else: state = state.lower() # map the pingdom status to keep's, fallback if status somehow is missing status = PingdomProvider.STATUS_MAP.get(state) if status is None: long_desc = (event.get("long_description") or "").strip() if long_desc == "OK": status = AlertStatus.RESOLVED else: status = AlertStatus.FIRING # its N/A but maybe in the future we will have it severity = PingdomProvider.SEVERITIES_MAP.get( event.get("importance_level"), AlertSeverity.INFO ) if "time" in event: last_received = datetime.datetime.fromtimestamp( event.get("time"), tz=datetime.timezone.utc ).isoformat() else: last_received = datetime.datetime.now().isoformat() alert = AlertDto( id=event.get("check_id"), fingerprint=str(event.get("check_id")), name=event.get("check_name"), status=status, severity=severity, lastReceived=last_received, description=event.get("long_description"), source=["pingdom"], check_params=event.get("check_params", {}), check_type=event.get("check_type", None), short_description=event.get("description", None), previous_status=event.get("previous_state", None), tags=event.get("tags", []), version=event.get("version", 1), state_changed_utc_time=event.get("state_changed_utc_time", None), state_changed_timestamp=event.get("state_changed_timestamp", None), custom_message=event.get("custom_message", None), first_probe=event.get("first_probe", None), second_probe=event.get("second_probe", None), alert=event, # keep the original alert ) return alert if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) # Load environment variables import os api_key = os.environ.get("PINGDOM_API_KEY") if not api_key: raise Exception("PINGDOM_API_KEY environment variable is not set") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = {"authentication": {"api_key": api_key}} provider = ProvidersFactory.get_provider( context_manager, provider_id="pingdom-keephq", provider_type="pingdom", provider_config=config, ) scopes = provider.validate_scopes() alerts = provider.get_alerts() print(alerts) ================================================ FILE: keep/providers/planner_provider/__init__.py ================================================ ================================================ FILE: keep/providers/planner_provider/planner_provider.py ================================================ """ PlannerProvider is a class that provides a way to read data from Microsoft Planner and create tasks in planner. """ import dataclasses from urllib.parse import urljoin import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class PlannerProviderAuthConfig: """Planner authentication configuration.""" tenant_id: str | None = dataclasses.field( metadata={ "required": True, "description": "Planner Tenant ID", "sensitive": True, }, ) client_id: str | None = dataclasses.field( metadata={ "required": True, "description": "Planner Client ID", "sensitive": True, } ) client_secret: str | None = dataclasses.field( metadata={ "required": True, "description": "Planner Client Secret", "sensitive": True, } ) class PlannerProvider(BaseProvider): """ Create tasks in Microsoft Planner. """ PROVIDER_DISPLAY_NAME = "Microsoft Planner" MS_GRAPH_BASE_URL = "https://graph.microsoft.com" MS_PLANS_URL = urljoin(base=MS_GRAPH_BASE_URL, url="/v1.0/planner/plans") MS_TASKS_URL = urljoin(base=MS_GRAPH_BASE_URL, url="/v1.0/planner/tasks") MS_AUTH_BASE_URL = "https://login.microsoftonline.com" MS_GRAPH_RESOURCE = "https://graph.microsoft.com" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.__access_token = self.__generate_access_token() self.__headers = { "Authorization": f"Bearer {self.__access_token}", "Content-Type": "application/json", } def __generate_access_token(self): """ Helper method to generate the access token. """ MS_TOKEN_URL = urljoin( base=self.MS_AUTH_BASE_URL, url=f"/{self.authentication_config.tenant_id}/oauth2/token", ) request_body = { "grant_type": "client_credentials", "client_id": self.authentication_config.client_id, "client_secret": self.authentication_config.client_secret, "resource": self.MS_GRAPH_RESOURCE, } self.logger.info("Generating planner access token...") response = requests.post(url=MS_TOKEN_URL, data=request_body) response.raise_for_status() response_data = response.json() if "access_token" in response_data: self.logger.info("Generated planner access token.") return response_data["access_token"] return None def dispose(self): pass def validate_config(self): self.authentication_config = PlannerProviderAuthConfig( **self.config.authentication ) def __get_plan_by_id(self, plan_id=""): """ Helper method to fetch the plan details by id. """ MS_PLAN_URL = f"{self.MS_PLANS_URL}/{plan_id}" self.logger.info(f"Fetching plan by id: {plan_id}") response = requests.get(url=MS_PLAN_URL, headers=self.__headers) # in case of error response response.raise_for_status() response_data = response.json() self.logger.info(f"Fetched plan by id: {plan_id}") return response_data def __create_task(self, plan_id="", title="", bucket_id=None): """ Helper method to create a task in Planner. """ request_body = {"planId": plan_id, "title": title, "bucketId": bucket_id} self.logger.info(f"Creating new task with title: {title}") response = requests.post( url=self.MS_TASKS_URL, headers=self.__headers, json=request_body ) # in case of error response response.raise_for_status() response_data = response.json() self.logger.info( "Created new task with id:%s and title:%s", response_data["id"], response_data["title"], ) return response_data def _notify(self, plan_id="", title="", bucket_id=None, **kwargs: dict): # to verify if the plan with plan_id exists or not self.__get_plan_by_id(plan_id=plan_id) # create a new task in given plan created_task = self.__create_task( plan_id=plan_id, title=title, bucket_id=bucket_id ) return created_task if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os planner_client_id = os.environ.get("PLANNER_CLIENT_ID") planner_client_secret = os.environ.get("PLANNER_CLIENT_SECRET") planner_tenant_id = os.environ.get("PLANNER_TENANT_ID") config = { "authentication": { "client_id": planner_client_id, "client_secret": planner_client_secret, "tenant_id": planner_tenant_id, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="planner-keephq", provider_type="planner", provider_config=config, ) result = provider.notify(plan_id="YOUR_PLANNER_ID", title="Keep HQ Task1") print(result) ================================================ FILE: keep/providers/postgres_provider/__init__.py ================================================ ================================================ FILE: keep/providers/postgres_provider/postgres_provider.py ================================================ """ PostgresProvider is a class that provides a way to read data from Postgres and write queries to Postgres. """ import dataclasses import os import psycopg2 import pydantic from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod from keep.validation.fields import NoSchemeUrl, UrlPort @pydantic.dataclasses.dataclass class PostgresProviderAuthConfig: username: str = dataclasses.field( metadata={"required": True, "description": "Postgres username"} ) password: str = dataclasses.field( metadata={ "required": True, "description": "Postgres password", "sensitive": True, } ) host: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "Postgres hostname", "validation": "no_scheme_url", } ) database: str | None = dataclasses.field( metadata={"required": False, "description": "Postgres database name"}, default=None, ) port: UrlPort | None = dataclasses.field( default=5432, metadata={ "required": False, "description": "Postgres port", "validation": "port", }, ) class PostgresProvider(BaseProvider): """Enrich alerts with data from Postgres.""" PROVIDER_DISPLAY_NAME = "PostgreSQL" PROVIDER_CATEGORY = ["Database"] PROVIDER_SCOPES = [ ProviderScope( name="connect_to_server", description="The user can connect to the server", mandatory=True, alias="Connect to the server", ) ] PROVIDER_METHODS = [ ProviderMethod( name="query", func_name="execute_query", description="Query the Postgres database", type="view", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.conn = None def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ try: conn = self.__init_connection() conn.close() scopes = { "connect_to_server": True, } except Exception as e: self.logger.exception("Error validating scopes") scopes = { "connect_to_server": str(e), } return scopes def execute_query(self, query: str): return self._query(query) def __init_connection(self): """ Generates a Postgres connection. Returns: psycopg2 connection object """ conn = psycopg2.connect( dbname=self.authentication_config.database, user=self.authentication_config.username, password=self.authentication_config.password, host=self.authentication_config.host, port=self.authentication_config.port, connect_timeout=10, ) self.conn = conn return conn def dispose(self): try: self.conn.close() except Exception: self.logger.exception("Error closing Postgres connection") def validate_config(self): """ Validates required configuration for Postgres's provider. """ self.authentication_config = PostgresProviderAuthConfig( **self.config.authentication ) def _query(self, query: str, **kwargs: dict) -> list | tuple: """ Executes a query against the Postgres database. Returns: list | tuple: list of results or single result if single_row is True """ if not query: raise ValueError("Query is required") conn = self.__init_connection() try: with conn.cursor() as cur: # Open a cursor to perform database operations cur = conn.cursor() # Execute a simple query cur.execute(query) # Fetch the results results = cur.fetchall() # Close the cursor and connection cur.close() conn.close() return list(results) finally: # Close the database connection conn.close() def _notify(self, query: str, **kwargs): """ Notifies the Postgres database. """ # notify and query are the same for Postgres if not query: raise ValueError("Query is required") conn = self.__init_connection() try: with conn.cursor() as cur: # Open a cursor to perform database operations cur = conn.cursor() # Execute a simple query cur.execute(query) # Close the cursor and connection cur.close() conn.commit() conn.close() finally: # Close the database connection conn.close() if __name__ == "__main__": config = ProviderConfig( authentication={ "username": os.environ.get("POSTGRES_USER"), "password": os.environ.get("POSTGRES_PASSWORD"), "host": os.environ.get("POSTGRES_HOST"), "database": os.environ.get("POSTGRES_DATABASE"), } ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) postgres_provider = PostgresProvider(context_manager, "postgres-prod", config) results = postgres_provider.query(query="select * from disk") print(results) ================================================ FILE: keep/providers/posthog_provider/__init__.py ================================================ ================================================ FILE: keep/providers/posthog_provider/posthog_provider.py ================================================ import dataclasses from collections import Counter from datetime import datetime, timedelta from urllib.parse import urlparse import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod @pydantic.dataclasses.dataclass class PosthogProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "PostHog API key", "hint": "https://posthog.com/docs/api/overview", "sensitive": True, }, ) project_id: str = dataclasses.field( metadata={ "required": True, "description": "PostHog project ID", "hint": "Found in your PostHog project settings", }, ) class PosthogProvider(BaseProvider, ProviderHealthMixin): """Query data from PostHog analytics.""" PROVIDER_DISPLAY_NAME = "PostHog" PROVIDER_CATEGORY = ["Analytics"] PROVIDER_SCOPES = [ ProviderScope( name="session_recording:read", description="Read PostHog session recordings", mandatory=True, alias="Read session recordings", ), ProviderScope( name="session_recording_playlist:read", description="Read PostHog session recording playlists", mandatory=False, alias="Read recording playlists", ), ProviderScope( name="project:read", description="Read PostHog project data", mandatory=True, alias="Read project data", ), ] PROVIDER_METHODS = [ ProviderMethod( name="Get Session Recording Domains", func_name="get_session_recording_domains", scopes=["session_recording:read", "project:read"], description="Get a list of domains from session recordings within a time period", type="action", ), ProviderMethod( name="Get Session Recordings", func_name="get_session_recordings", scopes=["session_recording:read", "project:read"], description="Get session recordings within a time period", type="action", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.base_url = "https://app.posthog.com/api" self.headers = { "Authorization": f"Bearer {self.authentication_config.api_key}", "Content-Type": "application/json", } def validate_scopes(self): scopes = {} self.logger.info("Validating scopes") try: # Test project access project_url = ( f"{self.base_url}/projects/{self.authentication_config.project_id}" ) project_response = requests.get(project_url, headers=self.headers) if project_response.status_code == 200: scopes["project:read"] = True else: scopes["project:read"] = ( f"Failed to access project data: {project_response.status_code}" ) # Test session recording access recordings_url = f"{self.base_url}/projects/{self.authentication_config.project_id}/session_recordings" params = {"limit": 1} recordings_response = requests.get( recordings_url, headers=self.headers, params=params ) if recordings_response.status_code == 200: scopes["session_recording:read"] = True else: scopes["session_recording:read"] = ( f"Failed to access session recordings: {recordings_response.status_code}" ) # Test session recording playlist access playlists_url = f"{self.base_url}/projects/{self.authentication_config.project_id}/session_recording_playlists" playlists_response = requests.get(playlists_url, headers=self.headers) if playlists_response.status_code == 200: scopes["session_recording_playlist:read"] = True else: scopes["session_recording_playlist:read"] = ( f"Failed to access recording playlists: {playlists_response.status_code}" ) except Exception as e: self.logger.exception("Failed to validate PostHog scopes") for scope in [ "project:read", "session_recording:read", "session_recording_playlist:read", ]: if scope not in scopes: scopes[scope] = str(e) return scopes def validate_config(self): self.authentication_config = PosthogProviderAuthConfig( **self.config.authentication ) def get_session_recording_domains( self, hours: int = 24, limit: int = 500, ): """ Get a list of domains from session recordings within a specified time period. Args: hours (int): Number of hours to look back (default: 24) limit (int): Maximum number of recordings to fetch (default: 100) Returns: dict: Dictionary containing unique domains and their frequency """ self.logger.info( f"Fetching session recording domains for the last {hours} hours" ) # Calculate time range end_time = datetime.now() start_time = end_time - timedelta(hours=hours) # Format timestamps for API start_timestamp = start_time.isoformat() + "Z" # ISO format with Z for UTC end_timestamp = end_time.isoformat() + "Z" # API endpoint recordings_endpoint = f"{self.base_url}/projects/{self.authentication_config.project_id}/session_recordings" # API request parameters params = { "date_from": start_timestamp, "date_to": end_timestamp, "limit": limit, } # Make initial request response = requests.get( recordings_endpoint, params=params, headers=self.headers ) if response.status_code != 200: self.logger.error( "Failed to fetch session recordings", extra={"status_code": response.status_code, "response": response.text}, ) raise Exception( f"API request failed with status code {response.status_code}: {response.text}" ) # Parse response data = response.json() recordings = data.get("results", []) # Handle pagination if needed while data.get("next") and recordings and len(recordings) < limit: response = requests.get(data["next"], headers=self.headers) if response.status_code == 200: data = response.json() recordings.extend(data.get("results", [])) else: self.logger.error( "Failed to fetch additional session recordings", extra={"status_code": response.status_code}, ) break # Extract domains from each recording domains = set() for recording in recordings: # Get recording details to extract URLs recording_id = recording.get("id") parsed_url = urlparse(recording["start_url"]) domain = parsed_url.netloc if domain: domains.add(domain) else: print(f"No domain found for recording ID {recording_id}") # Count domain frequencies domain_counter = Counter(domains) # Get unique domains unique_domains = list(domain_counter.keys()) return { "unique_domains": unique_domains, "domain_counts": dict(domain_counter), "total_domains_found": len(domains), "unique_domains_count": len(unique_domains), } def get_session_recordings( self, hours: int = 24, limit: int = 100, ): """ Get session recordings within a specified time period. Args: hours (int): Number of hours to look back (default: 24) limit (int): Maximum number of recordings to fetch (default: 100) Returns: dict: Dictionary containing session recordings data """ self.logger.info(f"Fetching session recordings for the last {hours} hours") # Calculate time range end_time = datetime.now() start_time = end_time - timedelta(hours=hours) # Format timestamps for API start_timestamp = start_time.isoformat() + "Z" # ISO format with Z for UTC end_timestamp = end_time.isoformat() + "Z" # API endpoint recordings_endpoint = f"{self.base_url}/projects/{self.authentication_config.project_id}/session_recordings" # API request parameters params = { "date_from": start_timestamp, "date_to": end_timestamp, "limit": limit, } # Make initial request response = requests.get( recordings_endpoint, params=params, headers=self.headers ) if response.status_code != 200: self.logger.error( "Failed to fetch session recordings", extra={"status_code": response.status_code, "response": response.text}, ) raise Exception( f"API request failed with status code {response.status_code}: {response.text}" ) # Parse response data = response.json() recordings = data.get("results", []) # Handle pagination if needed while data.get("next") and recordings and len(recordings) < limit: response = requests.get(data["next"], headers=self.headers) if response.status_code == 200: data = response.json() recordings.extend(data.get("results", [])) else: self.logger.error( "Failed to fetch additional session recordings", extra={"status_code": response.status_code}, ) break # Summarize basic information for each recording recording_summaries = [] for recording in recordings: recording_summaries.append( { "id": recording.get("id"), "start_time": recording.get("start_time"), "end_time": recording.get("end_time"), "duration": recording.get("duration"), "person": recording.get("person"), "start_url": recording.get("start_url"), } ) return { "recordings": recording_summaries, "total_recordings": len(recording_summaries), "time_range": {"start": start_timestamp, "end": end_timestamp}, } def _query(self, query_type="", hours=24, limit=100, **kwargs: dict): """ Query PostHog data. Args: query_type (str): Type of query (e.g., "session_recording_domains", "session_recordings") hours (int): Number of hours to look back limit (int): Maximum number of items to fetch **kwargs: Additional arguments Returns: dict: Query results """ if query_type == "session_recording_domains": return self.get_session_recording_domains(hours=hours, limit=limit) elif query_type == "session_recordings": return self.get_session_recordings(hours=hours, limit=limit) else: raise NotImplementedError(f"Query type {query_type} not implemented") def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass if __name__ == "__main__": # Output debug messages import logging import os logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables posthog_api_key = os.environ.get("POSTHOG_API_KEY") posthog_project_id = os.environ.get("POSTHOG_PROJECT_ID") assert posthog_api_key assert posthog_project_id # Initialize the provider and provider config config = ProviderConfig( description="PostHog Provider", authentication={"api_key": posthog_api_key, "project_id": posthog_project_id}, ) provider = PosthogProvider( context_manager, provider_id="posthog-test", config=config ) # Query session recording domains domains_result = provider.query( query_type="session_recording_domains", hours=24, limit=100 ) print(f"Found {len(domains_result['unique_domains'])} unique domains:") for domain, count in domains_result["domain_counts"].items(): print(f"{domain}: {count} occurrences") ================================================ FILE: keep/providers/prometheus_provider/__init__.py ================================================ ================================================ FILE: keep/providers/prometheus_provider/alerts_mock.py ================================================ ALERTS = { "HighCPUUsage": { "payload": { "summary": "CPU usage is over 90%", "labels": { "instance": "example1", "job": "example2", "workload": "somecoolworkload", "severity": "critical", }, }, "parameters": { "labels.host": ["host1", "host2", "host3"], "labels.service": [ "calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db", "ftp", "payments", ], "labels.instance": ["instance1", "instance2", "instance3"], }, }, "MQThirdFull (Message queue is over 33%)": { "payload": { "summary": "Message queue is over 33% capacity", "labels": {"severity": "warning", "customer_id": "acme"}, }, "parameters": { "labels.queue": ["queue1", "queue2", "queue3"], "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "queue"], "labels.mq_manager": ["mq_manager1", "mq_manager2", "mq_manager3"], }, }, "MQFull (Message queue is full)": { "payload": { "summary": "Message queue is over 90% capacity", "labels": {"severity": "critical", "customer_id": "acme"}, }, "parameters": { "labels.queue": ["queue4"], "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "queue"], "labels.mq_manager": ["mq_manager4"], }, }, "DiskSpaceLow": { "payload": { "summary": "Disk space is below 20%", "labels": { "severity": "warning", }, }, "parameters": { "labels.host": ["host1", "host2", "host3"], "labels.service": [ "calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db", "ftp", "payments", ], "labels.instance": ["instance1", "instance2", "instance3"], }, }, "NetworkLatencyHigh": { "payload": { "summary": "Network latency is higher than normal for customer_id:acme", "labels": { "severity": "info", }, }, "parameters": { "labels.host": ["host1", "host2", "host3"], "labels.service": [ "calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db", ], "labels.instance": ["instance1", "instance2", "instance3"], }, }, } ================================================ FILE: keep/providers/prometheus_provider/prometheus_provider.py ================================================ """ PrometheusProvider is a class that provides a way to read data from Prometheus. """ import dataclasses import datetime import os import pydantic import requests from requests.auth import HTTPBasicAuth from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider, ProviderHealthMixin from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class PrometheusProviderAuthConfig: url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Prometheus server URL", "hint": "https://prometheus-us-central1.grafana.net/api/prom", "validation": "any_http_url", } ) username: str = dataclasses.field( metadata={ "description": "Prometheus username", "sensitive": False, }, default="", ) password: str = dataclasses.field( metadata={ "description": "Prometheus password", "sensitive": True, }, default="", ) verify: bool = dataclasses.field( metadata={ "description": "Verify SSL certificates", "hint": "Set to false to allow self-signed certificates", "sensitive": False, }, default=True, ) class PrometheusProvider(BaseProvider, ProviderHealthMixin): """Get alerts from Prometheus into Keep.""" webhook_description = "This provider takes advantage of configurable webhooks available with Prometheus Alertmanager. Use the following template to configure AlertManager:" webhook_template = """route: receiver: "keep" group_by: ['alertname'] group_wait: 15s group_interval: 15s repeat_interval: 1m continue: true receivers: - name: "keep" webhook_configs: - url: '{keep_webhook_api_url}' send_resolved: true http_config: basic_auth: username: api_key password: {api_key}""" SEVERITIES_MAP = { "critical": AlertSeverity.CRITICAL, "error": AlertSeverity.HIGH, "high": AlertSeverity.HIGH, "warning": AlertSeverity.WARNING, "medium": AlertSeverity.WARNING, "info": AlertSeverity.INFO, "low": AlertSeverity.LOW, } PROVIDER_CATEGORY = ["Monitoring"] STATUS_MAP = { "firing": AlertStatus.FIRING, "resolved": AlertStatus.RESOLVED, } PROVIDER_SCOPES = [ ProviderScope( name="connectivity", description="Connectivity Test", mandatory=True ) ] FINGERPRINT_FIELDS = ["fingerprint"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): """ Validates required configuration for Prometheus's provider. """ self.authentication_config = PrometheusProviderAuthConfig( **self.config.authentication ) def validate_scopes(self) -> dict[str, bool | str]: validated_scopes = {"connectivity": True} try: self._get_alerts() except Exception as e: validated_scopes["connectivity"] = str(e) return validated_scopes def _query(self, query): """ Executes a query against the Prometheus server. Returns: list | tuple: list of results or single result if single_row is True """ if not query: raise ValueError("Query is required") auth = None if self.authentication_config.username and self.authentication_config.password: auth = HTTPBasicAuth( self.authentication_config.username, self.authentication_config.password ) response = requests.get( f"{self.authentication_config.url}/api/v1/query", params={"query": query}, auth=( auth if self.authentication_config.username and self.authentication_config.password else None ), verify=self.authentication_config.verify, ) if response.status_code != 200: raise Exception(f"Prometheus query failed: {response.content}") return response.json() def _get_alerts(self) -> list[AlertDto]: auth = None if self.authentication_config.username and self.authentication_config.password: auth = HTTPBasicAuth( self.authentication_config.username, self.authentication_config.password ) response = requests.get( f"{self.authentication_config.url}/api/v1/alerts", auth=auth, verify=self.authentication_config.verify, ) response.raise_for_status() if not response.ok: return [] alerts_data = response.json().get("data", {}) alert_dtos = self._format_alert(alerts_data) return alert_dtos @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> list[AlertDto]: # TODO: need to support more than 1 alert per event alert_dtos = [] if isinstance(event, list): return event else: alerts = event.get("alerts", [event]) for alert in alerts: alert_id = alert.get("id", alert.get("labels", {}).get("alertname")) description = alert.get("annotations", {}).pop( "description", None ) or alert.get("annotations", {}).get("summary", alert_id) labels = {k.lower(): v for k, v in alert.pop("labels", {}).items()} annotations = { k.lower(): v for k, v in alert.pop("annotations", {}).items() } service = labels.get("service", annotations.get("service", None)) # map severity and status to keep's format status = alert.pop("state", None) or alert.pop("status", None) status = PrometheusProvider.STATUS_MAP.get(status, AlertStatus.FIRING) severity = PrometheusProvider.SEVERITIES_MAP.get( labels.get("severity"), AlertSeverity.INFO ) alert_dto = AlertDto( id=alert_id, name=alert_id, description=description, status=status, service=service, lastReceived=datetime.datetime.now( tz=datetime.timezone.utc ).isoformat(), environment=labels.pop("environment", "unknown"), severity=severity, source=["prometheus"], labels=labels, annotations=annotations, # annotations can be used either by alert.annotations.some_annotation or by alert.some_annotation payload=alert, fingerprint=alert.pop("fingerprint", None), **alert, # rest of the fields ) for label in labels: if getattr(alert_dto, label, None) is not None: continue setattr(alert_dto, label, labels[label]) # Always set these as "" when absent so workflow templates can # reference them safely without triggering render_context safe=True errors. for _field in ("value", "instance", "job"): if getattr(alert_dto, _field, None) is None: setattr(alert_dto, _field, "") alert_dtos.append(alert_dto) return alert_dtos def dispose(self): """ Disposes of the Prometheus provider. """ return def notify(self, **kwargs): """ Notifies the Prometheus server. """ raise NotImplementedError("Prometheus provider does not support notify()") @classmethod def simulate_alert(cls, **kwargs) -> dict: """Mock a Prometheus alert.""" import hashlib import json import random from keep.providers.prometheus_provider.alerts_mock import ALERTS alert_type = kwargs.get("alert_type") if not alert_type: alert_type = random.choice(list(ALERTS.keys())) to_wrap_with_provider_type = kwargs.get("to_wrap_with_provider_type") alert_payload = ALERTS[alert_type]["payload"] alert_parameters = ALERTS[alert_type].get("parameters", []) # now generate some random data for parameter, parameter_options in alert_parameters.items(): # choose random param # support "labels.some_label" format if "." in parameter: # nested parameter parameter = parameter.split(".") if parameter[0] not in alert_payload: alert_payload[parameter[0]] = {} alert_payload[parameter[0]][parameter[1]] = random.choice( parameter_options ) else: alert_payload[parameter] = random.choice(parameter_options) annotations = {"summary": alert_payload["summary"]} alert_payload["labels"]["alertname"] = alert_type alert_payload["status"] = random.choice( [AlertStatus.FIRING.value, AlertStatus.RESOLVED.value] ) alert_payload["annotations"] = annotations alert_payload["startsAt"] = datetime.datetime.now( tz=datetime.timezone.utc ).isoformat() alert_payload["endsAt"] = "0001-01-01T00:00:00Z" alert_payload["generatorURL"] = "http://example.com/graph?g0.expr={}".format( alert_type ) # TODO: use BaseProvider's get_alert_fingerprint fingerprint_src = json.dumps(alert_payload["labels"], sort_keys=True) fingerprint = hashlib.md5(fingerprint_src.encode()).hexdigest() alert_payload["fingerprint"] = fingerprint if to_wrap_with_provider_type: return {"keep_source_type": "prometheus", "event": alert_payload} return alert_payload if __name__ == "__main__": config = ProviderConfig( authentication={ "url": os.environ.get("PROMETHEUS_URL"), "username": os.environ.get("PROMETHEUS_USER"), "password": os.environ.get("PROMETHEUS_PASSWORD"), "verify": os.environ.get("PROMETHEUS_VERIFY", "True").lower() == "true", } ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) prometheus_provider = PrometheusProvider(context_manager, "prometheus-prod", config) results = prometheus_provider.query( query="sum by (job) (rate(prometheus_http_requests_total[5m]))" ) results = prometheus_provider.query( query='Number_of_webhooks{name="Number of webhooks"}' ) print(results) ================================================ FILE: keep/providers/providers_factory.py ================================================ """ The providers factory module. """ import copy import datetime import importlib import inspect import json import keyword import logging import os import types import typing from dataclasses import _MISSING_TYPE, fields from typing import get_args from keep.api.core.config import config from keep.api.core.db import ( get_consumer_providers, get_installed_providers, get_linked_providers, get_provider_by_type_and_id, ) from keep.api.models.alert import DeduplicationRuleDto from keep.api.models.provider import Provider from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import ( BaseIncidentProvider, BaseProvider, BaseTopologyProvider, ) from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethodDTO, ProviderMethodParam from keep.secretmanager.secretmanagerfactory import SecretManagerFactory PROVIDERS_CACHE_FILE = os.environ.get("PROVIDERS_CACHE_FILE", "providers_cache.json") READ_ONLY_MODE = config("KEEP_READ_ONLY", default="false") == "true" logger = logging.getLogger(__name__) def get_method_parameters_safe(raw_params: list[str]) -> list[str]: safe_params = [] for param in raw_params: if param == "self": continue if param.endswith("_") and keyword.iskeyword(param[:-1]): safe_params.append(param[:-1]) else: safe_params.append(param) return safe_params class ProviderConfigurationException(Exception): pass class ProvidersFactory: _loaded_providers_cache = None _loaded_deduplication_rules_cache = None @staticmethod def get_provider_class( provider_type: str, ) -> BaseProvider | BaseTopologyProvider | BaseIncidentProvider: provider_type_split = provider_type.split( "." ) # e.g. "cloudwatch.logs" or "cloudwatch.metrics" actual_provider_type = provider_type_split[ 0 ] # provider type is always the first part module = importlib.import_module( f"keep.providers.{actual_provider_type}_provider.{actual_provider_type}_provider" ) # If the provider type doesn't include a sub-type, e.g. "cloudwatch.logs" if len(provider_type_split) == 1: provider_class = getattr( module, actual_provider_type.title().replace("_", "") + "Provider" ) # If the provider type includes a sub-type, e.g. "cloudwatch.metrics" else: provider_class = getattr( module, actual_provider_type.title().replace("_", "") + provider_type_split[1].title().replace("_", "") + "Provider", ) return provider_class @staticmethod def get_provider( context_manager: ContextManager, provider_id: str, provider_type: str, provider_config: dict, **kwargs, ) -> BaseProvider | BaseTopologyProvider | BaseIncidentProvider: """ Get the instantiated provider class according to the provider type. Args: provider (dict): The provider configuration. Returns: BaseProvider: The provider class. """ provider_class = ProvidersFactory.get_provider_class(provider_type) # we keep a copy of the auth config so we can check if the provider has changed it and we need to update it # an example for that is the Datadog provider that uses OAuth and needs to save the fresh new refresh token. provider_config_copy = copy.deepcopy(provider_config) provider_config: ProviderConfig = ProviderConfig(**provider_config) try: provider = provider_class( context_manager=context_manager, provider_id=provider_id, config=provider_config, ) return provider except TypeError as exc: error_message = f"Configuration problem while trying to initialize the provider {provider_id}. Probably missing provider config, please check the provider configuration." logging.getLogger(__name__).error(error_message) raise ProviderConfigurationException(exc) except Exception as exc: raise exc finally: # if the provider has changed the auth config, we need to update it, even if the provider failed to initialize if ( provider_config_copy.get("authentication") != provider_config.authentication ): provider_config_copy["authentication"] = provider_config.authentication secret_manager = SecretManagerFactory.get_secret_manager( context_manager ) secret_manager.write_secret( secret_name=f"{context_manager.tenant_id}_{provider_type}_{provider_id}", secret_value=json.dumps(provider_config_copy), ) @staticmethod def get_provider_required_config(provider_type: str) -> dict: """ Get the provider class from the provider type. Args: provider (dict): The provider configuration. Returns: BaseProvider: The provider class. """ # support for provider types with subtypes e.g. auth0.logs, github.stars # todo: if some day there will be different conf for auth0.logs and auth0.users, this will need to be revisited if "." in provider_type: provider_type = provider_type.split(".")[0] module = importlib.import_module( f"keep.providers.{provider_type}_provider.{provider_type}_provider" ) try: provider_auth_config_class = getattr( module, provider_type.title().replace("_", "") + "ProviderAuthConfig" ) return provider_auth_config_class except (ImportError, AttributeError): logging.getLogger(__name__).debug( f"Provider {provider_type} does not have a provider auth config class" ) return {} def _get_method_param_type(param: inspect.Parameter) -> str: """ Get the type name from a function parameter annotation. Handles generic types like Union by returning the first non-NoneType arg. Falls back to 'str' if it can't determine the type. Args: param (inspect.Parameter): The parameter to get the type from. Returns: str: The type name. """ annotation_type = param.annotation if annotation_type is inspect.Parameter.empty: # if no annotation, defaults to str return "str" if isinstance(annotation_type, type): # it's a simple type return annotation_type.__name__ annotation_type_origin = typing.get_origin(annotation_type) annotation_type_args = typing.get_args(annotation_type) if annotation_type_args and annotation_type_origin in [ typing.Union, types.UnionType, ]: # get the first annotation type argument which type is not NoneType arg_type = next( item.__name__ for item in annotation_type_args if item.__name__ != "NoneType" ) return arg_type else: # otherwise fallback to str return "str" def __get_methods(provider_class: BaseProvider) -> list[ProviderMethodDTO]: methods = [] for method in provider_class.PROVIDER_METHODS: params = dict( inspect.signature( provider_class.__dict__.get(method.func_name) ).parameters ) func_params = [] for param in params: if param == "self": continue mandatory = True default = None if getattr(params[param].default, "__name__", None) != "_empty": mandatory = False default = str(params[param].default) expected_values = list(get_args(params[param].annotation)) func_params.append( ProviderMethodParam( name=param, type=ProvidersFactory._get_method_param_type(params[param]), mandatory=mandatory, default=default, expected_values=expected_values, ) ) if "func_params" in method.dict(): if method.func_params: # this should not happen logging.getLogger(__name__).warning( f"Provider {provider_class.__name__} method {method.func_name} already has func_params" ) # remove it, we already adding it via func_params=func_params else: delattr(method, "func_params") methods.append(ProviderMethodDTO(**method.dict(), func_params=func_params)) return methods @staticmethod def get_all_providers(ignore_cache_file: bool = False) -> list[Provider]: """ Get all the providers. Returns: list: All the providers. """ logger = logging.getLogger(__name__) # use the cache if exists if ProvidersFactory._loaded_providers_cache: logger.debug("Using cached providers") return ProvidersFactory._loaded_providers_cache if os.path.exists(PROVIDERS_CACHE_FILE) and not ignore_cache_file: logger.info( "Loading providers from cache file", extra={"file": PROVIDERS_CACHE_FILE}, ) with open(PROVIDERS_CACHE_FILE, "r") as f: providers_cache = json.load(f) ProvidersFactory._loaded_providers_cache = [ Provider(**provider) for provider in providers_cache ] logger.info( "Providers loaded from cache file", extra={"file": PROVIDERS_CACHE_FILE}, ) return ProvidersFactory._loaded_providers_cache logger.info("Loading providers") providers = [] blacklisted_providers = [ "base_provider", "mock_provider", "file_provider", "github_workflows_provider", ] for provider_directory in os.listdir( os.path.dirname(os.path.abspath(__file__)) ): # skip files that aren't providers if not provider_directory.endswith("_provider"): continue elif provider_directory in blacklisted_providers: continue # import it try: module = importlib.import_module( f"keep.providers.{provider_directory}.{provider_directory}" ) provider_auth_config_class = getattr( module, provider_directory.title().replace("_", "") + "AuthConfig", None, ) provider_type = provider_directory.replace("_provider", "") provider_class = ProvidersFactory.get_provider_class(provider_type) scopes = ( provider_class.PROVIDER_SCOPES if issubclass(provider_class, BaseProvider) else [] ) can_setup_webhook = ( issubclass(provider_class, BaseProvider) and provider_class.__dict__.get("setup_webhook") is not None ) or ( issubclass(provider_class, BaseIncidentProvider) and provider_class.__dict__.get("setup_incident_webhook") is not None ) webhook_required = provider_class.WEBHOOK_INSTALLATION_REQUIRED supports_webhook = ( issubclass(provider_class, BaseProvider) and provider_class.__dict__.get("webhook_template") is not None ) can_notify = ( issubclass(provider_class, BaseProvider) and provider_class.__dict__.get("_notify") is not None ) notify_params = ( None if not can_notify else get_method_parameters_safe( list( dict( inspect.signature( provider_class.__dict__.get("_notify") ).parameters ).keys() ) ) ) can_query = ( issubclass(provider_class, BaseProvider) and provider_class.__dict__.get("_query") is not None ) query_params = ( None if not can_query else get_method_parameters_safe( list( dict( inspect.signature( provider_class.__dict__.get("_query") ).parameters ).keys() ) ) ) config = {} if provider_auth_config_class: for field in fields(provider_auth_config_class): config[field.name] = dict(field.metadata) if field.default is not None: config[field.name]["default"] = field.default provider_description = provider_class.__dict__.get( "provider_description" ) oauth2_url = provider_class.__dict__.get("OAUTH2_URL") docs = provider_class.__doc__ can_fetch_alerts = ( issubclass(provider_class, BaseProvider) and provider_class.__dict__.get("_get_alerts") is not None ) can_fetch_topology = issubclass(provider_class, BaseTopologyProvider) can_fetch_incidents = issubclass(provider_class, BaseIncidentProvider) pulling_available = ( can_fetch_alerts or can_fetch_topology or can_fetch_incidents ) provider_tags = set(provider_class.PROVIDER_TAGS) if can_fetch_topology: provider_tags.add("topology") if can_query and "data" not in provider_tags: provider_tags.add("data") if ( supports_webhook or can_setup_webhook and "alert" not in provider_tags ): provider_tags.add("alert") if can_notify and "ticketing" not in provider_tags: provider_tags.add("messaging") if can_fetch_incidents and "incident" not in provider_tags: provider_tags.add("incident") provider_tags = list(provider_tags) try: provider_methods = ProvidersFactory.__get_methods(provider_class) except Exception as e: logger.warning( f"Could not get provider {provider_directory} methods. ({str(e)})" ) provider_methods = [] # if the provider has a PROVIDER_DISPLAY_NAME, use it, otherwise use the provider type provider_display_name = getattr( provider_class, "PROVIDER_DISPLAY_NAME", provider_type, ) # Load alert examples if available try: alert_example = provider_class.simulate_alert() # not all providers have this method (yet ^^) except Exception: alert_example = None # Add default fingerprint fields if available if hasattr(provider_class, "FINGERPRINT_FIELDS"): default_fingerprint_fields = provider_class.FINGERPRINT_FIELDS else: default_fingerprint_fields = [] providers.append( Provider( type=provider_type, display_name=provider_display_name, config=config, can_notify=can_notify, can_query=can_query, notify_params=notify_params, query_params=query_params, can_setup_webhook=can_setup_webhook, webhook_required=webhook_required, supports_webhook=supports_webhook, provider_description=provider_description, oauth2_url=oauth2_url, scopes=scopes, docs=docs, methods=provider_methods, tags=provider_tags, alertExample=alert_example, default_fingerprint_fields=default_fingerprint_fields, categories=provider_class.PROVIDER_CATEGORY, coming_soon=provider_class.PROVIDER_COMING_SOON, health=provider_class.has_health_report(), pulling_available=pulling_available, # pulling can't be enabled if it's not available pulling_enabled=pulling_available, ) ) except ModuleNotFoundError: logger.error( f"Cannot import provider {provider_directory}, module not found." ) continue # for some providers that depends on grpc like cilium provider, this might fail on imports not from Keep (such as the docs script) except TypeError as e: logger.warning( f"Cannot import provider {provider_directory}, unexpected error. ({str(e)})" ) continue ProvidersFactory._loaded_providers_cache = providers return providers @staticmethod def get_installed_providers( tenant_id: str, all_providers: list[Provider] | None = None, include_details: bool = True, override_readonly: bool = False, ) -> list[Provider]: if all_providers is None: all_providers = ProvidersFactory.get_all_providers() installed_providers = get_installed_providers(tenant_id) providers = [] context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) for p in installed_providers: provider: Provider | None = next( filter( lambda provider: provider.type == p.type, all_providers, ), None, ) if provider is None: logger.warning(f"Installed provider {p.type} does not exist anymore?") continue provider_copy = provider.copy() provider_copy.id = p.id provider_copy.installed_by = p.installed_by provider_copy.installation_time = p.installation_time provider_copy.last_pull_time = p.last_pull_time provider_copy.provisioned = p.provisioned provider_copy.pulling_enabled = p.pulling_enabled provider_copy.installed = True provider_copy.provider_metadata = p.provider_metadata try: provider_auth = {"name": p.name} if include_details: provider_auth.update( secret_manager.read_secret( secret_name=p.configuration_key, is_json=True ) ) if READ_ONLY_MODE and not override_readonly: if "authentication" in provider_auth: provider_auth["authentication"] = { key: "demo" for key in provider_auth["authentication"] if isinstance(provider_auth["authentication"][key], str) } # Somehow the provider is installed but the secret is missing, probably bug in deletion # TODO: solve its root cause except Exception as e: logger.warning( f"Could not get provider {provider_copy.id} auth config from secret manager: {e}" ) continue provider_copy.details = provider_auth provider_copy.validatedScopes = p.validatedScopes providers.append(provider_copy) return providers @staticmethod def get_consumer_providers() -> list[Provider]: # get the list of all providers that consume events installed_consumer_providers = get_consumer_providers() initialized_consumer_providers = [] for provider in installed_consumer_providers: try: provider_class = ProvidersFactory.get_installed_provider( tenant_id=provider.tenant_id, provider_id=provider.id, provider_type=provider.type, ) initialized_consumer_providers.append(provider_class) except Exception: logger.warning( f"Could not get provider {provider.id} auth config from secret manager" ) continue return initialized_consumer_providers @staticmethod def get_provider_config( tenant_id: str, provider_id: str, provider_type: str, context_manager: ContextManager | None = None, ) -> dict: context_manager = context_manager or ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) provider_from_db = get_provider_by_type_and_id( tenant_id=tenant_id, provider_id=provider_id, provider_type=provider_type ) logger.info( f"Getting provider secret for provider id: {provider_from_db.id}," f" configuration key: {provider_from_db.configuration_key}," f" secret manager type: {secret_manager.__class__.__name__}" ) return secret_manager.read_secret( secret_name=provider_from_db.configuration_key, is_json=True, ) @staticmethod def get_installed_provider( tenant_id: str, provider_id: str, provider_type: str ) -> BaseProvider: """ Get the instantiated provider class according to the provider type. Args: tenant_id (str): The tenant id. provider_id (str): The provider id. provider_type (str): The provider type. Returns: BaseProvider: The instantiated provider class. """ context_manager = ContextManager(tenant_id=tenant_id) provider_config = ProvidersFactory.get_provider_config( tenant_id=tenant_id, provider_id=provider_id, provider_type=provider_type, context_manager=context_manager, ) provider_class = ProvidersFactory.get_provider( context_manager=context_manager, provider_id=provider_id, provider_type=provider_type, provider_config=provider_config, ) return provider_class @staticmethod def get_linked_providers(tenant_id: str) -> list[Provider]: """ Get the linked providers. Args: tenant_id (str): The tenant id. Returns: list: The linked providers. """ linked_providers = get_linked_providers(tenant_id) available_providers = ProvidersFactory.get_all_providers() _linked_providers = [] for p in linked_providers: provider_type, provider_id, last_alert_received = p[0], p[1], p[2] provider: Provider = next( filter( lambda provider: provider.type == provider_type, available_providers, ), None, ) if not provider: # It means it's a custom provider provider = Provider( display_name=provider_type, type=provider_type, can_notify=False, can_query=False, tags=["alert"], ) provider = provider.copy() provider.linked = True provider.id = provider_id if last_alert_received: provider.last_alert_received = last_alert_received.replace( tzinfo=datetime.timezone.utc ).isoformat() _linked_providers.append(provider) return _linked_providers @staticmethod def get_default_deduplication_rules() -> list[DeduplicationRuleDto]: """ Get the default deduplications for all providers with FINGERPRINT_FIELDS. Returns: list: The default deduplications for each provider. """ if ProvidersFactory._loaded_deduplication_rules_cache: return ProvidersFactory._loaded_deduplication_rules_cache default_deduplications = [] all_providers = ProvidersFactory.get_all_providers() for provider in all_providers: if provider.default_fingerprint_fields: deduplication_dto = DeduplicationRuleDto( name=f"{provider.type}_default", description=f"{provider.display_name} default deduplication rule", default=True, distribution=[{"hour": i, "number": 0} for i in range(24)], provider_type=provider.type, last_updated="", last_updated_by="", created_at="", created_by="", ingested=0, dedup_ratio=0.0, enabled=True, fingerprint_fields=provider.default_fingerprint_fields, # default provider deduplication rules are not full deduplication full_deduplication=False, # not relevant for default deduplication rules ignore_fields=[], is_provisioned=False, ) default_deduplications.append(deduplication_dto) ProvidersFactory._loaded_deduplication_rules_cache = default_deduplications return default_deduplications # Custom JSON encoder for Provider objects, to be used for providers cache class ProviderEncoder(json.JSONEncoder): def default(self, o): if isinstance(o, ProviderScope): dct = o.__dict__ dct.pop("__pydantic_initialised__", None) return dct elif isinstance(o, _MISSING_TYPE): return None return o.dict() ================================================ FILE: keep/providers/providers_service.py ================================================ import json import logging import os import time import uuid from typing import Any, Dict, List, Optional from fastapi import HTTPException from sqlalchemy.exc import IntegrityError from sqlmodel import Session, select from keep.api.alert_deduplicator.deduplication_rules_provisioning import ( provision_deduplication_rules, ) from keep.api.core.config import config from keep.api.core.db import ( engine, existed_or_new_session, get_all_provisioned_providers, get_provider_by_name, get_provider_logs, ) from keep.api.models.db.provider import Provider, ProviderExecutionLog from keep.api.models.provider import Provider as ProviderModel from keep.api.utils.tenant_utils import get_or_create_api_key from keep.contextmanager.contextmanager import ContextManager from keep.event_subscriber.event_subscriber import EventSubscriber from keep.functions import cyaml from keep.providers.base.base_provider import BaseProvider from keep.providers.providers_factory import ProvidersFactory from keep.secretmanager.secretmanagerfactory import SecretManagerFactory logger = logging.getLogger(__name__) class ProvidersService: @staticmethod def get_all_providers() -> List[ProviderModel]: return ProvidersFactory.get_all_providers() @staticmethod def get_installed_providers( tenant_id: str, include_details: bool = True ) -> List[ProviderModel]: all_providers = ProvidersService.get_all_providers() return ProvidersFactory.get_installed_providers( tenant_id, all_providers, include_details ) @staticmethod def get_linked_providers(tenant_id: str) -> List[ProviderModel]: return ProvidersFactory.get_linked_providers(tenant_id) @staticmethod def validate_scopes( provider: BaseProvider, validate_mandatory=True ) -> dict[str, bool | str]: logger.info("Validating provider scopes") try: validated_scopes = provider.validate_scopes() except Exception as e: logger.exception("Failed to validate provider scopes") raise HTTPException( status_code=412, detail=str(e), ) if validate_mandatory: mandatory_scopes_validated = True if provider.PROVIDER_SCOPES and validated_scopes: # All of the mandatory scopes must be validated for scope in provider.PROVIDER_SCOPES: if scope.mandatory and ( scope.name not in validated_scopes or validated_scopes[scope.name] is not True ): mandatory_scopes_validated = False break # Otherwise we fail the installation if not mandatory_scopes_validated: logger.warning( "Failed to validate mandatory provider scopes", extra={"validated_scopes": validated_scopes}, ) raise HTTPException( status_code=412, detail=validated_scopes, ) logger.info( "Validated provider scopes", extra={"validated_scopes": validated_scopes} ) return validated_scopes @staticmethod def prepare_provider( provider_id: str, provider_name: str, provider_type: str, provider_config: Dict[str, Any], validate_scopes: bool = True, ) -> Dict[str, Any]: provider_unique_id = uuid.uuid4().hex logger.info( "Installing provider", extra={ "provider_id": provider_id, "provider_type": provider_type, }, ) config = { "authentication": provider_config, "name": provider_name, } tenant_id = None context_manager = ContextManager(tenant_id=tenant_id) try: provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, config ) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) if validate_scopes: ProvidersService.validate_scopes(provider) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_name = f"{tenant_id}_{provider_type}_{provider_unique_id}" secret_manager.write_secret( secret_name=secret_name, secret_value=json.dumps(config), ) try: secret_manager.delete_secret( secret_name=secret_name, ) logger.warning("Secret deleted") except Exception: logger.exception("Failed to delete the secret") pass return provider @staticmethod def install_provider( tenant_id: str, installed_by: str, provider_id: str, provider_name: str, provider_type: str, provider_config: Dict[str, Any], provisioned: bool = False, validate_scopes: bool = True, pulling_enabled: bool = True, ) -> Dict[str, Any]: provider_unique_id = uuid.uuid4().hex logger.info( "Installing provider", extra={ "provider_id": provider_id, "provider_type": provider_type, "tenant_id": tenant_id, }, ) config = { "authentication": provider_config, "name": provider_name, } context_manager = ContextManager(tenant_id=tenant_id) try: provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, config ) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) if validate_scopes: validated_scopes = ProvidersService.validate_scopes(provider) else: validated_scopes = {} try: provider_metadata = provider.get_provider_metadata() except Exception: logger.exception("Failed to get provider metadata") provider_metadata = {} secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_name = f"{tenant_id}_{provider_type}_{provider_unique_id}" secret_manager.write_secret( secret_name=secret_name, secret_value=json.dumps(config), ) with Session(engine) as session: provider_model = Provider( id=provider_unique_id, tenant_id=tenant_id, name=provider_name, type=provider_type, installed_by=installed_by, installation_time=time.time(), configuration_key=secret_name, validatedScopes=validated_scopes, consumer=provider.is_consumer, provisioned=provisioned, pulling_enabled=pulling_enabled, provider_metadata=provider_metadata, ) try: session.add(provider_model) session.commit() except IntegrityError as e: if "FOREIGN KEY constraint" in str(e): raise try: # if the provider is already installed, delete the secret logger.warning( "Provider already installed, deleting secret", extra={"error": str(e)}, ) secret_manager.delete_secret( secret_name=secret_name, ) logger.warning("Secret deleted") except Exception: logger.exception("Failed to delete the secret") pass raise HTTPException( status_code=409, detail="Provider already installed" ) if provider_model.consumer: try: event_subscriber = EventSubscriber.get_instance() event_subscriber.add_consumer(provider) except Exception: logger.exception("Failed to register provider as a consumer") return { "type": provider_type, "id": provider_unique_id, "details": config, "validatedScopes": validated_scopes, } @staticmethod def update_provider( tenant_id: str, provider_id: str, provider_info: Dict[str, Any], updated_by: str, session: Optional[Session] = None, allow_provisioned=False, ) -> Dict[str, Any]: with existed_or_new_session(session) as session: provider = session.exec( select(Provider).where( (Provider.tenant_id == tenant_id) & (Provider.id == provider_id) ) ).one_or_none() if not provider: raise HTTPException(404, detail="Provider not found") if provider.provisioned and not allow_provisioned: raise HTTPException(403, detail="Cannot update a provisioned provider") pulling_enabled = provider_info.pop("pulling_enabled", True) # if pulling_enabled is "true" or "false" cast it to boolean if isinstance(pulling_enabled, str): pulling_enabled = pulling_enabled.lower() == "true" provider_config = { "authentication": provider_info, "name": provider.name, } context_manager = ContextManager(tenant_id=tenant_id) try: provider_instance = ProvidersFactory.get_provider( context_manager, provider_id, provider.type, provider_config ) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) validated_scopes = provider_instance.validate_scopes() secret_manager = SecretManagerFactory.get_secret_manager(context_manager) secret_manager.write_secret( secret_name=provider.configuration_key, secret_value=json.dumps(provider_config), ) provider.installed_by = updated_by provider.validatedScopes = validated_scopes provider.pulling_enabled = pulling_enabled session.commit() logger.info( "Provider updated", extra={ "provider_id": provider_id, "provider_type": provider.type, "tenant_id": tenant_id, }, ) return { "details": provider_config, "validatedScopes": validated_scopes, } @staticmethod def delete_provider( tenant_id: str, provider_id: str, session: Optional[Session] = None, allow_provisioned=False, ): with existed_or_new_session(session) as session: provider_model: Provider = session.exec( select(Provider).where( (Provider.tenant_id == tenant_id) & (Provider.id == provider_id) ) ).one_or_none() if not provider_model: raise HTTPException(404, detail="Provider not found") if provider_model.provisioned and not allow_provisioned: raise HTTPException(403, detail="Cannot delete a provisioned provider") context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) config = secret_manager.read_secret( provider_model.configuration_key, is_json=True ) try: secret_manager.delete_secret(provider_model.configuration_key) except Exception: logger.exception("Failed to delete the provider secret") if provider_model.consumer: try: event_subscriber = EventSubscriber.get_instance() event_subscriber.remove_consumer(provider_model) except Exception: logger.exception("Failed to unregister provider as a consumer") try: provider = ProvidersFactory.get_provider( context_manager, provider_model.id, provider_model.type, config ) provider.clean_up() except NotImplementedError: logger.info( "Being deleted provider of type %s does not have a clean_up method", provider_model.type, ) except Exception: logger.exception(msg="Provider deleted but failed to clean up provider") session.delete(provider_model) session.commit() @staticmethod def validate_provider_scopes( tenant_id: str, provider_id: str, session: Session ) -> Dict[str, bool | str]: provider = session.exec( select(Provider).where( (Provider.tenant_id == tenant_id) & (Provider.id == provider_id) ) ).one_or_none() if not provider: raise HTTPException(404, detail="Provider not found") context_manager = ContextManager(tenant_id=tenant_id) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) provider_config = secret_manager.read_secret( provider.configuration_key, is_json=True ) provider_instance = ProvidersFactory.get_provider( context_manager, provider_id, provider.type, provider_config ) validated_scopes = provider_instance.validate_scopes() if validated_scopes != provider.validatedScopes: provider.validatedScopes = validated_scopes session.commit() return validated_scopes @staticmethod def is_provider_installed(tenant_id: str, provider_name: str) -> bool: provider = get_provider_by_name(tenant_id, provider_name) return provider is not None @staticmethod def install_webhook( tenant_id: str, provider_type: str, provider_id: str, session: Optional[Session] = None, ) -> bool: context_manager = ContextManager( tenant_id=tenant_id, workflow_id="", # this is not in a workflow scope ) secret_manager = SecretManagerFactory.get_secret_manager(context_manager) provider_secret_name = f"{tenant_id}_{provider_type}_{provider_id}" provider_config = secret_manager.read_secret(provider_secret_name, is_json=True) provider_class = ProvidersFactory.get_provider_class(provider_type) if ( provider_class.__dict__.get("setup_incident_webhook") is None and provider_class.__dict__.get("setup_webhook") is None ): logger.info( "Provider does not support webhook installation", extra={ "provider_type": provider_type, "provider_id": provider_id, "tenant_id": tenant_id, }, ) return False provider = ProvidersFactory.get_provider( context_manager, provider_id, provider_type, provider_config ) api_url = config("KEEP_API_URL") keep_webhook_api_url = ( f"{api_url}/alerts/event/{provider_type}?provider_id={provider_id}" ) keep_webhook_incidents_api_url = ( f"{api_url}/incidents/event/{provider_type}?provider_id={provider_id}" ) with existed_or_new_session(session) as session: webhook_api_key = get_or_create_api_key( session=session, tenant_id=tenant_id, created_by="system", unique_api_key_id="webhook", system_description="Webhooks API key", ) try: if provider_class.__dict__.get("setup_incident_webhook") is not None: extra_config = provider.setup_incident_webhook( tenant_id, keep_webhook_incidents_api_url, webhook_api_key, True ) if provider_class.__dict__.get("setup_webhook") is not None: extra_config = provider.setup_webhook( tenant_id, keep_webhook_api_url, webhook_api_key, True ) if extra_config: provider_config["authentication"].update(extra_config) secret_manager.write_secret( secret_name=provider_secret_name, secret_value=json.dumps(provider_config), ) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) return True @staticmethod def provision_providers(tenant_id: str): """ Provision providers from a directory or env variable. Args: tenant_id (str): The tenant ID. """ logger = logging.getLogger(__name__) provisioned_providers_dir = os.environ.get("KEEP_PROVIDERS_DIRECTORY") provisioned_providers_json = os.environ.get("KEEP_PROVIDERS") # Get all existing provisioned providers provisioned_providers = get_all_provisioned_providers(tenant_id=tenant_id) if not (provisioned_providers_dir or provisioned_providers_json): logger.info("No providers for provisioning found") if provisioned_providers: logger.info("Found existing provisioned providers, deleting them") for provider in provisioned_providers: logger.info(f"Deprovisioning provider {provider.id}") ProvidersService.delete_provider( tenant_id=tenant_id, provider_id=provider.id, allow_provisioned=True, ) logger.info(f"Provider {provider.id} deprovisioned successfully") return [] if ( provisioned_providers_dir is not None and provisioned_providers_json is not None ): raise Exception( "Providers provisioned via env var and directory at the same time. Please choose one." ) if provisioned_providers_dir is not None and not os.path.isdir( provisioned_providers_dir ): raise FileNotFoundError( f"Directory {provisioned_providers_dir} does not exist" ) ### Provisioning from env var if provisioned_providers_json is not None: # Avoid circular import from keep.parser.parser import Parser parser = Parser() context_manager = ContextManager(tenant_id=tenant_id) parser._parse_providers_from_env(context_manager) env_providers = context_manager.providers_context # Un-provisioning other providers. for provider in provisioned_providers: if provider.name not in env_providers: try: logger.info(f"Deleting provider {provider.name}") ProvidersService.delete_provider( tenant_id=tenant_id, provider_id=provider.id, allow_provisioned=True, ) except Exception as e: logger.exception( "Failed to delete provisioned provider that does not exist in the env var", extra={"exception": e}, ) for provider_name, provider_config in env_providers.items(): provider_info = provider_config.get("authentication", {}) install_webhook_env = os.environ.get( "KEEP_PROVIDERS_INSTALL_WEBHOOKS", "true" ).lower() == "true" install_webhook = provider_config.get( "install_webhook", install_webhook_env ) logger.info(f"Provisioning provider {provider_name}") if ProvidersService.is_provider_installed(tenant_id, provider_name): logger.info( f"Provider {provider_name} already installed. Updating it" ) installed_provider = get_provider_by_name( tenant_id=tenant_id, provider_name=provider_name ) ProvidersService.update_provider( tenant_id=tenant_id, provider_id=installed_provider.id, provider_info=provider_info, updated_by="system", allow_provisioned=True, ) continue logger.info(f"Installing provider {provider_name}") try: installed_provider = ProvidersService.install_provider( tenant_id=tenant_id, installed_by="system", provider_id=provider_config["type"], provider_name=provider_name, provider_type=provider_config["type"], provider_config=provider_info, provisioned=True, validate_scopes=False, ) if install_webhook: try: ProvidersService.install_webhook( tenant_id=tenant_id, provider_type=installed_provider["type"], provider_id=installed_provider["id"], ) logger.info(f"Webhook installed for {provider_name}") except Exception as e: logger.error( "Error installing webhook for provider from env var", extra={"provider_name": provider_name, "exception": e}, ) else: logger.info( f"Install webhook disabled for {provider_name}; skipping." ) logger.info(f"Provider {provider_name} provisioned successfully") except Exception as e: logger.error( "Error provisioning provider from env var", extra={"exception": e}, ) ### Provisioning from the directory if provisioned_providers_dir is not None: installed_providers = [] for file in os.listdir(provisioned_providers_dir): if file.endswith((".yaml", ".yml")): logger.info(f"Provisioning provider from {file}") provider_path = os.path.join(provisioned_providers_dir, file) try: with open(provider_path, "r") as yaml_file: provider_yaml = cyaml.safe_load(yaml_file.read()) provider_name = provider_yaml["name"] provider_type = provider_yaml["type"] provider_config = provider_yaml.get("authentication", {}) install_webhook_env = os.environ.get( "KEEP_PROVIDERS_INSTALL_WEBHOOKS", "false" ).lower() == "true" install_webhook = provider_yaml.get( "install_webhook", install_webhook_env ) # Skip if already installed if ProvidersService.is_provider_installed( tenant_id, provider_name ): logger.info( f"Provider {provider_name} already installed. Updating it" ) # Add to installed providers list. This is necessary, otherwise the provider # will be un-provisioned on the process un-provisioning outdated providers. installed_providers.append(provider_name) installed_provider = get_provider_by_name( tenant_id=tenant_id, provider_name=provider_name ) ProvidersService.update_provider( tenant_id=tenant_id, provider_id=installed_provider.id, provider_info=provider_config, updated_by="system", allow_provisioned=True, ) continue logger.info(f"Installing provider {provider_name}") installed_provider = ProvidersService.install_provider( tenant_id=tenant_id, installed_by="system", provider_id=provider_type, provider_name=provider_name, provider_type=provider_type, provider_config=provider_config, provisioned=True, validate_scopes=False, ) if install_webhook: try: ProvidersService.install_webhook( tenant_id=tenant_id, provider_type=installed_provider["type"], provider_id=installed_provider["id"], ) logger.info(f"Webhook installed for {provider_name}") except Exception as e: logger.error( "Error installing webhook for provider from directory", extra={"provider_name": provider_name, "exception": e}, ) else: logger.info( f"Install webhook disabled for {provider_name}; skipping." ) logger.info( f"Provider {provider_name} provisioned successfully" ) installed_providers.append(provider_name) # Configure deduplication rules deduplication_rules = provider_yaml.get( "deduplication_rules", {} ) if deduplication_rules: logger.info( f"Provisioning deduplication rules for provider {provider_name}" ) deduplication_rules_dict: dict[str, dict] = {} for ( rule_name, rule_config, ) in deduplication_rules.items(): logger.info( f"Provisioning deduplication rule {rule_name}" ) rule_config["name"] = rule_name rule_config["provider_name"] = provider_name rule_config["provider_type"] = provider_type deduplication_rules_dict[rule_name] = rule_config # Provision deduplication rules provision_deduplication_rules( deduplication_rules=deduplication_rules_dict, tenant_id=tenant_id, ) except Exception as e: logger.error( "Error provisioning provider from directory", extra={"exception": e}, ) # Un-provisioning other providers. for provider in provisioned_providers: if provider.name not in installed_providers: logger.info( f"Deprovisioning provider {provider.name} as its file no longer exists or is outside the providers directory" ) ProvidersService.delete_provider( tenant_id=tenant_id, provider_id=provider.id, allow_provisioned=True, ) logger.info(f"Provider {provider.name} deprovisioned successfully") @staticmethod def get_provider_logs( tenant_id: str, provider_id: str ) -> List[ProviderExecutionLog]: if not config("KEEP_STORE_PROVIDER_LOGS", cast=bool, default=False): raise HTTPException(404, detail="Provider logs are not enabled") return get_provider_logs(tenant_id, provider_id) ================================================ FILE: keep/providers/pushover_provider/__init__.py ================================================ ================================================ FILE: keep/providers/pushover_provider/pushover_provider.py ================================================ import dataclasses import os import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class PushoverProviderAuthConfig: """Pushover authentication configuration.""" token: str = dataclasses.field( metadata={ "required": True, "description": "Pushover app token", "sensitive": True, } ) user_key: str = dataclasses.field( metadata={"required": True, "description": "Pushover user key"} ) class PushoverProvider(BaseProvider): """Send alert message to Pushover.""" PROVIDER_DISPLAY_NAME = "Pushover" PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = PushoverProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify(self, message=None, **kwargs: dict): """ Notify alert message to Pushover using the Pushover API https://support.pushover.net/i44-example-code-and-pushover-libraries#python Args: message (str): The content of the message. """ self.logger.debug("Notifying alert message to Pushover") sound = kwargs.get("sound", "pushover") priority = int(kwargs.get("priority", 0)) retry = kwargs.get("retry", 60) expire = kwargs.get("expire", 3600) resp = requests.post( "https://api.pushover.net/1/messages.json", data={ "token": self.authentication_config.token, "user": self.authentication_config.user_key, "message": message, "sound": sound, "priority": priority, **({"retry": retry, "expire": expire} if priority == 2 else {}), }, ) resp.raise_for_status() self.logger.debug("Alert message notified to Pushover") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os pushover_token = os.environ.get("PUSHOVER_TOKEN") pushover_user_key = os.environ.get("PUSHOVER_USER_KEY") # Initalize the provider and provider config config = ProviderConfig( id="pushover-test", description="Pushover Output Provider", authentication={"token": pushover_token, "user_key": pushover_user_key}, ) provider = PushoverProvider(context_manager, provider_id="pushover", config=config) provider.notify(message="Simple alert showing context with name: John Doe") ================================================ FILE: keep/providers/python_provider/__init__.py ================================================ ================================================ FILE: keep/providers/python_provider/python_provider.py ================================================ """ PythonProvider is a class that implements the BaseOutputProvider. """ from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_config_exception import ProviderConfigException from keep.exceptions.provider_exception import ProviderException from keep.iohandler.iohandler import IOHandler from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class PythonProvider(BaseProvider): """Python provider eval python code to get results""" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.io_handler = IOHandler(context_manager=context_manager) def validate_config(self): pass def _query(self, code: str = "", imports: str = "", **kwargs): """Python provider eval python code to get results Returns: _type_: _description_ """ modules = imports loaded_modules = {} if modules: for module in modules.split(","): try: imported_module = __import__(module, fromlist=[""]) # Add all public attributes from the module to loaded_modules for attr_name in dir(imported_module): if not attr_name.startswith("_"): loaded_modules[attr_name] = getattr( imported_module, attr_name ) # Add the module itself too.. loaded_modules[module] = imported_module except Exception: raise ProviderConfigException( f"{self.__class__.__name__} failed to import library: {module}", provider_id=self.provider_id, ) parsed_code = self.io_handler.parse(code) try: output = eval(parsed_code, loaded_modules) except Exception as e: raise ProviderException(e) return output def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass if __name__ == "__main__": # Example usage # Output debug messages import logging from keep.providers.providers_factory import ProvidersFactory logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) python_provider = ProvidersFactory.get_provider( context_manager=context_manager, provider_id="python-keephq", provider_type="python", provider_config={"authentication": {}}, ) # Example query result = python_provider._query(code="1 + 1", imports="keep.api.models.alert") print(result) # Output: 2 ================================================ FILE: keep/providers/quickchart_provider/__init__.py ================================================ ================================================ FILE: keep/providers/quickchart_provider/quickchart_provider.py ================================================ # builtins import dataclasses import datetime from collections import defaultdict import pydantic # third-parties from quickchart import QuickChart # internals from keep.api.core.db import get_alerts_by_fingerprint from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory def get_date_key(date: datetime.datetime, time_unit: str) -> str: if isinstance(date, str): date = datetime.datetime.fromisoformat(date) if time_unit == "Minutes": return f"{date.hour}:{date.minute}:{date.second}" elif time_unit == "Hours": return f"{date.hour}:{date.minute}" else: return f"{date.day}/{date.month}/{date.year}" @pydantic.dataclasses.dataclass class QuickchartProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": False, "description": "Quickchart API Key", "sensitive": True, }, default=None, ) class QuickchartProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "QuickChart" PROVIDER_CATEGORY = ["Developer Tools"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = QuickchartProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def _notify( self, fingerprint: str, status: str | None = None, chartConfig: dict | None = None, ) -> dict: db_alerts = get_alerts_by_fingerprint( tenant_id=self.context_manager.tenant_id, fingerprint=fingerprint, limit=False, status=status, ) alerts = convert_db_alerts_to_dto_alerts(db_alerts) if not alerts: self.logger.warning( "No alerts found for this fingerprint", extra={ "tenant_id": self.context_manager.tenant_id, "fingerprint": fingerprint, }, ) return {"chart_url": ""} min_last_received = min( datetime.datetime.fromisoformat(alert.lastReceived) for alert in alerts ) max_last_received = max( datetime.datetime.fromisoformat(alert.lastReceived) for alert in alerts ) title = f"First: {str(min_last_received)} | Last: {str(max_last_received)} | Total: {len(alerts)}" time_difference = ( max_last_received - min_last_received ).total_seconds() * 1000 # Convert to milliseconds time_unit = "Days" if time_difference < 3600000: time_unit = "Minutes" elif time_difference < 86400000: time_unit = "Hours" categories_by_status = [] raw_chart_data = defaultdict(dict) for alert in reversed(alerts): date_key = get_date_key(alert.lastReceived, time_unit) status = alert.status if date_key not in raw_chart_data: raw_chart_data[date_key][status] = 1 else: raw_chart_data[date_key][status] = ( raw_chart_data[date_key].get(status, 0) + 1 ) if status not in categories_by_status: categories_by_status.append(status) chart_data = [{"date": key, **value} for key, value in raw_chart_data.items()] # Generate chart using QuickChart return self.generate_chart_image( chart_data, categories_by_status, len(alerts), title, chartConfig ) def __get_total_alerts_gaugae(self, counter: int): qc = QuickChart() if self.authentication_config.api_key: qc.key = self.authentication_config.api_key qc.width = 500 qc.height = 300 qc.config = { "type": "radialGauge", "data": {"datasets": [{"data": [counter]}]}, "options": { "centerArea": {"fontSize": 25, "fontWeight": "bold"}, }, } chart_url = qc.get_short_url() return chart_url def generate_chart_image( self, chart_data, categories_by_status, total_alerts: int, title: str, config: dict | None = None, ) -> dict: qc = QuickChart() if self.authentication_config.api_key: qc.key = self.authentication_config.api_key qc.width = 800 qc.height = 400 qc.config = config or { "type": "line", "data": { "labels": [data["date"] for data in chart_data], "datasets": [ { "fill": False, "label": category, "lineTension": 0.4, "borderWidth": 3, "data": [data.get(category, 0) for data in chart_data], } for category in categories_by_status ], }, "options": { "title": { "display": True, "position": "top", "fontSize": 14, "padding": 10, "text": title, }, "scales": { "xAxes": [{"type": "category"}], "yAxes": [{"ticks": {"beginAtZero": True}}], }, }, } chart_url = qc.get_short_url() counter_url = self.__get_total_alerts_gaugae(total_alerts) return {"chart_url": chart_url, "counter_url": counter_url} if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="keep", workflow_id="test", ) config = { "description": "", "authentication": {}, } provider = ProvidersFactory.get_provider( context_manager, provider_id="quickchart", provider_type="quickchart", provider_config=config, ) result = provider.notify( fingerprint="5bcafb4ea94749f36871a2e1169d5252ecfb1c589d7464bd8bf863cdeb76b864" ) print(result) ================================================ FILE: keep/providers/redmine_provider/__init__.py ================================================ ================================================ FILE: keep/providers/redmine_provider/redmine_provider.py ================================================ """ RedmineProvider is a class that implements the BaseProvider interface for Redmine issues. """ import dataclasses import pydantic import requests from requests import HTTPError from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class RedmineProviderAuthConfig: """Redmine authentication configuration.""" host: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Redmine Host", "sensitive": False, "hint": "http://localhost:8080", "validation": "any_http_url", } ) api_access_key: str = dataclasses.field( metadata={ "required": True, "description": "Redmine API Access key", "sensitive": True, "documentation_url": "https://www.redmine.org/projects/redmine/wiki/rest_api#Authentication", } ) ticket_creation_url: str = dataclasses.field( metadata={ "required": False, "description": "URL for creating new tickets", "sensitive": False, "hint": "http://localhost:8080/issues/new", }, default="", ) class RedmineProvider(BaseProvider): """Enrich alerts with Redmine tickets.""" PROVIDER_DISPLAY_NAME = "Redmine" PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="Authenticated with Redmine API", mandatory=True, alias="Redmine API Access Key", ), ] PROVIDER_TAGS = ["ticketing"] PROVIDER_CATEGORY = ["Ticketing"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): self._host = None super().__init__(context_manager, provider_id, config) def validate_scopes(self): """ Validate that the provider has the required scopes. """ # first, validate user/api token are correct: resp = requests.get( f"{self.__redmine_url}/users/current.json", headers=self.__get_headers(), ) try: resp.raise_for_status() if resp.status_code == 200: scopes = {"authenticated": True} else: self.logger.error( f"Failed to validate scope for {self.provider_id}", extra=resp.json(), ) scopes = { "authenticated": { "status_code": resp.status_code, "error": resp.json(), } } except HTTPError as e: self.logger.error( f"HTTPError while validating scope for {self.provider_id}", extra={"error": str(e)}, ) scopes = { "authenticated": {"status_code": resp.status_code, "error": str(e)} } return scopes def validate_config(self): self.authentication_config = RedmineProviderAuthConfig( **self.config.authentication ) @property def __redmine_url(self): # if not the first time, return the cached host if self._host: return self._host.rstrip("/") # if the user explicitly supplied a host with http/https, use it if self.authentication_config.host.startswith( "http://" ) or self.authentication_config.host.startswith("https://"): self._host = self.authentication_config.host return self.authentication_config.host.rstrip("/") # otherwise, try to use https: try: requests.get( f"https://{self.authentication_config.host}", verify=False, ) self.logger.debug("Using https") self._host = f"https://{self.authentication_config.host}" return self._host.rstrip("/") except requests.exceptions.SSLError: self.logger.debug("Using http") self._host = f"http://{self.authentication_config.host}" return self._host.rstrip("/") # should happen only if the user supplied invalid host, so just let validate_config fail except Exception: return self.authentication_config.host.rstrip("/") def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def __get_headers(self): """ Helper method to build the auth header for redmine api requests. """ return { "Content-Type": "application/json", "X-Redmine-API-Key": self.authentication_config.api_access_key, } def __build_payload_from_kwargs(self, kwargs: dict): params = dict() for param in kwargs: if isinstance(kwargs[param], list): params[param] = ",".join(kwargs[param]) else: params[param] = kwargs[param] return params def _notify( self, project_id: str, subject: str, priority_id: str, description: str = "", **kwargs: dict, ): self.logger.info("Creating an issue in redmine") payload = self.__build_payload_from_kwargs( kwargs={ **kwargs, "subject": subject, "description": description, "project_id": project_id, "priority_id": priority_id, } ) resp = requests.post( f"{self.__redmine_url}/issues.json", headers=self.__get_headers(), json={"issue": payload}, ) try: resp.raise_for_status() except HTTPError as e: self.logger.error("Error While creating Redmine Issue") raise Exception(f"Failed to create issue: {str(e)}") self.logger.info( "Successfully created a Redmine Issue", extra={"status_code": resp.status_code}, ) return resp.json() ================================================ FILE: keep/providers/resend_provider/__init__.py ================================================ ================================================ FILE: keep/providers/resend_provider/resend_provider.py ================================================ """ ResendProvider is a class that implements the Resend API and allows email sending through Keep. """ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class ResendProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "Resend API key", "hint": "https://resend.com/api-keys", "sensitive": True, } ) class ResendProvider(BaseProvider): """Send email using the Resend API.""" PROVIDER_DISPLAY_NAME = "Resend" PROVIDER_CATEGORY = ["Collaboration"] RESEND_API_URL = "https://api.resend.com" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = ResendProviderAuthConfig( **self.config.authentication ) def _notify(self, _from: str, to: str, subject: str, html: str, **kwargs) -> dict: """ Send an email using the Resend API. Args: _from (str): From email address to (str): To email address subject (str): Email subject html (str): Email body """ self.logger.info( "Sending email using Resend API", extra={ "from": _from, "to": to, "subject": subject, }, ) # until https://github.com/resendlabs/resend-python/pull/37/files is merged response = requests.post( f"{self.RESEND_API_URL}/emails", json={ "from": _from, "to": to, "subject": subject, "html": html, **kwargs, }, headers={ "Accept": "application/json", "Authorization": f"Bearer {self.authentication_config.api_key}", }, ) if response.status_code != 200: error = response.json() raise Exception("Failed to send email: " + error["message"]) return response.json() def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass if __name__ == "__main__": import os resend_api_key = os.environ.get("RESEND_API_KEY") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Initalize the provider and provider config config = ProviderConfig( id="resend-test", authentication={"api_key": resend_api_key}, ) provider = ResendProvider(context_manager, provider_id="resend-test", config=config) response = provider.notify( "onboarding@resend.dev", "youremail@gmail.com", "Hello World from Keep!", "Test with HTML", ) print(response) ================================================ FILE: keep/providers/rollbar_provider/rollbar_provider.py ================================================ """ RollbarProvider is a class that allows to install webhooks and get alerts in Rollbar. """ import dataclasses import datetime from typing import List from urllib.parse import urljoin import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class RollbarProviderAuthConfig: """ RollbarProviderAuthConfig is a class that allows to authenticate in Rollbar. """ rollbarAccessToken: str = dataclasses.field( metadata={ "required": True, "description": "Project Access Token", "sensitive": True, }, default=None, ) class RollbarProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Rollbar" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authenticated", ), ] SEVERITIES_MAP = { "warning": AlertSeverity.WARNING, "error": AlertSeverity.HIGH, "info": AlertSeverity.INFO, "critical": AlertSeverity.CRITICAL, "debug": AlertSeverity.LOW, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): """ Validate the configuration of the provider. """ self.authentication_config = RollbarProviderAuthConfig( **self.config.authentication ) def __get_url(self, path: str): """ Get the URL for the request. """ return urljoin("https://api.rollbar.com/api/1/", path) def __get_headers(self): """ Get the headers for the request. """ return { "X-Rollbar-Access-Token": self.authentication_config.rollbarAccessToken, "accept": "application/json; charset=utf-8", "content-type": "application/json", } def validate_scopes(self) -> dict[str, bool | str]: """ Validate the scopes of the provider. """ try: response = requests.get( self.__get_url("items"), headers=self.__get_headers() ) if response.status_code == 200: scopes = {"authenticated": True} else: self.logger.error( "Unable to read projects from Rollbar, statusCode: %s", response.status_code, ) scopes = { "authenticated": f"Unable to read projects from Rollbar, statusCode: {response.status_code}" } except Exception as e: self.logger.error("Error validating scopes for Rollbar: %s", e) scopes = {"authenticated": f"Error validating scopes for Rollbar: {e}"} return scopes def __get_occurences(self) -> List[AlertDto]: try: response = requests.get( self.__get_url("instances"), headers=self.__get_headers() ) if not response.ok: self.logger.error( "Failed to get occurrences from Rollbar: %s", response.json() ) raise Exception("Could not get occurrences from Rollbar") return [ AlertDto( id=alert["id"], name=alert["project_id"], environment=alert["data"]["environment"], event_id=alert["data"]["uuid"], language=alert["data"]["language"], message=alert["data"]["body"]["message"]["body"], host=alert["data"]["server"]["host"], pid=alert["data"]["server"]["pid"], severity=RollbarProvider.SEVERITIES_MAP[alert["data"]["level"]], lastReceived=datetime.datetime.fromtimestamp( alert["timestamp"] ).isoformat(), ) for alert in response.json()["result"]["instances"] ] except Exception as e: self.logger.error("Error getting occurrences from Rollbar: %s", e) raise Exception(f"Error getting occurrences from Rollbar: {e}") def _get_alerts(self) -> List[AlertDto]: alerts = [] try: self.logger.info("Collecting alerts (occurrences) from Rollbar") occurences_alert = self.__get_occurences() alerts.extend(occurences_alert) except Exception as e: self.logger.error("Error getting occurrences from Rollbar: %s", e) return alerts @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: item_data = event["data"]["item"] occurrence_data = event["data"]["occurrence"] return AlertDto( id=str(item_data["id"]), name=event["event_name"], severity=RollbarProvider.SEVERITIES_MAP[occurrence_data["level"]], lastReceived=datetime.datetime.fromtimestamp( item_data["last_occurrence_timestamp"] ).isoformat(), environment=item_data["environment"], service="Rollbar", source=[occurrence_data["framework"]], url=event["data"]["url"], message=occurrence_data["body"]["message"]["body"], description=item_data["title"], event_id=str(occurrence_data["uuid"]), labels={"level": item_data["level"]}, fingerprint=item_data["hash"], ) def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): self.logger.info("Setting up webhook for Rollbar") self.logger.info("Enabling Webhook in Rollbar") try: response = requests.put( self.__get_url("notifications/webhook"), headers=self.__get_headers(), json={ "enabled": True, "url": f"{keep_api_url}?api_key={api_key}", }, ) if response.ok: response = requests.post( self.__get_url("notifications/webhook/rules"), headers=self.__get_headers(), json={ { "trigger": "occurrence", } }, ) if response.ok: self.logger.info("Created occurrence rule in Rollbar") else: self.logger.error( "Failed to enable webhook in Rollbar: %s", response.json() ) raise Exception("Failed to enable webhook in Rollbar") self.logger.info("Webhook enabled in Rollbar") except Exception as e: self.logger.error("Error setting up webhook for Rollbar: %s", e) raise Exception(f"Error setting up webhook for Rollbar: {e}") if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os rollbar_host = os.environ.get("ROLLBAR_HOST") if rollbar_host is None: raise Exception("ROLLBAR_HOST is not set") config = ProviderConfig( description="Rollbar Provider", authentication={ "rollbarAccessToken": rollbar_host, }, ) provider = RollbarProvider( context_manager, provider_id="rollbar", config=config, ) provider._get_alerts() ================================================ FILE: keep/providers/s3_provider/__init__.py ================================================ ================================================ FILE: keep/providers/s3_provider/s3_provider.py ================================================ """ S3 Provider for querying S3 buckets. """ import dataclasses import boto3 import pydantic from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider @pydantic.dataclasses.dataclass class S3ProviderAuthConfig: access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "S3 Access Token (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) secret_access_key: str = dataclasses.field( default=None, metadata={ "required": False, "description": "S3 Secret Access Token (Leave empty if using IAM role at EC2)", "sensitive": True, }, ) class S3Provider(BaseProvider): PROVIDER_DISPLAY_NAME = "AWS S3" PROVIDER_CATEGORY = ["Cloud Infrastructure"] def dispose(self): pass def validate_config(self): self.authentication_config = S3ProviderAuthConfig(**self.config.authentication) # List all S3 buckets to validate the credentials s3_client = boto3.client( "s3", aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.secret_access_key, ) try: s3_client.list_buckets() except Exception as e: raise ProviderException(f"Failed to list S3 buckets: {e}") def _query(self, bucket: str, **kwargs: dict): """ Query bucket for files. Downdload only yaml, json, xml and csv files. Returns: list[file_content]: results the list of downloaded files """ s3_client = boto3.client( "s3", aws_access_key_id=self.authentication_config.access_key, aws_secret_access_key=self.authentication_config.secret_access_key, ) try: response = s3_client.list_objects_v2(Bucket=bucket) except Exception as e: raise ProviderException(f"Failed to list objects in bucket: {e}") files = [] for obj in response.get("Contents", []): key = obj.get("Key") valid_extensions = [".yaml", ".json", ".xml", ".csv", ".yml"] if any(key.endswith(ext) for ext in valid_extensions): try: response = s3_client.get_object(Bucket=bucket, Key=key) files.append(response.get("Body").read().decode("utf-8")) print(files) except Exception as e: self.logger.exception( "Failed to download object from S3: %s", str(e) ) return files ================================================ FILE: keep/providers/salesforce_provider/__init__.py ================================================ ================================================ FILE: keep/providers/salesforce_provider/salesforce_provider.py ================================================ import dataclasses import pydantic from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class SalesforceProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "Salesforce API key", "sensitive": True, } ) class SalesforceProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Salesforce" PROVIDER_CATEGORY = ["CRM"] PROVIDER_COMING_SOON = True def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = SalesforceProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass ================================================ FILE: keep/providers/sendgrid_provider/__init__.py ================================================ ================================================ FILE: keep/providers/sendgrid_provider/sendgrid_provider.py ================================================ """ SendGridProvider is a class that implements the SendGrid API and allows email sending through Keep. """ import dataclasses import logging import pydantic from python_http_client.exceptions import ForbiddenError, UnauthorizedError from sendgrid import SendGridAPIClient from sendgrid.helpers.mail import Mail from keep.contextmanager.contextmanager import ContextManager from keep.functions import cyaml from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class SendgridProviderAuthConfig: """ SendGrid authentication configuration. """ api_key: str = dataclasses.field( metadata={ "required": True, "description": "SendGrid API key", "hint": "https://sendgrid.com/docs/ui/account-and-settings/api-keys/", "sensitive": True, } ) from_email: str = dataclasses.field( metadata={ "required": True, "description": "From email address", "hint": "e.g. noreply@yourdomain.com", } ) class SendgridProvider(BaseProvider): """Send email using the SendGrid API.""" PROVIDER_DISPLAY_NAME = "SendGrid" PROVIDER_CATEGORY = ["Collaboration"] PROVIDER_SCOPES = [ ProviderScope( name="email.send", description="Send emails using SendGrid", mandatory=True, documentation_url="https://sendgrid.com/docs/API_Reference/api_v3.html", alias="Email Sender", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = SendgridProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): scopes = {} self.logger.info("Validating scopes") try: sg = SendGridAPIClient(self.authentication_config.api_key) # Validate email.send scope by attempting to send a test email if any(scope.name == "email.send" for scope in self.PROVIDER_SCOPES): try: test_email = Mail( from_email=self.authentication_config.from_email, to_emails=self.authentication_config.from_email, subject="Test Email for Scope Validation", html_content="This is a test email for validating SendGrid email.send scope", ) response = sg.send(test_email) if response.status_code >= 400: raise Exception( f"Failed to validate email.send scope: {response.body}" ) scopes["email.send"] = True except UnauthorizedError: self.logger.warning( "Failed to validate email.send scope: Unauthorized" ) scopes["email.send"] = ( "Unauthorized: Invalid API key or insufficient permissions." ) except ForbiddenError: self.logger.warning( "Failed to validate email.send scope: Forbidden" ) scopes["email.send"] = ( "Forbidden: Insufficient permissions to send email." ) except Exception as e: self.logger.warning(f"Failed to validate email.send scope: {e}") scopes["email.send"] = str(e) except Exception as e: self.logger.error(f"Failed to validate scopes: {e}") for scope in self.PROVIDER_SCOPES: scopes[scope.name] = str(e) self.logger.info("Scopes validated", extra=scopes) return scopes def _notify(self, to: str | list[str], subject: str, html: str, **kwargs) -> dict: """ Send an email using the SendGrid API. Args: to (str | list[str]): To email address or list of email addresses subject (str): Email subject html (str): Email body """ _from = self.authentication_config.from_email self.logger.info( "Sending email using SendGrid API", extra={ "from": _from, "to": to, "subject": subject, }, ) if isinstance(to, str): to_emails = [to] else: to_emails = to message = Mail( from_email=_from, to_emails=to_emails, subject=subject, html_content=html, **kwargs, ) try: sg = SendGridAPIClient(self.authentication_config.api_key) response = sg.send(message) if response.status_code >= 400: self.logger.error( f"Failed to send email to {to} with subject {subject}: {response.body}" ) raise Exception(f"Failed to send email: {response.body}") self.logger.info(f"Email sent to {to} with subject {subject}") return { "status_code": response.status_code, "body": ( response.body.decode("utf-8") if isinstance(response.body, bytes) else response.body ), "headers": { k: v for k, v in response.headers.items() if isinstance(v, (str, int, float, bool, type(None))) }, } except UnauthorizedError: self.logger.error( "Unauthorized: Invalid API key or insufficient permissions." ) raise Exception( "Failed to send email: Unauthorized. Please check your API key and permissions." ) except ForbiddenError: self.logger.error("Forbidden: Insufficient permissions to send email.") raise Exception( "Failed to send email: Forbidden. Your API key does not have the necessary permissions." ) except Exception as e: self.logger.error(f"Exception occurred: {e}") raise Exception(f"Failed to send email: {str(e)}") def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass if __name__ == "__main__": import os sendgrid_api_key = os.environ.get("SENDGRID_API_KEY") from_email = os.environ.get("SENDGRID_FROM_EMAIL") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = { "authentication": {"api_key": sendgrid_api_key, "from_email": from_email}, } provider = ProvidersFactory.get_provider( context_manager, provider_id="sendgrid-test", provider_type="sendgrid", provider_config=config, ) scopes = provider.validate_scopes() print(scopes) mail = cyaml.safe_load( """to: - "youremail@gmail.com" - "youranotheremail@gmail.com" subject: "Hello from Keep!" html: "Test with HTML" """ ) response = provider._notify(**mail) print(response) ================================================ FILE: keep/providers/sentry_provider/__init__.py ================================================ ================================================ FILE: keep/providers/sentry_provider/alerts_mock.py ================================================ ALERTS = { "browser_timeout": { "payload": { "id": "4616132097", "project": "frontend-app", "project_name": "frontend-app", "project_slug": "frontend-app", "logger": "javascript", "level": "error", "culprit": "fetchUserProfile at app.js:245", "message": "Failed to fetch user profile: NetworkError: Server responded with 504 Gateway Timeout", "url": "https://keep-dr.sentry.io/issues/4616132097/", "event": { "event_id": "a892bf7d01c640b597831fb1710e3414", "title": "Failed to fetch user profile", "level": "error", "type": "default", "logentry": { "formatted": "Failed to fetch user profile: NetworkError: Server responded with 504 Gateway Timeout", "message": None, }, "logger": "javascript", "platform": "javascript", "timestamp": 1709991285.873, "environment": "production", "user": { "id": "user_8675309", "ip_address": "198.51.100.42", "geo": { "country_code": "US", "city": "San Francisco", "region": "CA", }, }, "request": { "url": "https://api.example.com/users/profile", "method": "GET", "headers": [ ["Accept", "application/json"], [ "User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", ], ], }, "contexts": { "browser": { "name": "Chrome", "version": "121.0.0.0", "type": "browser", }, "client_os": { "name": "Mac OS X", "version": "10.15.7", "type": "os", }, }, "tags": [ ["browser", "Chrome 121.0.0.0"], ["error.type", "NetworkError"], ["http.status_code", "504"], ["environment", "production"], ], }, } }, "server_overload": { "payload": { "id": "4616132098", "project": "frontend-app", "project_name": "frontend-app", "project_slug": "frontend-app", "logger": "javascript", "level": "error", "culprit": "submitOrder at checkout.js:178", "message": "Order submission failed: Server responded with 503 Service Unavailable - System under heavy load", "url": "https://keep-dr.sentry.io/issues/4616132098/", "event": { "event_id": "b723cf8e01c640b597831fb1710e3415", "level": "error", "title": "Order submission failed", "type": "default", "logentry": { "formatted": "Order submission failed: Server responded with 503 Service Unavailable - System under heavy load", "message": None, }, "logger": "javascript", "platform": "javascript", "timestamp": 1709991385.873, "environment": "production", "user": { "id": "user_2468101", "ip_address": "203.0.113.25", "geo": {"country_code": "GB", "city": "London", "region": "ENG"}, }, "request": { "url": "https://api.example.com/orders/submit", "method": "POST", "data": {"order_id": "ORD-12345", "total": 299.99}, "headers": [ ["Content-Type", "application/json"], [ "User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 17_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3.1 Mobile/15E148 Safari/604.1", ], ], }, "contexts": { "browser": { "name": "Mobile Safari", "version": "17.3.1", "type": "browser", }, "client_os": {"name": "iOS", "version": "17.3.1", "type": "os"}, }, "tags": [ ["browser", "Mobile Safari 17.3.1"], ["error.type", "ApiError"], ["http.status_code", "503"], ["environment", "production"], ], }, } }, "database_timeout": { "payload": { "id": "4616132099", "project": "frontend-app", "project_name": "frontend-app", "project_slug": "frontend-app", "logger": "javascript", "level": "error", "culprit": "loadProductCatalog at products.js:89", "message": "Failed to load product catalog: Server responded with 502 Bad Gateway - Database connection timeout", "url": "https://keep-dr.sentry.io/issues/4616132099/", "event": { "title": "Failed to load product catalog", "event_id": "c634de9f01c640b597831fb1710e3416", "level": "error", "type": "default", "logentry": { "formatted": "Failed to load product catalog: Server responded with 502 Bad Gateway - Database connection timeout", "message": None, }, "logger": "javascript", "platform": "javascript", "timestamp": 1709991485.873, "environment": "production", "user": { "id": "user_1357924", "ip_address": "192.0.2.78", "geo": {"country_code": "DE", "city": "Berlin", "region": "BE"}, }, "request": { "url": "https://api.example.com/catalog/products", "method": "GET", "headers": [ ["Accept", "application/json"], [ "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0", ], ], }, "contexts": { "browser": { "name": "Edge", "version": "120.0.0.0", "type": "browser", }, "client_os": {"name": "Windows", "version": "10", "type": "os"}, }, "tags": [ ["browser", "Edge 120.0.0.0"], ["error.type", "ApiError"], ["http.status_code", "502"], ["environment", "production"], ], }, } }, } ================================================ FILE: keep/providers/sentry_provider/sentry_provider.py ================================================ """ SentryProvider is a class that provides a way to read data from Sentry. """ import dataclasses import datetime import logging from urllib.parse import urlparse import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_config_exception import ProviderConfigException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class SentryProviderAuthConfig: """Sentry authentication configuration.""" api_key: str = dataclasses.field( metadata={ "required": True, "description": "Sentry Api Key", "sensitive": True, "hint": "https://docs.sentry.io/product/integrations/integration-platform/internal-integration/", } ) organization_slug: str = dataclasses.field( metadata={"required": True, "description": "Sentry organization slug"} ) api_url: HttpsUrl = dataclasses.field( metadata={ "required": False, "description": "Sentry API URL", "hint": "https://sentry.io/api/0 (see https://docs.sentry.io/api/)", "sensitive": False, "validation": "https_url", }, default="https://sentry.io/api/0", ) project_slug: str = dataclasses.field( metadata={ "required": False, "description": "Sentry project slug within the organization", "hint": "If you want to connect sentry to a specific project within an organization", }, default=None, ) class SentryProvider(BaseProvider): """Enrich alerts with data from Sentry.""" SENTRY_DEFAULT_API = "https://sentry.io/api/0" PROVIDER_SCOPES = [ ProviderScope( "event:read", description="Read events and issues", mandatory=True, documentation_url="https://docs.sentry.io/api/events/list-a-projects-issues/?original_referrer=https%3A%2F%2Fdocs.sentry.io%2Fapi%2F", ), ProviderScope( "project:read", description="Read projects in organization", mandatory=True, documentation_url="https://docs.sentry.io/api/projects/list-your-projects/?original_referrer=https%3A%2F%2Fdocs.sentry.io%2Fapi%2F", ), ProviderScope( "project:write", description="Write permission for projects in organization", mandatory=False, mandatory_for_webhook=True, ), ] DEFAULT_TIMEOUT = 600 PROVIDER_CATEGORY = ["Monitoring"] SEVERITIES_MAP = { "fatal": AlertSeverity.CRITICAL, "error": AlertSeverity.HIGH, "warning": AlertSeverity.WARNING, "info": AlertSeverity.INFO, "debug": AlertSeverity.LOW, } STATUS_MAP = { "resolved": AlertStatus.RESOLVED, "unresolved": AlertStatus.FIRING, "ignored": AlertStatus.SUPPRESSED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.sentry_org_slug = self.config.authentication.get("organization_slug") self.project_slug = self.config.authentication.get("project_slug") self.sentry_api = ( self.config.authentication.get("api_url") or self.SENTRY_DEFAULT_API ) @property def __headers(self) -> dict: return {"Authorization": f"Bearer {self.authentication_config.api_key}"} def get_events_url(self, project, date="14d"): return f"{self.sentry_api}/organizations/{self.sentry_org_slug}/events/?field=title&field=event.type&field=project&field=user.display&field=timestamp&field=replayId&per_page=50 \ &query={project}&referrer=api.discover.query-table&sort=-timestamp&statsPeriod={date}" def dispose(self): return def validate_config(self): """Validates required configuration for Sentry's provider.""" self.authentication_config = SentryProviderAuthConfig( **self.config.authentication ) if "sntryu_" in self.authentication_config.api_key: raise ProviderConfigException( "Invalid user-based token provided instead of API token", self.provider_id, ) def _query(self, project: str, time: str = "14d", **kwargs: dict): """ Query Sentry using the given query Args: project (str): project name time (str): time range, for example: 14d Returns: list[tuple] | list[dict]: results of the query """ headers = { "Authorization": f"Bearer {self.config.authentication['api_token']}", } params = {"limit": 100} response = requests.get( self.get_events_url(project, time), headers=headers, params=params ) response.raise_for_status() events = response.json() return events.get("data") # returns a list of events def get_template(self): pass def get_parameters(self): return {} def validate_scopes(self) -> dict[str, bool | str]: validated_scopes = {} project_slug = None for scope in self.PROVIDER_SCOPES: if scope.name == "event:read": if self.project_slug: response = requests.get( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{self.project_slug}/issues/", headers=self.__headers, ) if not response.ok: response_json = response.json() validated_scopes[scope.name] = response_json.get("detail") continue else: projects_response = requests.get( f"{self.sentry_api}/projects/", headers=self.__headers, ) if not projects_response.ok: response_json = projects_response.json() validated_scopes[scope.name] = response_json.get("detail") continue projects = projects_response.json() project_slug = projects[0].get("slug") response = requests.get( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/issues/", headers=self.__headers, ) if not response.ok: response_json = response.json() validated_scopes[scope.name] = response_json.get("detail") continue validated_scopes[scope.name] = True elif scope.name == "project:read": response = requests.get( f"{self.sentry_api}/projects/", headers=self.__headers, ) if not response.ok: response_json = response.json() validated_scopes[scope.name] = response_json.get("detail") continue validated_scopes[scope.name] = True elif scope.name == "project:write": response = requests.post( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{self.project_slug or project_slug}/plugins/webhooks/", headers=self.__headers, ) if not response.ok: response_json = response.json() validated_scopes[scope.name] = response_json.get("detail") continue validated_scopes[scope.name] = True return validated_scopes @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: logger = logging.getLogger(__name__) logger.debug( "Formatting Sentry alert", extra={ "event": event, }, ) event_data: dict = event.get("event", {}) if not event_data: event_data = event.get("data", {}).get("event", {}) if not event_data: raise Exception("Failed to get event data") tags_as_dict = {v[0]: v[1] for v in event_data.get("tags", [])} # Remove duplicate keys event_data.pop("id", None) tags_as_dict.pop("id", None) last_received = ( datetime.datetime.fromtimestamp( event_data.get("received"), tz=datetime.timezone.utc ) if "received" in event_data else datetime.datetime.now(tz=datetime.timezone.utc) ) # map severity and status to keep's format severity = event.pop("level", tags_as_dict.get("level", "")).lower() severity = SentryProvider.SEVERITIES_MAP.get(severity, AlertSeverity.INFO) status = event.get("action") status = SentryProvider.STATUS_MAP.get(status, AlertStatus.FIRING) # https://docs.sentry.io/product/integrations/integration-platform/webhooks/issue-alerts/#dataeventissue_url url = event_data.pop("url", event.get("url")) if "web_url" in event_data: url = event_data["web_url"] elif "issue_url" in event_data: url = event_data["issue_url"] elif "url" in tags_as_dict and not url: url = tags_as_dict["url"] exceptions = event_data.get("exception", {}).get("values", []) for exception in exceptions: if isinstance(exception, dict) and "stacktrace" not in exception: exception["stacktrace"] = False logger.debug("Formatted Sentry alert", extra={"event": event}) name = event_data.get("title", "").replace("'", "").replace('"', "") message = ( event_data.get("metadata", {}) .get("value", "") .replace("'", "") .replace('"', "") ) # Validate URL if url: try: result = urlparse(url) if not all([result.scheme, result.netloc]): url = None except Exception: url = None return AlertDto( id=event_data.pop("event_id"), name=name, status=status, lastReceived=str(last_received), service=tags_as_dict.get("server_name"), source=["sentry"], environment=event_data.pop( "environment", tags_as_dict.pop("environment", "unknown") ), message=message, description=event.get("culprit", ""), pushed=True, severity=severity, url=url, fingerprint=event.get("id"), tags=tags_as_dict, exceptions=exceptions, ) def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): self.logger.info("Setting up Sentry webhook") # cannot install webhook with localhost if ( "0.0.0.0" in keep_api_url or "127.0.0.1" in keep_api_url or "localhost" in keep_api_url ): raise ProviderConfigException( provider_id=self.provider_id, message="Cannot setup webhook with localhost, please use a public url", ) if self.project_slug: project_slugs = [self.project_slug] else: # Get all projects if no project slug was given projects_response = requests.get( f"{self.sentry_api}/projects/", headers=self.__headers, ) if not projects_response.ok: raise Exception("Failed to get projects") project_slugs = [ project.get("slug") for project in projects_response.json() ] for project_slug in project_slugs: self.logger.info(f"Setting up webhook for project {project_slug}") webhooks_request = requests.get( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/plugins/webhooks/", headers=self.__headers, ) webhooks_request.raise_for_status() webhooks_response = webhooks_request.json() # Get existing urls so we won't override anything config = next( iter( [ c for c in webhooks_response.get("config") if c.get("name") == "urls" ] ) ) existing_webhooks_value: str = config.get("value", "") or "" existing_webhooks = existing_webhooks_value.split("\n") # tb: this is a resolution to a bug i pushed somewhere in the beginning of sentry provider # TODO: remove this in the future if f"{keep_api_url}?api_key={api_key}" in existing_webhooks: existing_webhooks.remove(f"{keep_api_url}?api_key={api_key}") # This means we already installed in that project if f"{keep_api_url}&api_key={api_key}" in existing_webhooks: # TODO: we might got here but did not create the alert, we should fix that in the future # e.g. make sure the alert exists and if not create it. self.logger.info( f"Keep webhook already exists for project {project_slug}" ) continue existing_webhooks.append(f"{keep_api_url}&api_key={api_key}") # Update the webhooks urls update_response = requests.put( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/plugins/webhooks/", headers=self.__headers, json={"urls": "\n".join(existing_webhooks)}, ) update_response.raise_for_status() # Enable webhooks plugin for project requests.post( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/plugins/webhooks/", headers=self.__headers, ).raise_for_status() # TODO: make sure keep alert does not exist and if it doesnt create it. alert_rule_name = f"Keep Alert Rule - {project_slug}" alert_rules_response = requests.get( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/rules/", headers=self.__headers, ).json() alert_rule_exists = next( iter( [ alert_rule for alert_rule in alert_rules_response if alert_rule.get("name") == alert_rule_name ] ), None, ) if not alert_rule_exists: alert_payload = { "conditions": [ { "id": "sentry.rules.conditions.every_event.EveryEventCondition", }, ], "filters": [], "actions": [ { "service": "webhooks", "id": "sentry.rules.actions.notify_event_service.NotifyEventServiceAction", "name": "Send a notification via webhooks", }, ], "actionMatch": "any", "filterMatch": "any", "frequency": 5, "name": alert_rule_name, "projects": [project_slug], "status": "active", } try: requests.post( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/rules/", headers=self.__headers, json=alert_payload, ).raise_for_status() except Exception as e: # don't raise because we want to continue to the next project # TODO: identify the case where its "rule already exists" and raise for other errors self.logger.error( f"Failed to create alert rule for project {project_slug}", extra={"error": e}, ) continue self.logger.info(f"Sentry webhook setup complete for {project_slug}") else: self.logger.info(f"Sentry webhook already exists for {project_slug}") self.logger.info("Sentry webhook setup complete") def __get_issues(self, project_slug: str) -> dict: """ Get all issues for a project Args: project_slug (str): project slug Raises: Exception: if failed to get issues Returns: dict: issues by id """ issues_response = requests.get( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/issues/?query=*", headers=self.__headers, ) if not issues_response.ok: raise Exception(issues_response.json()) return {issue["id"]: issue for issue in issues_response.json()} def _get_alerts(self) -> list[AlertDto]: all_events_by_project = {} all_issues_by_project = {} if self.authentication_config.project_slug: response = requests.get( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{self.project_slug}/events/", headers=self.__headers, timeout=SentryProvider.DEFAULT_TIMEOUT, ) if not response.ok: raise Exception(response.json()) all_events_by_project[self.project_slug] = response.json() all_issues_by_project[self.project_slug] = self.__get_issues( self.project_slug ) else: projects_response = requests.get( f"{self.sentry_api}/projects/", headers=self.__headers, timeout=SentryProvider.DEFAULT_TIMEOUT, ) if not projects_response.ok: raise Exception("Failed to get projects") projects = projects_response.json() for project in projects: project_slug = project.get("slug") response = requests.get( f"{self.sentry_api}/projects/{self.sentry_org_slug}/{project_slug}/events/", headers=self.__headers, timeout=SentryProvider.DEFAULT_TIMEOUT, ) if not response.ok: error = response.json() self.logger.warning( "Failed to get events for project", extra={"project_slug": project_slug, **error}, ) continue all_events_by_project[project_slug] = response.json() all_issues_by_project[project_slug] = self.__get_issues(project_slug) if not all_events_by_project: # We didn't manage to get any events for some reason self.logger.warning("Failed to get events from all projects") return [] # format issues formatted_issues = [] for project in all_events_by_project: for event in all_events_by_project[project]: id = event.pop("id") fingerprint = event.get("groupID") related_issue = all_issues_by_project.get(project, {}).get( fingerprint, {} ) tags = {tag["key"]: tag["value"] for tag in event.pop("tags", [])} last_received = datetime.datetime.fromisoformat( event.get("dateCreated") ) + datetime.timedelta(minutes=1) # format severity and status severity = SentryProvider.SEVERITIES_MAP.get( tags.get("level"), AlertSeverity.INFO ) status = related_issue.get("status", event.get("event.type", None)) status = SentryProvider.STATUS_MAP.get(status, AlertStatus.FIRING) formatted_issues.append( AlertDto( id=id, name=event.pop("title"), description=event.pop("culprit", ""), message=event.get("message", ""), status=status, lastReceived=last_received.isoformat(), environment=tags.get("environment", "unknown"), severity=severity, url=event.pop("permalink", None), project=project, source=["sentry"], fingerprint=fingerprint, tags=tags, payload=event, ) ) return formatted_issues if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os sentry_api_url = os.environ.get("SENTRY_API_URL") sentry_api_token = os.environ.get("SENTRY_API_TOKEN") sentry_org_slug = os.environ.get("SENTRY_ORG_SLUG") sentry_project_slug = os.environ.get("SENTRY_PROJECT_SLUG") config = { "authentication": { "api_url": sentry_api_url, "api_key": sentry_api_token, "organization_slug": sentry_org_slug, "project_slug": sentry_project_slug, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="sentry-prod", provider_type="sentry", provider_config=config, ) alerts = provider.get_alerts() print(alerts) ================================================ FILE: keep/providers/servicenow_provider/.gitignore ================================================ cmdb_ci.json cmdb_rel_ci.json cmdb_rel_type.json ================================================ FILE: keep/providers/servicenow_provider/__init__.py ================================================ ================================================ FILE: keep/providers/servicenow_provider/servicenow_provider.py ================================================ """ ServicenowProvider is a class that implements the BaseProvider interface for Service Now updates. """ import os import dataclasses import hashlib import json import uuid from datetime import datetime, timezone import pydantic import requests from requests.auth import HTTPBasicAuth from keep.api.models.db.topology import TopologyServiceInDto from keep.api.models.incident import IncidentDto, IncidentStatus, IncidentSeverity from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseTopologyProvider, BaseIncidentProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class ServicenowProviderAuthConfig: """ServiceNow authentication configuration.""" service_now_base_url: HttpsUrl = dataclasses.field( metadata={ "required": True, "description": "The base URL of the ServiceNow instance", "sensitive": False, "hint": "https://dev12345.service-now.com", "validation": "https_url", } ) username: str = dataclasses.field( metadata={ "required": True, "description": "The username of the ServiceNow user", "sensitive": False, } ) password: str = dataclasses.field( metadata={ "required": True, "description": "The password of the ServiceNow user", "sensitive": True, } ) # @tb: based on this https://www.servicenow.com/community/developer-blog/oauth-2-0-with-inbound-rest/ba-p/2278926 client_id: str = dataclasses.field( metadata={ "required": False, "description": "The client ID to use OAuth 2.0 based authentication", "sensitive": False, }, default="", ) client_secret: str = dataclasses.field( metadata={ "required": False, "description": "The client secret to use OAuth 2.0 based authentication", "sensitive": True, }, default="", ) ticket_creation_url: str = dataclasses.field( metadata={ "required": False, "description": "URL for creating new tickets", "sensitive": False, "hint": "https://dev12345.service-now.com/now/sow/record/incident/-1", }, default="", ) class ServicenowProvider(BaseTopologyProvider, BaseIncidentProvider): """Manage ServiceNow tickets and incidents with bidirectional activity sync.""" PROVIDER_CATEGORY = ["Ticketing", "Incident Management"] PROVIDER_SCOPES = [ ProviderScope( name="itil", description="The user can read/write tickets from the table", documentation_url="https://docs.servicenow.com/bundle/sandiego-platform-administration/page/administer/roles/reference/r_BaseSystemRoles.html", mandatory=True, alias="Read from datahase", ) ] PROVIDER_TAGS = ["ticketing"] PROVIDER_DISPLAY_NAME = "Service Now" FINGERPRINT_FIELDS = ["number"] # ServiceNow incident state mapping # https://docs.servicenow.com/bundle/sandiego-it-service-management/page/product/incident-management/reference/r_IncidentStates.html INCIDENT_STATUS_MAP = { "1": IncidentStatus.FIRING, # New "2": IncidentStatus.ACKNOWLEDGED, # In Progress "3": IncidentStatus.ACKNOWLEDGED, # On Hold "6": IncidentStatus.RESOLVED, # Resolved "7": IncidentStatus.RESOLVED, # Closed "8": IncidentStatus.RESOLVED, # Canceled } # ServiceNow impact to severity mapping INCIDENT_SEVERITY_MAP = { "1": IncidentSeverity.CRITICAL, # High "2": IncidentSeverity.WARNING, # Medium "3": IncidentSeverity.LOW, # Low } PROVIDER_METHODS = [ ProviderMethod( name="Get Incidents", func_name="get_incidents", scopes=["itil"], description="Fetch all incidents from ServiceNow", type="view", ), ProviderMethod( name="Get Incident Activities", func_name="get_incident_activities", scopes=["itil"], description="Get work notes and comments from a ServiceNow incident", type="view", ), ProviderMethod( name="Add Incident Activity", func_name="add_incident_activity", scopes=["itil"], description="Add a work note or comment to a ServiceNow incident", type="action", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._access_token = None if ( self.authentication_config.client_id and self.authentication_config.client_secret ): url = f"{self.authentication_config.service_now_base_url}/oauth_token.do" payload = { "grant_type": "password", "username": self.authentication_config.username, "password": self.authentication_config.password, "client_id": self.authentication_config.client_id, "client_secret": self.authentication_config.client_secret, } headers = { "Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json", } response = requests.post( url, data=payload, headers=headers, ) if response.ok: self._access_token = response.json().get("access_token") else: self.logger.error( "Failed to get access token", extra={ "response": response.text, "status_code": response.status_code, "provider_id": self.provider_id, }, ) raise ProviderException( f"Failed to get OAuth access token from ServiceNow: {response.status_code}, {response.text}." " Please check your ServiceNow logs, information about this error should be there." ) def _get_auth(self): """Get authentication tuple or None if using OAuth.""" if self._access_token: return None return ( self.authentication_config.username, self.authentication_config.password, ) def _get_headers(self): """Get request headers including auth token if available.""" headers = {"Content-Type": "application/json", "Accept": "application/json"} if self._access_token: headers["Authorization"] = f"Bearer {self._access_token}" return headers def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ # Optional scope validation skipping if ( os.environ.get( "KEEP_SERVICENOW_PROVIDER_SKIP_SCOPE_VALIDATION", "false" ).lower() == "true" ): return {"itil": True} try: self.logger.info("Validating ServiceNow scopes") url = f"{self.authentication_config.service_now_base_url}/api/now/table/sys_user_role?sysparm_query=user_name={self.authentication_config.username}" if self._access_token: response = requests.get( url, headers={"Authorization": f"Bearer {self._access_token}"}, verify=False, timeout=10, ) else: response = requests.get( url, auth=HTTPBasicAuth( self.authentication_config.username, self.authentication_config.password, ), verify=False, timeout=10, ) try: response.raise_for_status() except requests.exceptions.HTTPError as e: self.logger.exception(f"Failed to get roles from ServiceNow: {e}") scopes = {"itil": str(e)} return scopes if response.ok: roles = response.json() roles_names = [role.get("name") for role in roles.get("result")] if "itil" in roles_names: self.logger.info("User has ITIL role") scopes = { "itil": True, } else: self.logger.info("User does not have ITIL role") scopes = { "itil": "This user does not have the ITIL role", } else: self.logger.error( "Failed to get roles from ServiceNow", extra={ "response": response.text, "status_code": response.status_code, }, ) scopes = {"itil": "Failed to get roles from ServiceNow"} except Exception as e: self.logger.exception("Error validating scopes") scopes = { "itil": str(e), } return scopes def validate_config(self): self.authentication_config = ServicenowProviderAuthConfig( **self.config.authentication ) def _query( self, table_name: str, incident_id: str = None, sysparm_limit: int = 100, sysparm_offset: int = 0, **kwargs: dict, ): """ Query ServiceNow for records. Args: table_name (str): The name of the table to query. incident_id (str): The incident ID to query. sysparm_limit (int): The maximum number of records to return. sysparm_offset (int): The offset to start from. """ request_url = f"{self.authentication_config.service_now_base_url}/api/now/table/{table_name}" headers = self._get_headers() auth = self._get_auth() if incident_id: request_url = f"{request_url}/{incident_id}" params = {"sysparm_offset": 0, "sysparm_limit": 100} # Add pagination parameters if not already set if sysparm_limit: params["sysparm_limit"] = ( sysparm_limit # Limit number of records per request ) if sysparm_offset: params["sysparm_offset"] = 0 # Start from beginning response = requests.get( request_url, headers=headers, auth=auth, params=params, verify=False, timeout=10, ) if not response.ok: self.logger.error( f"Failed to query {table_name}", extra={"status_code": response.status_code, "response": response.text}, ) return [] return response.json().get("result", []) # ------------------------------------------------------------------------- # Incident pulling (BaseIncidentProvider) # ------------------------------------------------------------------------- @staticmethod def _get_incident_id(incident_number: str) -> str: """Create a deterministic UUID from the ServiceNow incident number.""" md5 = hashlib.md5() md5.update(incident_number.encode("utf-8")) return uuid.UUID(md5.hexdigest()) def _get_incidents(self) -> list[IncidentDto]: """Pull incidents from the ServiceNow incident table.""" self.logger.info("Pulling incidents from ServiceNow") all_incidents = [] offset = 0 limit = 100 while True: raw_incidents = self._query( table_name="incident", sysparm_limit=limit, sysparm_offset=offset, ) if not raw_incidents: break for incident in raw_incidents: try: dto = self._format_incident({"incident": incident}) if dto: all_incidents.append(dto) except Exception: self.logger.exception( "Failed to format ServiceNow incident", extra={"sys_id": incident.get("sys_id")}, ) if len(raw_incidents) < limit: break offset += limit self.logger.info( "Finished pulling incidents from ServiceNow", extra={"count": len(all_incidents)}, ) return all_incidents @staticmethod def _format_incident( event: dict, provider_instance: "ServicenowProvider" = None ) -> IncidentDto | list[IncidentDto]: """Convert a raw ServiceNow incident record into an IncidentDto.""" incident = event.get("incident", {}) number = incident.get("number") if not number: return [] incident_id = ServicenowProvider._get_incident_id(number) state = str(incident.get("incident_state") or incident.get("state", "1")) status = ServicenowProvider.INCIDENT_STATUS_MAP.get( state, IncidentStatus.FIRING ) impact = str(incident.get("impact", "3")) severity = ServicenowProvider.INCIDENT_SEVERITY_MAP.get( impact, IncidentSeverity.INFO ) # Parse timestamps created_on = incident.get("sys_created_on", "") resolved_at = incident.get("resolved_at", "") closed_at = incident.get("closed_at", "") creation_time = None end_time = None if created_on: try: creation_time = datetime.strptime( created_on, "%Y-%m-%d %H:%M:%S" ).replace(tzinfo=timezone.utc) except (ValueError, TypeError): pass if resolved_at: try: end_time = datetime.strptime( resolved_at, "%Y-%m-%d %H:%M:%S" ).replace(tzinfo=timezone.utc) except (ValueError, TypeError): pass elif closed_at: try: end_time = datetime.strptime( closed_at, "%Y-%m-%d %H:%M:%S" ).replace(tzinfo=timezone.utc) except (ValueError, TypeError): pass # Extract service info from the assignment group or category assignment_group = incident.get("assignment_group", "") if isinstance(assignment_group, dict): assignment_group = assignment_group.get("display_value", "") or assignment_group.get("value", "") category = incident.get("category", "") service = assignment_group or category or "unknown" title = incident.get("short_description") or incident.get("number", "") description = incident.get("description", "") assignee = incident.get("assigned_to", "") if isinstance(assignee, dict): assignee = assignee.get("display_value", "") or assignee.get("value", "") return IncidentDto( id=incident_id, user_generated_name=f"SN-{title}-{number}", status=status, severity=severity, creation_time=creation_time, start_time=creation_time, end_time=end_time, description=description, assignee=assignee if assignee else None, alert_sources=["servicenow"], alerts_count=0, services=[service] if service != "unknown" else [], is_predicted=False, is_candidate=False, fingerprint=number, ) # ------------------------------------------------------------------------- # Incident activity sync (bidirectional) # ------------------------------------------------------------------------- def get_incident_activities( self, incident_id: str, limit: int = 50, ) -> list[dict]: """ Fetch work notes and comments from a ServiceNow incident via sys_journal_field. Args: incident_id: The incident number (e.g. INC0010001) or sys_id. limit: Maximum number of activity records to return. Returns: List of activity dicts with keys: sys_id, element, value, sys_created_on, sys_created_by. """ self.logger.info( "Fetching incident activities", extra={"incident_id": incident_id}, ) # First resolve the sys_id if we got an incident number sys_id = self._resolve_incident_sys_id(incident_id) if not sys_id: self.logger.warning( "Could not resolve incident sys_id", extra={"incident_id": incident_id}, ) return [] # Query the journal field table for work_notes and comments url = ( f"{self.authentication_config.service_now_base_url}" f"/api/now/table/sys_journal_field" ) params = { "sysparm_query": ( f"element_id={sys_id}" f"^name=incident" f"^elementINwork_notes,comments" f"^ORDERBYDESCsys_created_on" ), "sysparm_limit": limit, "sysparm_fields": "sys_id,element,value,sys_created_on,sys_created_by", } response = requests.get( url, headers=self._get_headers(), auth=self._get_auth(), params=params, verify=False, timeout=15, ) if not response.ok: self.logger.error( "Failed to fetch incident activities", extra={ "status_code": response.status_code, "response": response.text, }, ) return [] results = response.json().get("result", []) activities = [] for record in results: activities.append( { "sys_id": record.get("sys_id"), "type": record.get("element"), # work_notes or comments "content": record.get("value"), "created_at": record.get("sys_created_on"), "created_by": record.get("sys_created_by"), } ) self.logger.info( "Fetched incident activities", extra={"incident_id": incident_id, "count": len(activities)}, ) return activities def add_incident_activity( self, incident_id: str, content: str, activity_type: str = "work_notes", ) -> dict: """ Add a work note or comment to a ServiceNow incident. Args: incident_id: The incident number (e.g. INC0010001) or sys_id. content: The text content to add. activity_type: Either 'work_notes' or 'comments'. Defaults to 'work_notes'. Returns: The updated incident record from ServiceNow. """ if activity_type not in ("work_notes", "comments"): raise ProviderException( f"Invalid activity_type '{activity_type}'. Must be 'work_notes' or 'comments'." ) self.logger.info( "Adding incident activity", extra={ "incident_id": incident_id, "activity_type": activity_type, }, ) sys_id = self._resolve_incident_sys_id(incident_id) if not sys_id: raise ProviderException( f"Could not resolve incident sys_id for '{incident_id}'" ) url = ( f"{self.authentication_config.service_now_base_url}" f"/api/now/table/incident/{sys_id}" ) payload = {activity_type: content} response = requests.patch( url, headers=self._get_headers(), auth=self._get_auth(), data=json.dumps(payload), verify=False, timeout=15, ) if not response.ok: self.logger.error( "Failed to add incident activity", extra={ "status_code": response.status_code, "response": response.text, }, ) raise ProviderException( f"Failed to add activity to incident: {response.status_code}" ) result = response.json().get("result", {}) self.logger.info( "Added incident activity", extra={"incident_id": incident_id, "sys_id": sys_id}, ) return result def _resolve_incident_sys_id(self, incident_id: str) -> str | None: """ Resolve an incident number or sys_id to a sys_id. If the input looks like an incident number (starts with 'INC'), query by number. Otherwise assume it's already a sys_id. """ if not incident_id: return None # If it looks like a sys_id (32-char hex), return as-is clean = incident_id.replace("-", "") if len(clean) == 32 and all(c in "0123456789abcdef" for c in clean.lower()): return incident_id # Otherwise, query by number url = ( f"{self.authentication_config.service_now_base_url}" f"/api/now/table/incident" ) params = { "sysparm_query": f"number={incident_id}", "sysparm_fields": "sys_id", "sysparm_limit": 1, } response = requests.get( url, headers=self._get_headers(), auth=self._get_auth(), params=params, verify=False, timeout=10, ) if response.ok: results = response.json().get("result", []) if results: return results[0].get("sys_id") return None # ------------------------------------------------------------------------- # Topology pulling (existing functionality) # ------------------------------------------------------------------------- def pull_topology(self) -> tuple[list[TopologyServiceInDto], dict]: # TODO: in scale, we'll need to use pagination around here headers = {"Content-Type": "application/json", "Accept": "application/json"} auth = ( ( self.authentication_config.username, self.authentication_config.password, ) if not self._access_token else None ) if self._access_token: headers["Authorization"] = f"Bearer {self._access_token}" topology = [] self.logger.info( "Pulling topology", extra={"tenant_id": self.context_manager.tenant_id} ) self.logger.info("Pulling CMDB items") fields = [ "name", "sys_id", "ip_address", "mac_address", "owned_by.name" "manufacturer.name", # Retrieve the name of the manufacturer "short_description", "environment", ] # Set parameters for the request cmdb_params = { "sysparm_fields": ",".join(fields), "sysparm_query": "active=true", } cmdb_response = requests.get( f"{self.authentication_config.service_now_base_url}/api/now/table/cmdb_ci", headers=headers, auth=auth, params=cmdb_params, ) if not cmdb_response.ok: self.logger.info( "Failed to pull topology with cmdb_params, retrying with no params.", extra={ "tenant_id": self.context_manager.tenant_id, "status_code": cmdb_response.status_code, "response_body": cmdb_response.text, "using_access_token": self._access_token is not None, "provider_id": self.provider_id, }, ) # Retry without params, may happen because of lack of permissions. # The following code is tolerant to missing data. cmdb_response = requests.get( f"{self.authentication_config.service_now_base_url}/api/now/table/cmdb_ci", headers=headers, auth=auth, ) if not cmdb_response.ok: self.logger.error( "Failed to pull topology without params.", extra={ "tenant_id": self.context_manager.tenant_id, "status_code": cmdb_response.status_code, "response_body": cmdb_response.text, "using_access_token": self._access_token is not None, "provider_id": self.provider_id, }, ) return topology, {} cmdb_data = cmdb_response.json().get("result", []) self.logger.info( "Pulling CMDB items completed", extra={"len_of_cmdb_items": len(cmdb_data)} ) self.logger.info("Pulling relationship types") relationship_types = {} rel_type_response = requests.get( f"{self.authentication_config.service_now_base_url}/api/now/table/cmdb_rel_type", auth=auth, headers=headers, ) if not rel_type_response.ok: self.logger.error( "Failed to get topology types", extra={ "tenant_id": self.context_manager.tenant_id, "status_code": cmdb_response.status_code, "response_body": cmdb_response.text, "using_access_token": self._access_token is not None, "provider_id": self.provider_id, }, ) else: rel_type_json = rel_type_response.json() for result in rel_type_json.get("result", []): relationship_types[result.get("sys_id")] = result.get("sys_name") self.logger.info("Pulling relationship types completed") self.logger.info("Pulling relationships") relationships = {} rel_response = requests.get( f"{self.authentication_config.service_now_base_url}/api/now/table/cmdb_rel_ci", auth=auth, headers=headers, ) if not rel_response.ok: self.logger.error( "Failed to get topology relationships", extra={ "tenant_id": self.context_manager.tenant_id, "status_code": cmdb_response.status_code, "response_body": cmdb_response.text, "using_access_token": self._access_token is not None, "provider_id": self.provider_id, }, ) else: rel_json = rel_response.json() for relationship in rel_json.get("result", []): parent = relationship.get("parent", {}) if type(parent) is dict: parent_id = relationship.get("parent", {}).get("value") else: parent_id = None child = relationship.get("child", {}) if type(child) is dict: child_id = child.get("value") else: child_id = None relationship_type_id = relationship.get("type", {}).get("value") relationship_type = relationship_types.get(relationship_type_id) if parent_id not in relationships: relationships[parent_id] = {} relationships[parent_id][child_id] = relationship_type self.logger.info("Pulling relationships completed") self.logger.info("Mixing up all topology data") for entity in cmdb_data: sys_id = entity.get("sys_id") owned_by = entity.get("owned_by.name") environment = entity.get("environment") if environment is None: environment = "" topology_service = TopologyServiceInDto( source_provider_id=self.provider_id, service=sys_id, display_name=entity.get("name"), description=entity.get("short_description"), environment=environment, team=owned_by, dependencies=relationships.get(sys_id, {}), ip_address=entity.get("ip_address"), mac_address=entity.get("mac_address"), ) topology.append(topology_service) self.logger.info( "Topology pulling completed", extra={ "tenant_id": self.context_manager.tenant_id, "len_of_topology": len(topology), "using_access_token": self._access_token is not None, "provider_id": self.provider_id, }, ) return topology, {} def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify(self, table_name: str, payload: dict = {}, **kwargs: dict): """ Create a ticket in ServiceNow. Args: table_name (str): The name of the table to create the ticket in. payload (dict): The ticket payload. ticket_id (str): The ticket ID (optional to update a ticket). fingerprint (str): The fingerprint of the ticket (optional to update a ticket). """ headers = {"Content-Type": "application/json", "Accept": "application/json"} auth = ( ( self.authentication_config.username, self.authentication_config.password, ) if not self._access_token else None ) if self._access_token: headers["Authorization"] = f"Bearer {self._access_token}" # otherwise, create the ticket if not table_name: raise ProviderException("Table name is required") # TODO - this could be separated into a ServicenowUpdateProvider once we support if "ticket_id" in kwargs: ticket_id = kwargs.pop("ticket_id") fingerprint = kwargs.pop("fingerprint") return self._notify_update(table_name, ticket_id, fingerprint) # In ServiceNow tables are lower case table_name = table_name.lower() url = f"{self.authentication_config.service_now_base_url}/api/now/table/{table_name}" # HTTP request response = requests.post( url, auth=auth, headers=headers, data=json.dumps(payload), verify=False, ) if response.status_code == 201: # HTTP status code for "Created" resp = response.json() self.logger.info(f"Created ticket: {resp}") result = resp.get("result") # Add link to ticket result["link"] = ( f"{self.authentication_config.service_now_base_url}/now/nav/ui/classic/params/target/{table_name}.do%3Fsys_id%3D{result['sys_id']}" ) return result # if the instance is down due to hibranate you'll get 200 instead of 201 elif response.status_code == 200: raise ProviderException( "ServiceNow instance is down, you need to restart the instance." ) else: self.logger.info(f"Failed to create ticket: {response.text}") response.raise_for_status() def _notify_update(self, table_name: str, ticket_id: str, fingerprint: str): url = f"{self.authentication_config.service_now_base_url}/api/now/table/{table_name}/{ticket_id}" headers = {"Content-Type": "application/json", "Accept": "application/json"} auth = ( ( self.authentication_config.username, self.authentication_config.password, ) if self._access_token else None ) if self._access_token: headers["Authorization"] = f"Bearer {self._access_token}" response = requests.get( url, auth=auth, headers=headers, verify=False, ) if response.status_code == 200: resp = response.text # if the instance is down due to hibranate you'll get 200 instead of 201 if "Want to find out why instances hibernate?" in resp: raise ProviderException( "ServiceNow instance is down, you need to restart the instance." ) # else, we are ok else: resp = json.loads(resp) self.logger.info("Updated ticket", extra={"resp": resp}) resp = resp.get("result") resp["fingerprint"] = fingerprint return resp else: self.logger.info("Failed to update ticket", extra={"resp": response.text}) resp.raise_for_status() if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os from unittest.mock import patch service_now_base_url = os.environ.get("SERVICENOW_BASE_URL", "https://meow.me") service_now_username = os.environ.get("SERVICENOW_USERNAME", "admin") service_now_password = os.environ.get("SERVICENOW_PASSWORD", "admin") mock_real_requests_with_json_data = ( os.environ.get("MOCK_REAL_REQUESTS_WITH_JSON_DATA", "true").lower() == "true" ) # Initalize the provider and provider config config = ProviderConfig( description="Service Now Provider", authentication={ "service_now_base_url": service_now_base_url, "username": service_now_username, "password": service_now_password, }, ) provider = ServicenowProvider( context_manager, provider_id="servicenow", config=config ) def mock_get(*args, **kwargs): """ Mock topology responses using json files. """ class MockResponse: def __init__(self): self.ok = True self.status_code = 200 self.url = args[0] def json(self): if "cmdb_ci" in self.url: with open( os.path.join(os.path.dirname(__file__), "cmdb_ci.json") ) as f: return json.load(f) elif "cmdb_rel_type" in self.url: with open( os.path.join(os.path.dirname(__file__), "cmdb_rel_type.json") ) as f: return json.load(f) elif "cmdb_rel_ci" in self.url: with open( os.path.join(os.path.dirname(__file__), "cmdb_rel_ci.json") ) as f: return json.load(f) return {} return MockResponse() if mock_real_requests_with_json_data: with patch("requests.get", side_effect=mock_get): r = provider.pull_topology() else: r = provider.pull_topology() print(r) ================================================ FILE: keep/providers/signalfx_provider/__init__.py ================================================ ================================================ FILE: keep/providers/signalfx_provider/alerts_mock.py ================================================ ALERTS = { "simulate": { "payload": { "severity": "Critical", "statusExtended": "anomalous", "detectorUrl": "https://app.signalfx.com/#/detector/XXXX", "incidentId": "1234", "originatingMetric": "sf.org.log.numMessagesDroppedThrottle", "detectOnCondition": "when(A < threshold(1))", "messageBody": 'Rule "logs" in detector "logs" cleared at Thu, 29 Feb 2024 11:48:32 GMT.\n\nCurrent signal value for sf.org.log.numMessagesDroppedThrottle: 0\n\nSignal details:\n{sf_metric=sf.org.log.numMessagesDroppedThrottle, orgId=XXXX}', "inputs": { "A": { "value": "0", "fragment": "data(...A')", "_S2": {"value": "1", "fragment": "threshold(1)"}, }, "rule": "logs", "description": "The value of sf.org.log.numMessagesDroppedThrottle is below 1.", "messageTitle": "Manually resolved: logs (logs)", "sf_schema": 2, "eventType": "XXXX_XXXX_logs", "runbookUrl": None, "triggeredWhileMuted": False, }, } } } ================================================ FILE: keep/providers/signalfx_provider/signalfx_provider.py ================================================ import base64 import dataclasses import datetime from urllib.parse import quote, urlparse import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import ( BaseProvider, ProviderConfig, ProviderScope, ) from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class SignalfxProviderAuthConfig: """ Signalfx authentication configuration. """ KEEP_SIGNALFX_WEBHOOK_INTEGRATION_NAME = "keep-signalfx-webhook-integration" sf_token: str = dataclasses.field( metadata={ "required": True, "description": "SignalFX token", "hint": "https://dev.splunk.com/observability/docs/administration/authtokens/", "sensitive": True, }, default="", ) realm: str = dataclasses.field( metadata={ "required": False, "description": "SignalFX Realm", "sensitive": False, "hint": "https://api.{{realm}}.signalfx.com e.g. eu0", }, default="eu0", ) email: str = dataclasses.field( metadata={ "required": False, "description": "SignalFX email. Required for setup webhook.", "sensitive": True, "hint": "https://dev.splunk.com/observability/reference/api/sessiontokens/latest", }, default="", ) password: str = dataclasses.field( metadata={ "required": False, "description": "SignalFX password. Required for setup webhook.", "sensitive": True, "hint": "https://dev.splunk.com/observability/reference/api/sessiontokens/latest", }, default="", ) org_id: str = dataclasses.field( metadata={ "required": False, "description": "SignalFX organization ID. Required for setup webhook.", "sensitive": False, "hint": "https://dev.splunk.com/observability/reference/api/sessiontokens/latest", }, default="", ) class SignalfxProvider(BaseProvider): """Get alerts from SignalFx into Keep.""" PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="API", description="API authScope - read permission for SignalFx API", mandatory=True, mandatory_for_webhook=True, documentation_url="https://dev.splunk.com/observability/reference/api/org_tokens/latest#endpoint-create-single-token", alias="API Read", ), ] PROVIDER_METHODS = [] FINGERPRINT_FIELDS = ["detectorId", "incidentId"] PROVIDER_DISPLAY_NAME = "SignalFx" SEVERITIES_MAP = { "Critical": AlertSeverity.CRITICAL, "Major": AlertSeverity.HIGH, "Warning": AlertSeverity.WARNING, "Info": AlertSeverity.INFO, "Minor": AlertSeverity.LOW, } # https://docs.splunk.com/observability/en/admin/notif-services/webhook.html#observability-cloud-webhook-request-body-fields # search for "statusExtended" STATUS_MAP = { "ok": AlertStatus.RESOLVED, "anomalous": AlertStatus.FIRING, "manually resolved": AlertStatus.RESOLVED, "stopped": AlertStatus.RESOLVED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.api_url = f"https://api.{self.authentication_config.realm}.signalfx.com" self.api_token = self.authentication_config.sf_token if not self.api_token: raise ValueError("SignalFx token is required") def _get_headers(self): return { "X-SF-TOKEN": self.api_token, "Content-Type": "application/json", } def validate_scopes(self): # try to get some data from the API scopes = {} headers = self._get_headers() response = requests.get(f"{self.api_url}/v2/detector", headers=headers) try: response.raise_for_status() scopes["API"] = True except requests.exceptions.HTTPError as e: self.logger.error(f"Failed to get SignalFx alerts: {e.response.text}") scopes["API"] = str(e) return scopes def validate_config(self): self.authentication_config = SignalfxProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def _get_alerts(self): headers = self._get_headers() # should also consider /v2/event/find but it looks like the same scehme # https://dev.splunk.com/observability/reference/api/retrieve_events_v2/latest#endpoint-retrieve-events-using-query response = requests.get(f"{self.api_url}/v2/incident", headers=headers) response.raise_for_status() incidents = response.json() # Map SignalFx alert data to AlertDto objects alerts = [] # TODO: incident may have more than one alert? for incident in incidents: try: alerts.append(self._format_alert_get_alert(incident)) except Exception as e: self.logger.error(f"Failed to format SignalFx alert: {e}") pass return alerts @staticmethod def sanitize_url(url: str) -> str: # SignalFx URLs are not always properly formatted # e.g. 'https://app.eu0.signalfx.com/#/detector/YYYYYY/edit?incidentId=XXXXX&is=manually resolved' # so Pyatnadic will raise an error if the URL is not properly formatted # remove the # from the URL parsed_url = urlparse(url.replace("#", "")) # quote the query quoted_query = quote(parsed_url.query) # reassemble the URL url = url.replace(parsed_url.query, quoted_query) return url def _format_alert_get_alert(self, incident: dict) -> AlertDto: # there is difference between webhook payload (_format_alert) # and alerts from API (get_alert) so we need to handle it separately last_alert = incident.get("events")[-1] severity = SignalfxProvider.SEVERITIES_MAP.get( incident.pop("severity").lower(), AlertSeverity.INFO ) status = SignalfxProvider.STATUS_MAP.get( incident.pop("anomalyState").lower(), AlertStatus.FIRING ) incident_id = incident.pop("incidentId") detector_id = incident.pop("detectorId") url = f"https://app.eu0.signalfx.com/#/detector/{detector_id}/edit?incidentId%3D{incident_id}" name = incident.pop("detectLabel") description = incident.pop("displayBody") lastReceived = datetime.datetime.fromtimestamp( last_alert.get("timestamp") / 1000 ).isoformat() alert_dto = AlertDto( id=incident_id, name=name, description=description, lastReceived=lastReceived, severity=severity, status=status, url=url, source=["signalfx"], **incident, # rest of the incident ) alert_dto.fingerprint = SignalfxProvider.get_alert_fingerprint( alert_dto, SignalfxProvider.FINGERPRINT_FIELDS ) return alert_dto @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: # Transform a SignalFx event into an AlertDto object # see: https://docs.splunk.com/observability/en/admin/notif-services/webhook.html#observability-cloud-webhook-request-body-fields severity = SignalfxProvider.SEVERITIES_MAP.get( event.pop("severity"), AlertSeverity.INFO ) status = SignalfxProvider.STATUS_MAP.get( event.pop("statusExtended"), AlertStatus.FIRING ) # remove the status so we won't have duplicated keywords event.pop("status", None) message = event.pop("messageBody", "") description = event.pop("description", "") name = event.pop("messageTitle", "") lastReceived = event.pop("timestamp", datetime.datetime.utcnow().isoformat()) inputs: dict = event.pop("inputs", {}) new_inputs = [] for key, value in inputs.items(): value["id"] = key new_inputs.append(value) event["inputs"] = new_inputs url = event.pop("detectorUrl") url = SignalfxProvider.sanitize_url(url) _id = event.pop("incidentId") alert_dto = AlertDto( id=_id, name=name, message=message, description=description, lastReceived=lastReceived, severity=severity, status=status, url=url, source=["signalfx"], **event, # rest of the alert ) alert_dto.fingerprint = SignalfxProvider.get_alert_fingerprint( alert_dto, SignalfxProvider.FINGERPRINT_FIELDS ) return alert_dto def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): # see: https://dev.splunk.com/observability/reference/api/integrations/latest#endpoint-create-integration self.logger.info("Setting up SignalFx webhook integration") email = self.config.authentication.get("email") password = self.config.authentication.get("password") org_id = self.config.authentication.get("org_id") # all are required for webhook setup if not email or not password or not org_id: self.logger.error( "SignalFx email, password and organization ID are required for webhook setup" ) return None # 1. First - get session token becuase to set up webhook # you must have User API access token and you can use the Org access token # https://dev.splunk.com/observability/reference/api/sessiontokens/latest headers = self._get_headers() session_payload = { "email": email, "password": password, "organizationId": org_id, } response = requests.post( f"{self.api_url}/v2/session", headers=headers, json=session_payload, ) try: response.raise_for_status() # catch any HTTP errors except requests.exceptions.HTTPError as e: self.logger.error( f"Failed to get SignalFx session token: {e.response.text}" ) return None # this is the token we need to setup the webhook # see: https://dev.splunk.com/observability/reference/api/sessiontokens/latest session_access_token = response.json().get("accessToken") # 2. Now let's check if the webhook integration already exists response = requests.get(f"{self.api_url}/v2/integration", headers=headers) try: response.raise_for_status() # catch any HTTP errors except requests.exceptions.HTTPError as e: self.logger.error( f"Failed to get SignalFx webhook integration: {e.response.text}" ) return None integration_id = None integrations = response.json().get("results", []) for integration in integrations: # check if the webhook integration already exists if ( integration.get("name") == SignalfxProviderAuthConfig.KEEP_SIGNALFX_WEBHOOK_INTEGRATION_NAME ): # the integration already exists, let's patch it self.logger.info("SignalFx webhook integration already exists") integration_id = integration.get("id") break auth_header = f"api_key:{api_key}" auth_header = base64.b64encode(auth_header.encode()).decode() webhook_payloads = { "name": SignalfxProviderAuthConfig.KEEP_SIGNALFX_WEBHOOK_INTEGRATION_NAME, "type": "Webhook", "enabled": True, "url": keep_api_url, # authentication with Keep api key "headers": { "Authorization": f"Basic {auth_header}", }, } headers = { "X-SF-TOKEN": session_access_token, } # if integration_id is set, we need to update the existing integration if integration_id: # update the existing integration response = requests.put( f"{self.api_url}/v2/integration/{integration_id}", headers=headers, json=webhook_payloads, ) else: response = requests.post( f"{self.api_url}/v2/integration", headers=headers, json=webhook_payloads, ) # keep the integration id for later integration_id = response.json().get("id") try: response.raise_for_status() # catch any HTTP errors except requests.exceptions.HTTPError as e: self.logger.error( f"Failed to create SignalFx webhook integration: {e.response.text}" ) return None self.logger.info("SignalFx webhook integration setup complete") # 3. Now subscribe webhook to all detectors # https://docs.splunk.com/observability/en/admin/notif-services/webhook.html response = requests.get(f"{self.api_url}/v2/detector", headers=headers) try: response.raise_for_status() # catch any HTTP errors except requests.exceptions.HTTPError as e: self.logger.error(f"Failed to get SignalFx detectors: {e.response.text}") return None detectors = response.json().get("results", []) # subscribe the webhook to all detectors for detector in detectors: self.logger.info( "Updating SignalFx detector", extra={ "detector_id": detector.get("id"), "detector_name": detector.get("name"), }, ) detector_id = detector.get("id") rules = detector.get("rules", []) detector_updated = False for rule in rules: notifications = rule.get("notifications", []) keep_installed = integration_id in [ notification.get("credentialId") for notification in notifications ] if not keep_installed: # add the webhook as a notification to the rule self.logger.info( "Adding SignalFx webhook to detector rule", extra={ "rule_id": rule.get("id"), "rule_name": rule.get("name"), }, ) notifications.append( { "credentialId": integration_id, "type": "Webhook", } ) detector_updated = True # if at least one rule was updated, update the detector if detector_updated: # update the detector # https://dev.splunk.com/observability/reference/api/detectors/latest#endpoint-update-single-detector self.logger.info( "Updating SignalFx detector", extra={ "detector_id": detector_id, "detector_name": detector.get("name"), }, ) response = requests.put( f"{self.api_url}/v2/detector/{detector_id}", headers=headers, json=detector, ) try: response.raise_for_status() self.logger.info( "SignalFx detector updated", extra={ "detector_id": detector_id, "detector_name": detector.get("name"), }, ) # catch any HTTP errors except requests.exceptions.HTTPError as e: self.logger.error( f"Failed to subscribe SignalFx detector {detector_id} to webhook: {e.response.text}" ) return None self.logger.info("SignalFx webhook integration setup complete") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) # Load environment variables import os realm = os.environ.get("SIGNALFX_REALM", "eu0") token = os.environ.get("SIGNALFX_TOKEN", "") email = os.environ.get("SIGNALFX_USER", "") password = os.environ.get("SIGNALFX_PASSWORD", "") org_id = os.environ.get("SIGNALFX_ORGID", "") keep_api_key = os.environ.get("KEEP_API_KEY") keep_api_url = os.environ.get("KEEP_API_URL") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = { "authentication": { "realm": realm, "sf_token": token, "email": email, "password": password, "org_id": org_id, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="signalfx-keephq", provider_type="signalfx", provider_config=config, ) webhook = provider.setup_webhook("keep", keep_api_url, keep_api_key, True) print(webhook) ================================================ FILE: keep/providers/signl4_provider/__init__.py ================================================ ================================================ FILE: keep/providers/signl4_provider/signl4_provider.py ================================================ import dataclasses import enum import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory class S4Status(str, enum.Enum): """ SIGNL4 alert status. """ NEW = "new" ACKNOWLEDGED = "acknowledged" RESOLVED = "resolved" class S4AlertingScenario(str, enum.Enum): """ SIGNL4 alerting scenario. """ DEFAULT = "" SINGLE_ACK = "single_ack" MULTI_ACK = "multi_ack" EMERGENCY = "emergency" @pydantic.dataclasses.dataclass class Signl4ProviderAuthConfig: signl4_integration_secret: str = dataclasses.field( metadata={ "required": True, "description": "SIGNL4 integration or team secret", "sensitive": True, }, ) class Signl4Provider(BaseProvider): """Trigger SIGNL4 alerts.""" PROVIDER_DISPLAY_NAME = "SIGNL4" PROVIDER_CATEGORY = ["Incident Management"] PROVIDER_SCOPES = [ ProviderScope( name="signl4:create", description="Create SIGNL4 alerts", mandatory=True, alias="Create alerts", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = Signl4ProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): scopes = {} self.logger.info("Validating scopes") try: self._notify( user="John Doe", title="Simple test alert from Keep", message="Simple alert showing context with name: John Doe. Please ignore.", ) scopes["signl4:create"] = True except Exception as e: self.logger.exception("Failed to create SIGNL4 alert") scopes["signl4:create"] = str(e) return scopes def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, title: str | None = None, message: str | None = None, user: str | None = None, s4_external_id: str | None = None, s4_status: S4Status = S4Status.NEW, s4_service: str | None = None, s4_location: str | None = None, s4_alerting_scenario: S4AlertingScenario = S4AlertingScenario.DEFAULT, s4_filtering: bool = False, **kwargs: dict, ): """ Create a SIGNL4 alert. Alert / Incident is created via the SIGNL4 Webhook API (https://connect.signl4.com/webhook/docs/index.html). Args: title (str): Alert title. message (str): Alert message. user (str): User name. s4_external_id (str): External ID. s4_status (S4Status): Alert status. s4_service (str): Service name. s4_location (str): Location. s4_alerting_scenario (S4AlertingScenario): Alerting scenario. s4_filtering (bool): Filtering. **kwargs (dict): Additional alert data. """ # Alert data alert_data = { "title": title, "message": message, "user": user, "X-S4-ExternalID": s4_external_id, "X-S4-Status": s4_status, "X-S4-Service": s4_service, "X-S4-Location": s4_location, "X-S4-AlertingScenario": s4_alerting_scenario, "X-S4-Filtering": s4_filtering, "X-S4-SourceSystem": "Keep", **kwargs, } # SIGNL4 webhook URL webhook_url = ( "https://connect.signl4.com/webhook/" + self.authentication_config.signl4_integration_secret ) try: result = requests.post(url=webhook_url, json=alert_data) if result.status_code == 201: # Success self.logger.info(result.text) else: # Error self.logger.exception("Error: " + str(result.status_code)) raise Exception("Error: " + str(result.status_code)) except: self.logger.exception("Failed to create SIGNL4 alert") raise if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os signl4_integration_secret = os.environ.get("SIGNL4_INTEGRATION_SECRET") assert signl4_integration_secret # Initalize the provider and provider config provider_config = ProviderConfig( description="SIGNL4 Provider", authentication={"signl4_integration_secret": signl4_integration_secret}, ) provider = ProvidersFactory.get_provider( context_manager=context_manager, provider_id="keep-s4", provider_type="signl4", provider_config=provider_config, ) # provider.notify( # message="Simple alert showing context with name: John Doe", # note="Simple alert", # user="John Doe", # ) provider.query(type="alerts", query="status: open") ================================================ FILE: keep/providers/site24x7_provider/__init__.py ================================================ ================================================ FILE: keep/providers/site24x7_provider/site24x7_provider.py ================================================ """ Site24x7Provider is a class that allows to install webhooks and get alerts in Site24x7. """ import dataclasses from typing import List from urllib.parse import urlencode, urljoin import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope class ResourceAlreadyExists(Exception): def __init__(self, *args): super().__init__(*args) @pydantic.dataclasses.dataclass class Site24X7ProviderAuthConfig: """ Site24x7 authentication configuration. """ zohoRefreshToken: str = dataclasses.field( metadata={ "required": True, "description": "Zoho Refresh Token", "hint": "Refresh token for Zoho authentication", "sensitive": True, }, ) zohoClientId: str = dataclasses.field( metadata={ "required": True, "description": "Zoho Client Id", "hint": "Client Secret for Zoho authentication.", "sensitive": True, }, ) zohoClientSecret: str = dataclasses.field( metadata={ "required": True, "description": "Zoho Client Secret", "hint": "Password associated with yur account", "sensitive": True, }, ) zohoAccountTLD: str = dataclasses.field( metadata={ "required": True, "description": "Zoho Account's TLD (.com | .eu | .com.cn | .in | .au | .jp)", "hint": "Possible: .com | .eu | .com.cn | .in | .com.au | .jp", "validation": "tld", }, ) class Site24X7Provider(BaseProvider): """Install Webhooks and receive alerts from Site24x7.""" PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authenticated", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ProviderScope( name="valid_tld", description="TLD is amongst the list [.com | .eu | .com.cn | .in | .com.au | .jp]", mandatory=True, mandatory_for_webhook=True, alias="Valid TLD", ), ] PROVIDER_CATEGORY = ["Monitoring"] SEVERITIES_MAP = { "DOWN": AlertSeverity.WARNING, "TROUBLE": AlertSeverity.HIGH, "UP": AlertSeverity.INFO, "CRITICAL": AlertSeverity.CRITICAL, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Site24x7 provider. """ self.authentication_config = Site24X7ProviderAuthConfig( **self.config.authentication ) def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for Site24x7 api requests. Example: paths = ["issue", "createmeta"] query_params = {"projectKeys": "key1"} url = __get_url("test", paths, query_params) # url = https://site24x7.com/api/2/issue/createmeta?projectKeys=key1 """ url = urljoin( f"https://www.site24x7{self.authentication_config.zohoAccountTLD}/api/", "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def __get_headers(self): """ Getting the access token from Zoho API using the permanent refresh token. """ data = { "client_id": self.authentication_config.zohoClientId, "client_secret": self.authentication_config.zohoClientSecret, "refresh_token": self.authentication_config.zohoRefreshToken, "grant_type": "refresh_token", } response = requests.post( f"https://accounts.zoho{self.authentication_config.zohoAccountTLD}/oauth/v2/token", data=data, ).json() return { "Authorization": f'Bearer {response["access_token"]}', } def validate_scopes(self) -> dict[str, bool | str]: response = requests.get( f'{self.__get_url(paths=["monitors"])}', headers=self.__get_headers() ) if response.status_code == 401: authentication_scope = response.json() self.logger.error( "Failed to authenticate user", extra={"response": authentication_scope}, ) elif response.status_code == 200: authentication_scope = True self.logger.info("Authenticated user successfully") else: authentication_scope = ( f"Error while authenticating user, {response.status_code}" ) self.logger.error( "Error while authenticating user", extra={"status_code": response.status_code}, ) return { "authenticated": authentication_scope, "valid_tld": True } def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): webhook_data = { "method": "P", "down_alert": True, "is_poller_webhook": False, "type": 8, "alert_tags_id": [], "custom_headers": [{"name": "X-API-KEY", "value": api_key}], "url": keep_api_url, "timeout": 30, "selection_type": 0, "send_in_json_format": True, "auth_method": "B", "trouble_alert": True, "critical_alert": True, "send_incident_parameters": True, "service_status": 0, "name": "KeepWebhook", "manage_tickets": False, } response = requests.post( self.__get_url(paths=["integration/webhooks"]), json=webhook_data, headers=self.__get_headers(), ) if not response.ok: response_json = response.json() self.logger.error( "Error while creating webhook", extra={ "response": response_json, }, ) raise Exception(response_json["message"]) else: self.logger.info("Webhook created successfully") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: return AlertDto( url=event.get("MONITORURL", ""), lastReceived=event.get("INCIDENT_TIME_ISO", ""), description=event.get("INCIDENT_REASON", ""), name=event.get("MONITORNAME", ""), id=event.get("MONITOR_ID", ""), severity=Site24X7Provider.SEVERITIES_MAP.get(event.get("STATUS", "DOWN")), ) def _get_alerts(self) -> list[AlertDto]: response = requests.get( self.__get_url(paths=["alert_logs"]), headers=self.__get_headers() ) if response.status_code == 200: alerts = [] response = response.json() for alert in response["data"]: alerts.append( AlertDto( name=alert["display_name"], title=alert["msg"], startedAt=alert["sent_time"], ) ) return alerts else: self.logger.error( "Failed to get alerts", extra={"response": response.json()} ) raise Exception("Could not get alerts") ================================================ FILE: keep/providers/slack_provider/__init__.py ================================================ ================================================ FILE: keep/providers/slack_provider/slack_provider.py ================================================ """ Slack provider is an interface for Slack messages. """ import dataclasses import json import os from typing import OrderedDict import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.functions import utcnowtimestamp from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class SlackProviderAuthConfig: """Slack authentication configuration.""" webhook_url: str = dataclasses.field( metadata={ "required": True, "description": "Slack Webhook Url", "sensitive": True, }, default="", ) access_token: str = dataclasses.field( metadata={ "description": "For access token installation flow, use Keep UI", "required": False, "sensitive": True, "hidden": True, }, default="", ) class SlackProvider(BaseProvider): """Send alert message to Slack.""" PROVIDER_DISPLAY_NAME = "Slack" OAUTH2_URL = os.environ.get("SLACK_OAUTH2_URL") SLACK_CLIENT_ID = os.environ.get("SLACK_CLIENT_ID") SLACK_CLIENT_SECRET = os.environ.get("SLACK_CLIENT_SECRET") SLACK_API = "https://slack.com/api" PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = SlackProviderAuthConfig( **self.config.authentication ) if ( not self.authentication_config.webhook_url and not self.authentication_config.access_token ): raise Exception("Slack webhook url OR Slack access token is required") def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass @staticmethod def oauth2_logic(**payload) -> dict: """ Logic for handling oauth2 callback. Args: payload (dict): The payload from the oauth2 callback. Returns: dict: The provider configuration. """ code = payload.get("code") if not code: raise Exception("No code provided") exchange_request_payload = { **payload, "client_id": SlackProvider.SLACK_CLIENT_ID, "client_secret": SlackProvider.SLACK_CLIENT_SECRET, } response = requests.post( f"{SlackProvider.SLACK_API}/oauth.v2.access", data=exchange_request_payload, ) response_json = response.json() if not response.ok or not response_json.get("ok"): raise Exception( response_json.get("error"), ) new_provider_info = {"access_token": response_json.get("access_token")} team_name = response_json.get("team", {}).get("name") if team_name: # replacing dots to prevent problems in workflows new_provider_info["provider_name"] = team_name.replace(".", "") return new_provider_info def _notify_reaction(self, channel: str, emoji: str, timestamp: str): if not self.authentication_config.access_token: raise ProviderException("Access token is required to notify reaction") self.logger.info( "Notifying reaction to Slack using", extra={ "emoji": emoji, "channel": channel, "timestamp": timestamp, }, ) payload = { "channel": channel, "token": self.authentication_config.access_token, "name": emoji, "timestamp": timestamp, } response = requests.post( f"{SlackProvider.SLACK_API}/reactions.add", data=payload, ) if not response.ok: raise ProviderException( f"Failed to notify reaction to Slack: {response.text}" ) self.logger.info("Reaction notified to Slack") return response.json() def _notify( self, message="", blocks=[], channel="", slack_timestamp="", thread_timestamp="", attachments=[], username="", notification_type="message", **kwargs: dict, ): """ Notify alert message to Slack using the Slack Incoming Webhook API https://api.slack.com/messaging/webhooks Args: message (str): The content of the message. blocks (list): The blocks of the message. channel (str): The channel to send the message slack_timestamp (str): The timestamp of the message to update thread_timestamp (str): The timestamp of the thread to send the message attachments (list): The attachments of the message. username (str): The username of the message. notification_type (str): The type of notification. """ if notification_type == "reaction": return self._notify_reaction( channel=channel, emoji=message, timestamp=thread_timestamp, ) notify_data = None self.logger.info( f"Notifying message to Slack using {'webhook' if self.authentication_config.webhook_url else 'access token'}", extra={ "slack_message": message, "blocks": blocks, "channel": channel, }, ) if not message: if not blocks and not attachments: raise ProviderException( "Message is required - see for example https://github.com/keephq/keep/blob/main/examples/workflows/slack_basic.yml#L16" ) payload = OrderedDict( { "channel": channel, } ) if message: payload["text"] = message if blocks: payload["blocks"] = ( json.dumps(blocks) if isinstance(blocks, dict) or isinstance(blocks, list) else blocks ) if attachments: payload["attachments"] = ( json.dumps(attachments) if isinstance(attachments, dict) or isinstance(attachments, list) else blocks ) if username: payload["username"] = username if self.authentication_config.webhook_url: # If attachments are present, we need to send them as the payload with nothing else # Also, do not encode the payload as json, but as x-www-form-urlencoded # Only reference I found for it is: https://getkeep.slack.com/services/B082F60L9GX?added=1 and # https://stackoverflow.com/questions/42993602/slack-chat-postmessage-attachment-gives-no-text if payload.get("attachments", None): payload["attachments"] = attachments response = requests.post( self.authentication_config.webhook_url, data={"payload": json.dumps(payload)}, headers={"Content-Type": "application/x-www-form-urlencoded"}, ) else: response = requests.post( self.authentication_config.webhook_url, json=payload, ) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Slack: {response.text}" ) notify_data = {"slack_timestamp": utcnowtimestamp()} elif self.authentication_config.access_token: if not channel: raise ProviderException("Channel is required (E.g. C12345)") self.logger.info( "Adding access token to payload", extra={ "tenant_id": self.context_manager.tenant_id, "workflow_id": self.context_manager.workflow_id, "provider_id": self.provider_id, "access_token_truncated": self.authentication_config.access_token[ :5 ], }, ) payload["token"] = self.authentication_config.access_token if slack_timestamp == "" and thread_timestamp == "": self.logger.info("Sending a new message to Slack") method = "chat.postMessage" else: self.logger.info(f"Updating Slack message with ts: {slack_timestamp}") if slack_timestamp: payload["ts"] = slack_timestamp method = "chat.update" else: method = "chat.postMessage" payload["thread_ts"] = thread_timestamp if payload.get("attachments", None): payload["attachments"] = attachments if "token" not in payload: self.logger.warning( "Token is not in payload, adding it", extra={ "tenant_id": self.context_manager.tenant_id, "workflow_id": self.context_manager.workflow_id, "provider_id": self.provider_id, }, ) payload["token"] = self.authentication_config.access_token response = requests.post( f"{SlackProvider.SLACK_API}/{method}", json=payload, headers={ "Content-Type": "application/json", "Authorization": f"Bearer {self.authentication_config.access_token}", }, ) response_json = response.json() if not response.ok or not response_json.get("ok"): raise ProviderException( f"Failed to notify alert message to Slack: {response_json.get('error')}" ) notify_data = {"slack_timestamp": response_json["ts"]} self.logger.info("Message notified to Slack") return notify_data if __name__ == "__main__": # Output debug messages import logging from keep.providers.providers_factory import ProvidersFactory logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os slack_webhook_url = os.environ.get("SLACK_WEBHOOK_URL") # Initalize the provider and provider config context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) access_token = os.environ.get("SLACK_ACCESS_TOKEN") webhook_url = os.environ.get("SLACK_WEBHOOK_URL") if access_token: config = { "authentication": {"access_token": access_token}, } elif webhook_url: config = { "authentication": {"webhook_url": webhook_url}, } # you need some creds else: raise Exception("please provide either access token or webhook url") provider = ProvidersFactory.get_provider( context_manager, provider_id="slack-keephq", provider_type="slack", provider_config=config, ) provider.notify( channel="C04P7QSG692", attachments=[ { "fallback": "Plain-text summary of the attachment.", "color": "#2eb886", "title": "Slack API Documentation", "title_link": "https://api.slack.com/", "text": "Optional text that appears within the attachment", "footer": "Slack API", "footer_icon": "https://platform.slack-edge.com/img/default_application_icon.png", } ], ) ================================================ FILE: keep/providers/smtp_provider/__init__.py ================================================ ================================================ FILE: keep/providers/smtp_provider/smtp_provider.py ================================================ """ SMTP Provider is a class that provides the functionality to send emails using SMTP protocol. """ import dataclasses import typing from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from smtplib import SMTP, SMTP_SSL import pydantic from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import NoSchemeUrl, UrlPort @pydantic.dataclasses.dataclass class SmtpProviderAuthConfig: smtp_server: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "SMTP Server Address", "config_main_group": "authentication", "validation": "no_scheme_url", } ) smtp_port: UrlPort = dataclasses.field( metadata={ "required": True, "description": "SMTP port", "config_main_group": "authentication", "validation": "port", }, default=587, ) encryption: typing.Literal["SSL", "TLS", "None"] = dataclasses.field( default="TLS", metadata={ "required": True, "description": "SMTP encryption", "type": "select", "options": ["SSL", "TLS", "None"], "config_main_group": "authentication", }, ) smtp_username: str = dataclasses.field( metadata={ "required": False, "description": "SMTP username", "config_main_group": "authentication", }, default="", ) smtp_password: str = dataclasses.field( metadata={ "required": False, "sensitive": True, "description": "SMTP password", "config_main_group": "authentication", }, default="", ) class SmtpProvider(BaseProvider): PROVIDER_SCOPES = [ ProviderScope( name="send_email", description="Send email using SMTP protocol", mandatory=True, alias="Send Email", ) ] PROVIDER_CATEGORY = ["Collaboration"] PROVIDER_TAGS = ["messaging"] PROVIDER_DISPLAY_NAME = "SMTP" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): self.authentication_config = SmtpProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validate that the scopes provided are correct. """ try: smtp = self.generate_smtp_client() smtp.quit() return {"send_email": True} except Exception as e: return {"send_email": str(e)} def generate_smtp_client(self): """ Generate an SMTP client. """ smtp_username = self.authentication_config.smtp_username smtp_password = self.authentication_config.smtp_password smtp_server = self.authentication_config.smtp_server smtp_port = self.authentication_config.smtp_port encryption = self.authentication_config.encryption if encryption == "SSL": smtp = SMTP_SSL(smtp_server, smtp_port) elif encryption == "TLS": smtp = SMTP(smtp_server, smtp_port) smtp.starttls() elif encryption == "None": smtp = SMTP(smtp_server, smtp_port) else: raise Exception(f"Invalid encryption: {encryption}") if smtp_username and smtp_password: smtp.login(smtp_username, smtp_password) return smtp def send_email( self, from_email: str, from_name: str, to_email: str | list, subject: str, body: str = None, html: str = None, ): """ Send an email using SMTP protocol. """ msg = MIMEMultipart() if from_name == "": msg["From"] = from_email else: msg["From"] = f"{from_name} <{from_email}>" if isinstance(to_email, str): msg["To"] = to_email else: msg["To"] = ", ".join(to_email) msg["Subject"] = subject # Prefer HTML content if provided, otherwise use plain text if html: msg.attach(MIMEText(html, "html")) elif body: msg.attach(MIMEText(body, "plain")) else: raise ValueError("Either 'body' or 'html' must be provided") smtp = self.generate_smtp_client() smtp.sendmail(from_email, to_email, msg.as_string()) smtp.quit() def _notify( self, from_email: str, from_name: str, to_email: str, subject: str, body: str = None, html: str = None, **kwargs ): """ Send an email using SMTP protocol. """ self.send_email(from_email, from_name, to_email, subject, body, html) # Return the notification details result = {"from": from_email, "to": to_email, "subject": subject} if html: result["html"] = html if body: result["body"] = body return result if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os smtp_username = os.environ.get("SMTP_USERNAME") smtp_password = os.environ.get("SMTP_PASSWORD") smtp_server = os.environ.get("SMTP_SERVER") smtp_port = os.environ.get("SMTP_PORT") encryption = os.environ.get("ENCRYPTION") if smtp_username is None: raise Exception("SMTP_USERNAME is required") if smtp_password is None: raise Exception("SMTP_PASSWORD is required") if smtp_server is None: raise Exception("SMTP_SERVER is required") if smtp_port is None: raise Exception("SMTP_PORT is required") if encryption is None: raise Exception("ENCRYPTION is required") config = ProviderConfig( description="SMTP Provider", authentication={ "smtp_username": smtp_username, "smtp_password": smtp_password, "smtp_server": smtp_server, "smtp_port": smtp_port, "encryption": encryption, }, ) smtp_provider = SmtpProvider( context_manager=context_manager, provider_id="smtp_provider", config=config, ) smtp = smtp_provider.generate_smtp_client() smtp.quit() ================================================ FILE: keep/providers/snowflake_provider/__init__.py ================================================ ================================================ FILE: keep/providers/snowflake_provider/snowflake_provider.py ================================================ """ SnowflakeProvider is a class that provides a way to read data from Snowflake. """ import dataclasses import typing import pydantic from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization from snowflake.connector import connect from snowflake.connector.connection import SnowflakeConnection from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory @pydantic.dataclasses.dataclass class SnowflakeProviderAuthConfig: user: str = dataclasses.field( metadata={"required": True, "description": "Snowflake user"} ) account: str = dataclasses.field( metadata={"required": True, "description": "Snowflake account"} ) pkey: str = dataclasses.field( metadata={ "required": True, "description": "Snowflake private key", "sensitive": True, } ) pkey_passphrase: typing.Optional[str] = dataclasses.field( metadata={ "required": False, "description": "Snowflake password", "sensitive": True, }, default=None, ) class SnowflakeProvider(BaseProvider): """Enrich alerts with data from Snowflake.""" PROVIDER_DISPLAY_NAME = "Snowflake" PROVIDER_CATEGORY = ["Database"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._client = None @property def client(self) -> SnowflakeConnection: if self._client is None: self._client = self.__generate_client() return self._client def __generate_client(self) -> SnowflakeConnection: """ Generates a Snowflake connection. Returns: SnowflakeConnection: The connection to Snowflake. """ # Todo: support username/password authentication encoded_private_key = self.authentication_config.pkey.encode() encoded_password = ( self.authentication_config.pkey_passphrase.encode() if self.authentication_config.pkey_passphrase else None ) private_key = serialization.load_pem_private_key( encoded_private_key, password=encoded_password, backend=default_backend(), ) private_key_bytes = private_key.private_bytes( encoding=serialization.Encoding.DER, format=serialization.PrivateFormat.PKCS8, encryption_algorithm=serialization.NoEncryption(), ) snowflake_connection = connect( user=self.authentication_config.user, account=self.authentication_config.account, private_key=private_key_bytes, ) return snowflake_connection def dispose(self): try: self.client.close() except Exception: self.logger.exception("Error closing Snowflake connection") def validate_config(self): """ Validates required configuration for Snowflake's provider. Raises: ProviderConfigException: user or account is missing in authentication. ProviderConfigException: private key """ self.authentication_config = SnowflakeProviderAuthConfig( **self.config.authentication ) def _query(self, query: str, **kwargs: dict): """ Query snowflake using the given query Args: query (str): query to execute Returns: list[tuple] | list[dict]: results of the query """ cursor = self.client.cursor() return cursor.execute(query.format(**kwargs)).fetchall() if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os snowflake_private_key = os.environ.get("SNOWFLAKE_PRIVATE_KEY") snowflake_account = os.environ.get("SNOWFLAKE_ACCOUNT") config = { "id": "snowflake-prod", "authentication": { "user": "dbuser", "account": snowflake_account, "pkey": snowflake_private_key, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="snowflake", provider_type="snowflake", provider_config=config, ) result = provider.query( "select * from {table} limit 10", table="TEST_DB.PUBLIC.CUSTOMERS" ) print(result) ================================================ FILE: keep/providers/splunk_provider/__init__.py ================================================ ================================================ FILE: keep/providers/splunk_provider/splunk_provider.py ================================================ import dataclasses import datetime import json import logging import time from xml.etree.ElementTree import ParseError import pydantic from splunklib.binding import AuthenticationError, HTTPError from splunklib.client import connect from keep.api.models.alert import AlertDto, AlertSeverity from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory from keep.validation.fields import NoSchemeUrl, UrlPort @pydantic.dataclasses.dataclass class SplunkProviderAuthConfig: api_key: str = dataclasses.field( metadata={ "required": True, "description": "Splunk API Key", "sensitive": True, } ) host: NoSchemeUrl = dataclasses.field( metadata={ "description": "Splunk Host (default is localhost)", "validation": "no_scheme_url", }, default="localhost", ) port: UrlPort = dataclasses.field( metadata={"description": "Splunk Port (default is 8089)", "validation": "port"}, default=8089, ) verify: bool = dataclasses.field( metadata={ "description": "Enable SSL verification", "hint": "An `https` protocol will be used if enabled.", "type": "switch", }, default=True, ) username: str = dataclasses.field( metadata={ "description": "The username connected with the API key/token provided.", "required": False, }, default="", ) class SplunkProvider(BaseProvider): """Pull alerts and query incidents from Splunk.""" PROVIDER_DISPLAY_NAME = "Splunk" PROVIDER_SCOPES = [ ProviderScope( name="list_all_objects", description="The user can get all the alerts", mandatory=True, alias="List all Alerts", ), ProviderScope( name="edit_own_objects", description="The user can edit and add webhook to saved_searches", mandatory=True, alias="Needed to connect to webhook", ), ] FINGERPRINT_FIELDS = ["exception", "logger", "service"] PROVIDER_CATEGORY = ["Monitoring"] SEVERITIES_MAP = { "LOW": AlertSeverity.LOW, "INFO": AlertSeverity.INFO, "WARNING": AlertSeverity.WARNING, "ERROR": AlertSeverity.HIGH, "CRITICAL": AlertSeverity.CRITICAL, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def __debug_fetch_users_response(self): try: import requests from splunklib.client import PATH_USERS response = requests.get( f"https://{self.authentication_config.host}:{self.authentication_config.port}/services/{PATH_USERS}", headers={ "Authorization": f"Bearer {self.authentication_config.api_key}" }, verify=False, ) return response except Exception as e: self.logger.exception("Error getting debug users", extra={"error": str(e)}) return None def validate_scopes(self) -> dict[str, bool | str]: self.logger.info("Validating scopes for Splunk provider") validated_scopes = {} try: self.logger.debug( "Connecting to Splunk", extra={ "auth_config": self.authentication_config, "tenant_id": self.context_manager.tenant_id, }, ) service = connect( token=self.authentication_config.api_key, host=self.authentication_config.host, port=self.authentication_config.port, scheme="https" if self.authentication_config.verify else "http", verify=self.authentication_config.verify, ) self.logger.debug( "Connected to Splunk", extra={"service": service, "tenant_id": self.context_manager.tenant_id}, ) if not self.authentication_config.verify: self.logger.warning( "SSL verification is disabled - connection is not secure", extra={ "host": self.authentication_config.host, "tenant_id": self.context_manager.tenant_id, }, ) all_permissions = set() t = time.time() # a token is created and is coupled to a user, we need to check that user permissions # @tb: Didn't investigate in depth if I can get the user from the token... # @tb: I can't understand why in hell do we iterate over all users, but I guess it's legacy??? if self.authentication_config.username: self.logger.info( "Validating scopes for Splunk provider with username", extra={ "username": self.authentication_config.username, "tenant_id": self.context_manager.tenant_id, }, ) user = service.users[self.authentication_config.username] user_roles = user.content["roles"] for role_name in user_roles: perms = self.__get_role_capabilities( role_name=role_name, service=service ) all_permissions.update(perms) else: self.logger.info( "Validating scopes for Splunk provider without username", extra={"tenant_id": self.context_manager.tenant_id}, ) if len(service.users) > 1: self.logger.warning( "Splunk provider has more than one user", extra={ "users_count": len(service.users), "tenant_id": self.context_manager.tenant_id, }, ) for user in service.users: user_roles = user.content["roles"] for role_name in user_roles: perms = self.__get_role_capabilities( role_name=role_name, service=service ) all_permissions.update(perms) for scope in self.PROVIDER_SCOPES: if scope.name in all_permissions: validated_scopes[scope.name] = True else: validated_scopes[scope.name] = "NOT_FOUND" self.logger.info( "Validated scopes for Splunk provider", extra={ "tenant_id": self.context_manager.tenant_id, "time": time.time() - t, }, ) except AuthenticationError: self.logger.exception( "Error authenticating to Splunk", extra={"tenant_id": self.context_manager.tenant_id}, ) validated_scopes = dict( [[scope.name, "AUTHENTICATION_ERROR"] for scope in self.PROVIDER_SCOPES] ) except HTTPError as e: self.logger.exception( "Error connecting to Splunk", extra={"tenant_id": self.context_manager.tenant_id}, ) self.logger.debug( "Splunk error response", extra={ "body": e.body, "status": e.status, "headers": e.headers, "tenant_id": self.context_manager.tenant_id, }, ) validated_scopes = dict( [ [scope.name, "HTTP_ERROR ({status})".format(status=e.status)] for scope in self.PROVIDER_SCOPES ] ) except ConnectionRefusedError: self.logger.exception( "Error connecting to Splunk", extra={"tenant_id": self.context_manager.tenant_id}, ) validated_scopes = dict( [[scope.name, "CONNECTION_REFUSED"] for scope in self.PROVIDER_SCOPES] ) except ParseError: self.logger.exception( "Error parsing XML", extra={"tenant_id": self.context_manager.tenant_id}, ) if self.logger.getEffectiveLevel() == logging.DEBUG: response = self.__debug_fetch_users_response() if response is not None: self.logger.debug( "Raw users response", extra={ "url": response.url, "status": response.status_code, "text": response.text, }, ) validated_scopes = dict( [[scope.name, "PARSE_ERROR"] for scope in self.PROVIDER_SCOPES] ) except Exception as e: self.logger.exception("Error validating scopes", extra={"error": str(e)}) validated_scopes = dict( [[scope.name, "UNKNOWN_ERROR"] for scope in self.PROVIDER_SCOPES] ) return validated_scopes def validate_config(self): self.authentication_config = SplunkProviderAuthConfig( **self.config.authentication ) def __get_role_capabilities(self, role_name, service): role = service.roles[role_name] return role.content["capabilities"] + role.content["imported_capabilities"] def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): self.logger.info("Setting up Splunk webhook for all saved searches") webhook_url = f"{keep_api_url}&api_key={api_key}" webhook_kwargs = { "actions": "webhook", "action.webhook": "1", "action.webhook.param.url": webhook_url, } service = connect( token=self.authentication_config.api_key, host=self.authentication_config.host, port=self.authentication_config.port, scheme="https" if self.authentication_config.verify else "http", verify=self.authentication_config.verify, ) for saved_search in service.saved_searches: existing_webhook_url = saved_search["_state"]["content"].get( "action.webhook.param.url", None ) if existing_webhook_url and existing_webhook_url == webhook_url: self.logger.info( f"Webhook already set for saved search {saved_search.name}", extra={ "webhook_url": webhook_url, }, ) continue self.logger.info( f"Updating saved search with webhook {saved_search.name}", extra={ "webhook_url": webhook_url, }, ) saved_search.update(**webhook_kwargs).refresh() @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: result: dict = event.get("result", event.get("_result", {})) try: raw: str = result.get("_raw", "{}") raw_dict: dict = json.loads(raw) except Exception as e: logger = logging.getLogger(__name__) logger.warning( "Error parsing _raw attribute from event", extra={"err": e, "_raw": event.get("_raw")}, ) raw_dict = {} # export k8s specifics kubernetes = {} for key in result: if key.startswith("kubernetes"): kubernetes[key.replace("kubernetes.", "")] = result[key] message = result.get("message") name = message or raw_dict.get("message", event["search_name"]) service = result.get("service") environment = result.get("environment", result.get("env", "undefined")) exception = event.get( "exception", result.get( "exception", result.get("exception_class"), ), ) or raw_dict.get("exception_class", "") result["exception_class"] = exception # override stacktrace with _raw stacktrace if it doesnt exist in result stacktrace = result.get("stacktrace", raw_dict.get("stacktrace", "")) result["stacktrace"] = stacktrace severity = result.get("log_level", raw_dict.get("log_level", "INFO")) logger = event.get("logger", result.get("logger")) alert = AlertDto( id=event["sid"], name=name, source=["splunk"], url=event["results_link"], lastReceived=datetime.datetime.now(datetime.timezone.utc).isoformat(), severity=SplunkProvider.SEVERITIES_MAP.get(severity), status="firing", message=message, service=service, environment=environment, exception=exception, logger=logger, kubernetes=kubernetes, **event, ) alert.fingerprint = SplunkProvider.get_alert_fingerprint( alert, ( SplunkProvider.FINGERPRINT_FIELDS if (exception is not None or logger is not None) else ["name"] ), ) return alert if __name__ == "__main__": # Output debug messages logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os api_key = os.environ.get("SPLUNK_API_KEY") host = os.environ.get("SPLUNK_HOST") port = os.environ.get("SPLUNK_PORT") provider_config = { "authentication": {"api_key": api_key, "host": host, "port": port}, } provider = ProvidersFactory.get_provider( context_manager=context_manager, provider_id="keep-pd", provider_type="splunk", provider_config=provider_config, ) provider.validate_scopes() ================================================ FILE: keep/providers/squadcast_provider/__init__.py ================================================ ================================================ FILE: keep/providers/squadcast_provider/squadcast_provider.py ================================================ """ SquadcastProvider is a class that implements the Squadcast API and allows creating incidents and notes. """ import dataclasses import json import pydantic import requests from requests import HTTPError from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_config_exception import ProviderConfigException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class SquadcastProviderAuthConfig: service_region: str = dataclasses.field( metadata={ "required": True, "description": "Service region: EU/US", "hint": "https://apidocs.squadcast.com/#intro", "sensitive": False, } ) refresh_token: str | None = dataclasses.field( metadata={ "required": False, "description": "Squadcast Refresh Token", "hint": "https://support.squadcast.com/docs/squadcast-public-api", "sensitive": True, }, default=None, ) webhook_url: HttpsUrl | None = dataclasses.field( metadata={ "required": False, "description": "Incident webhook url", "hint": "https://support.squadcast.com/integrations/incident-webhook-incident-webhook-api", "sensitive": True, "validation": "https_url", }, default=None, ) class SquadcastProvider(BaseProvider): """Create incidents and notes using the Squadcast API.""" PROVIDER_DISPLAY_NAME = "Squadcast" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Incident Management"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="The user can connect to the client", mandatory=False, alias="Connect to the client", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_scopes(self): """ Validates that the user has the required scopes to use the provider. """ refresh_headers = { "content-type": "application/json", "X-Refresh-Token": f"{self.authentication_config.refresh_token}", } resp = requests.get( f"{self.__get_endpoint('auth')}/oauth/access-token", headers=refresh_headers ) try: resp.raise_for_status() scopes = { "authenticated": True, } except Exception as e: self.logger.exception("Error validating scopes") scopes = { "authenticated": str(e), } return scopes def __get_endpoint(self, endpoint: str): if endpoint == "auth": return ("https://auth.eu.squadcast.com", "https://auth.squadcast.com")[ self.authentication_config.service_region == "US" ] elif endpoint == "api": return ("https://api.eu.squadcast.com", "https://api.squadcast.com")[ self.authentication_config.service_region == "US" ] def validate_config(self): self.authentication_config = SquadcastProviderAuthConfig( **self.config.authentication ) if ( not self.authentication_config.refresh_token and not self.authentication_config.webhook_url ): raise ProviderConfigException( "SquadcastProvider requires either refresh_token or webhook_url", provider_id=self.provider_id, ) def _create_incidents( self, headers: dict, message: str, description: str, tags: dict = {}, priority: str = "", status: str = "", event_id: str = "", additional_json: str = "", ): body = json.dumps( { "message": message, "description": description, "tags": tags, "priority": priority, "status": status, "event_id": event_id, } ) # append body to additional_json we are doing this way because we don't want to override the core body fields try: additional_fields = json.loads(additional_json) if additional_json else {} core_fields = json.loads(body) body = json.dumps({**additional_fields, **core_fields}) except json.JSONDecodeError as e: raise ProviderConfigException( f"Invalid additional_json format: {str(e)}", provider_id=self.provider_id ) return requests.post( self.authentication_config.webhook_url, data=body, headers=headers ) def _crete_notes( self, headers: dict, message: str, incident_id: str, attachments: list = [] ): body = json.dumps({"message": message, "attachments": attachments}) return requests.post( f"{self.__get_endpoint('api')}/v3/incidents/{incident_id}/warroom", data=body, headers=headers, ) def _notify( self, notify_type: str, message: str = "", description: str = "", incident_id: str = "", priority: str = "", tags: dict = {}, status: str = "", event_id: str = "", attachments: list = [], additional_json: str = "", **kwargs, ) -> dict: """ Create an incident or notes using the Squadcast API. """ self.logger.info( f"Creating {notify_type} using SquadcastProvider", extra={notify_type: notify_type}, ) refresh_headers = { "content-type": "application/json", "X-Refresh-Token": f"{self.authentication_config.refresh_token}", } api_key_resp = requests.get( f"{self.__get_endpoint('auth')}/oauth/access-token", headers=refresh_headers ) headers = { "content-type": "application/json", "Authorization": f"Bearer {api_key_resp.json()['data']['access_token']}", } if notify_type == "incident": if message == "" or description == "": raise Exception( f'message: "{message}" and description: "{description}" cannot be empty' ) resp = self._create_incidents( headers=headers, message=message, description=description, tags=tags, priority=priority, status=status, event_id=event_id, additional_json=additional_json, ) elif notify_type == "notes": if message == "" or incident_id == "": raise Exception( f'message: "{message}" and incident_id: "{incident_id}" cannot be empty' ) resp = self._crete_notes( headers=headers, message=message, incident_id=incident_id, attachments=attachments, ) else: raise Exception( "notify_type is a mandatory field, expected: incident | notes" ) try: resp.raise_for_status() return resp.json() except HTTPError as e: raise Exception(f"Failed to create issue: {str(e)}") def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass if __name__ == "__main__": import os squadcast_api_key = os.environ.get("SQUADCAST_API_KEY") context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Initalize the provider and provider config config = ProviderConfig( authentication={"api_key": squadcast_api_key}, ) provider = SquadcastProvider( context_manager, provider_id="squadcast-test", config=config ) response = provider.notify( description="test", ) print(response) ================================================ FILE: keep/providers/ssh_provider/__init__.py ================================================ ================================================ FILE: keep/providers/ssh_provider/ssh_provider.py ================================================ """ SshProvider is a class that provides a way to execute SSH commands and get the output. """ import dataclasses import io import typing import pydantic from paramiko import AutoAddPolicy, RSAKey, SSHClient from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.providers_factory import ProvidersFactory from keep.validation.fields import NoSchemeUrl, UrlPort @pydantic.dataclasses.dataclass class SshProviderAuthConfig: """SSH authentication configuration.""" host: NoSchemeUrl = dataclasses.field( metadata={ "required": True, "description": "SSH hostname", "validation": "no_scheme_url", } ) user: str = dataclasses.field( metadata={"required": True, "description": "SSH user"} ) port: UrlPort = dataclasses.field( default=22, metadata={"required": False, "description": "SSH port", "validation": "port"}, ) pkey: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "SSH private key", "sensitive": True, "type": "file", "name": "pkey", "file_type": "text/plain, application/x-pem-file, application/x-putty-private-key, " + "application/x-ed25519-key, application/pkcs8, application/octet-stream", "config_sub_group": "private_key", "config_main_group": "authentication", }, ) password: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "SSH password", "sensitive": True, "config_sub_group": "password", "config_main_group": "authentication", }, ) @pydantic.root_validator def check_password_or_pkey(cls, values): password, pkey = values.get("password"), values.get("pkey") if password is None and pkey is None: raise ValueError("either password or private key must be provided") return values class SshProvider(BaseProvider): """Enrich alerts with data from SSH.""" PROVIDER_DISPLAY_NAME = "SSH" PROVIDER_CATEGORY = ["Cloud Infrastructure", "Developer Tools"] PROVIDER_SCOPES = [ ProviderScope( name="ssh_access", description="The provided credentials grant access to the SSH server", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self._client = None @property def client(self): if self._client is None: self._client = self.__generate_client() return self._client def __generate_client(self) -> SSHClient: """ Generates a paramiko SSH connection. Returns: SSHClient: The connection to the SSH server. """ ssh_client = SSHClient() ssh_client.set_missing_host_key_policy(AutoAddPolicy()) host = self.authentication_config.host port = self.authentication_config.port user = self.authentication_config.user private_key = self.authentication_config.pkey if private_key: # Connect using private key private_key_file = io.StringIO(private_key) private_key_file.seek(0) key = RSAKey.from_private_key( private_key_file, self.config.authentication.get("pkey_passphrase") ) ssh_client.connect(host, port, user, pkey=key) else: # Connect using password ssh_client.connect( host, port, user, self.authentication_config.password, ) return ssh_client def dispose(self): """ Closes the SSH connection. """ try: self.client.close() except Exception as e: self.logger.error("Error closing SSH connection", extra={"error": str(e)}) def validate_config(self): """ Validates required configuration for SSH provider. """ self.authentication_config = SshProviderAuthConfig(**self.config.authentication) def validate_scopes(self): """ Validate the scopes of the provider """ try: if self.client.get_transport().is_authenticated(): return {"ssh_access": True} except Exception: self.logger.exception("Error validating scopes") return {"ssh_access": "Authentication failed"} def _query(self, command: str, **kwargs: dict): """ Query snowflake using the given query Args: query (str): command to execute Returns: list: of the results for the executed command. """ stdin, stdout, stderr = self.client.exec_command(command.format(**kwargs)) stdout.channel.set_combine_stderr(True) return stdout.readlines() if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os user = os.environ.get("SSH_USERNAME") or "root" password = os.environ.get("SSH_PASSWORD") host = os.environ.get("SSH_HOST") or "1.1.1.1" pkey = os.environ.get("SSH_PRIVATE_KEY") config = { "authentication": { "user": user, "pkey": pkey, "host": host, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="ssh", provider_type="ssh", provider_config=config ) result = provider.query(command="df -h") print(result) ================================================ FILE: keep/providers/statuscake_provider/__init__.py ================================================ ================================================ FILE: keep/providers/statuscake_provider/statuscake_provider.py ================================================ """ Statuscake is a class that provides a way to read alerts from the Statuscake API and install webhook in StatuCake """ import dataclasses from typing import List from urllib.parse import urlencode, urljoin import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class StatuscakeProviderAuthConfig: """ StatuscakeProviderAuthConfig is a class that holds the authentication information for the StatuscakeProvider. """ api_key: str = dataclasses.field( metadata={ "required": True, "description": "Statuscake API Key", "sensitive": True, }, default=None, ) class StatuscakeProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Statuscake" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="alerts", description="Read alerts from Statuscake", ) ] SEVERITIES_MAP = { "high": AlertSeverity.HIGH, } STATUS_MAP = { "Up": AlertStatus.RESOLVED, "Down": AlertStatus.FIRING, } FINGERPRINT_FIELDS = ["test_id"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for StatucCake api requests. """ host = "https://api.statuscake.com/v1/" url = urljoin( host, "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def validate_scopes(self): """ Validate that the user has the required scopes to use the provider """ self.logger.info("Validating scopes for Statuscake provider") try: response = requests.get( url=self.__get_url(paths=["uptime"]), headers=self.__get_auth_headers(), ) if response.status_code == 200: self.logger.info("Successfully validated scopes for Statuscake") scopes = {"alerts": True} else: self.logger.error( "Unable to read alerts from Statuscake, statusCode: %s", response.status_code, ) scopes = { "alerts": f"Unable to read alerts from Statuscake, statusCode: {response.status_code}" } except Exception as e: self.logger.error("Error validating scopes for Statuscake: %s", e) scopes = {"alerts": f"Error validating scopes for Statuscake: {e}"} return scopes def validate_config(self): self.logger.info("Validating configuration for Statuscake provider") self.authentication_config = StatuscakeProviderAuthConfig( **self.config.authentication ) if self.authentication_config.api_key is None: self.logger.error("Statuscake API Key is missing") raise ValueError("Statuscake API Key is required") self.logger.info("Configuration validated successfully") def __get_auth_headers(self): if self.authentication_config.api_key is not None: return { "Authorization": f"Bearer {self.authentication_config.api_key}", "Content-Type": "application/x-www-form-urlencoded", } def __get_paginated_data(self, paths: list, query_params: dict = {}): data = [] try: page = 1 while True: self.logger.info(f"Getting page: {page} for {paths}") response = requests.get( url=self.__get_url( paths=paths, query_params={**query_params, "page": page} ), headers=self.__get_auth_headers(), ) if not response.ok: raise Exception(response.text) response = response.json() data.extend(response["data"]) if page == response["metadata"]["page_count"]: break else: page += 1 self.logger.info( f"Successfully got {len(data)} items from {paths}", extra={"data": data}, ) return data except Exception as e: self.logger.error( f"Error while getting {paths}", extra={"exception": str(e)} ) raise e def __update_contact_group(self, contact_group_id, keep_api_url): try: self.logger.info(f"Updating contact group {contact_group_id}") response = requests.put( url=self.__get_url(["contact-groups", contact_group_id]), headers=self.__get_auth_headers(), data={ "ping_url": keep_api_url, }, ) if response.status_code != 204: raise Exception(response.text) self.logger.info(f"Successfully updated contact group {contact_group_id}") except Exception as e: self.logger.error( "Error while updating contact group", extra={"exception": str(e)} ) raise e def __create_contact_group(self, keep_api_url: str, contact_group_name: str): try: self.logger.info(f"Creating contact group: {contact_group_name}") response = requests.post( url=self.__get_url(paths=["contact-groups"]), headers=self.__get_auth_headers(), data={ "ping_url": keep_api_url, "name": contact_group_name, }, ) if response.status_code != 201: raise Exception(response.text) self.logger.info("Successfully created contact group") return response.json()["data"]["new_id"] except Exception as e: self.logger.error( "Error while creating contact group", extra={"exception": str(e)} ) raise e def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): # Getting all the contact groups self.logger.info("Attempting to install webhook in statuscake") keep_api_url = f"{keep_api_url}&api_key={api_key}" contact_group_name = f"Keep-{self.provider_id}" self.logger.info("Getting contact groups for webhook setup") contact_groups = self.__get_paginated_data(paths=["contact-groups"]) for contact_group in contact_groups: if contact_group["name"] == contact_group_name: self.logger.info( "Webhook already exists, updating the ping_url, just for safe measures" ) contact_group_id = contact_group["id"] self.__update_contact_group( contact_group_id=contact_group_id, keep_api_url=keep_api_url ) break else: self.logger.info("Creating a new contact group") contact_group_id = self.__create_contact_group( contact_group_name=contact_group_name, keep_api_url=keep_api_url ) alerts_to_update = ["heartbeat", "uptime", "pagespeed", "ssl"] self.logger.info(f"Updating alerts for types: {alerts_to_update}") for alert_type in alerts_to_update: self.logger.info(f"Processing {alert_type} alerts") alerts = self.__get_paginated_data(paths=[alert_type]) for alert in alerts: if contact_group_id not in alert["contact_groups"]: alert["contact_groups"].append(contact_group_id) try: self.__update_alert( data={"contact_groups[]": alert["contact_groups"]}, paths=[alert_type, alert["id"]], ) except Exception: self.logger.exception( "Error while updating alert", extra={ "alert_type": alert_type, "alert_id": alert.get("id"), }, ) self.logger.info("Webhook setup completed successfully") def __update_alert(self, data: dict, paths: list): try: self.logger.info(f"Attempting to updated alert: {paths}") response = requests.put( url=self.__get_url(paths=paths), headers=self.__get_auth_headers(), data=data, ) if not response.ok: self.logger.error( "Error while updating alert", extra={"response": response.text, "data": data, "paths": paths}, ) # best effort pass else: self.logger.info( "Successfully updated alert", extra={"data": data, "paths": paths} ) except Exception as e: self.logger.error("Error while updating alert", extra={"exception": str(e)}) raise e def __get_heartbeat_alerts_dto(self) -> list[AlertDto]: self.logger.info("Getting heartbeat alerts from Statuscake") response = self.__get_paginated_data(paths=["heartbeat"]) alert_dtos = [ AlertDto( id=alert["id"], name=alert["name"], status=alert["status"], url=alert["website_url"], uptime=alert["uptime"], source="statuscake", ) for alert in response ] self.logger.info(f"Got {len(alert_dtos)} heartbeat alerts") return alert_dtos def __get_pagespeed_alerts_dto(self) -> list[AlertDto]: self.logger.info("Getting pagespeed alerts from Statuscake") response = self.__get_paginated_data(paths=["pagespeed"]) alert_dtos = [] for alert in response: status = alert.get("latest_stats", {}).get("has_issues", False) if status: status = AlertStatus.FIRING else: status = AlertStatus.RESOLVED alert_dto = AlertDto( name=alert["name"], url=alert["website_url"], location=alert["location"], alert_smaller=alert["alert_smaller"], alert_bigger=alert["alert_bigger"], alert_slower=alert["alert_slower"], status=status, source=["statuscake"], latest_stats=alert.get("latest_stats", {}), fingerprint=alert.get("id"), ) alert_dtos.append(alert_dto) self.logger.info(f"Got {len(alert_dtos)} pagespeed alerts") return alert_dtos def __get_ssl_alerts_dto(self) -> list[AlertDto]: self.logger.info("Getting SSL alerts from Statuscake") response = self.__get_paginated_data(paths=["ssl"]) alert_dtos = [] self.logger.info(f"Got {len(response)} ssl alerts") for alert in response: url = alert.get("website_url", None) alert_dto = AlertDto( name=f"Certificate for {url}", **alert, source=["statuscake"], ) alert_dtos.append(alert_dto) return alert_dtos def __get_uptime_alerts_dto(self) -> list[AlertDto]: self.logger.info("Getting uptime alerts from Statuscake") response = self.__get_paginated_data(paths=["uptime"]) self.logger.info(f"Got {len(response)} uptime alerts") alert_dtos = [] for alert in response: if alert.get("status").lower() == "up": status = AlertStatus.RESOLVED else: status = AlertStatus.FIRING alert_id = alert.get("id", None) if not alert_id: self.logger.error("Alert id is missing", extra={"alert": alert}) continue url = alert.get("website_url", None) alert = AlertDto( id=alert.get("id", ""), name=alert.get("name", ""), status=status, uptime=alert.get("uptime", 0), source=["statuscake"], paused=alert.get("paused", False), test_type=alert.get("test_type", ""), check_rate=alert.get("check_rate", 0), contact_groups=alert.get("contact_groups", []), tags=alert.get("tags", []), ) if url: alert.url = url # use id as fingerprint alert.fingerprint = alert_id alert_dtos.append(alert) return alert_dtos def _get_alerts(self) -> list[AlertDto]: self.logger.info("Starting to collect all alerts from Statuscake") alerts = [] try: self.logger.info("Collecting alerts (heartbeats) from Statuscake") heartbeat_alerts = self.__get_heartbeat_alerts_dto() alerts.extend(heartbeat_alerts) except Exception as e: self.logger.error("Error getting heartbeat from Statuscake: %s", e) try: self.logger.info("Collecting alerts (pagespeed) from Statuscake") pagespeed_alerts = self.__get_pagespeed_alerts_dto() alerts.extend(pagespeed_alerts) except Exception as e: self.logger.error("Error getting pagespeed from Statuscake: %s", e) try: self.logger.info("Collecting alerts (ssl) from Statuscake") ssl_alerts = self.__get_ssl_alerts_dto() alerts.extend(ssl_alerts) except Exception as e: self.logger.error("Error getting ssl from Statuscake: %s", e) try: self.logger.info("Collecting alerts (uptime) from Statuscake") uptime_alerts = self.__get_uptime_alerts_dto() alerts.extend(uptime_alerts) except Exception as e: self.logger.error("Error getting uptime from Statuscake: %s", e) self.logger.info( f"Successfully collected {len(alerts)} total alerts from Statuscake" ) return alerts @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: # https://www.statuscake.com/kb/knowledge-base/how-to-use-the-web-hook-url/ status = StatuscakeProvider.STATUS_MAP.get( event.get("Status"), AlertStatus.FIRING ) # Statuscake does not provide severity information severity = AlertSeverity.HIGH alert = AlertDto( id=event.get("TestID", event.get("Name")), name=event.get("Name"), status=status if status is not None else AlertStatus.FIRING, severity=severity, url=event.get("URL", None), ip=event.get("IP", None), tags=event.get("Tags", None), test_id=event.get("TestID", None), method=event.get("Method", None), checkrate=event.get("Checkrate", None), status_code=event.get("StatusCode", None), source=["statuscake"], ) alert.fingerprint = ( StatuscakeProvider.get_alert_fingerprint( alert, (StatuscakeProvider.FINGERPRINT_FIELDS), ) if event.get("TestID", None) else None ) return alert if __name__ == "__main__": pass import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os statuscake_api_key = os.environ.get("STATUSCAKE_API_KEY") if statuscake_api_key is None: raise Exception("STATUSCAKE_API_KEY is required") config = ProviderConfig( description="Statuscake Provider", authentication={"api_key": statuscake_api_key}, ) provider = StatuscakeProvider( context_manager, provider_id="statuscake", config=config, ) provider.setup_webhook( tenant_id="singletenant", keep_api_url="http://localhost:8000/api/v1/alert", api_key="test_api_key", ) provider._get_alerts() ================================================ FILE: keep/providers/sumologic_provider/__init__.py ================================================ ================================================ FILE: keep/providers/sumologic_provider/connection_template.json ================================================ { "name": "{{Name}}", "description": "{{Description}}", "monitorType": "{{MonitorType}}", "query": "{{Query}}", "queryURL": "{{QueryURL}}", "resultsJson": "{{ResultsJson}}", "numQueryResults": "{{NumQueryResults}}", "id": "{{Id}}", "detectionMethod": "{{DetectionMethod}}", "triggerType": "{{TriggerType}}", "triggerTimeRange": "{{TriggerTimeRange}}", "triggerTime": "{{TriggerTime}}", "triggerCondition": "{{TriggerCondition}}", "triggerValue": "{{TriggerValue}}", "triggerTimeStart": "{{TriggerTimeStart}}", "triggerTimeEnd": "{{TriggerTimeEnd}}", "sourceURL": "{{SourceURL}}", "alertResponseUrl": "{{AlertResponseUrl}}" } ================================================ FILE: keep/providers/sumologic_provider/sumologic_provider.py ================================================ """ SumoLogic Provider is a class that allows to install webhooks in SumoLogic. """ import dataclasses from datetime import datetime from pathlib import Path from typing import List from urllib.parse import urlencode, urljoin, urlparse import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope class ResourceAlreadyExists(Exception): def __init__(self, *args): super().__init__(*args) @pydantic.dataclasses.dataclass class SumologicProviderAuthConfig: """ SumoLogic authentication configuration. """ sumoAccessId: str = dataclasses.field( metadata={ "required": True, "description": "SumoLogic Access ID", "hint": "Your AccessID", }, ) sumoAccessKey: str = dataclasses.field( metadata={ "required": True, "description": "SumoLogic Access Key", "hint": "SumoLogic Access Key ", "sensitive": True, }, ) deployment: str = dataclasses.field( metadata={ "required": True, "description": "Deployment Region", "hint": "Your deployment Region: AU | CA | DE | EU | FED | IN | JP | KR | US1 | US2", }, ) class SumologicProvider(BaseProvider): """Install Webhooks and receive alerts from SumoLogic.""" PROVIDER_DISPLAY_NAME = "SumoLogic" PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authorized", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ProviderScope( name="authorized", description="Required privileges", mandatory=True, mandatory_for_webhook=True, alias="Rules Reader", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for SumoLogic provider. """ self.authentication_config = SumologicProviderAuthConfig( **self.config.authentication ) def __get_headers(self): return { "Content-Type": "application/json", "Accept": "application/json", } def __get_url(self, paths: List[str] = [], query_params: dict = None, **kwargs): """ Helper method to build the url for SumoLogic api requests. Example: paths = ["issue", "createmeta"] query_params = {"projectKeys": "key1"} url = __get_url("test", paths, query_params) # url = https://api.sumologic.com/api/v1/issue/createmeta?projectKeys=key1 """ if self.authentication_config.deployment.lower() != "us1": host = f"https://api.{self.authentication_config.deployment.lower()}.sumologic.com/api/v1/" else: host = "https://api.sumologic.com/api/v1/" url = urljoin( host, "/".join(str(path) for path in paths), ) # add query params if query_params: url = f"{url}?{urlencode(query_params)}" return url def validate_scopes(self) -> dict[str, bool | str]: perms = {"manageScheduledViews", "manageConnections", "manageUsersAndRoles"} self.logger.info("Validating SumoLogic authentication.") try: account_owner_response = requests.get( url=self.__get_url(paths=["account", "accountOwner"]), auth=self.__get_auth(), headers=self.__get_headers(), ) if account_owner_response.status_code == 200: authenticated = True user_id = account_owner_response.json() self.logger.info( "Successfully retrieved user_id", extra={"user_id": user_id} ) else: account_owner_response = account_owner_response.json() self.logger.error( "Error while getting UserID", extra={"error": str(account_owner_response)}, ) return { "authenticated": str(account_owner_response), "authorized": "Unauthorized", } self.logger.info("Fetching account info...", extra={"user_id": user_id}) account_info_response = requests.get( url=self.__get_url(paths=["users", user_id]), auth=self.__get_auth(), headers=self.__get_headers(), ) if account_info_response.status_code == 200: role_ids = account_info_response.json()["roleIds"] self.logger.info( "Successfully fetched account info", extra={"roles": role_ids} ) else: account_info_response = account_info_response.json() self.logger.error( "Error while getting account info", extra={"error": str(account_info_response)}, ) return { "authenticated": authenticated, "authorized": str(account_info_response), } # Checking if the required permissions exists for role_id in role_ids: role_info_response = requests.get( url=self.__get_url(paths=["roles", role_id]), auth=self.__get_auth(), headers=self.__get_headers(), ) if role_info_response.status_code == 200: role_info_response = role_info_response.json() self.logger.info(f"Successfully fetched role: {role_id}") for capability in role_info_response["capabilities"]: if capability in perms: perms.remove(capability) else: role_info_response = role_info_response.json() self.logger.error( f"Error while getting role: {role_id}", extra={"error": str(role_info_response)}, ) return { "authenticated": True, "authorized": str(role_info_response), } if len(perms) == 0: self.logger.info("All required perms found, user is authorized :)") return {"authenticated": True, "authorized": True} except Exception as e: self.logger.error("Error while getting User ID " + str(e)) return {"authenticated": str(e), "authorized": str(e)} def __get_auth(self) -> tuple[str, str]: return ( self.authentication_config.sumoAccessId, self.authentication_config.sumoAccessKey, ) def __get_connection_id(self, connection_name: str): params = {"limit": 1000} while True: connections_response = requests.get( url=self.__get_url(paths=["connections"]), headers=self.__get_headers(), params=params, auth=self.__get_auth(), ) if connections_response.status_code != 200: raise Exception(str(connections_response.json())) connections_response = connections_response.json() for connection in connections_response["data"]: if connection["name"] == connection_name: return connection["id"] if connections_response["next"] is None: break params["token"] = connections_response["next"] return None def __update_existing_connection(self, connection_id: str, connection_payload): self.logger.info(f"Updating the connection: {connection_id}") connection_update_response = requests.put( url=self.__get_url(paths=["connections", connection_id]), headers=self.__get_headers(), auth=self.__get_auth(), json=connection_payload, ) if connection_update_response.status_code == 200: self.logger.info(f"Successfully updated connection: {connection_id}") return connection_update_response.json()["id"] else: connection_update_response = connection_update_response.json() self.logger.error( f"Error while updating connection: {connection_id}", extra={"error": str(connection_update_response)}, ) raise Exception(str(connection_update_response)) def __create_connection(self, connection_payload, connection_name: str): self.logger.info("Creating a Webhook connection with Sumo Logic") try: connection_creation_response = requests.post( url=self.__get_url(paths=["connections"]), json=connection_payload, headers=self.__get_headers(), auth=self.__get_auth(), ) if connection_creation_response.status_code == 200: self.logger.info("Successfully created Webhook connection") return connection_creation_response.json()["id"] if connection_creation_response.status_code == 400: connection_creation_response = connection_creation_response.json() if ( connection_creation_response["errors"][0]["code"] == "connection:name_already_exists" ): self.logger.info( "Webhook connection already exists, attempting to update it" ) connection_id = self.__get_connection_id( connection_name=connection_name ) return self.__update_existing_connection( connection_payload=connection_payload, connection_id=connection_id, ) raise Exception(str(connection_creation_response)) else: connection_creation_response = connection_creation_response.json() self.logger.error( "Error while creating webhook connection", extra={"error": str(connection_creation_response)}, ) raise Exception(connection_creation_response) except Exception as e: self.logger.error("Error while creating webhook connection " + str(e)) raise e def __get_monitors_without_keep(self, connection_id: str): monitors = [] params = {"query": "type:monitor"} monitors_response = requests.get( url=self.__get_url(paths=["monitors", "search"]), params=params, headers=self.__get_headers(), auth=self.__get_auth(), ) if monitors_response.status_code == 200: self.logger.info("Successfully fetched all monitors") monitors_response = monitors_response.json() for monitor in monitors_response: print(monitor) for notification in monitor["item"]["notifications"]: if notification["notification"]["connectionId"] == connection_id: break else: monitors.append(monitor["item"]) return monitors else: monitors_response = monitors_response.json() self.logger.error( "Error while getting monitors", extra=str(monitors_response) ) raise Exception(str(monitors_response)) def __install_connection_in_monitor(self, monitor, connection_id: str): self.logger.info(f"Installing connection to monitor: {monitor['name']}") monitor["type"] = "MonitorsLibraryMonitorUpdate" triggers = [trigger["triggerType"] for trigger in monitor["triggers"]] keep_notification = { "notification": { "connectionType": "Webhook", "connectionId": connection_id, "payloadOverride": None, "resolutionPayloadOverride": None, }, "runForTriggerTypes": triggers, } monitor["notifications"].append(keep_notification) monitor_update_response = requests.put( url=self.__get_url(paths=["monitors", monitor["id"]]), headers=self.__get_headers(), auth=self.__get_auth(), json=monitor, ) if monitor_update_response.status_code == 200: self.logger.info( f"Successfully installed connection to monitor: {monitor['name']}" ) else: raise Exception(str(monitor_update_response.json())) def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): try: parsed_url = urlparse(keep_api_url) # Extract the query string query_params = parsed_url.query # Find the provider_id in the query parameters # connection_template.json is the payload that will be sent to keep as an event provider_id = query_params.split("provider_id=")[-1] connection_name = f"KeepHQ-{provider_id}" connection_payload = { "type": "WebhookDefinition", "name": connection_name, "description": "A webhook connection that pushes alerts to KeepHQ", "url": keep_api_url, "headers": [], "customHeaders": [{"name": "X-API-KEY", "value": api_key}], "defaultPayload": open( rf"{Path(__file__).parent}/connection_template.json" ).read(), "webhookType": "Webhook", "connectionSubtype": "Event", "resolutionPayload": open( rf"{Path(__file__).parent}/connection_template.json" ).read(), } # Creating a sumo logic connection connection_id = self.__create_connection( connection_payload=connection_payload, connection_name=connection_name ) # Monitors monitors = self.__get_monitors_without_keep(connection_id=connection_id) # Install connections in monitors that don't have keep for monitor in monitors: self.__install_connection_in_monitor( monitor=monitor, connection_id=connection_id ) except Exception as e: raise e @staticmethod def __extract_severity(severity: str): if "critical" in severity.lower(): return AlertSeverity.CRITICAL elif "warning" in severity.lower(): return AlertSeverity.WARNING elif "missing" in severity.lower(): return AlertSeverity.INFO @staticmethod def __extract_status(status: str): if "resolved" in status.lower(): return AlertStatus.RESOLVED else: return AlertStatus.FIRING @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: return AlertDto( id=event["id"], name=event["name"], severity=SumologicProvider.__extract_severity( severity=event["triggerType"] ), fingerprint=event["id"], status=SumologicProvider.__extract_status(status=event["triggerType"]), lastReceived=datetime.utcfromtimestamp( int(event["triggerTimeStart"]) / 1000 ).isoformat() + "Z", firingTimeStart=datetime.utcfromtimestamp( int(event["triggerTimeStart"]) / 1000 ).isoformat() + "Z", description=event["description"], url=event["alertResponseUrl"], source=["sumologic"], ) ================================================ FILE: keep/providers/teams_provider/__init__.py ================================================ ================================================ FILE: keep/providers/teams_provider/teams_provider.py ================================================ """ TeamsProvider is a class that implements the BaseOutputProvider interface for Microsoft Teams messages. """ import dataclasses from typing import Any, Optional import json5 as json import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class TeamsProviderAuthConfig: """Teams authentication configuration.""" webhook_url: HttpsUrl = dataclasses.field( metadata={ "required": True, "description": "Teams Webhook Url", "sensitive": True, "validation": "https_url", } ) class TeamsProvider(BaseProvider): """Send alert message to Teams.""" PROVIDER_DISPLAY_NAME = "Microsoft Teams" PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = TeamsProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, message: str = "", typeCard: str = "message", themeColor: Optional[str] = None, sections: str | list = [], schema: str = "http://adaptivecards.io/schemas/adaptive-card.json", attachments: str | list = [], mentions: str | list = [], **kwargs: dict[str, Any], ): """ Notify alert message to Teams using the Teams Incoming Webhook API Args: message (str): The message to send typeCard (str): The card type. Can be "MessageCard" (legacy) or "message" (for Adaptive Cards). Default is "message" themeColor (str): Hexadecimal color (only used with MessageCard type) sections (str | list): For MessageCard: Array of custom information sections. For Adaptive Cards: Array of card elements following the Adaptive Card schema. Can be provided as a JSON string or array. attachments (str | list): Custom attachments array for Adaptive Cards (overrides default attachment structure). Can be provided as a JSON string or array. schema (str): Schema URL for Adaptive Cards. Default is "http://adaptivecards.io/schemas/adaptive-card.json" mentions (str | list): List of user mentions to include in the Adaptive Card. Each mention should be a dict with 'id' (user ID, Microsoft Entra Object ID, or UPN) and 'name' (display name) keys. Example: [{"id": "user-id-123", "name": "John Doe"}, {"id": "john.doe@example.com", "name": "John Doe"}] """ self.logger.debug("Notifying alert message to Teams") webhook_url = self.authentication_config.webhook_url if sections and isinstance(sections, str): try: sections = json.loads(sections) except Exception as e: self.logger.error(f"Failed to decode sections string to JSON: {e}") if attachments and isinstance(attachments, str): try: attachments = json.loads(attachments) except Exception as e: self.logger.error(f"Failed to decode attachments string to JSON: {e}") if mentions and isinstance(mentions, str): try: mentions = json.loads(mentions) except Exception as e: self.logger.error(f"Failed to decode mentions string to JSON: {e}") if typeCard == "message": # Adaptive Card format payload = {"type": "message"} # Process the card content card_content = { "$schema": schema, "type": "AdaptiveCard", "version": "1.2", "body": ( sections if sections else [{"type": "TextBlock", "text": message}] ), } # Add mentions if provided if mentions: entities = [] for mention in mentions: if ( not isinstance(mention, dict) or "id" not in mention or "name" not in mention ): self.logger.warning( f"Invalid mention format: {mention}. Skipping." ) continue mention_text = f"{mention['name']}" entities.append( { "type": "mention", "text": mention_text, "mentioned": {"id": mention["id"], "name": mention["name"]}, } ) if entities: card_content["msteams"] = {"entities": entities} if attachments: payload["attachments"] = attachments else: payload["attachments"] = [ { "contentType": "application/vnd.microsoft.card.adaptive", "contentUrl": None, "content": card_content, } ] else: # Standard MessageCard format payload = { "@type": typeCard, "themeColor": themeColor, "text": message, "sections": sections, } response = requests.post(webhook_url, json=payload) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Teams: {response.text}" ) self.logger.debug("Alert message notified to Teams") return {"response_text": response.text} if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os teams_webhook_url = os.environ.get("TEAMS_WEBHOOK_URL") # Initalize the provider and provider config config = ProviderConfig( id="teams-test", description="Teams Output Provider", authentication={"webhook_url": teams_webhook_url}, ) provider = TeamsProvider(context_manager, provider_id="teams", config=config) provider.notify( typeCard="message", sections=[ {"type": "TextBlock", "text": "Danilo Vaz"}, { "type": "TextBlock", "text": "Hello Tal from Keep, please review this alert!", }, ], mentions=[{"id": "tal@example.com", "name": "Tal from Keep"}], ) ================================================ FILE: keep/providers/telegram_provider/__init__.py ================================================ ================================================ FILE: keep/providers/telegram_provider/telegram_provider.py ================================================ """ TelegramProvider is a class that implements the BaseProvider interface for Telegram messages. """ import asyncio import dataclasses from typing import Literal, Optional import pydantic import telegram from telegram import InlineKeyboardButton, InlineKeyboardMarkup from telegram.constants import ParseMode from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class TelegramProviderAuthConfig: """Telegram authentication configuration.""" bot_token: str = dataclasses.field( metadata={ "required": True, "description": "Telegram Bot Token", "sensitive": True, } ) class TelegramProvider(BaseProvider): """Send alert message to Telegram.""" PROVIDER_DISPLAY_NAME = "Telegram" PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = TelegramProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, chat_id: str = "", topic_id: Optional[int] = None, message: str = "", reply_markup: Optional[dict[str, dict[str, any]]] = None, reply_markup_layout: Literal["horizontal", "vertical"] = "horizontal", parse_mode: str = None, image_url: Optional[str] = None, caption_on_image: bool = False, **kwargs: dict, ): """ Notify alert message to Telegram using the Telegram Bot API https://core.telegram.org/bots/api Args: chat_id (str): Unique identifier for the target chat or username of the target channel topic_id (int): Unique identifier for the target message thread (topic) message (str): Message to be sent reply_markup (dict): Inline keyboard markup to be attached to the message reply_markup_layout (str): Direction of the reply markup, could be "horizontal" or "vertical" parse_mode (str): Mode for parsing entities in the message text, could be "markdown" or "html" image_url (str, optional): URL of the image to be attached to the message caption_on_image (bool, optional): Whether to use the message as a caption for the image """ self.logger.debug("Notifying alert message to Telegram") if not chat_id: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Telegram: chat_id is required" ) parse_mode_mapping = {"markdown": ParseMode.MARKDOWN_V2, "html": ParseMode.HTML} parse_mode = parse_mode_mapping.get(parse_mode, None) loop = asyncio.new_event_loop() telegram_bot = telegram.Bot(token=self.authentication_config.bot_token) try: keyboard_markup = None if reply_markup is not None: buttons = [] for text, params in reply_markup.items(): button = InlineKeyboardButton(text=text, **params) buttons.append(button) if reply_markup_layout == "horizontal": buttons = [buttons] elif reply_markup_layout == "vertical": buttons = [[button] for button in buttons] else: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Telegram: reply_markup_direction should be either horizontal or vertical" ) keyboard_markup = InlineKeyboardMarkup( inline_keyboard=buttons, ) if image_url: # If image URL is provided, send the image if caption_on_image: # Send image with caption task = loop.create_task( telegram_bot.send_photo( chat_id=chat_id, photo=image_url, caption=message, reply_markup=keyboard_markup, parse_mode=parse_mode, message_thread_id=topic_id, ) ) else: # Send message first, then image if message: msg_task = loop.create_task( telegram_bot.send_message( chat_id=chat_id, text=message, reply_markup=None, # Attach markup to the image instead parse_mode=parse_mode, message_thread_id=topic_id, ) ) loop.run_until_complete(msg_task) # Send image without caption task = loop.create_task( telegram_bot.send_photo( chat_id=chat_id, photo=image_url, reply_markup=keyboard_markup, message_thread_id=topic_id, ) ) else: # Send regular text message if no image URL is provided task = loop.create_task( telegram_bot.send_message( chat_id=chat_id, text=message, reply_markup=keyboard_markup, parse_mode=parse_mode, message_thread_id=topic_id, ) ) loop.run_until_complete(task) except Exception as e: raise ProviderException( f"{self.__class__.__name__} failed to notify alert message to Telegram: {e}" ) self.logger.debug("Alert message notified to Telegram") async def test_send_message(): # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os telegram_bot_token = os.environ.get("TELEGRAM_BOT_TOKEN") telegram_chat_id = os.environ.get("TELEGRAM_CHAT_ID") # Initalize the provider and provider config config = ProviderConfig( description="Telegram Provider", authentication={"bot_token": telegram_bot_token}, ) provider = TelegramProvider( context_manager, provider_id="telegram-test", config=config ) # Test with text only await provider.notify( message="Keep Alert", chat_id=telegram_chat_id, ) # Test with image await provider.notify( message="Keep Alert with Graph", chat_id=telegram_chat_id, image_url="https://example.com/path/to/grafana/graph.png", ) # Test with image and using message as caption await provider.notify( message="CPU Usage Alert", chat_id=telegram_chat_id, image_url="https://example.com/path/to/grafana/cpu_graph.png", caption_on_image=True, ) if __name__ == "__main__": import threading def run_in_thread(): import asyncio # Create a new event loop for this thread loop = asyncio.new_event_loop() # Set it as the event loop for this thread asyncio.set_event_loop(loop) try: # Run your async function in this new loop loop.run_until_complete(test_send_message()) finally: loop.close() # Create and start the thread thread = threading.Thread(target=run_in_thread) thread.start() # Wait for the thread to complete if needed thread.join() ================================================ FILE: keep/providers/thousandeyes_provider/__init__.py ================================================ ================================================ FILE: keep/providers/thousandeyes_provider/alerts_mock.py ================================================ ALERT = [{ "eventId": "562949953436734-562949955000593", "alert": { "severity": "Info", "dateStartZoned": "2025-03-24 17:28:40 UTC", "agentId": 562949953424211, "ipAddress": "172.17.0.2", "agentName": "te", "ruleExpression": "Last Contact ≥ 6 minutes ago", "type": "Agent", "ruleAid": 562949953552543, "hostname": "te", "dateStart": "2025-03-24 17:28:40", "ruleName": "Default Agent Offline Notification", "alertId": 562949955000593, "ruleId": 562949953553310 }, "eventType": "ALERT_NOTIFICATION_TRIGGER", "agentAlert": { "severity": "Info", "dateStartZoned": "2025-03-24 17:28:40 UTC", "agentId": 562949953424211, "ipAddress": "172.17.0.2", "agentName": "te", "ruleExpression": "Last Contact ≥ 6 minutes ago", "type": "Agent", "ruleAid": 562949953552543, "hostname": "te", "dateStart": "2025-03-24 17:28:40", "ruleName": "Default Agent Offline Notification", "alertId": 562949955000593, "ruleId": 562949953553310 } }, { "eventId": "9437a575-4b00-44a2-899a-41d1134eef08--5abda706-c065-40fa-aa8c-059c3ac1ea9d", "alert": { "severity": "Info", "dateStartZoned": "2025-03-17 19:43:00 UTC", "apiLinks": [ { "rel": "related", "href": "https://api.thousandeyes.com/v4/tests/562949953502258" }, { "rel": "data", "href": "https://api.thousandeyes.com/v4/web/http-server/562949953502258" } ], "testLabels": [ { "id": 562949953465712, "name": "Web Server" }, { "id": 562949953465711, "name": "https://pdf.ezhil.dev" }, { "id": 562949953465713, "name": "Health Overview Dashboard" } ], "active": 0, "ruleExpression": "Response Code is not OK (2xx)", "dateEnd": "2025-03-24 17:21:00", "type": "HTTP Server", "ruleAid": 562949953552543, "agents": [ { "dateStart": "2025-03-17 19:43:00", "dateEnd": "2025-03-24 17:21:00", "active": 0, "metricsAtStart": "Response Code: 502", "metricsAtEnd": "Response Code: 200", "permalink": "https://app.thousandeyes.com/alerts/list/?__a=562949953552543&alertId=5abda706-c065-40fa-aa8c-059c3ac1ea9d&agentId=4503", "agentId": 4503, "agentName": "Hong Kong (Trial)" } ], "testTargetsDescription": [ "https://pdf.ezhil.dev" ], "violationCount": 1, "dateStart": "2025-03-17 19:43:00", "dateEndZoned": "2025-03-24 17:21:00 UTC", "ruleName": "PDF Test", "testId": 562949953502258, "alertId": "5abda706-c065-40fa-aa8c-059c3ac1ea9d", "ruleId": 562949955720954, "permalink": "https://app.thousandeyes.com/alerts/list/?__a=562949953552543&alertId=5abda706-c065-40fa-aa8c-059c3ac1ea9d", "testName": "https://pdf.ezhil.dev - HTTP Server" }, "eventType": "ALERT_NOTIFICATION_CLEAR" }] ================================================ FILE: keep/providers/thousandeyes_provider/thousandeyes_provider.py ================================================ """ Thousandseyes provider is a class that allows you to retrieve alerts from Thousandeyes using API endpoints as well as webhooks. """ import dataclasses import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class ThousandeyesProviderAuthConfig: """ ThousandeyesProviderAuthConfig is a class that allows you to authenticate in Thousandeyes. """ oauth2_token: str = dataclasses.field( metadata={ "required": True, "description": "OAuth2 Bearer Token", "sensitive": True, }, ) class ThousandeyesProvider(BaseProvider): """ Get alerts from Thousandeyes into Keep. """ webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ To send alerts from ThousandEyes to Keep, Use the following webhook url to configure ThousandEyes send alerts to Keep: 1. In ThousandEyes Dashboard, go to Network & App Synthetics > Agent Settings 2. Go to Notifications under Enterprise Agents and click on Notifications 3. Go to Notifications and create a new webhook notification 4. Give it a name and set the URL as {keep_webhook_api_url}&api_key={api_key} 5. Select Auth Type as None and Add New Webhook 6. Now, you have successfully configured ThousandEyes to send alerts to Keep """ PROVIDER_DISPLAY_NAME = "ThousandEyes" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring", "Incident Management", "Cloud Infrastructure"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="User is Authenticated", ) ] SEVERITY_MAP = { "info": AlertSeverity.INFO, "minor": AlertSeverity.WARNING, "major": AlertSeverity.HIGH, "critical": AlertSeverity.CRITICAL, } # Thousandeyes only supports severity. We map severity to status. STATUS_MAP = { "info": AlertStatus.PENDING, "minor": AlertStatus.ACKNOWLEDGED, "major": AlertStatus.FIRING, "critical": AlertStatus.FIRING, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Thousandeyes provider. """ self.authentication_config = ThousandeyesProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validates required scopes for Thousandeyes provider. """ self.logger.info("Validating scopes for Thousandeyes provider") try: response = requests.get( "https://api.thousandeyes.com/v7/alerts", headers=self._generate_auth_headers(), ) response.raise_for_status() if response.status_code == 200: self.logger.info( "Successfully validated scopes for Thousandeyes provider" ) return {"authenticated": True} except requests.exceptions.HTTPError as e: self.logger.exception( "Error while validating scopes", extra={"error": str(e)} ) return {"authenticated": str(e)} def _generate_auth_headers(self): """ Generate authentication headers for Thousandeyes. """ return {"Authorization": "Bearer " + self.authentication_config.oauth2_token} def _get_alerts(self) -> list[AlertDto]: """ Get alerts from Thousandeyes """ self.logger.info("Getting alerts from Thousandeyes") try: response = requests.get( "https://api.thousandeyes.com/v7/alerts", headers=self._generate_auth_headers(), ) response.raise_for_status() if response.status_code == 200: alerts = response.json().get("alerts", []) alertDtos = [] for alert in alerts: id = alert.get("id") alertId = alert.get("alertId") name = alert.get("id") description = alert.get("id") ruleId = alert.get("ruleId") alertRuleId = alert.get("alertRuleId") state = alert.get("state", "Unable to fetch state") alertState = alert.get("alertState", "Unable to fetch alert state") dateStart = alert.get("dateStart") startDate = alert.get("startDate") startedAt = alert.get("startDate") lastReceived = alert.get("startDate") alertType = alert.get("alertType", "Unable to fetch alert type") severity = ThousandeyesProvider.SEVERITY_MAP.get( alert.get("alertSeverity"), AlertSeverity.INFO ) status = ThousandeyesProvider.STATUS_MAP.get( alert.get("alertSeverity"), AlertStatus.PENDING ) violationCount = alert.get( "violationCount", "Unable to fetch violation count" ) duration = alert.get("duration", "Unable to fetch duration") apiLinks = alert.get("apiLinks", []) url = ( apiLinks[0].get("href", "http://unable-to-fetch-url") if apiLinks else "http://unable-to-fetch-url" ) url2 = ( apiLinks[1].get("href", "http://unable-to-fetch-url") if len(apiLinks) > 1 else "http://unable-to-fetch-url" ) permalink = alert.get("permalink", "Unable to fetch permalink") suppressed = alert.get("suppressed", "Unable to fetch suppressed") meta = alert.get("meta", {}) links = alert.get("_links", {}) alertDto = AlertDto( id=id, alertId=alertId, name=name, description=description, ruleId=ruleId, alertRuleId=alertRuleId, state=state, alertState=alertState, dateStart=dateStart, startDate=startDate, startedAt=startedAt, lastReceived=lastReceived, alertType=alertType, severity=severity, status=status, violationCount=violationCount, duration=duration, apiLinks=apiLinks, url=url, url2=url2, permalink=permalink, suppressed=suppressed, meta=meta, links=links, source=["thousandeyes"], ) alertDtos.append(alertDto) return alertDtos except Exception as e: self.logger.exception("Error while getting alerts") raise Exception("Error while getting alerts from Thousandeyes", str(e)) @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: """ Format alert from Thousandeyes. """ alertData = event.get("alert", {}) id = event.get("eventId") description = alertData.get("ruleExpression", "Unable to fetch description") severity_value = alertData.get("severity", "info").lower() severity = ThousandeyesProvider.SEVERITY_MAP.get( severity_value, AlertSeverity.INFO ) status = ThousandeyesProvider.STATUS_MAP.get( severity_value, AlertStatus.PENDING ) name = alertData.get("ruleName", "Unable to fetch test name") dateStartZoned = alertData.get( "dateStartZoned", "Unable to fetch date start zoned" ) agentId = alertData.get("agent", {}).get("agentId", "Unable to fetch agent id") ipAddress = alertData.get("ipAddress", "Unable to fetch ip address") agentName = alertData.get("agentName", "Unable to fetch agent name") ruleExpression = alertData.get( "ruleExpression", "Unable to fetch rule expression" ) alert_type = alertData.get("type", "Unable to fetch alert type") ruleAid = alertData.get("ruleAid", "Unable to fetch rule aid") hostname = alertData.get("hostname", "Unable to fetch hostname") dateStart = alertData.get("dateStart", "Unable to fetch date start") ruleName = alertData.get("ruleName", "Unable to fetch rule name") ruleId = alertData.get("ruleId", "Unable to fetch rule id") alertId = alertData.get("alertId", "Unable to fetch alert id") eventType = event.get("eventType", "Unable to fetch event type") apiLinks = alertData.get("apiLinks", []) url = ( apiLinks[0].get("href", "http://unable-to-fetch-url") if apiLinks else "http://unable-to-fetch-url" ) url2 = ( apiLinks[1].get("href", "http://unable-to-fetch-url") if len(apiLinks) > 1 else "http://unable-to-fetch-url" ) testLabels = alertData.get("testLabels", []) active = alertData.get("active", "Unable to fetch active") dateEnd = alertData.get("dateEnd", "Unable to fetch date end") agents = alertData.get("agents", []) testTargetsDescription = alertData.get("testTargetsDescription", []) violationCount = alertData.get( "violationCount", "Unable to fetch violation count" ) dateEndZoned = alertData.get("dateEndZoned", "Unable to fetch date end zoned") testId = alertData.get("testId", "Unable to fetch test id") permalink = alertData.get("permalink", "Unable to fetch permalink") testName = alertData.get("testName", "Unable to fetch test name") alert = AlertDto( id=id, description=description, severity=severity, status=status, name=name, dateStartZoned=dateStartZoned, agentId=agentId, ipAddress=ipAddress, agentName=agentName, ruleExpression=ruleExpression, alert_type=alert_type, ruleAid=ruleAid, hostname=hostname, dateStart=dateStart, ruleName=ruleName, ruleId=ruleId, alertId=alertId, eventType=eventType, apiLinks=apiLinks, url=url, url2=url2, testLabels=testLabels, active=active, dateEnd=dateEnd, agents=agents, testTargetsDescription=testTargetsDescription, violationCount=violationCount, dateEndZoned=dateEndZoned, testId=testId, permalink=permalink, testName=testName, source=["thousandeyes"], ) return alert if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os oauth2_token = os.getenv("THOUSANDEYES_OAUTH2_TOKEN") config = ProviderConfig( description="Thousandeyes provider", authentication={"oauth2_token": oauth2_token}, ) provider = ThousandeyesProvider(context_manager, "thousandeyes", config) alerts = provider.get_alerts() print(alerts) ================================================ FILE: keep/providers/trello_provider/__init__.py ================================================ ================================================ FILE: keep/providers/trello_provider/trello_provider.py ================================================ """ TrelloOutput is a class that implements the BaseOutputProvider interface for Trello updates. """ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class TrelloProviderAuthConfig: """Trello authentication configuration.""" api_key: str = dataclasses.field( metadata={"required": True, "description": "Trello API Key", "sensitive": True} ) api_token: str = dataclasses.field( metadata={ "required": True, "description": "Trello API Token", "sensitive": True, } ) class TrelloProvider(BaseProvider): """Enrich alerts with data from Trello.""" PROVIDER_DISPLAY_NAME = "Trello" PROVIDER_CATEGORY = ["Collaboration"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = TrelloProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _query(self, board_id: str = "", filter: str = "createCard", **kwargs: dict): """ Notify alert message to Slack using the Slack Incoming Webhook API https://api.slack.com/messaging/webhooks Args: board_id (str): Trello board ID filter (str): Trello action filter """ self.logger.debug("Fetching data from Trello") trello_api_key = self.authentication_config.api_key trello_api_token = self.authentication_config.api_token request_url = f"https://api.trello.com/1/boards/{board_id}/actions?key={trello_api_key}&token={trello_api_token}&filter={filter}" response = requests.get(request_url) if not response.ok: raise ProviderException( f"{self.__class__.__name__} failed to fetch data from Trello: {response.text}" ) self.logger.debug("Fetched data from Trello") cards = response.json() return {"cards": cards, "number_of_cards": len(cards)} if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os trello_api_key = os.environ.get("TRELLO_API_KEY") trello_api_token = os.environ.get("TRELLO_API_TOKEN") # Initalize the provider and provider config config = ProviderConfig( description="Trello Input Provider", authentication={"api_key": trello_api_key, "api_token": trello_api_token}, ) provider = TrelloProvider(context_manager, provider_id="trello-test", config=config) provider.query(board_id="trello-board-id", filter="createCard") ================================================ FILE: keep/providers/twilio_provider/twilio_provider.py ================================================ """ TwilioProvider is a class that implements the BaseProvider interface for Twilio updates. """ import dataclasses import pydantic from twilio.base.exceptions import TwilioRestException from twilio.rest import Client from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class TwilioProviderAuthConfig: """Twilio authentication configuration.""" account_sid: str = dataclasses.field( metadata={ "required": True, "description": "Twilio Account SID", "sensitive": False, "documentation_url": "https://support.twilio.com/hc/en-us/articles/223136027-Auth-Tokens-and-How-to-Change-Them", } ) api_token: str = dataclasses.field( metadata={ "required": True, "description": "Twilio API Token", "sensitive": True, "documentation_url": "https://support.twilio.com/hc/en-us/articles/223136027-Auth-Tokens-and-How-to-Change-Them", } ) from_phone_number: str = dataclasses.field( metadata={ "required": True, "description": "Twilio Phone Number", "sensitive": False, "documentation_url": "https://www.twilio.com/en-us/guidelines/regulatory", } ) class TwilioProvider(BaseProvider): """Send SMS via Twilio.""" PROVIDER_DISPLAY_NAME = "Twilio" PROVIDER_CATEGORY = ["Collaboration"] PROVIDER_SCOPES = [ ProviderScope( name="send_sms", description="The API token has permission to send the SMS", mandatory=True, alias="Send SMS", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_scopes(self) -> dict[str, bool | str]: validated_scopes = {} twilio_client = Client( self.authentication_config.account_sid, self.authentication_config.api_token, ) try: # from: 15005550006 is a magic number according to https://www.twilio.com/docs/messaging/tutorials/automate-testing twilio_client.messages.create( from_="+15005550006", to="+5571981265131", body="scope test", ) validated_scopes["send_sms"] = True except TwilioRestException as e: # unfortunately, there is no API to get the enabled region, so we just try US and if it fails on "enabled for the region" # we assume the creds are valid but the region is not enabled (and that's ok) if "SMS has not been enabled for the region" in str(e): self.logger.debug( "Twilio SMS is not enabled for the region, but that's ok" ) validated_scopes["send_sms"] = True else: self.logger.warning( "Failed to validate scope send_sms", extra={"reason": str(e)}, ) validated_scopes["send_sms"] = str(e) # other unknown exception except Exception as e: self.logger.warning( "Failed to validate scope send_sms", extra={"reason": str(e)}, ) validated_scopes["send_sms"] = str(e) return validated_scopes def validate_config(self): self.authentication_config = TwilioProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, message_body: str = "", to_phone_number: str = "", **kwargs: dict ): """ Send an SMS notification using Twilio API. Args: message_body (str, optional): The content of the SMS message to be sent. Defaults to "". to_phone_number (str, optional): The recipient's phone number. Defaults to "". """ # extract the required params self.logger.debug("Notifying alert SMS via Twilio") if not to_phone_number: raise ProviderException( f"{self.__class__.__name__} failed to notify alert SMS via Twilio: to_phone_number is required" ) twilio_client = Client( self.authentication_config.account_sid, self.authentication_config.api_token ) try: self.logger.debug("Sending SMS via Twilio") twilio_client.messages.create( from_=self.authentication_config.from_phone_number, to=to_phone_number, body=message_body, ) self.logger.debug("SMS sent via Twilio") except Exception as e: self.logger.warning( "Failed to send SMS via Twilio", extra={"reason": str(e)} ) raise ProviderException( f"{self.__class__.__name__} failed to notify alert SMS via Twilio: {e}" ) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os twilio_api_token = os.environ.get("TWILIO_API_TOKEN") twilio_account_sid = os.environ.get("TWILIO_ACCOUNT_SID") twilio_from_phone_number = os.environ.get("TWILIO_FROM_PHONE_NUMBER") twilio_to_phone_number = os.environ.get("TWILIO_TO_PHONE_NUMBER") # Initialize the provider and provider config config = ProviderConfig( description="Twilio Input Provider", authentication={ "api_token": twilio_api_token, "account_sid": twilio_account_sid, "from_phone_number": twilio_from_phone_number, }, ) provider = TwilioProvider(context_manager, provider_id="twilio", config=config) provider.validate_scopes() # Send SMS provider.notify( message_body="Keep Alert", to_phone_number=twilio_to_phone_number, ) ================================================ FILE: keep/providers/uptimekuma_provider/__init__.py ================================================ ================================================ FILE: keep/providers/uptimekuma_provider/uptimekuma_provider.py ================================================ """ UptimeKuma is a class that provides the necessary methods to interact with the UptimeKuma SDK """ import dataclasses import pydantic from socketio.exceptions import BadNamespaceError from uptime_kuma_api import UptimeKumaApi from keep.api.models.alert import AlertDto, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class UptimekumaProviderAuthConfig: """ UptimekumaProviderAuthConfig is a class that holds the authentication information for the UptimekumaProvider. """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "UptimeKuma Host URL", "sensitive": False, "validation": "any_http_url" }, ) username: str = dataclasses.field( metadata={ "required": True, "description": "UptimeKuma Username", "sensitive": False, }, ) password: str = dataclasses.field( metadata={ "required": True, "description": "UptimeKuma Password", "sensitive": True, }, ) class UptimekumaProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "UptimeKuma" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] PROVIDER_SCOPES = [ ProviderScope( name="alerts", description="Read alerts from UptimeKuma", ) ] STATUS_MAP = { # Possible firing "down": AlertStatus.FIRING.value, "unavailable": AlertStatus.FIRING.value, "firing": AlertStatus.FIRING.value, "0": AlertStatus.FIRING.value, 0: AlertStatus.FIRING.value, # RESOLVED "up": AlertStatus.RESOLVED.value, "available": AlertStatus.RESOLVED.value, "1": AlertStatus.RESOLVED.value, 1: AlertStatus.RESOLVED.value, "resolved": AlertStatus.RESOLVED.value, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def _get_api(self): api = UptimeKumaApi(self.authentication_config.host_url) api.login( self.authentication_config.username, self.authentication_config.password ) return api def dispose(self): pass def validate_scopes(self): """ Validate that the scopes provided in the config are valid """ api = UptimeKumaApi(self.authentication_config.host_url) response = api.login( self.authentication_config.username, self.authentication_config.password ) api.disconnect() if "token" in response: return {"alerts": True} return {"alerts": False} def validate_config(self): self.authentication_config = UptimekumaProviderAuthConfig( **self.config.authentication ) def _get_heartbeats(self): try: api = self._get_api() response = api.get_heartbeats() length = len(response) if length == 0: return [] heartbeats = [] for key in response: heartbeat = response[key][-1] monitor_id = heartbeat.get("monitor_id", heartbeat.get("monitorID")) try: name = api.get_monitor(monitor_id)["name"] except BadNamespaceError: # Most likely connection issues try: api.disconnect() except Exception: pass # Single retry api = self._get_api() name = api.get_monitor(monitor_id)["name"] heartbeats.append( AlertDto( id=heartbeat["id"], name=name, monitor_id=heartbeat["monitor_id"], description=heartbeat["msg"], status=self.STATUS_MAP.get(heartbeat["status"], "firing"), lastReceived=self._format_datetime(heartbeat["localDateTime"], heartbeat["timezoneOffset"]), ping=heartbeat["ping"], source=["uptimekuma"], ) ) api.disconnect() return heartbeats except Exception as e: self.logger.error("Error getting heartbeats from UptimeKuma: %s", e) raise Exception(f"Error getting heartbeats from UptimeKuma: {e}") def _get_alerts(self) -> list[AlertDto]: try: self.logger.info("Collecting alerts (heartbeats) from UptimeKuma") alerts = self._get_heartbeats() return alerts except Exception as e: self.logger.error("Error getting alerts from UptimeKuma: %s", e) raise Exception(f"Error getting alerts from UptimeKuma: {e}") @classmethod def _format_alert( cls, event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: alert = AlertDto( id=event["monitor"]["id"], name=event["monitor"]["name"], monitor_url=event["monitor"]["url"], status=cls.STATUS_MAP.get(event["heartbeat"]["status"], "firing"), description=event["msg"], lastReceived=cls._format_datetime(event["heartbeat"]["localDateTime"], event["heartbeat"]["timezoneOffset"]), msg=event["heartbeat"]["msg"], source=["uptimekuma"], ) return alert @staticmethod def _format_datetime(dt, offset): return dt + offset if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os uptimekuma_host = os.environ.get("UPTIMEKUMA_HOST") uptimekuma_username = os.environ.get("UPTIMEKUMA_USERNAME") uptimekuma_password = os.environ.get("UPTIMEKUMA_PASSWORD") if uptimekuma_host is None: raise Exception("UPTIMEKUMA_HOST is required") if uptimekuma_username is None: raise Exception("UPTIMEKUMA_USERNAME is required") if uptimekuma_password is None: raise Exception("UPTIMEKUMA_PASSWORD is required") config = ProviderConfig( description="UptimeKuma Provider", authentication={ "host_url": uptimekuma_host, "username": uptimekuma_username, "password": uptimekuma_password, }, ) provider = UptimekumaProvider( context_manager=context_manager, provider_id="uptimekuma", config=config, ) alerts = provider.get_alerts() print(alerts) provider.dispose() ================================================ FILE: keep/providers/vectordev_provider/__init__.py ================================================ ================================================ FILE: keep/providers/vectordev_provider/vectordev_provider.py ================================================ import dataclasses import json import logging import random import pydantic from keep.api.models.alert import AlertDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig from keep.providers.providers_factory import ProvidersFactory logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class VectordevProviderAuthConfig: api_key: str = dataclasses.field( metadata={"required": True, "description": "API key", "sensitive": True} ) class VectordevProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Vector" PROVIDER_CATEGORY = ["Monitoring", "Developer Tools"] PROVIDER_COMING_SOON = True # Mapping from vector sources to keep providers SOURCE_TO_PROVIDER_MAP = { "prometheus": "prometheus", } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = VectordevProviderAuthConfig( **self.config.authentication ) def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: events = [] if isinstance(event, list): events = event else: events = [event] alert_dtos = [] for e in events: if ( "keep_source_type" in e and e["keep_source_type"] in VectordevProvider.SOURCE_TO_PROVIDER_MAP ): provider_class = ProvidersFactory.get_provider_class( VectordevProvider.SOURCE_TO_PROVIDER_MAP[e["keep_source_type"]] ) alert_dtos.extend( provider_class._format_alert( e.get("message", e.get("event")), provider_instance ) ) else: message_str = json.dumps(e.get("message", e.get("event"))) alert_dtos.append( AlertDto( name="", message=message_str, description=message_str, lastReceived=e.get("timestamp"), source_type=e.get("source_type"), source=["vectordev"], original_event=e.get("message"), ) ) return alert_dtos def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass @classmethod def simulate_alert(cls, **kwargs) -> dict: provider = random.choice( list(VectordevProvider.SOURCE_TO_PROVIDER_MAP.values()) ) provider_class = ProvidersFactory.get_provider_class(provider) return provider_class.simulate_alert(to_wrap_with_provider_type=True) ================================================ FILE: keep/providers/victorialogs_provider/README.md ================================================ ## VictoriaLogs Setup using Docker 1. Run the following command to start VictoriaLogs container ```bash docker run --rm -it -p 9428:9428 -v ./victoria-logs-data:/victoria-logs-data \ docker.io/victoriametrics/victoria-logs:v1.13.0-victorialogs ``` 2. Push dummy logs to VictoriaLogs (If needed) ```bash for i in {1..100}; do TIMESTAMP=$(date +%s%N) SEVERITY=("info" "warning" "error" "critical") STATUS=("success" "failure" "pending") DESC=("Operation completed" "Network issue detected" "User login failed" "Service restarted") RANDOM_SEVERITY=${SEVERITY[$RANDOM % ${#SEVERITY[@]}]} RANDOM_STATUS=${STATUS[$RANDOM % ${#STATUS[@]}]} RANDOM_DESC=${DESC[$RANDOM % ${#DESC[@]}]} curl -H "Content-Type: application/json" -XPOST "http://localhost:9428/insert/loki/api/v1/push?_stream_fields=instance" --data-raw \ "{ \"streams\": [{ \"stream\": { \"instance\": \"host123\", \"ip\": \"192.168.1.$i\", \"trace_id\": \"trace_$i\", \"severity\": \"$RANDOM_SEVERITY\", \"status\": \"$RANDOM_STATUS\" }, \"values\": [[\"$TIMESTAMP\", \"[$RANDOM_SEVERITY] - Status: $RANDOM_STATUS - $RANDOM_DESC\"]] }] }" done ``` 3. To add authentication to VictoriaLogs, you can use [VMauth](https://docs.victoriametrics.com/vmauth/) from VictoriaMetrics. 4. Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest), unpack it and pass the following flag to vmauth binary in order to start authorizing and proxying requests ```bash /path/to/vmauth -auth.config=/path/to/auth/config.yml ``` 5. Use the following configuration as config.yml ```yaml users: - bearer_token: "1234" url_prefix: "http://localhost:9428" - bearer_token: "123" url_prefix: "http://localhost:9428" headers: - "X-Scope-OrgID: foobar" - username: "admin" password: "1234" url_prefix: "http://localhost:9428" ``` ================================================ FILE: keep/providers/victorialogs_provider/__init__.py ================================================ ================================================ FILE: keep/providers/victorialogs_provider/victorialogs_provider.py ================================================ """ VictoriaLogsProvider is a class that allows you to query logs from VictoriaLogs. """ import base64 import dataclasses import json import typing import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class VictorialogsProviderAuthConfig: """ VictoriaLogsProviderAuthConfig is a class that allows you to authenticate in VictoriaLogs. """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "VictoriaLogs Host URL", "hint": "e.g. https://victorialogs.example.com", "sensitive": False, "validation": "any_http_url", } ) authentication_type: typing.Literal["NoAuth", "Basic", "Bearer"] = ( dataclasses.field( default=typing.cast(typing.Literal["NoAuth", "Basic", "Bearer"], "NoAuth"), metadata={ "required": True, "description": "Authentication Type", "type": "select", "options": ["NoAuth", "Basic", "Bearer"], }, ) ) # Basic Authentication username: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "HTTP basic authentication - Username", "sensitive": False, "config_sub_group": "basic_authentication", "config_main_group": "authentication", }, ) password: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "HTTP basic authentication - Password", "sensitive": True, "config_sub_group": "basic_authentication", "config_main_group": "authentication", }, ) # Bearer Token bearer_token: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "Bearer Token", "sensitive": True, "config_sub_group": "bearer_token", "config_main_group": "authentication", }, ) x_scope_orgid: typing.Optional[str] = dataclasses.field( default=None, metadata={ "required": False, "description": "X-Scope-OrgID Header", "sensitive": False, "config_sub_group": "bearer_token", "config_main_group": "authentication", }, ) insecure: bool = dataclasses.field( default=False, metadata={ "name": "insecure", "description": "Skip TLS verification", "required": False, "sensitive": False, "type": "switch", }, ) class VictorialogsProvider(BaseProvider): """ VictoriaLogsProvider is a class that allows you to query logs from VictoriaLogs. """ PROVIDER_DISPLAY_NAME = "VictoriaLogs" PROVIDER_TAGS = ["alert"] PROVIDER_SCOPES = [ ProviderScope( name="authenticated", description="The instance is valid and the user is authenticated", ), ] PROVIDER_CATEGORY = ["Monitoring"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): """ Validate the configuration of the provider. """ self.authentication_config = VictorialogsProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validate the scopes of the provider. """ try: url = self._get_url("/") response = requests.get( url=url, headers=self.generate_auth_headers(), verify=not self.authentication_config.insecure, ) if response.status_code != 200: response.raise_for_status() self.logger.info("Successfully validate scopes") return {"authenticated": True} except Exception as e: self.logger.exception("Failed to validate scopes", extra={"error": str(e)}) return {"authenticated": str(e)} def _get_url(self, endpoint: str): return f"{self.authentication_config.host_url}{endpoint}" def generate_auth_headers(self): """ Generate the authentication headers. """ if self.authentication_config.authentication_type == "Basic": credentials = f"{self.authentication_config.username}:{self.authentication_config.password}".encode( "utf-8" ) encoded_credentials = base64.b64encode(credentials).decode("utf-8") return {"Authorization": f"Basic {encoded_credentials}"} if self.authentication_config.authentication_type == "Bearer": headers = {} if self.authentication_config.bearer_token: headers["Authorization"] = ( f"Bearer {self.authentication_config.bearer_token}" ) if self.authentication_config.x_scope_orgid: headers["X-Scope-OrgID"] = self.authentication_config.x_scope_orgid return headers def _convert_to_json(self, response: str) -> dict: """ Convert the response string to JSON. """ if "\n" in response: log_lines = response.split("\n") log_entries = [json.loads(line) for line in log_lines if line.strip()] else: log_entries = json.loads(response) return log_entries def _query( self, queryType="", query="", time="", start="", end="", step="", account_id="", project_id="", limit="", timeout="", **kwargs: dict, ) -> dict: """ Query logs from VictoriaLogs. """ if queryType == "query": url = self._get_url("/select/logsql/query") params = {"query": query, "limit": limit, "timeout": timeout} params = {k: v for k, v in params.items() if v} headers = self.generate_auth_headers() headers.update({"AccountID": account_id, "ProjectID": project_id}) headers = {k: v for k, v in headers.items() if v} response = requests.post( url=url, data=params, headers=headers, verify=not self.authentication_config.insecure, ) try: response.raise_for_status() return self._convert_to_json(response.text) except Exception as e: self.logger.exception("Failed to query logs") raise Exception( "Could not query logs from VictoriaLogs on /query endpoint: ", str(e), ) elif queryType == "hits": url = self._get_url("/select/logsql/hits") params = {"query": query, "start": start, "end": end, "step": step} params = {k: v for k, v in params.items() if v} headers = self.generate_auth_headers() headers.update({"AccountID": account_id, "ProjectID": project_id}) headers = {k: v for k, v in headers.items() if v} response = requests.post( url=url, data=params, headers=headers, verify=not self.authentication_config.insecure, ) try: response.raise_for_status() return self._convert_to_json(response.text) except Exception as e: self.logger.exception("Failed to query logs") raise Exception( "Could not query logs from VictoriaLogs on /hits endpoint: ", str(e) ) elif queryType == "stats_query": url = self._get_url("/select/logsql/stats_query") params = {"query": query, "time": time} params = {k: v for k, v in params.items() if v} response = requests.post( url=url, data=params, headers=self.generate_auth_headers(), verify=not self.authentication_config.insecure, ) try: response.raise_for_status() return self._convert_to_json(response.text) except Exception as e: self.logger.exception("Failed to query logs") raise Exception( "Could not query logs from VictoriaLogs on /stats_query endpoint: ", str(e), ) elif queryType == "stats_query_range": url = self._get_url("/select/logsql/stats_query_range") params = {"query": query, "start": start, "end": end, "step": step} params = {k: v for k, v in params.items() if v} response = requests.post( url=url, data=params, headers=self.generate_auth_headers(), verify=not self.authentication_config.insecure, ) try: response.raise_for_status() return self._convert_to_json(response.text) except Exception as e: self.logger.exception("Failed to query logs") raise Exception( "Could not query logs from VictoriaLogs on /stats_query_range endpoint: ", str(e), ) else: self.logger.exception("Invalid queryType") raise Exception("Invalid queryType") if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os victorialogs_host_url = os.getenv("VICTORIALOGS_HOST_URL") config = ProviderConfig( description="VictoriaLogs Provider", authentication={ "host_url": victorialogs_host_url, }, ) provider = VictorialogsProvider(context_manager, "victorialogs", config) logs = provider._query(queryType="query", query="error") print(logs) ================================================ FILE: keep/providers/victoriametrics_provider/README.md ================================================ ## Guide to deploy VictoriaMetrics using docker ### 1. Clone the repository ```bash git clone https://github.com/VictoriaMetrics/VictoriaMetrics.git ``` ### 2. Change the directory to docker ```bash cd deployment/docker ``` ### 3. Change the ports in the docker-compose file to avoid conflicts with the keep services ```bash sed -i -e 's/3000:3000/3001:3000/' -e 's/127.0.0.1:3000/127.0.0.1:3001/' docker-compose.yml ``` ### 3. Run the docker-compose file ```bash docker-compose up -d ``` ### 4. You can access the following services on the following ports vicotriametrics - [http://localhost:8428](http://localhost:8428) grafana - [http://localhost:3001](http://localhost:3001) vmagent - [http://localhost:8429](http://localhost:8429) vmalert - [http://localhost:8880](http://localhost:8880) alertmanager - [http://localhost:9093](http://localhost:9093) ================================================ FILE: keep/providers/victoriametrics_provider/__init__.py ================================================ ================================================ FILE: keep/providers/victoriametrics_provider/victoriametrics_provider.py ================================================ """ VictoriametricsProvider is a class that allows to install webhooks and get alerts in Victoriametrics. """ import dataclasses import datetime import pydantic import requests from pydantic import AnyHttpUrl from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import UrlPort class ResourceAlreadyExists(Exception): def __init__(self, *args): super().__init__(*args) @pydantic.dataclasses.dataclass class VictoriametricsProviderAuthConfig: """ VictoriaMetrics authentication configuration. Both VMAlert and VM Backend are optional, but at least one must be configured. """ # VMAlert Configuration VMAlertHost: AnyHttpUrl | None = dataclasses.field( metadata={ "required": False, "description": "The hostname or IP address where VMAlert is running", "hint": "Example: 'http://localhost', 'http://192.168.1.100'", "validation": "any_http_url", "config_sub_group": "vmalert", "config_main_group": "address", }, default=None, ) VMAlertPort: UrlPort = dataclasses.field( metadata={ "required": False, "description": "The port number on which VMAlert is listening", "hint": "Example: 8880", "validation": "port", "config_sub_group": "vmalert", "config_main_group": "address", }, default=8880, ) VMAlertURL: AnyHttpUrl | None = dataclasses.field( metadata={ "required": False, "description": "The full URL to the VMAlert instance. Alternative to Host/Port", "hint": "Example: 'http://vmalert.mydomain.com:8880'", "validation": "any_http_url", "config_sub_group": "vmalert", "config_main_group": "address", }, default=None, ) # VM Backend Configuration VMBackendHost: AnyHttpUrl | None = dataclasses.field( metadata={ "required": False, "description": "The hostname or IP address where VictoriaMetrics backend is running", "hint": "Example: 'http://localhost', 'http://192.168.1.100'", "validation": "any_http_url", "config_sub_group": "vmbackend", "config_main_group": "address", }, default=None, ) VMBackendPort: UrlPort = dataclasses.field( metadata={ "required": False, "description": "The port number on which VictoriaMetrics backend is listening", "hint": "Example: 8428", "validation": "port", "config_sub_group": "vmbackend", "config_main_group": "address", }, default=8428, ) VMBackendURL: AnyHttpUrl | None = dataclasses.field( metadata={ "required": False, "description": "The full URL to the VictoriaMetrics backend. Alternative to Host/Port", "hint": "Example: 'http://vm.mydomain.com:8428'", "validation": "any_http_url", "config_sub_group": "vmbackend", "config_main_group": "address", }, default=None, ) # Auth Configuration BasicAuthUsername: str | None = dataclasses.field( metadata={ "required": False, "description": "Username for basic authentication", "config_sub_group": "auth", "config_main_group": "authentication", }, default=None, ) BasicAuthPassword: str | None = dataclasses.field( metadata={ "required": False, "description": "Password for basic authentication", "config_sub_group": "auth", "config_main_group": "authentication", "sensitive": True, }, default=None, ) # Auth Configuration BasicAuthUsername: str | None = dataclasses.field( metadata={ "required": False, "description": "Username for basic authentication", "config_sub_group": "auth", "config_main_group": "authentication", }, default=None, ) BasicAuthPassword: str | None = dataclasses.field( metadata={ "required": False, "description": "Password for basic authentication", "config_sub_group": "auth", "config_main_group": "authentication", "sensitive": True, }, default=None, ) SkipValidation: bool = dataclasses.field( metadata={ "required": False, "description": "Enter 'true' to skip validation of authentication", "config_sub_group": "validation", "config_main_group": "validation", }, default=False, ) insecure: bool = dataclasses.field( default=False, metadata={ "name": "insecure", "description": "Skip TLS verification", "required": False, "sensitive": False, "type": "switch", }, ) class VictoriametricsProvider(BaseProvider): """Install Webhooks and receive alerts from Victoriametrics.""" webhook_description = "This provider takes advantage of configurable webhooks available with Prometheus Alertmanager. Use the following template to configure AlertManager:" webhook_template = """route: receiver: "keep" group_by: ['alertname'] group_wait: 15s group_interval: 15s repeat_interval: 1m continue: true receivers: - name: "keep" webhook_configs: - url: '{keep_webhook_api_url}' send_resolved: true http_config: basic_auth: username: api_key password: {api_key} """ PROVIDER_SCOPES = [ ProviderScope( name="connected", description="The user can connect to the client", mandatory=True, alias="Connect to the client", ), ] PROVIDER_CATEGORY = ["Monitoring"] SEVERITIES_MAP = { "critical": AlertSeverity.CRITICAL, "high": AlertSeverity.HIGH, "warning": AlertSeverity.WARNING, "low": AlertSeverity.LOW, "test": AlertSeverity.INFO, "info": AlertSeverity.INFO, } STATUS_MAP = { "firing": AlertStatus.FIRING, "resolved": AlertStatus.RESOLVED, "acknowledged": AlertStatus.ACKNOWLEDGED, "suppressed": AlertStatus.SUPPRESSED, "pending": AlertStatus.PENDING, } def _get_auth(self): """Get basic auth tuple if credentials are configured.""" if ( self.authentication_config.BasicAuthUsername and self.authentication_config.BasicAuthPassword ): return ( self.authentication_config.BasicAuthUsername, self.authentication_config.BasicAuthPassword, ) return None def validate_scopes(self) -> dict[str, bool | str]: """Validate scopes by checking configured services.""" results = [] if self.authentication_config.SkipValidation == True: return {"connected": True} if self.vmalert_enabled: vmalert_response = requests.get( self.vmalert_host, auth=self._get_auth(), verify=not self.authentication_config.insecure, ) if vmalert_response.status_code == 200: self.logger.info("Connected to VMAlert successfully") else: results.append(f"VMAlert error: {vmalert_response.status_code}") self.logger.error( "Error connecting to VMAlert", extra={"status_code": vmalert_response.status_code}, ) if self.vmbackend_enabled: vmbackend_response = requests.get( self.vmbackend_host, auth=self._get_auth(), verify=not self.authentication_config.insecure, ) if vmbackend_response.status_code == 200: self.logger.info("Connected to VM Backend successfully") else: results.append(f"VM Backend error: {vmbackend_response.status_code}") self.logger.error( "Error connecting to VM Backend", extra={"status_code": vmbackend_response.status_code}, ) return { "connected": True if not results else ", ".join(results), } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): self._vmalert_host = None self._vmbackend_host = None super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def validate_config(self): """ Validates required configuration for Victoriametrics provider. At least one service (VMAlert or VM Backend) must be configured. """ self.authentication_config = VictoriametricsProviderAuthConfig( **self.config.authentication ) vmalert_configured = ( self.authentication_config.VMAlertURL is not None or self.authentication_config.VMAlertHost is not None ) vmbackend_configured = ( self.authentication_config.VMBackendURL is not None or self.authentication_config.VMBackendHost is not None ) if not vmalert_configured and not vmbackend_configured: raise Exception("At least one of VMAlert or VM Backend must be configured") @property def vmalert_enabled(self) -> bool: """Check if VMAlert is configured.""" return ( self.authentication_config.VMAlertURL is not None or self.authentication_config.VMAlertHost is not None ) @property def vmbackend_enabled(self) -> bool: """Check if VM Backend is configured.""" return ( self.authentication_config.VMBackendURL is not None or self.authentication_config.VMBackendHost is not None ) @property def vmalert_host(self): """Get the VMAlert host URL.""" # Return cached host if available if self._vmalert_host: return self._vmalert_host.rstrip("/") # Skip if VMAlert is not configured if not self.vmalert_enabled: return None host = None if self.authentication_config.VMAlertURL is not None: host = self.authentication_config.VMAlertURL else: host = f"{self.authentication_config.VMAlertHost}:{self.authentication_config.VMAlertPort}" # If HTTP/HTTPS is explicitly specified, use it if host.startswith("http://") or host.startswith("https://"): self._vmalert_host = host return host.rstrip("/") # Try HTTPS first, fall back to HTTP try: url = f"https://{host}" requests.get( url, auth=self._get_auth(), verify=not self.authentication_config.insecure, ) self.logger.debug("Using HTTPS for VMAlert") self._vmalert_host = f"https://{host}" return self._vmalert_host.rstrip("/") except requests.exceptions.SSLError: self.logger.debug("Using HTTP for VMAlert") self._vmalert_host = f"http://{host}" return self._vmalert_host.rstrip("/") except Exception: return host.rstrip("/") @property def vmbackend_host(self): """Get the VM Backend host URL.""" # Return cached host if available if self._vmbackend_host: return self._vmbackend_host.rstrip("/") # Skip if VM Backend is not configured if not self.vmbackend_enabled: return None host = None if self.authentication_config.VMBackendURL is not None: host = self.authentication_config.VMBackendURL else: host = f"{self.authentication_config.VMBackendHost}:{self.authentication_config.VMBackendPort}" # If HTTP/HTTPS is explicitly specified, use it if host.startswith("http://") or host.startswith("https://"): self._vmbackend_host = host return host.rstrip("/") # Try HTTPS first, fall back to HTTP try: url = f"https://{host}" requests.get( url, auth=self._get_auth(), verify=not self.authentication_config.insecure, ) self.logger.debug("Using HTTPS for VM Backend") self._vmbackend_host = f"https://{host}" return self._vmbackend_host.rstrip("/") except requests.exceptions.SSLError: self.logger.debug("Using HTTP for VM Backend") self._vmbackend_host = f"http://{host}" return self._vmbackend_host.rstrip("/") except Exception: return host.rstrip("/") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto | list[AlertDto]: alerts = [] for alert in event["alerts"]: annotations = alert.get("annotations", {}) labels = alert.get("labels", {}) fingerprint = alert.get("fingerprint") alerts.append( AlertDto( name=labels.get("alertname", ""), fingerprint=fingerprint, id=fingerprint, description=annotations.get("description"), message=annotations.get("summary"), status=VictoriametricsProvider.STATUS_MAP.get( alert["status"], AlertStatus.FIRING ), severity=VictoriametricsProvider.SEVERITIES_MAP.get( labels.get("severity", "low"), AlertSeverity.LOW ), startedAt=alert.get("startsAt"), url=alert.get("generatorURL"), source=["victoriametrics"], labels=labels, lastReceived=datetime.datetime.now( tz=datetime.timezone.utc ).isoformat(), ) ) return alerts def _get_alerts(self) -> list[AlertDto]: """Get alerts from VMAlert.""" if not self.vmalert_enabled: raise Exception("VMAlert is not configured") response = requests.get( f"{self.vmalert_host}/api/v1/alerts", auth=self._get_auth(), verify=not self.authentication_config.insecure, ) try: response.raise_for_status() alerts = [] response = response.json() for alert in response["data"]["alerts"]: alerts.append( AlertDto( name=alert["name"], id=alert["id"], description=alert["annotations"]["description"], message=alert["annotations"]["summary"], status=VictoriametricsProvider.STATUS_MAP.get( alert["state"], AlertStatus.FIRING ), severity=VictoriametricsProvider.SEVERITIES_MAP.get( alert["labels"]["severity"], AlertSeverity.LOW ), startedAt=alert["activeAt"], url=alert["source"], source=["victoriametrics"], event_id=alert["rule_id"], labels=alert["labels"], ) ) return alerts except Exception as e: self.logger.exception("Failed to get alerts") raise e def _query(self, query="", start="", end="", step="", queryType="", **kwargs: dict): """Query metrics from VM Backend.""" if not self.vmbackend_enabled: raise Exception("VM Backend is not configured") auth = self._get_auth() base_url = self.vmbackend_host if queryType == "query": response = requests.get( f"{base_url}/api/v1/query", params={"query": query, "time": start}, auth=auth, verify=not self.authentication_config.insecure, ) try: response.raise_for_status() results = response.json() return results.get("data", {}).get("result", []) except Exception as e: self.logger.exception("Failed to perform instant query") raise e elif queryType == "query_range": response = requests.get( f"{base_url}/api/v1/query_range", params={"query": query, "start": start, "end": end, "step": step}, auth=auth, verify=not self.authentication_config.insecure, ) if response.status_code == 200: results = response.json() # return only the results return response.json() else: self.logger.error( "Failed to perform range query", extra=response.json() ) raise Exception("Could not range query") else: self.logger.error("Invalid query type") raise Exception("Invalid query type") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) # Load environment variables import os from keep.providers.providers_factory import ProvidersFactory vmalerthost = os.environ.get("VMALERT_HOST") or "http://localhost:8880" user = os.environ.get("VMALERT_USER") or "admin" password = os.environ.get("VMALERT_PASSWORD") or "secret" context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = { "authentication": { "VMAlertURL": vmalerthost, "BasicAuthUsername": user, "BasicAuthPassword": password, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="vm-keephq", provider_type="victoriametrics", provider_config=config, ) alerts = provider.get_alerts() vmbackendhost = os.environ.get("VMBACKEND_HOST") or "http://localhost:8428" user = os.environ.get("VMBACKEND_USER") or "admin" password = os.environ.get("VMBACKEND_PASSWORD") or "secret" config = { "authentication": { "VMBackendURL": vmbackendhost, "BasicAuthUsername": user, "BasicAuthPassword": password, }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="vm-keephq", provider_type="victoriametrics", provider_config=config, ) query = provider.query( query="avg(rate(process_cpu_seconds_total))", queryType="query" ) print(alerts) ================================================ FILE: keep/providers/vllm_provider/__init__.py ================================================ ================================================ FILE: keep/providers/vllm_provider/vllm_provider.py ================================================ import json import dataclasses import pydantic import requests from typing import Optional, Dict, Any from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class VllmProviderAuthConfig: api_url: str = dataclasses.field( metadata={ "required": True, "description": "vLLM API endpoint URL", "sensitive": False, } ) api_key: str | None = dataclasses.field( metadata={ "required": False, "description": "Optional API key if your vLLM deployment requires authentication", "sensitive": True, }, default=None, ) class VllmProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "vLLM" PROVIDER_CATEGORY = ["AI"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = VllmProviderAuthConfig( **self.config.authentication ) def dispose(self): pass def validate_scopes(self) -> dict[str, bool | str]: scopes = {} return scopes def _prepare_headers(self) -> Dict[str, str]: headers = {"Content-Type": "application/json"} if self.authentication_config.api_key: headers["Authorization"] = f"Bearer {self.authentication_config.api_key}" return headers def _format_messages(self, prompt: str) -> str: """Format the prompt in a chat-style format if needed.""" # You might want to customize this based on your model's requirements return prompt def _query( self, prompt: str, temperature: float = 0.7, model: str = "Qwen/Qwen1.5-1.8B-Chat", max_tokens: int = 1024, structured_output_format: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: headers = self._prepare_headers() formatted_prompt = self._format_messages(prompt) # Prepare the request payload payload = { "model": model, "prompt": formatted_prompt, "max_tokens": max_tokens, "temperature": temperature, } # Add structured output format if provided if structured_output_format: payload["guided_json"] = structured_output_format try: response = requests.post( self.authentication_config.api_url + "/v1/completions", headers=headers, json=payload, ) response.raise_for_status() # Parse the response result = response.json() # Extract the generated text from the response # Adjust this based on your vLLM API response structure try: generated_text = result["choices"][0]['text'] except KeyError: generated_text = "" # Try to parse as JSON if it's meant to be structured if structured_output_format: try: generated_text = json.loads(generated_text) except json.JSONDecodeError: raise ProviderException( f"Failed to parse generated text as JSON: {generated_text}. Model not following the structured output format. Response: {result}" ) return { "response": generated_text, } except requests.exceptions.RequestException as e: raise ProviderException(f"Error querying vLLM API: {str(e)}") if __name__ == "__main__": import os import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) config = ProviderConfig( description="vLLM Provider", authentication={ "api_url": "http://localhost:8000/v1/completions", # Default vLLM API endpoint "api_key": os.environ.get("VLLM_API_KEY"), # Optional }, ) provider = VllmProvider( context_manager=context_manager, provider_id="vllm_provider", config=config, ) print( provider.query( prompt="Here is an alert, define environment for it: Clients are panicking, nothing works.", temperature=0, model="Qwen/Qwen1.5-1.8B-Chat", structured_output_format={ "type": "object", "properties": { "environment": { "type": "string", "enum": ["production", "debug", "pre-prod"], }, }, "required": ["environment"], }, max_tokens=100, ) ) ================================================ FILE: keep/providers/wazuh_provider/__init__.py ================================================ ================================================ FILE: keep/providers/wazuh_provider/alerts_mock.py ================================================ ALERTS = { "message": "New dpkg (Debian Package) installed.", "severity": "info", "description": "Rule ID 2902\nLevel 7\nAgent ID 001\nAgent Name test\nTitle New dpkg (Debian package) installed.\nFull Log 2025-02-03 21:41:42 status installed rsync:amd64 3.2.7-1+deb12u2\n", "created_at": "2025-02-03T21:41:42.853014+01.00", } ================================================ FILE: keep/providers/wazuh_provider/custom-keep ================================================ #!/bin/sh # This file is not intended to be executed on the Keep side. # It is stored in this repository to be served from GitHub for use within Wazuh. # Following: https://documentation.wazuh.com/current/user-manual/manager/integration-with-external-apis.html#creating-an-integration-script WPYTHON_BIN="framework/python/bin/python3" SCRIPT_PATH_NAME="$0" DIR_NAME="$(cd $(dirname ${SCRIPT_PATH_NAME}); pwd -P)" SCRIPT_NAME="$(basename ${SCRIPT_PATH_NAME})" case ${DIR_NAME} in */active-response/bin | */wodles*) if [ -z "${WAZUH_PATH}" ]; then WAZUH_PATH="$(cd ${DIR_NAME}/../..; pwd)" fi PYTHON_SCRIPT="${DIR_NAME}/${SCRIPT_NAME}.py" ;; */bin) if [ -z "${WAZUH_PATH}" ]; then WAZUH_PATH="$(cd ${DIR_NAME}/..; pwd)" fi PYTHON_SCRIPT="${WAZUH_PATH}/framework/scripts/$(echo ${SCRIPT_NAME} | sed 's/\-/_/g').py" ;; */integrations) if [ -z "${WAZUH_PATH}" ]; then WAZUH_PATH="$(cd ${DIR_NAME}/..; pwd)" fi PYTHON_SCRIPT="${DIR_NAME}/${SCRIPT_NAME}.py" ;; esac ${WAZUH_PATH}/${WPYTHON_BIN} ${PYTHON_SCRIPT} "$@" ================================================ FILE: keep/providers/wazuh_provider/custom-keep.py ================================================ # This file is not intended to be executed on the Keep side. # It is stored in this repository to be served from GitHub for use within Wazuh. # Following: https://documentation.wazuh.com/current/user-manual/manager/integration-with-external-apis.html#creating-an-integration-script import json import os import sys from datetime import datetime, timezone # Exit error codes ERR_NO_REQUEST_MODULE = 1 ERR_BAD_ARGUMENTS = 2 ERR_FILE_NOT_FOUND = 6 ERR_INVALID_JSON = 7 try: import requests except Exception: print("No module 'requests' found. Install: pip install requests") sys.exit(ERR_NO_REQUEST_MODULE) # Global vars debug_enabled = False pwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) json_alert = {} json_options = {} # Log path LOG_FILE = f"{pwd}/logs/integrations.log" # Constants ALERT_INDEX = 1 API_KEY_INDEX = 2 WEBHOOK_INDEX = 3 def main(args): global debug_enabled try: # Read arguments bad_arguments: bool = False if len(args) >= 4: msg = "{0} {1} {2} {3} {4}".format( args[1], args[2], args[3], args[4] if len(args) > 4 else "", args[5] if len(args) > 5 else "", ) debug_enabled = len(args) > 4 and args[4] == "debug" else: msg = "# ERROR: Wrong arguments" bad_arguments = True # Logging the call with open(LOG_FILE, "a") as f: f.write(msg + "\n") if bad_arguments: debug("# ERROR: Exiting, bad arguments. Inputted: %s" % args) sys.exit(ERR_BAD_ARGUMENTS) # Core function process_args(args) except Exception as e: debug(str(e)) raise def process_args(args) -> None: debug("# Running Custom Keep script") # Read args alert_file_location: str = args[ALERT_INDEX] webhook: str = args[WEBHOOK_INDEX] api_key: str = args[API_KEY_INDEX] options_file_location: str = "" # Look for options file location for idx in range(4, len(args)): if args[idx][-7:] == "options": options_file_location = args[idx] break # Load options. Parse JSON object. json_options = get_json_options(options_file_location) debug(f"# Opening options file at '{options_file_location}' with '{json_options}'") # Load alert. Parse JSON object. json_alert = get_json_alert(alert_file_location) debug(f"# Opening alert file at '{alert_file_location}' with '{json_alert}'") debug("# Generating message") msg: any = generate_msg(json_alert, json_options) if not len(msg): debug("# ERROR: Empty message") raise Exception debug(f"# Sending message {msg} to Keep server") send_msg(msg, webhook, api_key) def debug(msg: str) -> None: if debug_enabled: print(msg) with open(LOG_FILE, "a") as f: f.write(msg + "\n") def generate_msg(alert: any, options: any) -> any: level = alert["rule"]["level"] title = ( alert["rule"]["description"] if "description" in alert["rule"] else "N/A" ) rule_id = alert["rule"]["id"] agent_id = alert["agentless"]["host"] if "agentless" in alert else alert["agent"]["id"] agent_name = "Agentless Host" if "agentless" in alert else alert["agent"]["name"] full_log = alert["full_log"] if "full_log" in alert else "N/A" severity = "low" if level > 14: severity = "critical" elif level > 11: severity = "high" elif level > 6: severity = "info" created_at = datetime.now(timezone.utc).astimezone().isoformat() result = { "message": title, "severity": severity, "description": f"Rule ID {rule_id}\nLevel {level}\nAgent ID {agent_id}\nAgent Name {agent_name}\nTitle {title}\nFull Log {full_log}\n", "created_at": created_at, } return result def send_msg(msg: str, url: str, api_key: str) -> None: headers = { "Content-Type": "application/json", "Accept": "application/json", "X-API-KEY": api_key, } res = requests.post(url, json=msg, headers=headers, timeout=10) debug("# Response received: %s" % res.json) def get_json_alert(file_location: str) -> any: try: with open(file_location) as alert_file: return json.load(alert_file) except FileNotFoundError: debug("# JSON file for alert %s doesn't exist" % file_location) sys.exit(ERR_FILE_NOT_FOUND) except json.decoder.JSONDecodeError as e: debug("Failed getting JSON alert. Error: %s" % e) sys.exit(ERR_INVALID_JSON) def get_json_options(file_location: str) -> any: try: with open(file_location) as options_file: return json.load(options_file) except FileNotFoundError: debug("# JSON file for options %s doesn't exist" % file_location) except BaseException as e: debug("Failed getting JSON options. Error: %s" % e) sys.exit(ERR_INVALID_JSON) if __name__ == "__main__": main(sys.argv) ================================================ FILE: keep/providers/wazuh_provider/wazuh_provider.py ================================================ """ Wazuh is a security platform that provides unified XDR and SIEM protection for endpoints and cloud workloads """ from keep.api.models.alert import AlertDto, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig class WazuhProvider(BaseProvider): """Get alerts from Wazuh into Keep""" webhook_documentation_here_differs_from_general_documentation = True webhook_description = "" webhook_template = "" webhook_markdown = """ 1. Wazuh supports custom integration scripts. 2. Install Keep integration scripts following the [Keep documentation](https://docs.keephq.dev/providers/documentation/wazuh-provider). 3. Open the Wazuh configuration file 4. You will need to parameters: Webhook URL of Keep which is {keep_webhook_api_url}. 5. And the second parameter: API Key of Keep which is {api_key}. 6. Add `` including proper `api_key` and `webhook_url` block in Wazuh configuration according to the the [Keep documentation](https://docs.keephq.dev/providers/documentation/wazuh-provider) 7. Restart Wazuh. 8. Now Wazuh will be able to send alerts to Keep. """ PROVIDER_DISPLAY_NAME = "Wazuh" PROVIDER_TAGS = ["alert"] PROVIDER_CATEGORY = ["Monitoring"] FINGERPRINT_FIELDS = ["id"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(): """ No validation required for Wazuh provider. """ pass @staticmethod def _format_alert( event: dict, provider_instance: BaseProvider = None ) -> AlertDto | list[AlertDto]: alert = AlertDto( name=event["message"], description=event["description"], severity=event["severity"], # @TODO: handle alert resolve status=AlertStatus.FIRING, source=["wazuh"], lastReceived=event["created_at"], ) alert.fingerprint = WazuhProvider.get_alert_fingerprint( alert, fingerprint_fields=WazuhProvider.FINGERPRINT_FIELDS ) return alert if __name__ == "__main__": pass ================================================ FILE: keep/providers/webhook_provider/__init__.py ================================================ ================================================ FILE: keep/providers/webhook_provider/webhook_provider.py ================================================ """ WebhookProvider is a class that provides a way to notify a 3rd party service using a webhook. """ import base64 import copy import dataclasses import json import typing import pydantic import requests from requests.exceptions import JSONDecodeError from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class WebhookProviderAuthConfig: """ Webhook authentication configuration. """ url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Webhook URL", "validation": "any_http_url", } ) verify: bool = dataclasses.field( metadata={ "description": "Enable SSL verification", "hint": "Whether to verify the SSL certificate of the webhook URL or not", "type": "switch", }, default=True, ) method: typing.Literal["GET", "POST", "PUT", "DELETE"] = dataclasses.field( default="POST", metadata={ "required": True, "description": "HTTP method", "type": "select", "options": ["POST", "GET", "PUT", "DELETE"], }, ) http_basic_authentication_username: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "HTTP basic authentication - Username", "config_sub_group": "basic_authentication", "config_main_group": "authentication", }, ) http_basic_authentication_password: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "HTTP basic authentication - Password", "sensitive": True, "config_sub_group": "basic_authentication", "config_main_group": "authentication", }, ) api_key: typing.Optional[str] = dataclasses.field( default=None, metadata={ "description": "API key", "sensitive": True, "config_sub_group": "api_key", "config_main_group": "authentication", }, ) headers: typing.Optional[list[dict[str, str]]] = dataclasses.field( default=None, metadata={ "description": "Headers", "type": "form", }, ) class WebhookProvider(BaseProvider): """Enrich alerts with data from Webhook.""" BLACKLISTED_ENDPOINTS = [ "metadata.google.internal", "metadata.internal", "169.254.169.254", "localhost", "googleapis.com", ] PROVIDER_CATEGORY = ["Developer Tools"] PROVIDER_SCOPES = [ ProviderScope( name="send_webhook", mandatory=True, alias="Send Webhook", ) ] PROVIDER_TAGS = ["messaging"] PROVIDER_DISPLAY_NAME = "Webhook" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Nothing to do here. """ pass def validate_scopes(self) -> dict[str, bool | str]: validated_scopes = {} try: self.__validate_url(str(self.authentication_config.url)) validated_scopes["send_webhook"] = True except Exception as e: self.logger.exception("Error validating webhook URL") validated_scopes["send_webhook"] = str(e) return validated_scopes def validate_config(self): self.authentication_config = WebhookProviderAuthConfig( **self.config.authentication ) def __validate_url(self, url: str): """ Validate that the url is not blacklisted. """ for endpoint in WebhookProvider.BLACKLISTED_ENDPOINTS: if endpoint in url: raise Exception(f"URL {url} is blacklisted") def _notify( self, body: dict = None, params: dict = None, **kwargs, ): """ Send a HTTP request to the given url. """ self.query( url=self.authentication_config.url, method=self.authentication_config.method, http_basic_authentication_username=self.authentication_config.http_basic_authentication_username, http_basic_authentication_password=self.authentication_config.http_basic_authentication_password, api_key=self.authentication_config.api_key, headers=self.authentication_config.headers, body=body, params=params, **kwargs, ) def _query( self, url: str, method: typing.Literal["GET", "POST", "PUT", "DELETE"] = "POST", http_basic_authentication_username: str = None, http_basic_authentication_password: str = None, api_key: str = None, headers: str = None, body: dict = None, params: dict = None, fail_on_error: bool = True, **kwargs: dict, ) -> dict: """ Trigger a webhook with the given method, headers, body and params. """ self.__validate_url(url) if headers is None: headers = {} if isinstance(headers, str): headers = json.loads(headers) if isinstance(headers, list): try: headers = {header["key"]: header["value"] for header in headers} except Exception: raise Exception( "Headers must be a list of dictionaries with 'key' and 'value' fields, e.g. [{'key': 'Content-Type', 'value': 'application/json'}]" ) if body is None: body = {} if params is None: params = {} extra_args = copy.deepcopy(kwargs) verify = extra_args.pop("verify", self.authentication_config.verify) if http_basic_authentication_username and http_basic_authentication_password: credentials = f"{http_basic_authentication_username}:{http_basic_authentication_password}" encoded_credentials = base64.b64encode(credentials.encode("utf-8")).decode( "utf-8" ) headers["Authorization"] = f"Basic {encoded_credentials}" if api_key: headers["Authorization"] = f"Bearer {api_key}" self.logger.debug( f"Sending {method} request to {url}", extra={ "body": body, "headers": headers, "params": params, }, ) if method == "GET": response = requests.get( url, headers=headers, params=params, timeout=10, verify=verify, **extra_args, ) elif method == "POST": response = requests.post( url, headers=headers, json=body, timeout=10, verify=verify, **extra_args ) elif method == "PUT": response = requests.put( url, headers=headers, json=body, timeout=10, verify=verify, **extra_args ) elif method == "DELETE": response = requests.delete( url, headers=headers, json=body, timeout=10, verify=verify, **extra_args ) self.logger.debug( f"Trigger a webhook with {method} on {url}", extra={ "body": body, "headers": headers, "params": params, "status_code": response.status_code, }, ) result = {"status": response.ok, "status_code": response.status_code} try: body = response.json() except JSONDecodeError: body = response.text if fail_on_error: self.logger.info( f"Webhook response: {response.status_code} {response.reason}", extra={"body": body}, ) response.raise_for_status() result["body"] = body return result ================================================ FILE: keep/providers/websocket_provider/__init__.py ================================================ ================================================ FILE: keep/providers/websocket_provider/websocket_provider.py ================================================ """ WebsocketProvider is a class that implements a simple websocket provider. """ import pydantic import websocket import websocket._exceptions from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class WebsocketProviderAuthConfig: pass class WebsocketProvider(BaseProvider): """Enrich alerts with data from a websocket.""" def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.ws = None def validate_config(self): self.authentication_config = WebsocketProviderAuthConfig( **self.config.authentication ) def _query( self, socket_url: str, timeout: int | None = None, data: str | None = None, **kwargs: dict ) -> dict: """ Query a websocket endpoint. Args: socket_url (str): The websocket URL to query. timeout (int | None, optional): Connection Timeout. Defaults to None. data (str | None, optional): Data to send through the websocket. Defaults to None. Returns: str: First received bytes from the websocket. """ try: self.ws = websocket.create_connection(socket_url, timeout=timeout) received = self.ws.recv() if data: self.ws.send(data) return {"connection": True, "data": received, "error": None} except websocket._exceptions.WebSocketException as e: self.logger.exception("Failed to connect to websocket") return {"connection": False, "data": None, "error": e} def dispose(self): """ Dispose of the websocket connection. """ try: self.ws.close() except Exception: self.logger.warning("Failed to close websocket connection") if __name__ == "__main__": # Initalize the provider and provider config config = ProviderConfig( id="websocket-test", authentication={}, ) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) provider = WebsocketProvider( context_manager, provider_id="websocket", config=config ) response = provider.query(socket_url="ws://echo.websockets.events") print(response) ================================================ FILE: keep/providers/youtrack_provider/README.md ================================================ ## YouTrack Setup using Docker 1. Run the following command to start the YouTrack container (This doesn't persist the data) ```bash docker run -it --name youtrack -p 8080:8080 jetbrains/youtrack:2025.1.62967 ``` For more information, visit the [YouTracker Docker Setup](https://www.jetbrains.com/help/youtrack/server/youtrack-docker-installation.html). ================================================ FILE: keep/providers/youtrack_provider/__init__.py ================================================ ================================================ FILE: keep/providers/youtrack_provider/youtrack_provider.py ================================================ """ YoutrackProvider is a class that provides a way to create new issues in Youtrack. """ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class YoutrackProviderAuthConfig: """ YoutrackProviderAuthConfig is a class that holds the authentication information for the YoutrackProvider. """ host_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "YouTrack Host URL", "hint": "e.g. https://example.youtrack.cloud", "sensitive": False, "validation": "any_http_url", } ) project_id: str = dataclasses.field( metadata={ "required": True, "description": "YouTrack Project ID", "hint": "e.g. 1-0", "sensitive": False, } ) permanent_token: str = dataclasses.field( metadata={ "required": True, "description": "YouTrack Permanent Token", "sensitive": True, } ) ticket_creation_url: str = dataclasses.field( metadata={ "required": False, "description": "URL for creating new tickets", "sensitive": False, "hint": "https://example.youtrack.cloud/issues/new", }, default="", ) class YoutrackProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "YouTrack" PROVIDER_TAGS = ["ticketing"] PROVIDER_CATEGORY = ["Ticketing"] PROVIDER_SCOPES = [ ProviderScope( name="create_issue", mandatory=True, alias="Create Issue", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): pass def validate_config(self): """ Validates required configuration for Youtrack provider. """ self.authentication_config = YoutrackProviderAuthConfig( **self.config.authentication ) def validate_scopes(self): """ Validate scopes for the provider """ self.logger.info("Validating Youtrack provider scopes") try: url = self._get_url("issues") headers = self._get_auth_headers() response = requests.get(url, headers=headers) response.raise_for_status() except Exception as e: self.logger.exception("Failed to validate scopes") return {"create_issue": str(e)} return {"create_issue": True} def _create_issue(self, summary="", description=""): """ Create an issue in Youtrack. """ self.logger.info("Creating issue in Youtrack") try: url = self._get_url("issues") headers = self._get_auth_headers() data = { "summary": summary, "description": description, "project": {"id": self.authentication_config.project_id}, } response = requests.post(url, headers=headers, json=data) response.raise_for_status() self.logger.info("Successfully created issue in Youtrack", extra={"response": response.json()}) except Exception as e: self.logger.exception("Error creating issue in Youtrack") raise Exception(f"Error creating issue in Youtrack: {e}") return response.json() def _get_url(self, endpoint: str): return f"{self.authentication_config.host_url}/api/{endpoint}" def _get_auth_headers(self): """ Get authentication headers for Youtrack. """ return { "Authorization": f"Bearer {self.authentication_config.permanent_token}", "Content-Type": "application/json", "Accept": "application/json" } def _notify(self, summary="", description=""): self.logger.info("Creating issue in Youtrack") return self._create_issue(summary=summary, description=description) if __name__ == "__main__": import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) import os youtrack_host_url = os.getenv("YOUTRACK_HOST_URL") youtrack_project_id = os.getenv("YOUTRACK_PROJECT_ID") youtrack_permanent_token = os.getenv("YOUTRACK_permanent_token") config = ProviderConfig( description="Youtrack Provider", authentication={ "host_url": youtrack_host_url, "project_id": youtrack_project_id, "permanent_token": youtrack_permanent_token, }, ) provider = YoutrackProvider(context_manager, "youtrack", config) provider._notify(summary="Test Issue", description="This is a test issue") ================================================ FILE: keep/providers/zabbix_provider/README.md ================================================ ## How to start Zabbix? Clone the Zabbix docker repo: `git clone https://github.com/zabbix/zabbix-docker.git` Enter the repo directory: `cd zabbix-docker` Run the docker compose file (with PostgreSQL): `docker compose -f docker-compose_v3_alpine_pgsql_latest.yaml up` Open the Zabbix UI: `http://localhost` Login with the default credentials: `Admin` / `zabbix` ================================================ FILE: keep/providers/zabbix_provider/__init__.py ================================================ ================================================ FILE: keep/providers/zabbix_provider/zabbix_provider.py ================================================ """ Zabbix Provider is a class that allows to ingest/digest data from Zabbix. """ import dataclasses import datetime import json import logging import os import random from typing import Union import pydantic import requests from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.base.provider_exceptions import ProviderMethodException from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.providers.models.provider_method import ProviderMethod from keep.providers.providers_factory import ProvidersFactory logger = logging.getLogger(__name__) @pydantic.dataclasses.dataclass class ZabbixProviderAuthConfig: """ Zabbix authentication configuration. """ zabbix_frontend_url: pydantic.AnyHttpUrl = dataclasses.field( metadata={ "required": True, "description": "Zabbix Frontend URL", "hint": "https://zabbix.example.com", "sensitive": False, "validation": "any_http_url", } ) auth_token: str = dataclasses.field( metadata={ "required": True, "description": "Zabbix Auth Token", "hint": "Users -> Api tokens", "sensitive": True, } ) verify: bool = dataclasses.field( metadata={ "description": "Verify SSL certificates", "hint": "Set to false to allow self-signed certificates", "sensitive": False, }, default=True, ) class ZabbixProvider(BaseProvider): """ Pull/Push alerts from Zabbix into Keep. """ PROVIDER_CATEGORY = ["Monitoring"] KEEP_ZABBIX_WEBHOOK_INTEGRATION_NAME = "keep" # keep-zabbix KEEP_ZABBIX_WEBHOOK_SCRIPT_FILENAME = ( "zabbix_provider_script.js" # zabbix mediatype script file ) PROVIDER_SCOPES = [ ProviderScope( name="action.create", description="This method allows to create new actions.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/action/create", ), ProviderScope( name="action.get", description="This method allows to retrieve actions.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/action/get", ), ProviderScope( name="event.acknowledge", description="This method allows to update events.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/event/acknowledge", ), ProviderScope( name="mediatype.create", description="This method allows to create new media types.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/mediatype/create", ), ProviderScope( name="mediatype.get", description="This method allows to retrieve media types.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/mediatype/get", ), ProviderScope( name="mediatype.update", description="This method allows to update media types.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/mediatype/update", ), ProviderScope( name="problem.get", description="The method allows to retrieve problems.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/problem/get", ), ProviderScope( name="script.create", description="This method allows to create new scripts.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/script/create", ), ProviderScope( name="script.get", description="The method allows to retrieve scripts.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/script/get", ), ProviderScope( name="script.update", description="This method allows to update scripts.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/script/update", ), ProviderScope( name="user.get", description="This method allows to retrieve users.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/user/get", ), ProviderScope( name="user.update", description="This method allows to update users.", mandatory=True, mandatory_for_webhook=True, documentation_url="https://www.zabbix.com/documentation/current/en/manual/api/reference/user/update", ), ] PROVIDER_METHODS = [ ProviderMethod( name="Close Problem", func_name="close_problem", scopes=["event.acknowledge"], type="action", ), ProviderMethod( name="Change Severity", func_name="change_severity", scopes=["event.acknowledge"], type="action", ), ProviderMethod( name="Suppress Problem", func_name="surrpress_problem", scopes=["event.acknowledge"], type="action", ), ProviderMethod( name="Unsuppress Problem", func_name="unsurrpress_problem", scopes=["event.acknowledge"], type="action", ), ProviderMethod( name="Acknowledge Problem", func_name="acknowledge_problem", scopes=["event.acknowledge"], type="action", ), ProviderMethod( name="Unacknowledge Problem", func_name="unacknowledge_problem", scopes=["event.acknowledge"], type="action", ), ProviderMethod( name="Add Message to Problem", func_name="add_message_to_problem", scopes=["event.acknowledge"], type="action", ), ProviderMethod( name="Get Problem Messages", func_name="get_problem_messages", scopes=["problem.get"], type="view", ), ] SEVERITIES_MAP = { 0: AlertSeverity.LOW, 1: AlertSeverity.INFO, 2: AlertSeverity.WARNING, 3: AlertSeverity.WARNING, 4: AlertSeverity.HIGH, 5: AlertSeverity.CRITICAL, } SEVERITY_NAME_TO_ID_MAP = { "not_classified": 0, "not classified": 0, "information": 1, "warning": 2, "average": 3, "high": 4, "disaster": 5, } STATUS_MAP = { "problem": AlertStatus.FIRING, "ok": AlertStatus.RESOLVED, "resolved": AlertStatus.RESOLVED, "acknowledged": AlertStatus.ACKNOWLEDGED, "suppressed": AlertStatus.SUPPRESSED, } def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def dispose(self): """ Dispose the provider. """ pass def close_problem(self, id: str): """ Close a problem. https://www.zabbix.com/documentation/current/en/manual/api/reference/event/acknowledge Args: id (str): The problem id. """ self.logger.info(f"Closing problem {id}") self.__send_request("event.acknowledge", {"eventids": id, "action": 1}) self.logger.info(f"Closed problem {id}") def unsurrpress_problem(self, id: str): """ Unsuppress a problem. Args: id (str): The problem id. """ self.logger.info(f"Unsuppressing problem {id}") self.__send_request("event.acknowledge", {"eventids": id, "action": 64}) self.logger.info(f"Unsuppressed problem {id}") def surrpress_problem( self, id: str, suppress_until: datetime.datetime = datetime.datetime.now() + datetime.timedelta(days=1), ): """ Suppress a problem. Args: id (str): The problem id. suppress_until (datetime.datetime): The datetime to suppress the problem until. """ self.logger.info(f"Suppressing problem {id} until {suppress_until}") if isinstance(suppress_until, str): suppress_until = datetime.datetime.fromisoformat(suppress_until) self.__send_request( "event.acknowledge", { "eventids": id, "action": 32, "suppress_until": int(suppress_until.timestamp()), }, ) self.logger.info(f"Suppressed problem {id} until {suppress_until}") def acknowledge_problem(self, id: str): """ Acknowledge a problem. Args: id (str): The problem id. """ self.logger.info(f"Acknowledging problem {id}") self.__send_request("event.acknowledge", {"eventids": id, "action": 2}) self.logger.info(f"Acknowledged problem {id}") def unacknowledge_problem(self, id: str): """ Unacknowledge a problem. Args: id (str): The problem id. """ self.logger.info(f"Unacknowledging problem {id}") self.__send_request("event.acknowledge", {"eventids": id, "action": 16}) self.logger.info(f"Unacknowledged problem {id}") def add_message_to_problem(self, id: str, message_text: str): """ Add a message to a problem. Args: id (str): The problem id. message_text (str): The message text. """ self.logger.info( f"Adding message to problem {id}", extra={"zabbix_message": message_text} ) self.__send_request( "event.acknowledge", {"eventids": id, "message": message_text, "action": 4}, ) self.logger.info( f"Added message to problem {id}", extra={"zabbix_message": message_text} ) def get_problem_messages(self, id: str): """ Get the messages from a problem. Args: id (str): The problem id. """ problem = self.__send_request( "problem.get", {"eventids": id, "selectAcknowledges": "extend"} ) messages = [] problems = problem.get("result", []) if not problems: return messages for acknowledge in problem.get("result", [])[0].get("acknowledges", []): if acknowledge.get("action") == "4": time = datetime.datetime.fromtimestamp(int(acknowledge.get("clock"))) messages.append(f'{time}: {acknowledge.get("message")}') return messages def change_severity( self, id: str, new_severity: str, ): """ Change the severity of a problem. Args: id (str): The problem id. new_severity (str): The new severity. Can be an integer string (0-5) or severity name: - "0" or "Not classified" - "1" or "Information" - "2" or "Warning" - "3" or "Average" - "4" or "High" - "5" or "Disaster" """ # Validate and convert input severity = 0 # Handle numeric string input if new_severity.isdigit(): severity_int = int(new_severity) if 0 <= severity_int <= 5: severity = severity_int else: raise ValueError(f"Invalid severity number: {new_severity}. Must be between 0-5.") else: # Handle string input severity_lower = new_severity.lower().strip() if severity_lower in ZabbixProvider.SEVERITY_NAME_TO_ID_MAP: severity = ZabbixProvider.SEVERITY_NAME_TO_ID_MAP[severity_lower] else: valid_severities = list(ZabbixProvider.SEVERITY_NAME_TO_ID_MAP.keys()) + ["0", "1", "2", "3", "4", "5"] raise ValueError(f"Invalid severity: {new_severity}. Valid values are: {valid_severities}") self.__send_request( "event.acknowledge", {"eventids": id, "severity": severity, "action": 8} ) def validate_config(self): """ Validates required configuration for Zabbix provider. """ self.authentication_config = ZabbixProviderAuthConfig( **self.config.authentication ) def validate_scopes(self) -> dict[str, bool | str]: validated_scopes = {} for scope in self.PROVIDER_SCOPES: try: self.__send_request(scope.name) except Exception as e: # This is a hack to check if the error is related to permissions error = getattr(e, "message", e.args[0]) # If we got here, it means it's an exception from Zabbix if "permission" in str(error) or "not authorized" in str(error).lower(): validated_scopes[scope.name] = "Permission denied" continue else: if error and any(phrase in error.lower() for phrase in [ "invalid parameter", "incorrect arguments" ]): # This is OK, it means the request is broken but we have access to the endpoint. pass else: validated_scopes[scope.name] = error continue validated_scopes[scope.name] = True return validated_scopes def __send_request( self, method: str, params: dict = None, include_auth: bool = True ): """ Send a request to Zabbix API. Args: method (str): The method to call. params (dict): The parameters to send. Returns: dict: The response from Zabbix API. """ url = f"{self.authentication_config.zabbix_frontend_url}/api_jsonrpc.php" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.authentication_config.auth_token}", } data = { "jsonrpc": "2.0", "method": method, "params": params or {}, "id": random.randint(1000, 2000), } # in zabbix >=7.2 it makes requests fail. if include_auth: # zabbix < 6.4 compatibility data["auth"] = f"{self.authentication_config.auth_token}" response = requests.post( url, json=data, headers=headers, verify=self.authentication_config.verify ) try: response.raise_for_status() except requests.HTTPError: self.logger.exception( "Error while sending request to Zabbix API", extra={ "response": response.text, "tenant_id": self.context_manager.tenant_id, }, ) raise response_json = response.json() if "error" in response_json: self.logger.error( "Error while querying zabbix", extra={ "tenant_id": self.context_manager.tenant_id, "response_json": response_json, }, ) error_data = response_json.get("error", {}).get("data") # Try to send the request without auth, probably zabbix >=7.2 if 'unexpected parameter "auth".' in error_data and include_auth: return self.__send_request(method, params, include_auth=False) raise ProviderMethodException(error_data) return response_json @staticmethod def _convert_severity(severity: Union[int, str]) -> AlertSeverity: """ Convert Zabbix severity to Keep AlertSeverity. Args: severity (Union[int, str]): The severity value. Can be: - Integer (0-5): 0=Not classified, 1=Information, 2=Warning, 3=Average, 4=High, 5=Disaster - String: "not classified", "information", "warning", "average", "high", "disaster" Returns: AlertSeverity: The corresponding Keep AlertSeverity """ if isinstance(severity, int): return ZabbixProvider.SEVERITIES_MAP.get(severity, AlertSeverity.INFO) # Handle string input if isinstance(severity, str): severity_stripped = severity.strip() # First, check if it's a numeric string if severity_stripped.isdigit(): severity_int = int(severity_stripped) if 0 <= severity_int <= 5: return ZabbixProvider.SEVERITIES_MAP.get(severity_int, AlertSeverity.INFO) # If not a valid integer string, handle as text severity_lower = severity_stripped.lower() severity_int = ZabbixProvider.SEVERITY_NAME_TO_ID_MAP.get(severity_lower, 1) # Default to Information return ZabbixProvider.SEVERITIES_MAP.get(severity_int, AlertSeverity.INFO) # Fallback for any other type return AlertSeverity.INFO def _get_alerts(self) -> list[AlertDto]: # https://www.zabbix.com/documentation/current/en/manual/api/reference/problem/get time_from = int( (datetime.datetime.now() - datetime.timedelta(days=7)).timestamp() ) problems = self.__send_request( "problem.get", { "recent": False, "selectSuppressionData": "extend", "time_from": time_from, }, ) formatted_alerts = [] for problem in problems.get("result", []): name = problem.pop("name") problem.pop("source") environment = problem.pop("environment", None) if environment is None: environment = "unknown" severity = self._convert_severity(problem.pop("severity", 1)) status = ZabbixProvider.STATUS_MAP.get( problem.pop("status", "").lower(), AlertStatus.FIRING ) formatted_alerts.append( AlertDto( id=problem.pop("eventid"), name=name, status=status, lastReceived=datetime.datetime.fromtimestamp( int(problem.get("clock")) + 10 # to override pushed problems, 10 is just random, could probably be 1 ).isoformat(), source=["zabbix"], message=name, severity=severity, environment=environment, problem=problem, ) ) return formatted_alerts def setup_webhook( self, tenant_id: str, keep_api_url: str, api_key: str, setup_alerts: bool = True ): # Copied from https://git.zabbix.com/projects/ZBX/repos/zabbix/browse/templates/media/ilert/media_ilert.yaml?at=release%2F6.4 # Based on @SomeAverageDev hints and suggestions ;) Thanks! # TODO: this can be done once when loading the provider file self.logger.info("Reading webhook JS script file") __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)) ) with open( os.path.join( __location__, ZabbixProvider.KEEP_ZABBIX_WEBHOOK_SCRIPT_FILENAME ) ) as f: script = f.read() self.logger.info("Creating or updating webhook") script_name = ( f"{ZabbixProvider.KEEP_ZABBIX_WEBHOOK_INTEGRATION_NAME}-{self.provider_id}" ) self.logger.info("Getting existing scripts") existing_scripts = self.__send_request( "script.get", {"output": ["scriptid", "name"]}, ) self.logger.info("Got existing scripts") scripts = [ mt for mt in existing_scripts.get("result", []) if mt["name"] == script_name ] parameters = [ {"name": "keepApiKey", "value": api_key}, {"name": "keepApiUrl", "value": keep_api_url}, {"name": "id", "value": "{EVENT.ID}"}, {"name": "triggerId", "value": "{TRIGGER.ID}"}, {"name": "lastReceived", "value": "{DATE} {TIME}"}, {"name": "message", "value": "{ALERT.MESSAGE}"}, {"name": "name", "value": "{EVENT.NAME}"}, {"name": "service", "value": "{HOST.HOST}"}, {"name": "severity", "value": "{EVENT.SEVERITY}"}, {"name": "status", "value": "{EVENT.STATUS}"}, {"name": "tags", "value": "{EVENT.TAGSJSON}"}, {"name": "description", "value": "{TRIGGER.DESCRIPTION}"}, {"name": "time", "value": "{EVENT.TIME}"}, {"name": "value", "value": "{EVENT.VALUE}"}, {"name": "host_ip", "value": "{HOST.IP}"}, {"name": "host_name", "value": "{HOST.NAME}"}, {"name": "url", "value": "{$ZABBIX.URL}"}, {"name": "update_action", "value": "{EVENT.UPDATE.ACTION}"}, {"name": "event_ack", "value": "{EVENT.ACK.STATUS}"}, ] if scripts: existing_script = scripts[0] self.logger.info("Updating existing script") script_id = str(existing_script["scriptid"]) self.__send_request( "script.update", { "scriptid": script_id, "command": script, "type": "5", "timeout": "30s", "parameters": parameters, "scope": "1", "description": "Keep Zabbix Webhook", }, ) self.logger.info("Updated script") else: self.logger.info("Creating script") params = { "name": script_name, "parameters": parameters, "command": script, "type": "5", "timeout": "30s", "scope": "1", "description": "Keep Zabbix Webhook", } response_json = self.__send_request("script.create", params) script_id = str(response_json.get("result", {}).get("scriptids", [])[0]) self.logger.info("Created script") action_name = f"keep-{self.provider_id}" existing_actions = self.__send_request( "action.get", {"output": ["name"]}, ) action_exists = any( [ action for action in existing_actions.get("result", []) if action["name"] == action_name ] ) if not action_exists: self.logger.info("Creating action") payload = { "eventsource": "0", "name": action_name, "status": "0", "esc_period": "1h", "operations": { "0": { "operationtype": "1", "opcommand_hst": {"0": {"hostid": "0"}}, "opcommand": {"scriptid": script_id}, } }, "recovery_operations": { "0": { "operationtype": "1", "opcommand_hst": {"0": {"hostid": "0"}}, "opcommand": {"scriptid": script_id}, } }, "update_operations": { "0": { "operationtype": "1", "opcommand_hst": {"0": {"hostid": "0"}}, "opcommand": {"scriptid": script_id}, } }, "pause_symptoms": "1", "pause_suppressed": "1", "notify_if_canceled": "1", } try: action_response = self.__send_request( "action.create", payload, ) except Exception: payload.pop("pause_symptoms", None) action_response = self.__send_request( "action.create", payload, ) self.logger.info( "Created action", extra={"action_response": action_response} ) else: self.logger.info("Action already exists") self.logger.info("Finished installing webhook") @staticmethod def _format_alert( event: dict, provider_instance: "BaseProvider" = None ) -> AlertDto: environment = "unknown" tags_raw = event.pop("tags", "[]") try: tags = {tag.get("tag"): tag.get("value") for tag in json.loads(tags_raw)} except json.JSONDecodeError: logger.error("Failed to extract Zabbix tags", extra={"tags_raw": tags_raw}) # We failed to extract tags for some reason. tags = {} if isinstance(tags, dict): environment = tags.pop("environment", "unknown") # environment exists in tags but is None if environment is None: environment = "unknown" event_id = event.get("id") trigger_id = event.get("triggerId") zabbix_url = event.pop("url", None) hostname = event.pop("service", None) or event.get("hostName") ip_address = event.get("hostIp") if zabbix_url == "{$ZABBIX.URL}": # This means user did not configure $ZABBIX.URL in Zabbix probably zabbix_url = None url = None if event_id and trigger_id and zabbix_url: url = ( f"{zabbix_url}/tr_events.php?triggerid={trigger_id}&eventid={event_id}" ) severity = ZabbixProvider._convert_severity(event.pop("severity", 1)) status = event.pop("status", "").lower() status = ZabbixProvider.STATUS_MAP.get(status, AlertStatus.FIRING) last_received = event.pop( "lastReceived", datetime.datetime.now(tz=datetime.timezone.utc).isoformat() ) if last_received == "{DATE} {TIME}": # This means it's a test message, just override. last_received = datetime.datetime.now(tz=datetime.timezone.utc).isoformat() else: last_received = datetime.datetime.strptime( last_received, "%Y.%m.%d %H:%M:%S" ).isoformat() update_action = event.get("update_action", "") if update_action == "acknowledged": status = AlertStatus.ACKNOWLEDGED elif "suppressed" in update_action: status = AlertStatus.SUPPRESSED return AlertDto( **event, environment=environment, pushed=True, source=["zabbix"], severity=severity, status=status, url=url, lastReceived=last_received, tags=tags, hostname=hostname, service=hostname, ip_address=ip_address, ) if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os auth_token = os.environ.get("ZABBIX_AUTH_TOKEN") provider_config = { "authentication": { "auth_token": auth_token, "zabbix_frontend_url": "http://localhost", }, } provider = ProvidersFactory.get_provider( context_manager, provider_id="zabbix", provider_type="zabbix", provider_config=provider_config, ) provider.setup_webhook( "e1faa321-35df-486b-8fa8-3601ee714011", "http://localhost:8080", "abc" ) ================================================ FILE: keep/providers/zabbix_provider/zabbix_provider_script.js ================================================ try { var result = { tags: {} }, params = JSON.parse(value), req = new HttpRequest(), resp = ""; if (typeof params.HTTPProxy === "string" && params.HTTPProxy.trim() !== "") { req.setProxy(params.HTTPProxy); } keepApiUrl = params["keepApiUrl"]; if ( !keepApiUrl || (typeof keepApiUrl === "string" && keepApiUrl.trim() === "") ) { throw 'incorrect value for variable "keepApiUrl". The value must be a non-empty URL.'; } keepApiKey = params["keepApiKey"]; if ( !keepApiKey || (typeof keepApiKey === "string" && keepApiKey.trim() === "") ) { throw 'incorrect value for variable "keepApiKey". The value must be a non-empty API key.'; } delete params["keepApiUrl"]; delete params["keepApiKey"]; delete params["HTTPProxy"]; var incidentKey = "zabbix-" + params["EVENT.ID"]; req.addHeader("Accept: application/json"); req.addHeader("Content-Type: application/json"); req.addHeader("X-API-KEY: " + keepApiKey); Zabbix.log(4, "[Keep Webhook] keepApiUrl:" + keepApiUrl); Zabbix.log(4, "[Keep Webhook] keepApiKey:" + keepApiKey); Zabbix.log(4, "[Keep Webhook] Sending request:" + JSON.stringify(params)); resp = req.post(keepApiUrl, JSON.stringify(params)); Zabbix.log(4, "[Keep Webhook] Received response: HTTP " + req.getStatus()); if (req.getStatus() != 202) { throw "Response code not 202"; } else { return resp; } } catch (error) { Zabbix.log(3, "[Keep Webhook] Notification failed : " + error); throw "Keep notification failed : " + error; } ================================================ FILE: keep/providers/zendesk_provider/__init__.py ================================================ ================================================ FILE: keep/providers/zendesk_provider/zendesk_provider.py ================================================ import dataclasses import pydantic from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class ZendeskProviderAuthConfig: api_key: str = dataclasses.field( metadata={"required": True, "description": "Zendesk API key", "sensitive": True} ) zendesk_domain: str = dataclasses.field( metadata={ "required": True, "description": "Zendesk domain", "sensitive": False, "hint": "yourcompany.zendesk.com", } ) ticket_creation_url: str = dataclasses.field( metadata={ "required": False, "description": "URL for creating new tickets", "sensitive": False, "hint": "https://yourcompany.zendesk.com/agent/filters/new", }, default="", ) class ZendeskProvider(BaseProvider): PROVIDER_DISPLAY_NAME = "Zendesk" PROVIDER_CATEGORY = ["Ticketing"] PROVIDER_COMING_SOON = True def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = ZendeskProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass ================================================ FILE: keep/providers/zenduty_provider/__init__.py ================================================ ================================================ FILE: keep/providers/zenduty_provider/zenduty_provider.py ================================================ import dataclasses import pydantic import requests from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig @pydantic.dataclasses.dataclass class ZendutyProviderAuthConfig: """Zenduty authentication configuration.""" api_key: str = dataclasses.field( metadata={"required": True, "description": "Zenduty api key", "sensitive": True} ) class ZendutyProvider(BaseProvider): """Create incident in Zenduty.""" PROVIDER_DISPLAY_NAME = "Zenduty" PROVIDER_CATEGORY = ["Incident Management"] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) def validate_config(self): self.authentication_config = ZendutyProviderAuthConfig( **self.config.authentication ) def dispose(self): """ No need to dispose of anything, so just do nothing. """ pass def _notify( self, title: str = "", summary: str = "", service: str = "", user: str = "", policy: str = "", **kwargs: dict ): """ Create incident Zenduty using the Zenduty API https://github.com/Zenduty/zenduty-python-sdk Args: title (str): Title of the incident summary (str): Summary of the incident service (str): Service ID in Zenduty user (str): User ID in Zenduty policy (str): Policy ID in Zenduty """ self.logger.debug("Notifying incident to Zenduty") if not service: raise ProviderException("Service is required") if not title or not summary: raise ProviderException("Title and summary are required") body = { "service": service, "policy": policy, "user": user, "title": title, "summary": summary, } # https://github.com/Zenduty/zenduty-python-sdk/blob/master/zenduty/api_client.py#L11 headers = { "Authorization": "Token " + self.authentication_config.api_key, } resp = requests.post( url="https://www.zenduty.com/api/incidents/", json=body, headers=headers ) assert resp.status == 201 self.logger.debug("Alert message notified to Zenduty") if __name__ == "__main__": # Output debug messages import logging logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Load environment variables import os zenduty_key = os.environ.get("ZENDUTY_KEY") assert zenduty_key # Initalize the provider and provider config config = ProviderConfig( description="Zenduty Output Provider", authentication={"api_key": zenduty_key}, ) provider = ZendutyProvider( context_manager, provider_id="zenduty-test", config=config ) provider.notify( message="Simple incident showing context with name: John Doe", title="Simple incident", summary="Simple incident showing context with name: John Doe", service="9c6ddc88-16a0-4ce8-85ab-181760d8cb87", ) ================================================ FILE: keep/providers/zoom_chat_provider/__init__.py ================================================ ================================================ FILE: keep/providers/zoom_chat_provider/zoom_chat_provider.py ================================================ """ ZoomChatProvider is a class that provides a way to send Zoom Chats programmatically using the Incoming Webhook Zoom application. """ import dataclasses import http import os import time from typing import Optional import pydantic import requests from requests.auth import HTTPBasicAuth from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope from keep.validation.fields import HttpsUrl @pydantic.dataclasses.dataclass class ZoomChatProviderAuthConfig: """ ZoomChatProviderAuthConfig holds the authentication information for the ZoomChatProvider. """ webhook_url: HttpsUrl = dataclasses.field( metadata={ "name": "webhook_url", "description": "Zoom Incoming Webhook Full Format Url", "required": True, "sensitive": True, "validation": "https_url", }, ) authorization_token: str = dataclasses.field( metadata={ "name": "authorization_token", "description": "Incoming Webhook Authorization Token", "required": True, "sensitive": True, }, ) account_id: Optional[str] = dataclasses.field( default="zoom_account_id", metadata={ "required": False, "description": "Zoom Account ID", "sensitive": True, } ) client_id: Optional[str] = dataclasses.field( default="zoom_client_id", metadata={ "required": False, "description": "Zoom Client ID", "sensitive": True, } ) client_secret: Optional[str] = dataclasses.field( default="zoom_client_secret", metadata={ "required": False, "description": "Zoom Client Secret", "sensitive": True, } ) class ZoomChatProvider(BaseProvider): """Send alert message to Zoom Chat using the Incoming Webhook application.""" PROVIDER_DISPLAY_NAME = "Zoom Chat" PROVIDER_TAGS = ["messaging"] PROVIDER_CATEGORY = ["Communication"] BASE_URL = "https://api.zoom.us/v2" PROVIDER_SCOPES = [ ProviderScope( name="user:read:user:admin", description="View a Zoom user's details", mandatory=False, alias="View a Zoom user", ), ProviderScope( name="user:read:list_users:admin", description="List Zoom users", mandatory=False, alias="List Zoom users", ), ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig, ): super().__init__(context_manager, provider_id, config) self.access_token = None def validate_config(self): """Validates required configuration for Zoom Chat provider.""" self.authentication_config = ZoomChatProviderAuthConfig( **self.config.authentication ) if ( not self.authentication_config.webhook_url and not self.authentication_config.authorization_token ): raise Exception( "Zoom Incoming Webhook URL and authorization token are required." ) def _get_access_token(self) -> str: """ Get OAuth access token from Zoom. Returns: str: Access token """ try: token_url = "https://zoom.us/oauth/token" auth = HTTPBasicAuth( self.authentication_config.client_id, self.authentication_config.client_secret, ) data = { "grant_type": "account_credentials", "account_id": self.authentication_config.account_id, } response = requests.post(token_url, auth=auth, data=data) if response.status_code != 200: raise ProviderException( f"Failed to get access token: {response.json()}" ) return response.json()["access_token"] except Exception as e: raise ProviderException(f"Failed to get access token: {str(e)}") def _get_headers(self) -> dict: """ Get headers for API requests. Returns: dict: Headers including authorization """ if not self.access_token: self.access_token = self._get_access_token() return { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json", } def validate_scopes(self) -> dict[str, bool | str]: """Validate scopes for the provider.""" if not all( [ self.authentication_config.account_id, self.authentication_config.client_id, self.authentication_config.client_secret, ] ): return { "user:read:user:admin": "OAuth credentials not configured", "user:read:list_users:admin": "OAuth credentials not configured", } try: # Test API access by listing users response = requests.get( f"{self.BASE_URL}/users", headers=self._get_headers() ) if response.status_code != 200: raise Exception(f"Failed to validate scopes: {response.json()}") return { "user:read:user:admin": True, "user:read:list_users:admin": True, } except Exception as e: self.logger.exception("Failed to validate scopes") return { "user:read:user:admin": str(e), "user:read:list_users:admin": str(e), } def dispose(self): """Clean up resources.""" self.access_token = None pass def _get_zoom_userinfo(self, email: str) -> dict: """Get a user's information from Zoom API using email address.""" try: response = requests.get( f"{self.BASE_URL}/users/{email}", headers=self._get_headers(), ) if response.status_code == 200: self.logger.info("User details retrieved successfully") return response.json() else: raise ProviderException( f"Failed to retrieve user info for {email}: {response.status_code} - {response.text}" ) except requests.exceptions.RequestException as e: raise ProviderException(f"Failed to retrieve user info: {str(e)}") def _notify( self, severity: str = "info", title: Optional[str] = "", message: str = "", tagged_users: Optional[str] = "", details_url: Optional[str] = "", **kwargs: dict, ) -> str: """ Send a message to Zoom Chat using a Incoming Webhook URL. Args: title (str): The title to use for the message. (optional) message (str): The text message to send. Supports Markdown formatting. tagged_users (list): A list of Zoom user email addresses to tag. (optional) severity (str): The severity of the alert. details_url (str): A URL linking to more information. (optional) Raises: ProviderException: If the message could not be sent successfully. """ self.logger.debug("Sending message to Zoom Chat Incoming Webhook") webhook_url = self.authentication_config.webhook_url authorization_token = self.authentication_config.authorization_token if not message: raise ProviderException("Message is required") def __send_message(url, body, headers, retries=3): for attempt in range(retries): try: resp = requests.post(url, json=body, headers=headers) if resp.status_code == http.HTTPStatus.OK: return resp self.logger.warning( f"Attempt {attempt + 1} failed with status code {resp.status_code}" ) except requests.exceptions.RequestException as e: self.logger.error(f"Attempt {attempt + 1} failed: {e}") if attempt < retries - 1: time.sleep(1) raise requests.exceptions.RequestException( f"Failed to notify message after {retries} attempts" ) payload = { "content": { "settings": { "default_sidebar_color": ( "#EF4444" if severity == "critical" else ( "#F97316" if severity == "high" else ( "#EAB308" if severity == "warning" else "#10B981" if severity == "low" else "#3B82F6" ) ) ) }, "body": [ { "type": "message", "is_markdown_support": "true", "text": message, } ], } } # Conditionally add a title entry if title: payload["content"]["head"] = { "text": title, "style": {"bold": "true"}, } # Conditionally add the "View More Details" entry if details_url: payload["content"]["body"].append( {"type": "message", "text": "View More Details", "link": details_url} ) # Conditionally add tagged users if tagged_users: tagged_users_list = [user.strip() for user in tagged_users.split(",")] tagged_user_jid_list = [] for user in tagged_users_list: try: user_data = self._get_zoom_userinfo(user) jid = user_data.get("jid") display_name = user_data.get("display_name") if jid and display_name: tagged_user_jid_list.append(f"") except ProviderException as e: self.logger.warning(f"Failed to get info for user {user}: {e}") continue if tagged_user_jid_list: tagged_user_string = " ".join(tagged_user_jid_list) payload["content"]["body"].insert( 0, { "type": "message", "is_markdown_support": True, "text": tagged_user_string, }, ) request_headers = { "Authorization": authorization_token, "Content-Type": "application/json", } response = __send_message(webhook_url, body=payload, headers=request_headers) if response.status_code != http.HTTPStatus.OK: raise ProviderException( f"Failed to send message to Zoom Chat: {response.text}" ) self.logger.debug("Alert message sent to Zoom Chat successfully") return "Alert message sent to Zoom Chat successfully" if __name__ == "__main__": import logging # Set up logging logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) # Get webhook details from environment webhook_url = os.environ.get("ZOOM_WEBHOOK_URL") webhook_auth_token = os.environ.get("ZOOM_WEBHOOK_AUTH_TOKEN") if not all([webhook_url, webhook_auth_token]): raise Exception( "ZOOM_WEBHOOK_URL and ZOOM_WEBHOOK_AUTH_TOKEN are required" ) # Create context manager context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Initialize the provider and provider config config = ProviderConfig( name="Zoom Chat", description="Zoom Chat Output Provider", authentication={ "webhook_url": webhook_url, "authorization_token": webhook_auth_token, }, ) # Initialize provider provider = ZoomChatProvider( context_manager=context_manager, provider_id="zoom_chat_provider", config=config, ) provider.notify(message="Simple alert to Zoom chat.") ================================================ FILE: keep/providers/zoom_provider/__init__.py ================================================ ================================================ FILE: keep/providers/zoom_provider/zoom_provider.py ================================================ """ ZoomProvider is a class that provides a way to create Zoom meetings programmatically using Zoom's REST API. """ import dataclasses import json import os from datetime import datetime from typing import Optional import pydantic import requests from requests.auth import HTTPBasicAuth from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException from keep.providers.base.base_provider import BaseProvider from keep.providers.models.provider_config import ProviderConfig, ProviderScope @pydantic.dataclasses.dataclass class ZoomProviderAuthConfig: """ ZoomProviderAuthConfig holds the authentication information for the ZoomProvider. """ account_id: str = dataclasses.field( metadata={"required": True, "description": "Zoom Account ID", "sensitive": True} ) client_id: str = dataclasses.field( metadata={"required": True, "description": "Zoom Client ID", "sensitive": True} ) client_secret: str = dataclasses.field( metadata={ "required": True, "description": "Zoom Client Secret", "sensitive": True, } ) class ZoomProvider(BaseProvider): """Create and manage Zoom meetings using REST API.""" PROVIDER_DISPLAY_NAME = "Zoom" PROVIDER_CATEGORY = ["Communication", "Video Conferencing"] BASE_URL = "https://api.zoom.us/v2" PROVIDER_SCOPES = [ ProviderScope( name="create_meeting", description="Create a new Zoom meeting", mandatory=True, alias="Create Meeting", ) ] def __init__( self, context_manager: ContextManager, provider_id: str, config: ProviderConfig ): super().__init__(context_manager, provider_id, config) self.access_token = None def validate_config(self): """Validates required configuration for Zoom provider.""" self.authentication_config = ZoomProviderAuthConfig( **self.config.authentication ) def _get_access_token(self) -> str: """ Get OAuth access token from Zoom. Returns: str: Access token """ try: token_url = "https://zoom.us/oauth/token" auth = HTTPBasicAuth( self.authentication_config.client_id, self.authentication_config.client_secret, ) data = { "grant_type": "account_credentials", "account_id": self.authentication_config.account_id, } response = requests.post(token_url, auth=auth, data=data) if response.status_code != 200: raise ProviderException( f"Failed to get access token: {response.json()}" ) return response.json()["access_token"] except Exception as e: raise ProviderException(f"Failed to get access token: {str(e)}") def _get_headers(self) -> dict: """ Get headers for API requests. Returns: dict: Headers including authorization """ if not self.access_token: self.access_token = self._get_access_token() return { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json", } def validate_scopes(self) -> dict[str, bool | str]: """Validate scopes for the provider.""" try: # Test API access by listing users response = requests.get( f"{self.BASE_URL}/users", headers=self._get_headers() ) if response.status_code != 200: raise Exception(f"Failed to validate scopes: {response.json()}") return {"create_meeting": True} except Exception as e: self.logger.exception("Failed to validate scopes") return {"create_meeting": str(e)} def dispose(self): """Clean up resources.""" self.access_token = None def _create_meeting( self, topic: str, start_time: datetime, duration: int = 60, timezone: str = "UTC", record_meeting: bool = False, host_email: Optional[str] = None, ) -> dict: """ Create a new Zoom meeting. Args: topic: Meeting topic/name start_time: Meeting start time duration: Meeting duration in minutes timezone: Meeting timezone record_meeting: Whether to automatically record the meeting host_email: Email of the meeting host (optional) Returns: dict: Meeting details including join URL """ try: # Format start time for Zoom API start_time_str = start_time.strftime("%Y-%m-%dT%H:%M:%SZ") meeting_settings = { "auto_recording": "cloud" if record_meeting else "none", } meeting_data = { "topic": topic, "type": 2, # Scheduled meeting "start_time": start_time_str, "duration": duration, "timezone": timezone, "settings": meeting_settings, } # If host email provided, get their user ID first if host_email: users_response = requests.get( f"{self.BASE_URL}/users/{host_email}", headers=self._get_headers(), ) if users_response.status_code != 200: raise ProviderException( f"Failed to find host: {users_response.json()}" ) user = users_response.json() user_id = user.get("id") if not user_id: raise ProviderException(f"Host not found: {host_email}") create_url = f"{self.BASE_URL}/users/{user_id}/meetings" else: # Create meeting under authenticated user create_url = f"{self.BASE_URL}/users/me/meetings" response = requests.post( create_url, headers=self._get_headers(), data=json.dumps(meeting_data) ) if response.status_code != 201: raise ProviderException(f"Failed to create meeting: {response.json()}") response = response.json() auto_recording = response.get("settings", {}).get("auto_recording") if record_meeting and not auto_recording == "cloud": # Zoom API failed to set auto recording self.logger.warning( "Failed to set auto recording - do you have basic plan?", extra={"auto_recording": auto_recording}, ) self.logger.info( "Meeting created successfully", extra={"meeting_id": response.get("id"), "recording": auto_recording}, ) return response except Exception as e: raise ProviderException(f"Failed to create meeting: {str(e)}") def _notify( self, topic: str, start_time: datetime = None, duration: int = 60, timezone: str = "UTC", record_meeting: bool = False, host_email: Optional[str] = None, ) -> dict: """ Create a new Zoom meeting (notification endpoint). Returns: dict: Meeting details including join URL """ try: self.logger.info(f"Creating new Zoom meeting: {topic}") if not start_time: start_time = datetime.now() meeting = self._create_meeting( topic=topic, start_time=start_time, duration=duration, timezone=timezone, record_meeting=record_meeting, host_email=host_email, ) self.logger.info( "Meeting created successfully", extra={"meeting_id": meeting.get("id")} ) return meeting except Exception as e: raise ProviderException(f"Failed to create meeting: {str(e)}") if __name__ == "__main__": import logging from datetime import datetime, timedelta # Set up logging logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) # Get authentication details from environment client_id = os.environ.get("ZOOM_CLIENT_ID") client_secret = os.environ.get("ZOOM_CLIENT_SECRET") account_id = os.environ.get("ZOOM_ACCOUNT_ID") if not all([client_id, client_secret, account_id]): raise Exception( "ZOOM_CLIENT_ID, ZOOM_CLIENT_SECRET, and ZOOM_ACCOUNT_ID are required" ) # Create context manager context_manager = ContextManager( tenant_id="singletenant", workflow_id="test", ) # Create provider config config = ProviderConfig( description="Zoom Provider", authentication={ "client_id": client_id, "client_secret": client_secret, "account_id": account_id, }, ) # Initialize provider zoom_provider = ZoomProvider( context_manager=context_manager, provider_id="zoom_provider", config=config, ) # Test meeting creation try: # Schedule meeting for tomorrow start_time = datetime.now() + timedelta(days=1) meeting = zoom_provider._notify( topic="Test Meeting", start_time=start_time, duration=30, timezone="UTC", record_meeting=True, host_email="shahar@keephq.dev", # Replace with actual host email ) print("Meeting created successfully!") print(f"Join URL: {meeting.get('join_url')}") print(f"Meeting ID: {meeting.get('id')}") print(f"Meeting Password: {meeting.get('password')}") except Exception as e: print(f"Failed to create meeting: {str(e)}") ================================================ FILE: keep/rulesengine/__init__.py ================================================ ================================================ FILE: keep/rulesengine/rulesengine.py ================================================ import copy import json import logging import re from typing import List, Optional import celpy import celpy.c7nlib import celpy.celparser import celpy.celtypes import celpy.evaluation from sqlalchemy.orm.exc import StaleDataError from sqlmodel import Session from keep.api.bl.incidents_bl import IncidentBl from keep.api.core.db import ( assign_alert_to_incident, create_incident_for_grouping_rule, enrich_incidents_with_alerts, get_alerts_by_fingerprint, get_incident_for_grouping_rule, ) from keep.api.core.db import get_rules as get_rules_db from keep.api.core.db import is_all_alerts_in_status from keep.api.core.dependencies import get_pusher_client from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus from keep.api.models.db.alert import Incident from keep.api.models.db.rule import Rule from keep.api.models.incident import IncidentDto from keep.api.utils.cel_utils import preprocess_cel_expression from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts # Shahar: this is performance enhancment https://github.com/cloud-custodian/cel-python/issues/68 celpy.evaluation.Referent.__repr__ = lambda self: "" celpy.evaluation.NameContainer.__repr__ = lambda self: "" celpy.Activation.__repr__ = lambda self: "" celpy.Activation.__str__ = lambda self: "" celpy.celtypes.MapType.__repr__ = lambda self: "" celpy.celtypes.DoubleType.__repr__ = lambda self: "" celpy.celtypes.BytesType.__repr__ = lambda self: "" celpy.celtypes.IntType.__repr__ = lambda self: "" celpy.celtypes.UintType.__repr__ = lambda self: "" celpy.celtypes.ListType.__repr__ = lambda self: "" celpy.celtypes.StringType.__repr__ = lambda self: "" celpy.celtypes.TimestampType.__repr__ = lambda self: "" celpy.c7nlib.C7NContext.__repr__ = lambda self: "" celpy.celparser.Tree.__repr__ = lambda self: "" class RulesEngine: def __init__(self, tenant_id=None): self.tenant_id = tenant_id self.logger = logging.getLogger(__name__) self.env = celpy.Environment() def run_rules( self, events: list[AlertDto], session: Optional[Session] = None ) -> list[IncidentDto]: """ Evaluate the rules on the events and create incidents if needed Args: events: list of events session: db session """ self.logger.info("Running CEL rules") cel_incidents = self._run_cel_rules(events, session) self.logger.info("CEL rules ran successfully") return cel_incidents def _run_cel_rules( self, events: list[AlertDto], session: Optional[Session] = None ) -> list[IncidentDto]: """ Evaluate the rules on the events and create incidents if needed Args: events: list of events session: db session """ self.logger.info("Running rules") rules = get_rules_db(tenant_id=self.tenant_id) incidents_dto = {} for rule in rules: self.logger.info(f"Evaluating rule {rule.name}") for event in events: self.logger.info( f"Checking if rule {rule.name} apply to event {event.id}" ) try: matched_rules = self._check_if_rule_apply(rule, event) except ValueError as e: if "Invalid name" in str(e): self.logger.warning( f"{str(e)} in the CEL expression {rule.definition_cel} for alert {event.id}. This might mean there's a blank space in the field name", extra={"alert_id": event.id, "payload": event.dict()}, ) continue except Exception: self.logger.exception( f"Failed to evaluate rule {rule.name} on event {event.id}", extra={ "rule": rule.dict(), "event": event.dict(), }, ) continue if matched_rules: self.logger.info( f"Rule {rule.name} on event {event.id} is relevant" ) rule_fingerprints = self._calc_rule_fingerprint(event, rule) for rule_fingerprint in rule_fingerprints: # #If the alert recover its previous status, we need to check if there are any alerts with the same fingerprint that were resolved creation_allowed = True if hasattr(event, "previous_status") and (event.previous_status == AlertStatus.MAINTENANCE.value): alerts_solved = get_alerts_by_fingerprint(self.tenant_id, event.fingerprint, status=AlertStatus.RESOLVED.value) if alerts_solved and any(event.lastReceived < solved_alert.event["lastReceived"] for solved_alert in alerts_solved): creation_allowed = False incident, send_created_event = self._get_or_create_incident( rule=rule, rule_fingerprint=",".join(rule_fingerprint), session=session, event=event, creation_allowed=creation_allowed ) if incident: incident = assign_alert_to_incident( fingerprint=event.fingerprint, incident=incident, tenant_id=self.tenant_id, session=session, ) if not incident.is_visible: self.logger.info( f"No existing incidents for rule {rule.name}. Checking incident creation conditions" ) rule_groups = self._extract_subrules( rule.definition_cel ) firing_count = sum( [ alert.event.get("unresolvedCounter", 1) for alert in incident.alerts ] ) alerts_count = max(incident.alerts_count, firing_count) if alerts_count >= rule.threshold: if not rule.require_approve: if rule.create_on == "any" or ( rule.create_on == "all" and len(rule_groups) == len(matched_rules) ): self.logger.info( "Single event is enough, so creating incident" ) incident.is_visible = True elif rule.create_on == "all": incident = self._process_event_for_history_based_rule( incident, rule, session ) send_created_event = incident.is_visible # If we try to access incident.id inside except block, it will try to refresh # instance and raises PendingRollback error incident_id = incident.id # Incident instance might change till this moment (set visible for example), # so we need to commit changes # Otherwise sqlalchemy might try to do this in unpredictable moment for attempt in range(3): try: # Explicitly add incident, but it most likely already there, since it was loaded in # same session session.add(incident) session.commit() break except StaleDataError as ex: if "expected to update" in ex.args[0]: self.logger.warning( f"Race condition met while updating incident `{incident_id}`, retry #{attempt}" ) session.rollback() continue else: raise incident = IncidentBl( self.tenant_id, session ).resolve_incident_if_require(incident, handle_workflow_event=False) incident_dto = IncidentDto.from_db_incident(incident) if send_created_event: RulesEngine.send_workflow_event( self.tenant_id, session, incident_dto, "created" ) elif incident.is_visible: RulesEngine.send_workflow_event( self.tenant_id, session, incident_dto, "updated" ) incidents_dto[incident.id] = incident_dto else: self.logger.info( f"Rule {rule.name} on event {event.id} is not relevant" ) self.logger.info("Rules ran successfully") # if we don't have any updated groups, we don't need to create any alerts if not incidents_dto: return [] self.logger.info(f"Rules ran, {len(incidents_dto)} incidents created") return list(incidents_dto.values()) def get_value_from_event(self, event: AlertDto, var: str) -> str: """ Extract value from event based on template variable e.g., alert.labels.host -> event['labels']['host'] alert.service -> event['service'] """ # Remove 'alert.' prefix path = var.replace("alert.", "").split(".") current = event.dict() # Convert to dict for easier access try: for part in path: part = part.strip() current = current.get(part) return str(current) if current is not None else "N/A" except (KeyError, AttributeError): return "N/A" def get_vaiables(self, incident_name_template): regex = r"\{\{\s*([^}]+)\s*\}\}" return re.findall(regex, incident_name_template) def _get_or_create_incident( self, rule: Rule, rule_fingerprint, session, event, creation_allowed=True ) -> (Optional[Incident], bool): existed_incident, expired = get_incident_for_grouping_rule( self.tenant_id, rule, rule_fingerprint, session=session, ) if existed_incident and not expired and rule.incident_prefix: if rule.incident_prefix not in existed_incident.user_generated_name: existed_incident.user_generated_name = f"{rule.incident_prefix}-{existed_incident.running_number} - {existed_incident.user_generated_name}" self.logger.info( "Incident name updated with prefix", ) # if not incident name template, return the incident if existed_incident and not expired and not rule.incident_name_template: return existed_incident, False # if incident name template, merge elif existed_incident and not expired: incident_name = copy.copy(rule.incident_name_template) current_name = existed_incident.user_generated_name self.logger.info( "Updating the incident name based on the new event", extra={ "incident_id": existed_incident.id, "incident_name": current_name, }, ) alerts = existed_incident.alerts variables = self.get_vaiables(rule.incident_name_template) values = set() for var in variables: var_to_replace = "" alerts_dtos = convert_db_alerts_to_dto_alerts(alerts) for alert in alerts_dtos: value = self.get_value_from_event(alert, var) # don't add twice the same value if value not in values: var_to_replace += value + "," values.add(value) this_event_val = self.get_value_from_event(event, var) if this_event_val not in values: var_to_replace += this_event_val pattern = r"\{\{\s*" + re.escape(var) + r"\s*\}\}" # it happens when the last value is already in the incident name so its skipped if var_to_replace.endswith(","): var_to_replace = var_to_replace[:-1] # update the incident name template # note that it will be commited later, when the incident is commited incident_name = re.sub(pattern, var_to_replace, incident_name) # Re-apply the incident prefix after template regeneration. # The template generates a plain name without the prefix, which # would otherwise overwrite the prefixed name set during creation # or the earlier prefix check. # See: https://github.com/keephq/keep/issues/5450 if rule.incident_prefix and rule.incident_prefix not in incident_name: incident_name = f"{rule.incident_prefix}-{existed_incident.running_number} - {incident_name}" # we are done if existed_incident.user_generated_name != incident_name: existed_incident.user_generated_name = incident_name self.logger.info( "Incident name updated", extra={ "incident_id": existed_incident.id, "old_incident_name": current_name, "new_incident_name": existed_incident.user_generated_name, }, ) return existed_incident, False # else, this is the first time # Starting new incident ONLY if alert is firing # https://github.com/keephq/keep/issues/3418 if creation_allowed and (event.status == AlertStatus.FIRING.value): if rule.incident_name_template: incident_name = copy.copy(rule.incident_name_template) variables = self.get_vaiables(rule.incident_name_template) if not variables: self.logger.warning( f"Failed to fetch the appropriate labels from the event {event.id} and rule {rule.name}" ) incident_name = None for var in variables: value = self.get_value_from_event(event, var) pattern = r"\{\{\s*" + re.escape(var) + r"\s*\}\}" incident_name = re.sub(pattern, value, incident_name) else: incident_name = None if rule.multi_level: incident_name = ( f"{rule_fingerprint} - {rule.name}" if not incident_name else f"{rule_fingerprint} - {incident_name}" ) incident = create_incident_for_grouping_rule( tenant_id=self.tenant_id, rule=rule, rule_fingerprint=rule_fingerprint, session=session, incident_name=incident_name, past_incident=existed_incident, assignee=rule.assignee, ) return incident, True return None, False def _process_event_for_history_based_rule( self, incident: Incident, rule: Rule, session: Session ) -> Incident: self.logger.info("Multiple events required for the incident to start") enrich_incidents_with_alerts( tenant_id=self.tenant_id, incidents=[incident], session=session, ) fingerprints = [alert.fingerprint for alert in incident.alerts] is_all_conditions_met = False all_sub_rules = set(self._extract_subrules(rule.definition_cel)) matched_sub_rules = set() for alert in incident.alerts: matched_sub_rules = matched_sub_rules.union( self._check_if_rule_apply(rule, AlertDto(**alert.event)) ) if all_sub_rules == matched_sub_rules: is_all_conditions_met = True break if is_all_conditions_met: all_alerts_firing = is_all_alerts_in_status( fingerprints=fingerprints, status=AlertStatus.FIRING, session=session ) if all_alerts_firing: incident.is_visible = True session.add(incident) session.commit() return incident @staticmethod def _extract_subrules(expression): # CEL rules looks like '(source == "sentry") || (source == "grafana" && severity == "critical")' # and we need to extract the subrules sub_rules = expression.split(") || (") if len(sub_rules) == 1: return sub_rules # the first and the last rules will have a ( or ) at the beginning or the end # e.g. for the example of: # (source == "sentry") && (source == "grafana" && severity == "critical") # than sub_rules[0] will be (source == "sentry" and sub_rules[-1] will be source == "grafana" && severity == "critical") # so we need to remove the first and last character sub_rules[0] = sub_rules[0][1:] sub_rules[-1] = sub_rules[-1][:-1] return sub_rules @staticmethod def sanitize_cel_payload(payload): """ Remove keys containing forbidden characters from payload and return warnings. Returns tuple of (sanitized_payload, warnings) """ forbidden_starts = [ "@", "-", "$", "#", " ", ":", ".", "/", "\\", "*", "&", "^", "%", "!", ] logger = logging.getLogger(__name__) def _sanitize_dict(d): result = {} for k, v in d.items(): if k[0] in forbidden_starts: # Only check first character logger.warning( f"Removed key '{k}' starting with forbidden character '{k[0]}'" ) continue if isinstance(v, dict): result[k] = _sanitize_dict(v) elif isinstance(v, list): result[k] = [ _sanitize_dict(i) if isinstance(i, dict) else i for i in v ] else: result[k] = v return result sanitized = _sanitize_dict(payload) return sanitized def _check_if_rule_apply(self, rule: Rule, event: AlertDto) -> List[str]: """ Evaluates if a rule applies to an event using CEL. Handles type coercion for ==/!= between int and str. """ sub_rules = self._extract_subrules(rule.definition_cel) payload = event.dict() # workaround since source is a list # todo: fix this in the future payload["source"] = payload["source"][0] payload = RulesEngine.sanitize_cel_payload(payload) # what we do here is to compile the CEL rule and evaluate it # https://github.com/cloud-custodian/cel-python # https://github.com/google/cel-spec sub_rules_matched = [] for sub_rule in sub_rules: # Shahar: rules such as "(source != null)" causing an exception: # celpy.evaluation.CELEvalError: ("found no matching overload for 'relation_ne' applied to # '(, )'", , # ("no such overload: != None ",)) # So we need to replace "null" with "" # # TODO: it works for strings now, but we need to add support on list/dict when needed if "null" in sub_rule: sub_rule = sub_rule.replace("null", '""') ast = self.env.compile(sub_rule) prgm = self.env.program(ast) activation = celpy.json_to_cel(json.loads(json.dumps(payload, default=str))) try: r = prgm.evaluate(activation) except celpy.evaluation.CELEvalError as e: # this is ok, it means that the subrule is not relevant for this event if "no such member" in str(e): continue # unknown # --- Fix for https://github.com/keephq/keep/issues/5107 --- if "no such overload" in str(e) or "found no matching overload" in str( e ): try: coerced = self._coerce_eq_type_error( sub_rule, prgm, activation, event ) if coerced: sub_rules_matched.append(sub_rule) continue except Exception: pass raise if r: sub_rules_matched.append(sub_rule) # no subrules matched return sub_rules_matched def _coerce_eq_type_error(self, cel, prgm, activation, alert): """ Helper for type coercion fallback for ==/!= between int and str in CEL. Fixes https://github.com/keephq/keep/issues/5107 """ import re m = re.match(r"([a-zA-Z0-9_\.]+)\s*([!=]=)\s*(.+)", cel) if not m: return False left, op, right = m.groups() left = left.strip() right = ( right.strip().strip('"') if right.strip().startswith('"') and right.strip().endswith('"') else right.strip() ) try: def get_nested(d, path): for part in path.split("."): if isinstance(d, dict): d = d.get(part) else: return None return d left_val = get_nested(activation, left) try: right_val = int(right) except Exception: try: right_val = float(right) except Exception: right_val = right # If one is str and the other is int/float, compare as str if (isinstance(left_val, (int, float)) and isinstance(right_val, str)) or ( isinstance(left_val, str) and isinstance(right_val, (int, float)) ): if op == "==": return str(left_val) == str(right_val) else: return str(left_val) != str(right_val) # Also handle both as str for robustness if op == "==": return str(left_val) == str(right_val) else: return str(left_val) != str(right_val) except Exception: pass return False def _calc_rule_fingerprint(self, event: AlertDto, rule: Rule) -> list[list[str]]: # extract all the grouping criteria from the event # e.g. if the grouping criteria is ["event.labels.queue", "event.labels.cluster"] # and the event is: # { # "labels": { # "queue": "queue1", # "cluster": "cluster1", # "foo": "bar" # } # } # than the rule_fingerprint will be "[queue1,cluster1]" # if the rule is multi_level, the rule_fingerprint will be "[queue1,cluster1]" and "[queue2,cluster2]" and more than 1 incident will be created # note: rule_fingerprint is not a unique id, since different rules can lead to the same rule_fingerprint # hence, the actual fingerprint is composed of the rule_fingerprint and the incident id event_payload = event.dict() grouping_criteria = rule.grouping_criteria or [] if not rule.multi_level: rule_fingerprints = [] for criteria in grouping_criteria: # we need to extract the value from the event # e.g. if the criteria is "event.labels.queue" # than we need to extract the value of event["labels"]["queue"] criteria_parts = criteria.split(".") value = event_payload for part in criteria_parts: value = value.get(part) if isinstance(value, list): value = ",".join(value) rule_fingerprints.append(value) # if, for example, the event should have labels.X but it doesn't, # than we will have None in the rule_fingerprint if not rule_fingerprints: self.logger.warning( f"Failed to calculate rule fingerprint for event {event.id} and rule {rule.name}", extra={ "rule_id": rule.id, "rule_name": rule.name, "tenant_id": self.tenant_id, }, ) return [["none"]] # if any of the values is None, we will return "none" if any([fingerprint is None for fingerprint in rule_fingerprints]): self.logger.warning( f"Failed to fetch the appropriate labels from the event {event.id} and rule {rule.name}", extra={ "rule_id": rule.id, "rule_name": rule.name, "tenant_id": self.tenant_id, }, ) return [["none"]] return [rule_fingerprints] else: fingerprints = set() # the idea is pretty simple but implementation is a bit hacky for now # we expect the grouping criteria to be a dict with the key being the property name # for example: {"customers": {"1": {"name": "John", "age": 30}, "2": {"name": "Jane", "age": 25}}} # and we want to group by the "name" property # so we will get ["John", "Jane"] and 2 incidents will be created: one for "John" and one for "Jane" with same alerts. if not grouping_criteria: self.logger.warning( "wtf? no grouping criteria for multi_level rule", extra={ "rule_id": rule.id, "rule_name": rule.name, "tenant_id": self.tenant_id, }, ) return [["none"]] # @tb: this is a known limitation for now, we only accept 1 grouping criteria for multi_level rule criteria = grouping_criteria[0] criteria_parts = criteria.split(".") for part in criteria_parts: value = event_payload for part in criteria_parts: value = value.get(part) if not isinstance(value, dict): self.logger.warning( "multi level rule grouping criteria is not a dict", extra={ "rule_id": rule.id, "rule_name": rule.name, "tenant_id": self.tenant_id, }, ) return [["none"]] for key in value.keys(): fingerprints.add(value[key].get(rule.multi_level_property_name)) return [[key] for key in fingerprints] return [["none"]] @staticmethod def get_alerts_activation(alerts: list[AlertDto]): activations = [] for alert in alerts: payload = alert.dict() # TODO: workaround since source is a list # should be fixed in the future payload["source"] = ",".join(payload["source"]) # payload severity could be the severity itself or the order of the severity, cast it to the order if isinstance(payload["severity"], str): payload["severity"] = AlertSeverity(payload["severity"].lower()).order # sanitize the payload payload = RulesEngine.sanitize_cel_payload(payload) activation = celpy.json_to_cel(json.loads(json.dumps(payload, default=str))) activations.append(activation) return activations def filter_alerts( self, alerts: list[AlertDto], cel: str, alerts_activation: list = None ): """This function filters alerts according to a CEL Args: alerts (list[AlertDto]): list of alerts cel (str): CEL expression Returns: list[AlertDto]: list of alerts that are related to the cel """ logger = logging.getLogger(__name__) # if the cel is empty, return all the alerts if cel == "": return alerts # if the cel is empty, return all the alerts if not cel: logger.debug("No CEL expression provided") return alerts # preprocess the cel expression cel = preprocess_cel_expression(cel) ast = self.env.compile(cel) prgm = self.env.program(ast) filtered_alerts = [] for i, alert in enumerate(alerts): if alerts_activation: activation = alerts_activation[i] else: activation = self.get_alerts_activation([alert])[0] try: r = prgm.evaluate(activation) except ValueError as e: if "Invalid name" in str(e): logger.warning( f"{str(e)} in the CEL expression {cel} for alert {alert.id}. This might mean there's a blank space in the field name", extra={"alert_id": alert.id, "payload": alert.dict()}, ) continue except celpy.evaluation.CELEvalError as e: # this is ok, it means that the subrule is not relevant for this event if "no such member" in str(e): continue # unknown elif "no such overload" in str( e ) or "found no matching overload" in str(e): # Try type coercion for == and != try: coerced = self._coerce_eq_type_error( cel, prgm, activation, alert ) if coerced: filtered_alerts.append(alert) continue except Exception: pass logger.debug( f"Type mismtach between operator and operand in the CEL expression {cel} for alert {alert.id}" ) continue logger.warning( f"Failed to evaluate the CEL expression {cel} for alert {alert.id} - {e}" ) continue except Exception: logger.exception( f"Failed to evaluate the CEL expression {cel} for alert {alert.id}" ) continue if r: filtered_alerts.append(alert) return filtered_alerts @staticmethod def send_workflow_event( tenant_id: str, session: Session, incident_dto: IncidentDto, action: str ): logger = logging.getLogger(__name__) logger.info(f"Sending workflow event {action} for incident {incident_dto.id}") pusher_client = get_pusher_client() incident_bl = IncidentBl(tenant_id, session, pusher_client) incident_bl.send_workflow_event(incident_dto, action) incident_bl.update_client_on_incident_change(incident_dto.id) logger.info(f"Workflow event {action} for incident {incident_dto.id} sent") ================================================ FILE: keep/searchengine/searchengine.py ================================================ import enum import logging from keep.api.core.alerts import query_last_alerts from keep.api.core.db import get_last_alerts from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.core.elastic import ElasticClient from keep.api.core.tenant_configuration import TenantConfiguration from keep.api.models.alert import AlertDto, AlertStatus from keep.api.models.db.preset import PresetDto, PresetSearchQuery from keep.api.models.query import QueryDto from keep.api.models.time_stamp import TimeStampFilter from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts from keep.rulesengine.rulesengine import RulesEngine from datetime import datetime, timedelta, timezone class SearchMode(enum.Enum): """The search mode for the search engine""" # use elastic to search alerts (for large tenants) ELASTIC = "elastic" # use internal search to search alerts (for small-medium tenants) INTERNAL = "internal" class SearchEngine: def __init__(self, tenant_id): self.tenant_id = tenant_id self.logger = logging.getLogger(__name__) self.rule_engine = RulesEngine(tenant_id=self.tenant_id) self.elastic_client = ElasticClient(tenant_id) self.tenant_configuration = TenantConfiguration() # this is backward compatibility for single/noauth tenants if tenant_id == SINGLE_TENANT_UUID: self.search_mode = ( SearchMode.ELASTIC if self.elastic_client.enabled else SearchMode.INTERNAL ) # elif elastic is disabled: elif not self.elastic_client.enabled: self.search_mode = SearchMode.INTERNAL # for multi-tenant deployment with elastic enabled, get the per-tenant search configuration: else: search_mode_config = self.tenant_configuration.get_configuration( tenant_id, "search_mode" ) if search_mode_config: self.search_mode = SearchMode(search_mode_config) else: self.search_mode = SearchMode.INTERNAL self.logger.info( "Initialized search engine", extra={"tenant_id": self.tenant_id, "search_mode": self.search_mode}, ) def _get_last_alerts( self, limit=1000, timeframe: int = 0, time_stamp: TimeStampFilter = None ) -> list[AlertDto]: """Get the last alerts Returns: list[AlertDto]: The list of alerts """ self.logger.info("Getting last alerts") lower_timestamp = time_stamp.lower_timestamp if time_stamp else None upper_timestamp = time_stamp.upper_timestamp if time_stamp else None alerts = get_last_alerts( tenant_id=self.tenant_id, limit=limit, timeframe=timeframe, lower_timestamp=lower_timestamp, upper_timestamp=upper_timestamp, with_incidents=True, ) # convert the alerts to DTO alerts_dto = convert_db_alerts_to_dto_alerts(alerts) self.logger.info( f"Finished getting last alerts {lower_timestamp} {upper_timestamp} {time_stamp}" ) return alerts_dto def search_alerts_by_cel( self, cel_query: str, limit: int = 1000, timeframe: float = 0, ) -> list[AlertDto]: """Search for alerts based on a CEL query Args: cel_query (str): The CEL query to search for alerts (list[AlertDto]): The list of alerts to search in Returns: list[AlertDto]: The list of alerts that match the query """ cel_query = (cel_query or "").strip() if timeframe: timeframe_in_seconds = timeframe * 24 * 60 * 60 current_utc_date = datetime.now(timezone.utc) time_ago = current_utc_date - timedelta(seconds=timeframe_in_seconds) iso_utc_date = ( time_ago.astimezone(timezone.utc).replace(microsecond=0).isoformat() ) cel_list = [ f"timestamp >= '{iso_utc_date}'", cel_query, ] cel_query = " && ".join(f"({cel})" for cel in cel_list if cel) self.logger.info("Searching alerts by CEL") db_alerts, _ = query_last_alerts( tenant_id=self.tenant_id, query=QueryDto( cel=cel_query, limit=limit, ), ) filtered_alerts = convert_db_alerts_to_dto_alerts(db_alerts) self.logger.info("Finished searching alerts by CEL") return filtered_alerts def _search_alerts_by_sql( self, sql_query: dict, limit=1000, timeframe: int = 0 ) -> list[AlertDto]: """Search for alerts based on a SQL query Args: sql_query (dict): The SQL query to search for Returns: list[AlertDto]: The list of alerts that match the query """ self.logger.info("Searching alerts by SQL") query = self._create_raw_sql(sql_query.get("sql"), sql_query.get("params")) # get the alerts from elastic elastic_sql_query = ( f"""select * from "{self.elastic_client.alerts_index}" """ + (f"where {query}" if query else "") ) if timeframe: elastic_sql_query += f" and lastReceived > now() - {timeframe}s" elastic_sql_query += f" order by lastReceived desc limit {limit}" from opentelemetry import trace tracer = trace.get_tracer(__name__) with tracer.start_as_current_span("elastic_run_query"): filtered_alerts = self.elastic_client.search_alerts( elastic_sql_query, limit ) self.logger.info("Finished searching alerts by SQL") return filtered_alerts def search_alerts(self, query: PresetSearchQuery) -> list[AlertDto]: """Search for alerts based on a query Args: query (dict | str): CEL (str) / SQL (dict) query Returns: list[AlertDto]: The list of alerts that match the query """ self.logger.info("Searching alerts") # if internal if self.search_mode == SearchMode.INTERNAL: filtered_alerts = self.search_alerts_by_cel( query.cel_query, limit=query.limit, timeframe=query.timeframe ) # if elastic elif self.search_mode == SearchMode.ELASTIC: filtered_alerts = self._search_alerts_by_sql( query.sql_query, limit=query.limit, timeframe=query.timeframe ) else: self.logger.error("Invalid search mode") return [] self.logger.info("Finished searching alerts") return filtered_alerts def search_preset_alerts( self, presets: list[PresetDto], time_stamp: TimeStampFilter = None ) -> dict[str, list[AlertDto]]: """Search for alerts based on a list of queries Args: presets (list[Preset]): The list of presets to search for Returns: dict[str, list[AlertDto]]: The list of alerts that match each query """ self.logger.info( "Searching alerts for presets", extra={"tenant_id": self.tenant_id, "search_mode": self.search_mode}, ) # if internal if self.search_mode == SearchMode.INTERNAL: # get the alerts alerts_dto = self._get_last_alerts(time_stamp=time_stamp) # performance optimization: get the alerts activation once alerts_activation = self.rule_engine.get_alerts_activation(alerts_dto) for preset in presets: filtered_alerts = self.rule_engine.filter_alerts( alerts_dto, preset.cel_query, alerts_activation ) preset.alerts_count = len(filtered_alerts) # update noisy if preset.is_noisy: firing_filtered_alerts = list( filter( lambda alert: alert.status == AlertStatus.FIRING.value and not alert.deleted and not alert.dismissed, filtered_alerts, ) ) # if there are firing alerts, then do noise if firing_filtered_alerts: self.logger.info("Noisy preset is noisy") preset.should_do_noise_now = True else: self.logger.info("Noisy preset is not noisy") preset.should_do_noise_now = False # else if one of the alerts are isNoisy elif not preset.static and any( alert.isNoisy and alert.status == AlertStatus.FIRING.value and not alert.deleted and not alert.dismissed for alert in filtered_alerts ): self.logger.info("Preset is noisy") preset.should_do_noise_now = True # if elastic elif self.search_mode == SearchMode.ELASTIC: # get the alerts from elastic for preset in presets: try: query = self._create_raw_sql( preset.sql_query.get("sql"), preset.sql_query.get("params") ) # get number of alerts and number of noisy alerts elastic_sql_query = ( f"""select count(*), MAX(CASE WHEN isNoisy = true AND dismissed = false AND deleted = false THEN 1 ELSE 0 END) from "{self.elastic_client.alerts_index}" """ + (f" where {query}" if query else "") ) results = self.elastic_client.run_query(elastic_sql_query) if results: preset.alerts_count = results["rows"][0][0] preset.should_do_noise_now = results["rows"][0][1] == 1 else: self.logger.warning( "No results found for preset", extra={"preset_id": preset.id, "preset_name": preset.name}, ) preset.alerts_count = 0 preset.should_do_noise_now = False except Exception: self.logger.exception( "Failed to search alerts for preset", extra={"preset_id": preset.id, "preset_name": preset.name}, ) pass self.logger.info( "Finished searching alerts for presets", extra={"tenant_id": self.tenant_id, "search_mode": self.search_mode}, ) return presets def _create_raw_sql(self, sql_template, params): """ Replace placeholders in the SQL template with actual values from the params dictionary. """ params = list(params.items()) # param_{double_digit} bug params.reverse() if params: for key, value in params: placeholder = f":{key}" if isinstance(value, str): value = f"'{value}'" # Add quotes around string values sql_template = sql_template.replace(placeholder, str(value)) return sql_template ================================================ FILE: keep/secretmanager/__init__.py ================================================ ================================================ FILE: keep/secretmanager/awssecretmanager.py ================================================ import json import os import boto3 import opentelemetry.trace as trace from botocore.exceptions import ClientError from keep.api.core.config import config from keep.secretmanager.secretmanager import BaseSecretManager tracer = trace.get_tracer(__name__) SECRET_MANAGER_TAGS = config("AWS_SECRET_MANAGER_TAGS", default=None) ROTATION_ENABLED = config("AWS_SECRET_ROTATION_ENABLED", default=False, cast=bool) ROTATION_DAYS = config("AWS_SECRET_ROTATION_DAYS", default=30, cast=int) ROTATION_LAMBDA_ARN = config("AWS_SECRET_ROTATION_LAMBDA_ARN", default=None) class AwsSecretManager(BaseSecretManager): def __init__(self, context_manager, **kwargs): super().__init__(context_manager) try: session = boto3.session.Session() self.client = session.client( service_name="secretsmanager", region_name=os.environ.get("AWS_REGION") ) except Exception as e: self.logger.error( "Failed to initialize AWS Secrets Manager client", extra={"error": str(e)}, ) raise self.tags = [] if SECRET_MANAGER_TAGS: # we expect this format: key=value,key2=value2 try: for tag in SECRET_MANAGER_TAGS.split(","): key, value = tag.split("=") self.tags.append({"Key": key, "Value": value}) except Exception as e: self.logger.error( "Failed to parse SECRET_MANAGER_TAGS, skipping tags", extra={"error": str(e)}, ) def write_secret(self, secret_name: str, secret_value: str) -> None: """ Writes a secret to AWS Secrets Manager. Args: secret_name (str): The name of the secret. secret_value (str): The value of the secret. Raises: ClientError: If an AWS-specific error occurs while writing the secret. Exception: If any other unexpected error occurs. """ with tracer.start_as_current_span("write_secret"): self.logger.info("Writing secret", extra={"secret_name": secret_name}) try: # Check if secret exists by trying to describe it self.client.describe_secret(SecretId=secret_name) # If secret exists, update it with new value self.client.put_secret_value( SecretId=secret_name, SecretString=secret_value ) self.logger.info( "Secret updated successfully", extra={"secret_name": secret_name} ) except ClientError as e: if e.response["Error"]["Code"] == "ResourceNotFoundException": try: self.client.create_secret( Name=secret_name, SecretString=secret_value, KmsKeyId=os.environ.get("AWS_KMS_KEY_ID", None), Tags=self.tags, ) self.logger.info( "Secret created successfully", extra={"secret_name": secret_name}, ) # Apply rotation policy if enabled if ROTATION_ENABLED and ROTATION_LAMBDA_ARN: try: self.client.rotate_secret( SecretId=secret_name, RotationLambdaARN=ROTATION_LAMBDA_ARN, RotationRules={ "AutomaticallyAfterDays": ROTATION_DAYS }, RotateImmediately=False, ) self.logger.info( "Rotation policy configured successfully", extra={ "secret_name": secret_name, "rotation_days": ROTATION_DAYS, }, ) except ClientError as rot_error: self.logger.error( "Failed to configure rotation policy", extra={ "secret_name": secret_name, "error": str(rot_error), "error_code": rot_error.response["Error"][ "Code" ], }, ) except Exception as e: self.logger.error( "Unexpected error while creating secret", extra={ "secret_name": secret_name, "error": str(e), "error_type": type(e).__name__, }, ) raise else: self.logger.error( "AWS error while writing secret", extra={ "secret_name": secret_name, "error": str(e), "error_code": e.response["Error"]["Code"], }, ) raise except Exception as e: self.logger.error( "Unexpected error while writing secret", extra={ "secret_name": secret_name, "error": str(e), "error_type": type(e).__name__, }, ) raise def read_secret(self, secret_name: str, is_json: bool = False) -> str | dict: """ Reads a secret from AWS Secrets Manager. Args: secret_name (str): The name of the secret. is_json (bool): Whether to parse the secret as JSON. Defaults to False. Returns: str | dict: The secret value as a string, or as a dict if is_json=True. Raises: ClientError: If an AWS-specific error occurs while reading the secret. Exception: If any other unexpected error occurs. """ with tracer.start_as_current_span("read_secret"): self.logger.debug("Getting secret", extra={"secret_name": secret_name}) try: response = self.client.get_secret_value(SecretId=secret_name) secret_value = response["SecretString"] if is_json: try: secret_value = json.loads(secret_value) except json.JSONDecodeError as e: self.logger.error( "Failed to parse secret as JSON", extra={"secret_name": secret_name, "error": str(e)}, ) raise self.logger.debug( "Got secret successfully", extra={"secret_name": secret_name} ) return secret_value except ClientError as e: self.logger.error( "AWS error while reading secret", extra={ "secret_name": secret_name, "error": str(e), "error_code": e.response["Error"]["Code"], }, ) raise except Exception as e: self.logger.error( "Unexpected error while reading secret", extra={ "secret_name": secret_name, "error": str(e), "error_type": type(e).__name__, }, ) raise def delete_secret(self, secret_name: str) -> None: """ Deletes a secret from AWS Secrets Manager. Args: secret_name (str): The name of the secret. Raises: ClientError: If an AWS-specific error occurs while deleting the secret. Exception: If any other unexpected error occurs. """ with tracer.start_as_current_span("delete_secret"): try: self.client.delete_secret( SecretId=secret_name, ForceDeleteWithoutRecovery=True ) self.logger.info( "Secret deleted successfully", extra={"secret_name": secret_name} ) except ClientError as e: self.logger.error( "AWS error while deleting secret", extra={ "secret_name": secret_name, "error": str(e), "error_code": e.response["Error"]["Code"], }, ) raise except Exception as e: self.logger.error( "Unexpected error while deleting secret", extra={ "secret_name": secret_name, "error": str(e), "error_type": type(e).__name__, }, ) raise ================================================ FILE: keep/secretmanager/dbsecretmanager.py ================================================ from datetime import datetime import json from sqlmodel import Session, select from keep.api.models.db.secret import Secret from keep.secretmanager.secretmanager import BaseSecretManager from keep.api.core.db import engine class DbSecretManager(BaseSecretManager): def __init__(self, context_manager, **kwargs): super().__init__(context_manager) self.logger.info("Using DB Secret Manager") def read_secret(self, secret_name: str, is_json: bool = False) -> str | dict: self.logger.info("Getting secret", extra={"secret_name": secret_name}) with Session(engine) as session: try: secret_model = session.exec( select(Secret).where( Secret.key == secret_name ) ).one_or_none() if secret_model: if is_json: return json.loads(secret_model.value) return secret_model.value except Exception as e: self.logger.error( "Failed to read secret", extra={"error": str(e)}, ) raise if not secret_model: raise KeyError(f"Secret {secret_name} not found") def write_secret(self, secret_name: str, secret_value: str) -> None: self.logger.info("Writing secret", extra={"secret_name": secret_name}) with Session(engine) as session: secret_model = session.exec( select(Secret).where( Secret.key == secret_name ) ).one_or_none() try: if secret_model: secret_model.value = secret_value secret_model.last_updated = datetime.utcnow() session.commit() return secret_model = Secret( key=secret_name, value=secret_value, ) session.add(secret_model) session.commit() except Exception as e: self.logger.error( "Failed to write secret", extra={"error": str(e)}, ) raise def delete_secret(self, secret_name: str) -> None: self.logger.info("Deleting secret", extra={"secret_name": secret_name}) with Session(engine) as session: secret_model = session.exec( select(Secret).where( Secret.key == secret_name ) ).one_or_none() try: if secret_model: session.delete(secret_model) session.commit() except Exception as e: self.logger.error( "Failed to delete secret", extra={"error": str(e)}, ) raise ================================================ FILE: keep/secretmanager/filesecretmanager.py ================================================ import json import os from keep.secretmanager.secretmanager import BaseSecretManager class FileSecretManager(BaseSecretManager): def __init__(self, context_manager, **kwargs): super().__init__(context_manager) self.directory = os.environ.get("SECRET_MANAGER_DIRECTORY", "./") def read_secret(self, secret_name: str, is_json: bool = False) -> str | dict: secret_name = os.path.join(self.directory, secret_name) self.logger.debug(f"Reading {secret_name}", extra={"is_json": is_json}) with open(secret_name, "r") as f: file_data = f.read() if is_json: return json.loads(file_data) self.logger.debug(f"Read {secret_name}", extra={"is_json": is_json}) return file_data def write_secret(self, secret_name: str, secret_value: str) -> None: path = os.path.join(self.directory, secret_name) # Create directory if not exist os.makedirs(self.directory, exist_ok=True) with open(path, "w") as f: self.logger.debug(f"Writing {secret_name}") try: f.write(secret_value) except Exception as e: self.logger.error(f"Error writing {secret_name}: {e}") raise self.logger.debug(f"Wrote {secret_name}") def delete_secret(self, secret_name: str) -> None: os.remove(os.path.join(self.directory, secret_name)) ================================================ FILE: keep/secretmanager/gcpsecretmanager.py ================================================ import json import os import opentelemetry.trace as trace from google.api_core.exceptions import AlreadyExists from google.cloud import secretmanager from keep.secretmanager.secretmanager import BaseSecretManager tracer = trace.get_tracer(__name__) class GcpSecretManager(BaseSecretManager): def __init__(self, context_manager, **kwargs): super().__init__(context_manager) self.project_id = os.environ["GOOGLE_CLOUD_PROJECT"] self.client = secretmanager.SecretManagerServiceClient() def write_secret(self, secret_name: str, secret_value: str) -> None: """ Writes a secret to the Secret Manager. Args: secret_name (str): The name of the secret. secret_value (str): The value of the secret. Raises: Exception: If an error occurs while writing the secret. """ with tracer.start_as_current_span("write_secret"): self.logger.info("Writing secret", extra={"secret_name": secret_name}) # Construct the resource name parent = f"projects/{self.project_id}" try: # Create the secret if it does not exist self.client.create_secret( request={ "parent": parent, "secret_id": secret_name, "secret": {"replication": {"automatic": {}}}, } ) self.logger.info( "Secret created successfully", extra={"secret_name": secret_name} ) except AlreadyExists: # If the secret already exists, update the existing secret version pass try: # Add the secret version. parent = self.client.secret_path(self.project_id, secret_name) payload_bytes = secret_value.encode("UTF-8") self.client.add_secret_version( request={ "parent": parent, "payload": { "data": payload_bytes, }, } ) self.logger.info( "Secret updated successfully", extra={"secret_name": secret_name} ) except Exception as e: self.logger.error( "Error writing secret", extra={"secret_name": secret_name, "error": str(e)}, ) raise def read_secret(self, secret_name: str, is_json: bool = False) -> str | dict: with tracer.start_as_current_span("read_secret"): self.logger.debug("Getting secret", extra={"secret_name": secret_name}) resource_name = ( f"projects/{self.project_id}/secrets/{secret_name}/versions/latest" ) response = self.client.access_secret_version(name=resource_name) secret_value = response.payload.data.decode("UTF-8") if is_json: secret_value = json.loads(secret_value) self.logger.debug( "Got secret successfully", extra={"secret_name": secret_name} ) return secret_value def delete_secret(self, secret_name: str) -> None: with tracer.start_as_current_span("delete_secret"): # Construct the resource name resource_name = f"projects/{self.project_id}/secrets/{secret_name}" self.client.delete_secret(request={"name": resource_name}) ================================================ FILE: keep/secretmanager/kubernetessecretmanager.py ================================================ import base64 import json import os import kubernetes.client import kubernetes.config from kubernetes.client.exceptions import ApiException from keep.api.core.config import config from keep.secretmanager.secretmanager import BaseSecretManager # kubernetes.config.incluster_config.SERVICE_CERT_FILENAME = "/app/bla" VERIFY_SSL_CERT = config.get("K8S_VERIFY_SSL_CERT", cast=bool, default=True) class KubernetesSecretManager(BaseSecretManager): def __init__(self, context_manager, **kwargs): super().__init__(context_manager) # Initialize Kubernetes configuration (Assuming it's already set up properly) self.namespace = os.environ.get("K8S_NAMESPACE", "default") self.logger.info( "Using K8S Secret Manager", extra={"namespace": self.namespace} ) # kubernetes.config.load_config() # when running locally kubernetes.config.load_incluster_config() # If we need to disable SSL, let's do it if not VERIFY_SSL_CERT: self.logger.info("Disabling SSL verification") try: # we want to change the default configuration to disable SSL verification default_config = kubernetes.client.Configuration.get_default_copy() default_config.verify_ssl = False kubernetes.client.Configuration.set_default(default_config) self.api = kubernetes.client.CoreV1Api() # we also need to disable SSL verification in the connection pool # shahar: idk why this is needed, but it is try: self.api.api_client.rest_client.pool_manager.connection_pool_kw[ "ca_certs" ] = None except Exception: self.logger.exception( "Error disabling SSL verification in the connection pool" ) pass self.logger.info("SSL verification disabled") except Exception: self.logger.exception("Error disabling SSL verification") self.api = kubernetes.client.CoreV1Api() else: self.api = kubernetes.client.CoreV1Api() def write_secret(self, secret_name: str, secret_value: str) -> None: """ Writes a secret to the Kubernetes Secret. Args: secret_name (str): The name of the secret. secret_value (str): The value of the secret. Raises: ApiException: If an error occurs while writing the secret. """ # k8s requirements: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names secret_name = secret_name.replace("_", "-").lower() self.logger.info("Writing secret", extra={"secret_name": secret_name}) body = kubernetes.client.V1Secret( metadata=kubernetes.client.V1ObjectMeta(name=secret_name), data={"value": base64.b64encode(secret_value.encode()).decode()}, ) try: self.api.create_namespaced_secret(namespace=self.namespace, body=body) self.logger.info( "Secret created/updated successfully", extra={"secret_name": secret_name}, ) except ApiException as e: if e.status == 409: # Secret exists, try to patch it try: self.api.patch_namespaced_secret( name=secret_name, namespace=self.namespace, body=body ) self.logger.info( "Secret updated successfully", extra={"secret_name": secret_name}, ) except kubernetes.client.exceptions.ApiException as patch_error: self.logger.error( "Error updating secret", extra={"secret_name": secret_name, "error": str(patch_error)}, ) raise patch_error else: self.logger.error( "Error writing secret", extra={"secret_name": secret_name, "error": str(e)}, ) raise def read_secret(self, secret_name: str, is_json: bool = False) -> str | dict: # k8s requirements: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names secret_name = secret_name.replace("_", "-").lower() self.logger.info("Getting secret", extra={"secret_name": secret_name}) try: response = self.api.read_namespaced_secret( name=secret_name, namespace=self.namespace ) secret_data = base64.b64decode(response.data.get("value", "")).decode() if is_json: secret_data = json.loads(secret_data) self.logger.info( "Got secret successfully", extra={"secret_name": secret_name} ) return secret_data except ApiException as e: self.logger.debug( "Error reading secret", extra={"secret_name": secret_name, "error": str(e)}, ) raise def delete_secret(self, secret_name: str) -> None: secret_name = secret_name.replace("_", "-").lower() self.logger.info("Deleting secret", extra={"secret_name": secret_name}) try: self.api.delete_namespaced_secret( name=secret_name, namespace=self.namespace, body={} ) self.logger.info( "Deleted secret successfully", extra={"secret_name": secret_name} ) except ApiException as e: self.logger.error( "Error deleting secret", extra={"secret_name": secret_name, "error": str(e)}, ) raise ================================================ FILE: keep/secretmanager/secretmanager.py ================================================ import abc import logging from keep.contextmanager.contextmanager import ContextManager class BaseSecretManager(metaclass=abc.ABCMeta): def __init__(self, context_manager: ContextManager, **kwargs): self.logger = logging.getLogger(__name__) self.context_manager = context_manager @abc.abstractmethod def read_secret(self, secret_name: str, is_json: bool = False) -> str | dict: """ Read a secret from the secret manager. Args: secret_name (str): The name of the secret to read. is_json (bool): Whether to try and convert to python dictionary or not (json.loads) Returns: str: The secret value. """ raise NotImplementedError( "read_secret() method not implemented" " for {}".format(self.__class__.__name__) ) @abc.abstractmethod def write_secret(self, secret_name: str, secret_value: str) -> None: """ Write a secret to the secret manager. Args: secret_name (str): The name of the secret to write. secret_value (str): The value of the secret to write. """ @abc.abstractmethod def delete_secret(self, secret_name: str) -> None: """ Delete a secret from the secret manager. Args: secret_name (str): The name of the secret to delete. """ raise NotImplementedError("delete_secret() method not implemented") ================================================ FILE: keep/secretmanager/secretmanagerfactory.py ================================================ import enum from keep.api.core.config import config from keep.contextmanager.contextmanager import ContextManager from keep.secretmanager.secretmanager import BaseSecretManager class SecretManagerTypes(enum.Enum): FILE = "file" GCP = "gcp" K8S = "k8s" VAULT = "vault" AWS = "aws" DB = "db" class SecretManagerFactory: @staticmethod def get_secret_manager( context_manager: ContextManager, secret_manager_type: SecretManagerTypes = None, **kwargs, ) -> BaseSecretManager: if not secret_manager_type: secret_manager_type = SecretManagerTypes[ config("SECRET_MANAGER_TYPE", default="FILE").upper() ] if secret_manager_type == SecretManagerTypes.FILE: from keep.secretmanager.filesecretmanager import FileSecretManager return FileSecretManager(context_manager, **kwargs) elif secret_manager_type == SecretManagerTypes.GCP: from keep.secretmanager.gcpsecretmanager import GcpSecretManager return GcpSecretManager(context_manager, **kwargs) elif secret_manager_type == SecretManagerTypes.K8S: from keep.secretmanager.kubernetessecretmanager import ( KubernetesSecretManager, ) return KubernetesSecretManager(context_manager, **kwargs) elif secret_manager_type == SecretManagerTypes.VAULT: from keep.secretmanager.vaultsecretmanager import VaultSecretManager return VaultSecretManager(context_manager, **kwargs) elif secret_manager_type == SecretManagerTypes.AWS: from keep.secretmanager.awssecretmanager import AwsSecretManager return AwsSecretManager(context_manager, **kwargs) elif secret_manager_type == SecretManagerTypes.DB: from keep.secretmanager.dbsecretmanager import DbSecretManager return DbSecretManager(context_manager, **kwargs) raise NotImplementedError( f"Secret manager type {str(secret_manager_type)} not implemented" ) ================================================ FILE: keep/secretmanager/vaultsecretmanager.py ================================================ # Builtins import json import os # 3rd-party import hvac # Internals from keep.secretmanager.secretmanager import BaseSecretManager class VaultSecretManager(BaseSecretManager): HASHICORP_VAULT_ADDR = os.environ.get( "HASHICORP_VAULT_ADDR", "http://localhost:8200" ) HASHICORP_VALUT_NAMESPACE = os.environ.get("HASHICORP_VALUT_NAMESPACE", "default") def __init__(self, context_manager, **kwargs): super().__init__(context_manager) vault_token = os.environ.get("HASHICORP_VAULT_TOKEN") vault_use_k8s = os.environ.get("HASHICORP_VAULT_USE_K8S", False) if vault_token: self.client = hvac.Client( url=self.HASHICORP_VAULT_ADDR, namespace=self.HASHICORP_VALUT_NAMESPACE, token=vault_token, ) elif vault_use_k8s: k8s_role = os.environ.get("HASHICORP_VAULT_K8S_ROLE") if not k8s_role: raise Exception( "HASHICORP_VAULT_K8S_ROLE is required when using k8s auth method" ) from hvac.api.auth_methods import Kubernetes self.client = hvac.Client( url=self.HASHICORP_VAULT_ADDR, namespace=self.HASHICORP_VALUT_NAMESPACE ) f = open("/var/run/secrets/kubernetes.io/serviceaccount/token") jwt = f.read() Kubernetes(self.client.adapter).login(role=k8s_role, jwt=jwt) else: raise Exception("Unsupported vault login method") self.logger.info("Using Vault Secret Manager") def write_secret(self, secret_name: str, secret_value: str) -> None: self.logger.info("Writing secret", extra={"secret_name": secret_name}) self.client.secrets.kv.v2.create_or_update_secret( path=secret_name, secret={"value": secret_value} ) self.logger.info( "Secret created/updated successfully", extra={"secret_name": secret_name} ) def read_secret(self, secret_name: str, is_json: bool = False) -> str | dict: self.logger.info("Getting secret", extra={"secret_name": secret_name}) secret = self.client.secrets.kv.v2.read_secret_version(path=secret_name) self.logger.info( "Secret retrieved successfully", extra={"secret_name": secret_name} ) secret_value = secret["data"]["data"].get("value") if is_json: try: secret_value = json.loads(secret_value) except json.JSONDecodeError as e: self.logger.error("Failed to parse secret as JSON", extra={"secret_name": secret_name, "error": str(e)}) raise return secret_value def delete_secret(self, secret_name: str) -> None: self.logger.info("Deleting secret", extra={"secret_name": secret_name}) self.client.secrets.kv.delete_metadata_and_all_versions(secret_name) self.logger.info( "Secret deleted successfully", extra={"secret_name": secret_name} ) ================================================ FILE: keep/server_jobs_bg.py ================================================ import os import time import logging import requests from keep.api.core.demo_mode import launch_demo_mode_thread from keep.api.core.report_uptime import launch_uptime_reporting_thread logger = logging.getLogger(__name__) def main(): logger.info("Starting background server jobs.") # We intentionally don't use KEEP_API_URL here to avoid going through the internet. # Script should be launched in the same environment as the server. keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) keep_api_key = os.environ.get("KEEP_LIVE_DEMO_MODE_API_KEY") while True: try: logger.info(f"Checking if server is up at {keep_api_url}...") response = requests.get(keep_api_url) response.raise_for_status() break except requests.exceptions.RequestException: logger.info("API is not up yet. Waiting...") time.sleep(5) threads = [] threads.append(launch_demo_mode_thread(keep_api_url, keep_api_key)) threads.append(launch_uptime_reporting_thread()) logger.info("Background server jobs threads launched, joining them.") for thread in threads: if thread is not None: thread.join() logger.info("Background server jobs script executed and exiting.") if __name__ == "__main__": """ This script should be executed alongside to the server. Running it in the same process as the server may (and most probably will) cause issues. """ main() ================================================ FILE: keep/step/__init__.py ================================================ ================================================ FILE: keep/step/step.py ================================================ import logging import time from enum import Enum from keep.conditions.condition_factory import ConditionFactory from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.action_error import ActionError from keep.iohandler.iohandler import IOHandler from keep.providers.base.base_provider import BaseProvider from keep.step.step_provider_parameter import StepProviderParameter from keep.throttles.throttle_factory import ThrottleFactory class StepType(Enum): STEP = "step" ACTION = "action" class Step: def __init__( self, context_manager, step_id: str, config: dict, step_type: StepType, provider: BaseProvider, provider_parameters: dict, ): self.config = config self.step_id = step_id self.step_type = step_type self.provider = provider self.provider_parameters: dict[str, str | StepProviderParameter] = ( provider_parameters ) # backward compatibility legacy_on_failure = self.config.get("provider", {}).get("on-failure", {}) self.on_failure = self.config.get("on-failure", {}) or legacy_on_failure self.context_manager: ContextManager = context_manager self.io_handler = IOHandler(context_manager) self.conditions = self.config.get("condition", []) self.vars = self.config.get("vars", {}) self.conditions_results = {} self.logger = logging.getLogger(__name__) self.__retry = self.on_failure.get("retry", {}) self.__retry_count = self.__retry.get("count", 0) self.__retry_interval = self.__retry.get("interval", 0) self.__continue_to_next_step = self.config.get("continue", True) @property def foreach(self): return self.config.get("foreach") @property def name(self): return self.step_id @property def continue_to_next_step(self): return self.__continue_to_next_step def _dont_render(self): # special case for Keep provider on _notify with "if" - it should render the parameters itself return self.step_type == StepType.ACTION and "KeepProvider" in str( self.provider.__class__ ) def run(self): try: if self.config.get("foreach"): did_action_run = self._run_foreach() # special case for Keep provider on _notify with "if" - it should render the parameters itself elif self._dont_render(): did_action_run = self._run_single(dont_render=True) else: did_action_run = self._run_single() return did_action_run except Exception as e: self.logger.warning( "Failed to run step %s with error %s", self.step_id, e, extra={ "step_id": self.step_id, }, exc_info=True, ) raise ActionError(e) def _check_throttling(self, action_name): throttling = self.config.get("throttle") # if there is no throttling, return if not throttling: return False throttling_type = throttling.get("type") throttling_config = throttling.get("with") throttle = ThrottleFactory.get_instance( self.context_manager, throttling_type, throttling_config ) workflow_id = self.context_manager.get_workflow_id() event_id = self.context_manager.event_context.event_id return throttle.check_throttling(action_name, workflow_id, event_id) def _get_foreach_items(self) -> list | list[list]: """Get the items to iterate over, when using the `foreach` attribute (see foreach.md)""" # TODO: this should be part of iohandler? # the item holds the value we are going to iterate over # TODO: currently foreach will support only {{ a.b.c }} and not functions and other things (which make sense) foreach_split = self.config.get("foreach").split("&&") foreach_items = [] for foreach in foreach_split: index = foreach.replace("{{", "").replace("}}", "").split(".") index = [i.strip() for i in index] items = self.context_manager.get_full_context() for i in index: if isinstance(items, dict): items = items.get(i, {}) else: items = getattr(items, i, {}) foreach_items.append(items) if not foreach_items: return [] return len(foreach_items) == 1 and foreach_items[0] or zip(*foreach_items) def _run_foreach(self): """Evaluate the action for each item, when using the `foreach` attribute (see foreach.md)""" # the item holds the value we are going to iterate over items = self._get_foreach_items() any_action_run = False # apply ALL conditions (the decision whether to run or not is made in the end) self.context_manager.set_foreach_items(items=items) for item in items: self.context_manager.set_foreach_value(value=item) try: did_action_run = self._run_single() except Exception as e: self.logger.warning( "Failed to run step %s with error %s", self.step_id, e, extra={ "step_id": self.step_id, }, exc_info=True, ) continue # If at least one item triggered an action, return True # TODO - do it per item if did_action_run: any_action_run = True # reset the foreach context self.context_manager.reset_foreach_context() return any_action_run def _run_single(self, dont_render=False): # Initialize all conditions conditions = [] aliases = self.config.get("alias", {}) # if aliases are defined, set them in the context for alias_key, alias_val in aliases.items(): aliases[alias_key] = self.io_handler.render(alias_val) self.context_manager.set_step_vars( self.step_id, _vars=self.vars, _aliases=aliases ) for condition in self.conditions: condition_name = condition.get("name", None) if not condition_name: raise Exception("Condition must have a name") conditions.append( ConditionFactory.get_condition( self.context_manager, condition.get("type"), condition_name, condition, ) ) for condition in conditions: condition_compare_to = condition.get_compare_to() condition_compare_value = condition.get_compare_value() try: condition_result = condition.apply( condition_compare_to, condition_compare_value ) except Exception as e: self.logger.error( "Failed to apply condition %s with error %s", condition.condition_name, e, extra={ "step_id": self.step_id, }, ) raise self.context_manager.set_condition_results( self.step_id, condition.condition_name, condition.condition_type, condition_compare_to, condition_compare_value, condition_result, condition_alias=condition.condition_alias, **condition.condition_context, ) # Second, decide if need to run # after all conditions are applied, check if we need to run # there are 2 cases: # 1. a "if" block is supplied, then use it # 2. no "if" block is supplied, then use the AND between all conditions if self.config.get("if"): if_conf = self.config.get("if") else: # create a string of all conditions, separated by "and" if_conf = " and ".join( [f"{{{{ {condition.condition_alias} }}}} " for condition in conditions] ) # Now check it if if_conf: quoted_if_conf = self.io_handler.quote(if_conf) if_met = self.io_handler.render(quoted_if_conf, safe=False) # Evaluate the condition string from asteval import Interpreter aeval = Interpreter() evaluated_if_met = aeval(if_met) # tb: when Shahar and I debugged, conclusion was: if isinstance(evaluated_if_met, str): evaluated_if_met = aeval(evaluated_if_met) # if the evaluation failed, raise an exception if aeval.error_msg and if_conf == quoted_if_conf: self.logger.error( f"Failed to evaluate if condition, you probably used a variable that doesn't exist. Condition: {quoted_if_conf}, Rendered: {if_met}, Error: {aeval.error_msg}", extra={ "condition": quoted_if_conf, "rendered": if_met, "step_id": self.step_id, }, ) raise Exception( f"Failed to evaluate if condition, you probably used a variable that doesn't exist. Condition: {if_conf}, Rendered: {if_met}, Error: {aeval.error_msg}" ) # maybe its because of quoting, try again without quoting elif aeval.error_msg or aeval.error: # without quoting aeval_without_quote = Interpreter() if_met = self.io_handler.render(if_conf, safe=False) evaluated_if_met = aeval_without_quote(if_met) if isinstance(evaluated_if_met, str): evaluated_if_met = aeval_without_quote(evaluated_if_met) # if again error, raise an exception if aeval_without_quote.error_msg: raise Exception( f"Failed to evaluate if condition, you probably used a variable that doesn't exist. Condition: {if_conf}, Rendered: {if_met}, Error: {aeval_without_quote.error_msg}" ) else: evaluated_if_met = True action_name = self.config.get("name") if not evaluated_if_met: self.logger.info( f"Action {action_name} evaluated NOT to run, Reason: {if_met} evaluated to false. [before evaluation: {if_conf}]", extra={ "condition": if_conf, "rendered": if_met, "step_id": self.step_id, }, ) return if if_conf: self.logger.info( f"Action {action_name} evaluated to run! Reason: {if_met} evaluated to true.", extra={ "condition": if_conf, "rendered": if_met, "step_id": self.step_id, }, ) else: self.logger.info( "Action %s evaluated to run! Reason: no condition, hence true.", self.config.get("name"), extra={ "step_id": self.step_id, }, ) # Third, check throttling # Now check if throttling is enabled self.logger.info( "Checking throttling for action %s", self.config.get("name"), extra={ "step_id": self.step_id, }, ) throttled = self._check_throttling(self.config.get("name")) if throttled: self.logger.info( "Action %s is throttled", self.config.get("name"), extra={ "step_id": self.step_id, }, ) return self.logger.info( "Action %s is not throttled", self.config.get("name"), extra={ "step_id": self.step_id, }, ) # Last, run the action try: if not dont_render: rendered_providers_parameters = self.io_handler.render_context( self.provider_parameters ) # special case for Keep provider (alert evaluation engine) # which needs to evaluate the provider parameters by itself else: rendered_providers_parameters = self.provider_parameters for curr_retry_count in range(self.__retry_count + 1): self.logger.info( f"Running {self.step_id} {self.step_type}, current retry: {curr_retry_count}", extra={ "step_id": self.step_id, }, ) try: if self.step_type == StepType.STEP: step_output = self.provider.query( **rendered_providers_parameters ) else: step_output = self.provider.notify( **rendered_providers_parameters ) # exiting the loop as step/action execution was successful self.context_manager.set_step_context( self.step_id, results=step_output, foreach=self.foreach ) break except Exception as e: if curr_retry_count == self.__retry_count: raise StepError(e) else: self.logger.info( "Retrying running %s step after %s second(s)...", self.step_id, self.__retry_interval, extra={ "step_id": self.step_id, }, ) time.sleep(self.__retry_interval) extra_context = self.provider.expose() rendered_providers_parameters.update(extra_context) self.context_manager.set_step_provider_paremeters( self.step_id, rendered_providers_parameters ) except Exception as e: raise StepError(e) return True class StepError(Exception): pass ================================================ FILE: keep/step/step_provider_parameter.py ================================================ from pydantic import BaseModel class StepProviderParameter(BaseModel): key: str # the key to render safe: bool = False # whether to validate this key or fail silently ("safe") default: str | int | bool = None # default value if this key doesn't exist ================================================ FILE: keep/throttles/base_throttle.py ================================================ """ Base class for all conditions. """ import abc import logging from keep.contextmanager.contextmanager import ContextManager class BaseThrottle(metaclass=abc.ABCMeta): def __init__( self, context_manager: ContextManager, throttle_type, throttle_config, **kwargs ): """ Initialize a provider. Args: **kwargs: Provider configuration loaded from the provider yaml file. """ # Initialize logger for every provider self.logger = logging.getLogger(self.__class__.__name__) self.throttle_type = throttle_type self.throttle_config = throttle_config self.context_manager = context_manager @abc.abstractmethod def check_throttling(self, action_name, workflow_id, event_id, **kwargs) -> bool: """ Validate provider configuration. Args: action_name (str): The name of the action to check throttling for. workflow_id (str): The id of the workflow to check throttling for. event_id (str): The id of the event to check throttling for. """ raise NotImplementedError("apply() method not implemented") ================================================ FILE: keep/throttles/one_until_resolved_throttle.py ================================================ from keep.api.core.db import get_alert_by_fingerprint_and_event_id, \ get_workflow_to_alert_execution_by_workflow_execution_id from keep.api.models.alert import AlertStatus from keep.throttles.base_throttle import BaseThrottle from keep.contextmanager.contextmanager import ContextManager class OneUntilResolvedThrottle(BaseThrottle): """OneUntilResolvedThrottle if action is throttled by checking if the last time the . Args: BaseThrottle (_type_): _description_ """ def __init__(self, context_manager: ContextManager, throttle_type, throttle_config): super().__init__(context_manager=context_manager, throttle_type=throttle_type, throttle_config=throttle_config) def check_throttling(self, action_name, workflow_id, event_id, **kwargs) -> bool: last_workflow_run = self.context_manager.get_last_workflow_run(workflow_id) if not last_workflow_run: return False # query workflowtoalertexecution table by workflow_id and after that get the alert by fingerprint and event_id last_workflow_alert_execution = get_workflow_to_alert_execution_by_workflow_execution_id(last_workflow_run.id) if not last_workflow_alert_execution: return False alert = get_alert_by_fingerprint_and_event_id(self.context_manager.tenant_id, last_workflow_alert_execution.alert_fingerprint, last_workflow_alert_execution.event_id) if not alert: return False # if the last time the alert were triggered it was in resolved status, return false if AlertStatus(alert.event.get("status")) == AlertStatus.RESOLVED: return False # else, return true because its already firing return True ================================================ FILE: keep/throttles/throttle_factory.py ================================================ import importlib from keep.throttles.base_throttle import BaseThrottle class ThrottleFactory: @staticmethod def get_instance(context_manager, throttle_type, throttle_config) -> BaseThrottle: module = importlib.import_module(f"keep.throttles.{throttle_type}_throttle") throttle_class = getattr( module, throttle_type.title().replace("_", "") + "Throttle" ) return throttle_class(context_manager, throttle_type, throttle_config) ================================================ FILE: keep/topologies/topologies_service.py ================================================ import json import logging from typing import List, Optional from uuid import UUID from pydantic import ValidationError from sqlalchemy import and_, or_, exists from sqlalchemy.orm import joinedload, selectinload from sqlmodel import Session, select from keep.api.core.db_utils import get_aggreated_field from keep.api.models.db.topology import ( TopologyApplication, TopologyApplicationDtoIn, TopologyApplicationDtoOut, TopologyService, TopologyServiceApplication, TopologyServiceDependency, TopologyServiceDtoOut, TopologyServiceCreateRequestDTO, TopologyServiceUpdateRequestDTO, TopologyServiceDependencyCreateRequestDto, TopologyServiceDependencyUpdateRequestDto, TopologyServiceDependencyDto, TopologyServiceYAML, ) logger = logging.getLogger(__name__) class TopologyException(Exception): """Base exception for topology-related errors""" class ApplicationParseException(TopologyException): """Raised when an application cannot be parsed""" class ApplicationNotFoundException(TopologyException): """Raised when an application is not found""" class InvalidApplicationDataException(TopologyException): """Raised when application data is invalid""" class ServiceNotFoundException(TopologyException): """Raised when a service is not found""" class ServiceNotManualException(TopologyException): """Raised when a service is not manual""" class DependencyNotFoundException(TopologyException): """Raised when a dependency is not found""" def get_service_application_ids_dict( session: Session, service_ids: List[int] ) -> dict[int, List[UUID]]: # TODO: add proper types query = ( select( TopologyServiceApplication.service_id, get_aggreated_field( session, TopologyServiceApplication.application_id, # type: ignore "application_ids", ), ) .where(TopologyServiceApplication.service_id.in_(service_ids)) .group_by(TopologyServiceApplication.service_id) ) results = session.exec(query).all() dialect_name = session.bind.dialect.name if session.bind else "" result = {} if session.bind is None: raise ValueError("Session is not bound to a database") for application_id, service_ids in results: if dialect_name == "postgresql": # PostgreSQL returns a list of UUIDs pass elif dialect_name == "mysql": # MySQL returns a JSON string, so we need to parse it service_ids = json.loads(service_ids) elif dialect_name == "sqlite": # SQLite returns a comma-separated string service_ids = [UUID(id) for id in service_ids.split(",")] else: if service_ids and isinstance(service_ids[0], UUID): # If it's already a list of UUIDs (like in PostgreSQL), use it as is pass else: # For any other case, try to convert to UUID service_ids = [UUID(str(id)) for id in service_ids] result[application_id] = service_ids return result def validate_non_manual_exists( service_ids: list[int], session: Session, tenant_id: str ) -> bool: non_manual_exists = session.query( exists() .where(TopologyService.id.in_(service_ids)) .where(TopologyService.tenant_id == tenant_id) .where(TopologyService.is_manual.isnot(True)) ).scalar() return non_manual_exists class TopologiesService: @staticmethod def get_topology_services( tenant_id: str, session: Session, provider_ids: Optional[str] = None, services: Optional[str] = None, environment: Optional[str] = None, ) -> list[TopologyService]: query = select(TopologyService).where(TopologyService.tenant_id == tenant_id) # @tb: let's filter by service only for now and take care of it when we handle multiple # services and environments and cmdbs # the idea is that we show the service topology regardless of the underlying provider/env if services is not None: query = query.where(TopologyService.service.in_(services.split(","))) service_instance = session.exec(query).first() if not service_instance: return [] services = session.exec( select(TopologyServiceDependency) .where( TopologyServiceDependency.depends_on_service_id == service_instance.id ) .options(joinedload(TopologyServiceDependency.service)) ).all() services = [service_instance, *[service.service for service in services]] else: # Fetch services for the tenant services = session.exec( query.options( selectinload(TopologyService.dependencies).selectinload( TopologyServiceDependency.dependent_service ) ) ).all() return services @staticmethod def get_all_topology_data( tenant_id: str, session: Session, provider_ids: Optional[str] = None, services: Optional[str] = None, environment: Optional[str] = None, include_empty_deps: Optional[bool] = False, ) -> List[TopologyServiceDtoOut]: services = TopologiesService.get_topology_services( tenant_id, session, provider_ids, services, environment ) # Fetch application IDs for all services in a single query service_ids = [service.id for service in services if service.id is not None] service_to_app_ids = get_service_application_ids_dict(session, service_ids) logger.info(f"Service to app ids: {service_to_app_ids}") service_dtos = [ TopologyServiceDtoOut.from_orm( service, application_ids=service_to_app_ids.get(service.id, []) ) for service in services if service.dependencies or include_empty_deps ] return service_dtos @staticmethod def get_applications_by_tenant_id( tenant_id: str, session: Session ) -> List[TopologyApplicationDtoOut]: applications = session.exec( select(TopologyApplication).where( TopologyApplication.tenant_id == tenant_id ) ).all() result = [] for application in applications: try: app_dto = TopologyApplicationDtoOut.from_orm(application) result.append(app_dto) except ValidationError as e: logger.error( f"Failed to parse application with id {application.id}: {e}" ) raise ApplicationParseException( f"Failed to parse application with id {application.id}" ) return result @staticmethod def create_application_by_tenant_id( tenant_id: str, application: TopologyApplicationDtoIn, session: Session ) -> TopologyApplicationDtoOut: service_ids = [service.id for service in application.services] if not service_ids: raise InvalidApplicationDataException( "Application must have at least one service" ) # Fetch existing services services_to_add = session.exec( select(TopologyService) .where(TopologyService.tenant_id == tenant_id) .where(TopologyService.id.in_(service_ids)) ).all() if len(services_to_add) != len(service_ids): raise ServiceNotFoundException("One or more services not found") new_application = TopologyApplication( tenant_id=tenant_id, name=application.name, description=application.description, ) # This will be true if we are pulling applications from a Provider if application.id: new_application.id = application.id session.add(new_application) session.flush() # This assigns an ID to new_application # Create TopologyServiceApplication links new_links = [ TopologyServiceApplication( service_id=service.id, application_id=new_application.id ) for service in services_to_add if service.id ] session.add_all(new_links) session.commit() session.expire(new_application, ["services"]) return TopologyApplicationDtoOut.from_orm(new_application) @staticmethod def create_applications_by_tenant_id( tenant_id: str, applications: List[TopologyApplicationDtoIn], session: Session ) -> None: """Creates multiple applications for a given tenant in a single transaction.""" try: new_applications = [] new_links = [] for application in applications: service_ids = [service.id for service in application.services] if not service_ids: raise InvalidApplicationDataException( "Each application must have at least one service" ) # Fetch existing services services_to_add = session.exec( select(TopologyService) .where(TopologyService.tenant_id == tenant_id) .where(TopologyService.id.in_(service_ids)) ).all() if len(services_to_add) != len(service_ids): raise ServiceNotFoundException("One or more services not found") new_application = TopologyApplication( tenant_id=tenant_id, name=application.name, description=application.description, ) if application.id: new_application.id = application.id # Preserve ID if provided session.add(new_application) new_applications.append(new_application) session.flush() # Assigns IDs to new applications for new_application, application in zip(new_applications, applications): new_links.extend( [ TopologyServiceApplication( service_id=service.id, application_id=new_application.id ) for service in application.services if service.id ] ) session.add_all(new_links) session.commit() except Exception as e: session.rollback() logger.error(f"Error while creating applications: {e}") raise e finally: session.close() @staticmethod def update_application_by_id( tenant_id: str, application_id: UUID, application: TopologyApplicationDtoIn, session: Session, existing_application: Optional[TopologyApplication] = None, ) -> TopologyApplicationDtoOut: if existing_application: application_db = existing_application else: application_db = session.exec( select(TopologyApplication) .where(TopologyApplication.tenant_id == tenant_id) .where(TopologyApplication.id == application_id) ).first() if not application_db: raise ApplicationNotFoundException( f"Application with id {application_id} not found" ) application_db.name = application.name application_db.description = application.description application_db.repository = application.repository new_service_ids = set(service.id for service in application.services) # Remove existing links not in the update request session.query(TopologyServiceApplication).where( TopologyServiceApplication.application_id == application_id ).where(TopologyServiceApplication.service_id.not_in(new_service_ids)).delete() # Add new links existing_links = session.exec( select(TopologyServiceApplication.service_id).where( TopologyServiceApplication.application_id == application_id ) ).all() existing_service_ids = set(existing_links) services_to_add_ids = new_service_ids - existing_service_ids # Fetch existing services services_to_add = session.exec( select(TopologyService) .where(TopologyService.tenant_id == tenant_id) .where(TopologyService.id.in_(services_to_add_ids)) ).all() if len(services_to_add) != len(services_to_add_ids): raise ServiceNotFoundException("One or more services not found") new_links = [ TopologyServiceApplication( service_id=service.id, application_id=application_id ) for service in services_to_add if service.id ] session.add_all(new_links) session.commit() session.refresh(application_db) return TopologyApplicationDtoOut.from_orm(application_db) @staticmethod def create_or_update_application( tenant_id: str, application: TopologyApplicationDtoIn, session: Session, ) -> TopologyApplicationDtoOut: # Check if an application with the same name already exists for the tenant existing_application = session.exec( select(TopologyApplication) .where(TopologyApplication.tenant_id == tenant_id) .where(TopologyApplication.id == application.id) ).first() if existing_application: # If the application exists, update it return TopologiesService.update_application_by_id( tenant_id=tenant_id, application_id=existing_application.id, application=application, session=session, existing_application=existing_application, ) else: # If the application doesn't exist, create it return TopologiesService.create_application_by_tenant_id( tenant_id=tenant_id, application=application, session=session, ) @staticmethod def delete_application_by_id( tenant_id: str, application_id: UUID, session: Session ): # Validate that application_id is a valid UUID application = session.exec( select(TopologyApplication) .where(TopologyApplication.tenant_id == tenant_id) .where(TopologyApplication.id == application_id) ).first() if not application: raise ApplicationNotFoundException( f"Application with id {application_id} not found" ) session.delete(application) session.commit() return None @staticmethod def get_service_by_id( _id: int, tenant_id: str, session: Session ) -> TopologyService: return session.exec( select(TopologyService) .where(TopologyService.tenant_id == tenant_id) .where(TopologyService.id == _id) ).first() @staticmethod def get_dependency_by_id(_id: int, session: Session) -> TopologyServiceDependency: return session.exec( select(TopologyServiceDependency).where(TopologyServiceDependency.id == _id) ).first() @staticmethod def create_service( service: TopologyServiceCreateRequestDTO, tenant_id: str, session: Session ) -> TopologyService: """This function is used for creating services manually. services.is_manual=True""" try: # Setting is_manual to True since this service is created manually. db_service = TopologyService( **service.dict(), tenant_id=tenant_id, is_manual=True ) session.add(db_service) session.commit() session.refresh(db_service) return db_service except Exception as e: session.rollback() logger.error(f"Error while creating/updating the services manually: {e}") raise e finally: session.close() @staticmethod def create_services( services: List[TopologyServiceYAML], tenant_id: str, session: Session, ) -> None: """Creates multiple services in a single transaction without returning them.""" try: for service in services: db_service = TopologyService(**service.dict(), tenant_id=tenant_id) session.add(db_service) session.commit() except Exception as e: session.rollback() logger.error(f"Error while creating services: {e}") raise e finally: session.close() @staticmethod def update_service( service: TopologyServiceUpdateRequestDTO, tenant_id: str, session: Session ) -> TopologyService: try: db_service: TopologyService = TopologiesService.get_service_by_id( _id=service.id, tenant_id=tenant_id, session=session ) # Asserting that the service we're trying to update was created manually if not db_service.is_manual: raise ServiceNotManualException() service_dict = service.dict() if db_service is None: raise ServiceNotFoundException() else: # We update it. for attr in service_dict: if ( service_dict[attr] is not None and db_service.__getattribute__(attr) != service_dict[attr] ): db_service.__setattr__(attr, service_dict[attr]) session.commit() session.refresh(db_service) return db_service except Exception as e: session.rollback() logger.error(f"Error while updating the services manually: {e}") raise e finally: session.close() @staticmethod def delete_services(service_ids: list[int], tenant_id: str, session: Session): try: # Asserting that all the services that we are trying to delete were created manually, if this assertion # fails we do not proceed with deletion at all if validate_non_manual_exists( service_ids=service_ids, session=session, tenant_id=tenant_id, ): raise ServiceNotManualException() # Deleting all the dependencies first session.query(TopologyServiceDependency).filter( TopologyServiceDependency.service.has( and_( TopologyService.tenant_id == tenant_id, or_( TopologyServiceDependency.service_id.in_(service_ids), TopologyServiceDependency.depends_on_service_id.in_( service_ids ), ), ) ) ).delete(synchronize_session=False) deleted_count = ( session.query(TopologyService) .filter( TopologyService.id.in_(service_ids), TopologyService.tenant_id == tenant_id, ) .delete(synchronize_session=False) # Efficient batch delete ) if deleted_count == 0: raise ServiceNotFoundException("No services found for the given IDs.") session.commit() except Exception as e: session.rollback() logger.error(f"Error while deleting services: {e}") raise e finally: session.close() @staticmethod def create_dependency( dependency: TopologyServiceDependencyCreateRequestDto, tenant_id: str, session: Session, enforce_manual: bool = True, ) -> TopologyServiceDependencyDto: try: # Enforcing is_manual on the service_id and depends_on_service_id if enforce_manual and validate_non_manual_exists( service_ids=[dependency.service_id, dependency.depends_on_service_id], session=session, tenant_id=tenant_id, ): raise ServiceNotManualException() db_dependency = TopologyServiceDependency(**dependency.dict()) session.add(db_dependency) session.commit() session.refresh(db_dependency) return TopologyServiceDependencyDto.from_orm(db_dependency) except Exception as e: session.rollback() logger.error(f"Error while creating/updating the Dependency manually: {e}") raise e finally: session.close() @staticmethod def create_dependencies( dependencies: List[TopologyServiceDependencyCreateRequestDto], tenant_id: str, session: Session, enforce_manual: bool = True, ) -> None: """Creates multiple dependencies in a single transaction.""" try: db_dependencies = [] for dependency in dependencies: # Enforcing is_manual on the service_id and depends_on_service_id if enforce_manual and validate_non_manual_exists( service_ids=[ dependency.service_id, dependency.depends_on_service_id, ], session=session, tenant_id=tenant_id, ): raise ServiceNotManualException() db_dependency = TopologyServiceDependency(**dependency.dict()) session.add(db_dependency) db_dependencies.append(db_dependency) session.commit() except Exception as e: session.rollback() logger.error(f"Error while creating dependencies: {e}") raise e finally: session.close() @staticmethod def update_dependency( dependency: TopologyServiceDependencyUpdateRequestDto, session: Session, tenant_id: str, ) -> TopologyServiceDependencyDto: try: # Enforcing is_manual on the service_id and depends_on_service_id if validate_non_manual_exists( service_ids=[dependency.service_id, dependency.depends_on_service_id], session=session, tenant_id=tenant_id, ): raise ServiceNotManualException() db_dependency: TopologyServiceDependency = ( TopologiesService.get_dependency_by_id( _id=dependency.id, session=session ) ) service_dict = dependency.dict() if db_dependency is None: raise DependencyNotFoundException() else: # We update it. for attr in service_dict: if ( service_dict[attr] is not None and db_dependency.__getattribute__(attr) != service_dict[attr] ): db_dependency.__setattr__(attr, service_dict[attr]) session.commit() session.refresh(db_dependency) return TopologyServiceDependencyDto.from_orm(db_dependency) except Exception as e: session.rollback() logger.error(f"Error while updating the Dependency manually: {e}") raise e finally: session.close() @staticmethod def delete_dependency(dependency_id: int, session: Session, tenant_id: str): try: db_dependency: TopologyServiceDependency = ( TopologiesService.get_dependency_by_id( _id=dependency_id, session=session ) ) # Enforcing is_manual on the service_id and depends_on_service_id if validate_non_manual_exists( service_ids=[ db_dependency.service_id, db_dependency.depends_on_service_id, ], session=session, tenant_id=tenant_id, ): raise ServiceNotManualException() if db_dependency is None: raise DependencyNotFoundException() session.delete(db_dependency) session.commit() return None except Exception as e: session.rollback() logger.error(f"Error while updating the Dependency manually: {e}") raise e finally: session.close() @staticmethod def clean_before_import(tenant_id: str, session: Session): """Removes all services and applications for a given tenant before importing a new topology.""" try: # Delete all dependencies for this tenant session.query(TopologyServiceDependency).filter( TopologyServiceDependency.service.has( TopologyService.tenant_id == tenant_id ) ).delete(synchronize_session=False) # Delete all service-application links for this tenant session.query(TopologyServiceApplication).filter( TopologyServiceApplication.service.has( TopologyService.tenant_id == tenant_id ) ).delete(synchronize_session=False) # Delete all applications for this tenant session.query(TopologyApplication).filter( TopologyApplication.tenant_id == tenant_id ).delete(synchronize_session=False) # Delete all services for this tenant session.query(TopologyService).filter( TopologyService.tenant_id == tenant_id ).delete(synchronize_session=False) session.commit() except Exception as e: session.rollback() logger.error(f"Error during cleanup before import: {e}") raise e @staticmethod def import_to_db(topology_data: dict, session: Session, tenant_id: str): all_services: list[TopologyServiceYAML] = [] all_applications: list[TopologyApplicationDtoIn] = [] all_dependencies: list[TopologyServiceDependencyCreateRequestDto] = [] try: # Clean existing data for the tenant before import TopologiesService.clean_before_import(tenant_id=tenant_id, session=session) for service in topology_data["services"]: all_services.append(TopologyServiceYAML(**service)) for application in topology_data["applications"]: application["services"] = [ {"id": _id} for _id in application["services"] ] all_applications.append(TopologyApplicationDtoIn(**application)) for dependency in topology_data["dependencies"]: all_dependencies.append( TopologyServiceDependencyCreateRequestDto(**dependency) ) TopologiesService.create_services( services=all_services, tenant_id=tenant_id, session=session, ) TopologiesService.create_applications_by_tenant_id( tenant_id=tenant_id, applications=all_applications, session=session, ) TopologiesService.create_dependencies( dependencies=all_dependencies, tenant_id=tenant_id, session=session, enforce_manual=False, ) except Exception as e: logger.error(f"Error while importing topology: {e}") session.rollback() raise e ================================================ FILE: keep/topologies/topology_processor.py ================================================ import logging import os import threading from collections import defaultdict from typing import Dict, Optional, Set from sqlmodel import select from keep.api.core.config import config from keep.api.core.db import ( add_alerts_to_incident, assign_alert_to_incident, enrich_incidents_with_alerts, existed_or_new_session, get_last_alerts, ) from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.core.tenant_configuration import TenantConfiguration from keep.api.models.alert import AlertDto, AlertStatus from keep.api.models.db.alert import Incident from keep.api.models.db.incident import IncidentStatus from keep.api.models.db.topology import TopologyServiceApplication from keep.api.models.incident import IncidentDto from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts from keep.rulesengine.rulesengine import RulesEngine from keep.topologies.topologies_service import TopologiesService class TopologyProcessor: @staticmethod def get_instance() -> "TopologyProcessor": if not hasattr(TopologyProcessor, "_instance"): TopologyProcessor._instance = TopologyProcessor() return TopologyProcessor._instance def __init__(self): self.logger = logging.getLogger(__name__) self.started = False self.thread = None self._stop_event = threading.Event() self._topology_cache = {} self._cache_lock = threading.Lock() self.enabled = ( os.environ.get("KEEP_TOPOLOGY_PROCESSOR", "false").lower() == "true" ) # get enabled tenants self.tenant_configuration = TenantConfiguration() self.enabled_tenants = { tenant_id: self.tenant_configuration.get_configuration( tenant_id, "topology_processor" ) for tenant_id in self.tenant_configuration.configurations } # for the single tenant, use the global configuration self.enabled_tenants[SINGLE_TENANT_UUID] = self.enabled # Configuration self.process_interval = config( "KEEP_TOPOLOGY_PROCESSOR_INTERVAL", cast=int, default=10 ) # seconds self.look_back_window = config( "KEEP_TOPOLOGY_PROCESSOR_LOOK_BACK_WINDOW", cast=int, default=15 ) # minutes async def start(self): """Runs the topology processor in server mode""" if not self.enabled: self.logger.info("Topology processor is disabled") return if self.started: self.logger.info("Topology processor already started") return self.logger.info("Starting topology processor") self._stop_event.clear() self.thread = threading.Thread( target=self._start_processing, name="topology-processing", daemon=True ) self.thread.start() self.started = True self.logger.info("Started topology processor") def _start_processing(self): """Starts processing the topology""" self.logger.info("Starting topology processing") while not self._stop_event.is_set(): try: self.logger.info("Processing topology for all tenants") self._process_all_tenants() self.logger.info( "Finished processing topology for all tenants will wait for next interval [{}]".format( self.process_interval ) ) except Exception as e: self.logger.exception("Error in topology processing: %s", str(e)) # Wait for the next interval or until stopped self._stop_event.wait(self.process_interval) self.logger.info("Topology processing stopped") def stop(self): """Stops the topology processor""" if not self.started: return self.logger.info("Stopping topology processor") self._stop_event.set() if self.thread and self.thread.is_alive(): self.thread.join(timeout=30) # Wait up to 30 seconds if self.thread.is_alive(): self.logger.warning("Topology processor thread did not stop gracefully") self.started = False self.thread = None self.logger.info("Stopped topology processor") def _process_all_tenants(self): """Process topology for all tenants""" tenants = self.enabled_tenants.keys() for tenant_id in tenants: try: self.logger.info(f"Processing topology for tenant {tenant_id}") self._process_tenant(tenant_id) self.logger.info(f"Finished processing topology for tenant {tenant_id}") except Exception as e: self.logger.exception(f"Error processing tenant {tenant_id}: {str(e)}") def _process_tenant(self, tenant_id: str): """Process topology for a single tenant""" self.logger.info(f"Processing topology for tenant {tenant_id}") # 1. Get last alerts for the tenant topology_data = self._get_topology_data(tenant_id) applications = self._get_applications_data(tenant_id) services = [t.service for t in topology_data] if not topology_data: self.logger.info(f"No topology data found for tenant {tenant_id}") return # Currently topology-based incidents are created for applications only # SHAHAR: this is harder to implement service-related incidents without applications # TODO: add support for service-related incidents if not applications: self.logger.info(f"No applications found for tenant {tenant_id}") return # TODO: get only alerts with service ( if lot of alerts it will be hidden) db_last_alerts = get_last_alerts(tenant_id, with_incidents=True) last_alerts = convert_db_alerts_to_dto_alerts(db_last_alerts) services_to_alerts = defaultdict(list) # group by service for alert in last_alerts: if alert.service: if alert.service not in services: # ignore alerts for services not in topology data self.logger.debug( f"Alert service {alert.service} not in topology data" ) continue services_to_alerts[alert.service].append(alert) for application in applications: # check if there is an incident for the application incident = self._get_application_based_incident(tenant_id, application) application_services = [t.service for t in application.services] services_with_alerts = [ service for service in application_services if service in services_to_alerts ] # if none of the services in the application have alerts, we don't need to create an incident if not services_with_alerts: self.logger.info( f"No alerts found for application {application.name}, skipping" ) continue # if we are here - we have alerts for the application, we need to create/update an incident self.logger.info( f"Found alerts for application {application.name}, creating/updating incident" ) # if an incident exists, we will update it # NOTE: we support only one incident per application for now if incident: self.logger.info( f"Found existing incident for application {application.name}" ) # update the incident with new alerts / status / severity self._update_application_based_incident( tenant_id, application, incident, services_to_alerts ) else: self.logger.info( f"No existing incident found for application {application.name}" ) # create a new incident with the alerts self._create_application_based_incident( tenant_id, application, services_to_alerts ) def _get_topology_based_incidents(self, tenant_id: str) -> Dict[str, Incident]: """Get all topology-based incidents for a tenant""" with existed_or_new_session() as session: incidents = session.exec( select(Incident).where( Incident.tenant_id == tenant_id and Incident.incident_type == "topology" ) ).all() return incidents def _check_topology_for_incidents( self, last_alerts: Dict[str, AlertDto], topology_based_incidents: Dict[str, Incident], ) -> Set[Incident]: """Check if the topology should create incidents""" incidents = [] # get all alerts within the same application: # get all alerts within services that have dependencies: return incidents def _get_application_based_incident( self, tenant_id, application: TopologyServiceApplication ) -> Optional[Incident]: """Get the incident for an application""" with existed_or_new_session() as session: incident = session.exec( select(Incident).where(Incident.incident_application == application.id) ).first() return incident def _get_topology_data(self, tenant_id: str): """Get topology data for a tenant""" with existed_or_new_session() as session: topology_data = TopologiesService.get_all_topology_data( tenant_id=tenant_id, session=session ) return topology_data def _get_applications_data(self, tenant_id: str): """Get applications data for a tenant""" with existed_or_new_session() as session: applications = TopologiesService.get_applications_by_tenant_id( tenant_id=tenant_id, session=session ) return applications def _update_application_based_incident( self, tenant_id: str, application: TopologyServiceApplication, incident: Incident, services_with_alerts: Dict[str, list[AlertDto]], ) -> None: """ Update an existing application-based incident with new alerts and status Args: application: The application associated with the incident incident: The existing incident to update services_with_alerts: List of services that have active alerts """ self.logger.info(f"Updating incident for application {application.name}") with existed_or_new_session() as session: # Get all alerts for the services alerts = [] for service in services_with_alerts: service_alerts = services_with_alerts[service] alerts.extend(service_alerts) # Assign all alerts to the incident if they're not already assigned add_alerts_to_incident( tenant_id=tenant_id, incident=incident, fingerprints=[alert.fingerprint for alert in alerts], session=session, exclude_unlinked_alerts=True, ) # Check if incident should be resolved if incident.resolve_on == "all_resolved": self.logger.info("Checking if incident should be resolved") incident = enrich_incidents_with_alerts(tenant_id, [incident], session)[ 0 ] alert_dtos = convert_db_alerts_to_dto_alerts(incident.alerts) statuses = [] for alert in alert_dtos: if isinstance(alert.status, str): statuses.append(alert.status) else: statuses.append(alert.status.value) all_resolved = all( [ s == AlertStatus.RESOLVED.value or s == AlertStatus.SUPPRESSED.value for s in statuses ] ) # If all alerts are resolved, update incident status to resolved if all_resolved and incident.status != IncidentStatus.RESOLVED.value: self.logger.info( "All alerts are resolved, updating incident status to resolved" ) incident.status = IncidentStatus.RESOLVED.value session.add(incident) session.commit() # elif the alert is resolved and the incident is not resolved, update the incident status to updated elif ( incident.status == IncidentStatus.RESOLVED.value and not all_resolved ): self.logger.info( "Alerts are not resolved, updating incident status to updated" ) incident.status = IncidentStatus.FIRING.value session.add(incident) session.commit() # Send notification about incident update incident_dto = IncidentDto.from_db_incident(incident) RulesEngine.send_workflow_event(tenant_id, session, incident_dto, "updated") self.logger.info(f"Updated incident for application {application.name}") def _create_application_based_incident( self, tenant_id, application: TopologyServiceApplication, services_with_alerts: Dict[str, list[AlertDto]], ) -> None: """ Create a new application-based incident Args: application: The application to create an incident for services_with_alerts: List of services that have active alerts """ self.logger.info(f"Creating new incident for application {application.name}") with existed_or_new_session() as session: # Create new incident incident = Incident( tenant_id=tenant_id, user_generated_name=f"Application incident: {application.name}", user_summary=f"Multiple services in application {application.name} are experiencing issues", incident_type="topology", incident_application=application.id, is_candidate=False, # Topology-based incidents are always confirmed is_visible=True, # Topology-based incidents are always confirmed ) # Get all alerts for the services and find max severity for service in services_with_alerts: service_alerts = services_with_alerts[service] # Assign alerts to incident for alert in service_alerts: incident = assign_alert_to_incident( fingerprint=alert.fingerprint, incident=incident, tenant_id=tenant_id, session=session, ) # Send notification about new incident incident_dto = IncidentDto.from_db_incident(incident) # Trigger the workflow event RulesEngine.send_workflow_event(tenant_id, session, incident_dto, "created") self.logger.info(f"Created new incident for application {application.name}") ================================================ FILE: keep/validation/__init__.py ================================================ ================================================ FILE: keep/validation/fields.py ================================================ from typing import Optional from pydantic import AnyUrl, HttpUrl, conint, errors from pydantic.networks import MultiHostDsn, Parts UrlPort = conint(ge=1, le=65_535) class HttpsUrl(HttpUrl): """Validate https url, coerce if no scheme, throw if wrong scheme.""" allowed_schemes = {"https"} def __new__(cls, url: Optional[str], **kwargs) -> object: _url = url if url is not None and url.startswith("https://") else None return super().__new__(cls, _url, **kwargs) @staticmethod def get_default_parts(parts: Parts) -> Parts: return {"scheme": "https", "port": "443"} class NoSchemeUrl(AnyUrl): """Validate url with any scheme, remove scheme in output.""" def __new__(cls, url: Optional[str], **kwargs) -> object: _url = cls.build(**kwargs) if url is None else url _url = _url.split("://")[1] if "://" in _url else _url return super().__new__(cls, _url, **kwargs) @classmethod def validate_parts(cls, parts: Parts, validate_port: bool = True) -> Parts: """ In this override, we removed validation for url scheme. """ scheme = parts["scheme"] parts["scheme"] = "foo" if scheme is None else scheme if validate_port: cls._validate_port(parts["port"]) user = parts["user"] if cls.user_required and user is None: raise errors.UrlUserInfoError() return parts class MultiHostUrl(MultiHostDsn): @classmethod def build( cls, *, scheme: str, user: Optional[str] = None, password: Optional[str] = None, host: Optional[str] = None, port: Optional[str] = None, path: Optional[str] = None, query: Optional[str] = None, fragment: Optional[str] = None, **_kwargs: str, ) -> str: hosts = _kwargs.get("hosts") if host is not None and hosts is None: return super().build( scheme=scheme, user=user, password=password, host=host, port=port, path=path, query=query, fragment=fragment, **_kwargs, ) urls = [ cls._build_single_url( position=-1 if len(hosts) - idx == 1 else idx, scheme=scheme, user=user, password=password, host=hp["host"] + (hp["tld"] if hp["host_type"] == "domain" else ""), port=hp["port"], path=path, query=query, fragment=fragment, **_kwargs, ) for (idx, hp) in enumerate(hosts) ] return ",".join(urls) @classmethod def _build_single_url( cls, *, position: int, scheme: str, user: Optional[str] = None, password: Optional[str] = None, host: str, port: Optional[str] = None, path: Optional[str] = None, query: Optional[str] = None, fragment: Optional[str] = None, **_kwargs: str, ) -> str: parts = Parts( scheme=scheme, user=user, password=password, host=host, port=port, path=path, query=query, fragment=fragment, **_kwargs, # type: ignore[misc] ) url = "" if position == 0: url = scheme + "://" if user: url += user if password: url += ":" + password if user or password: url += "@" url += host if port and ( "port" not in cls.hidden_parts or cls.get_default_parts(parts).get("port") != port ): url += ":" + port if position == -1: if path: url += path if query: url += "?" + query if fragment: url += "#" + fragment return url class NoSchemeMultiHostUrl(MultiHostUrl): def __new__(cls, url: Optional[str], **kwargs) -> object: _url = cls.build(**kwargs) if url is None else url _url = _url.split("://")[1] if "://" in _url else _url return super().__new__(cls, _url, **kwargs) @classmethod def validate_parts(cls, parts: Parts, validate_port: bool = True) -> Parts: """ Remove validation for url scheme, port & user. """ scheme = parts["scheme"] parts["scheme"] = "" if scheme is None else scheme return parts ================================================ FILE: keep/workflowmanager/__init__.py ================================================ ================================================ FILE: keep/workflowmanager/workflow.py ================================================ import enum import logging import threading import typing from keep.contextmanager.contextmanager import ContextManager from keep.identitymanager.rbac import Roles from keep.iohandler.iohandler import IOHandler from keep.step.step import Step, StepError class WorkflowStrategy(enum.Enum): # if a workflow run on the same fingerprint, skip the workflow NONPARALLEL = "nonparallel" # if a workflow run on the same fingerprint, add the workflow back to the queue and run it again on the next cycle NONPARALLEL_WITH_RETRY = "nonparallel_with_retry" # DEFAULT # if a workflow run on the same fingerprint, run PARALLEL = "parallel" class Workflow: def __init__( self, context_manager: ContextManager, workflow_id: str, workflow_revision: int, workflow_name: str, workflow_owners: typing.List[str], workflow_tags: typing.List[str], workflow_interval: int, workflow_triggers: typing.Optional[typing.List[dict]], workflow_steps: typing.List[Step], workflow_actions: typing.List[Step], workflow_description: str = None, workflow_disabled: bool = False, workflow_providers: typing.List[dict] = None, workflow_providers_type: typing.List[str] = [], workflow_strategy: WorkflowStrategy = WorkflowStrategy.NONPARALLEL_WITH_RETRY.value, on_failure: Step = None, workflow_consts: typing.Dict[str, str] = {}, workflow_debug: bool = False, workflow_permissions: typing.List[str] = [], is_test: bool = False, ): self.workflow_id = workflow_id self.workflow_revision = workflow_revision self.workflow_name = workflow_name self.workflow_owners = workflow_owners self.workflow_tags = workflow_tags self.workflow_interval = workflow_interval self.workflow_triggers = workflow_triggers self.workflow_steps = workflow_steps self.workflow_actions = workflow_actions self.workflow_description = workflow_description self.workflow_disabled = workflow_disabled self.workflow_providers = workflow_providers self.workflow_providers_type = workflow_providers_type self.workflow_strategy = workflow_strategy self.workflow_consts = workflow_consts self.is_test = is_test self.on_failure = on_failure self.context_manager = context_manager self.context_manager.set_consts_context(workflow_consts) self.context_manager.set_secret_context() self.io_nandler = IOHandler(context_manager) self.logger = logging.getLogger(__name__) self.workflow_debug = workflow_debug self.workflow_permissions = workflow_permissions def run_steps(self): self.logger.debug(f"Running steps for workflow {self.workflow_id}") for step in self.workflow_steps: try: threading.current_thread().step_id = step.step_id self.logger.info( "Running step %s", step.step_id, extra={"step_id": step.step_id}, ) step_ran = step.run() if step_ran: self.logger.info( "Step %s ran successfully", step.step_id, extra={"step_id": step.step_id}, ) threading.current_thread().step_id = None # if the step ran + the step configured to stop the workflow: if step_ran and not step.continue_to_next_step: self.logger.info( "Step %s ran successfully, stopping because continue_to_next is False", step.step_id, extra={"step_id": step.step_id}, ) break except StepError as e: self.logger.error(f"Step {step.step_id} failed: {e}") threading.current_thread().step_id = None raise self.logger.debug(f"Steps for workflow {self.workflow_id} ran successfully") def run_action(self, action: Step): self.logger.info( "Running action %s", action.name, extra={"step_id": action.step_id}, ) try: action_stop = False action_ran = action.run() action_error = None if action_ran: self.logger.info( "Action %s ran successfully", action.name, extra={ "step_id": action.step_id, }, ) if action_ran and not action.continue_to_next_step: self.logger.info( "Action %s ran successfully, stopping because continue_to_next is False", action.name, extra={ "step_id": action.step_id, }, ) action_stop = True except Exception as e: self.logger.error( f"Action {action.name} failed: {e}", extra={ "step_id": action.step_id, }, ) action_ran = False action_error = f"Failed to run action {action.name}: {str(e)}" return action_ran, action_error, action_stop def run_actions(self): self.logger.debug("Running actions") actions_firing = [] actions_errors = [] for action in self.workflow_actions: threading.current_thread().step_id = action.step_id action_status, action_error, action_stop = self.run_action(action) threading.current_thread().step_id = None if action_error: actions_firing.append(action_status) actions_errors.append(action_error) # if the action ran + the action configured to stop the workflow: elif action_status and action_stop: self.logger.info("Action stop, stopping the workflow") break self.logger.debug("Actions ran") return actions_firing, actions_errors def run(self, workflow_execution_id): if self.workflow_disabled: self.logger.info(f"Skipping disabled workflow {self.workflow_id}") return self.logger.info( f"Running workflow {self.workflow_id}", extra={ "event": self.context_manager.event_context or self.context_manager.incident_context }, ) self.context_manager.set_execution_context( self.workflow_id, workflow_execution_id ) try: self.run_steps() except StepError as e: self.logger.error( f"Workflow {self.workflow_id} failed: {e}", extra={ "workflow_execution_id": workflow_execution_id, }, ) raise actions_firing, actions_errors = self.run_actions() self.logger.info(f"Finish to run workflow {self.workflow_id}") return actions_errors @staticmethod def check_run_permissions( workflow_permissions: list[str], user_email: str, user_role: str | None ) -> bool: if not workflow_permissions: return True if user_role == Roles.ADMIN.value: return True if workflow_permissions: workflow_permissions_standardized = [ permission.lower().strip() for permission in workflow_permissions ] if ( user_email not in workflow_permissions_standardized and user_role not in workflow_permissions_standardized ): return False return True ================================================ FILE: keep/workflowmanager/workflowmanager.py ================================================ import logging import os import re import threading import typing import uuid import celpy from keep.api.core.config import config from keep.api.core.db import ( get_enrichment, get_previous_alert_by_fingerprint, save_workflow_results, ) from keep.api.core.metrics import workflow_execution_duration from keep.api.models.alert import AlertDto, AlertSeverity from keep.api.models.incident import IncidentDto from keep.identitymanager.identitymanagerfactory import IdentityManagerTypes from keep.providers.providers_factory import ProviderConfigurationException from keep.workflowmanager.workflow import Workflow from keep.workflowmanager.workflowscheduler import WorkflowScheduler, timing_histogram from keep.workflowmanager.workflowstore import WorkflowStore from keep.api.utils.cel_utils import preprocess_cel_expression class WorkflowManager: # List of providers that are not allowed to be used in workflows in multi tenant mode. PREMIUM_PROVIDERS = ["bash", "python", "llamacpp", "ollama"] _lock = threading.Lock() _instance: typing.Optional["WorkflowManager"] = None @staticmethod def get_instance() -> "WorkflowManager": if not WorkflowManager._instance: # We don't want to lock if the instance is already created with WorkflowManager._lock: # Another thread might have created the instance while we were waiting for the lock if not WorkflowManager._instance: WorkflowManager._instance = WorkflowManager() return WorkflowManager._instance def __init__(self): self.logger = logging.getLogger(__name__) self.debug = config("WORKFLOW_MANAGER_DEBUG", default=False, cast=bool) if self.debug: self.logger.setLevel(logging.DEBUG) self.scheduler = WorkflowScheduler(self) self.workflow_store = WorkflowStore() self.started = False self.cel_environment = celpy.Environment() # this is to enqueue the workflows in the REDIS queue # SHAHAR: todo - finish the REDIS implementation # self.loop = None # self.redis = config("REDIS", default="false").lower() == "true" async def start(self): """Runs the workflow manager in server mode""" if self.started: self.logger.info("Workflow manager already started") return await self.scheduler.start() self.started = True def stop(self): """Stops the workflow manager""" if not self.started: return self.scheduler.stop() self.started = False # Clear the scheduler reference self.scheduler = None # Clear the instance with lock protection to prevent race conditions with _get_instance method with WorkflowManager._lock: WorkflowManager._instance = None def _apply_filter(self, filter_val, value): # if it's a regex, apply it if isinstance(filter_val, str) and filter_val.startswith('r"'): try: # remove the r" and the last " pattern = re.compile(filter_val[2:-1]) return pattern.findall(value) except Exception as e: self.logger.error( f"Error applying regex filter: {filter_val} on value: {value}", extra={"exception": e}, ) return False else: # For cases like `dismissed` if isinstance(filter_val, bool) and isinstance(value, str): return value == str(filter_val) return value == filter_val def _get_workflow_from_store(self, tenant_id, workflow_model): try: # get the actual workflow that can be triggered self.logger.info("Getting workflow from store") workflow = self.workflow_store.get_workflow(tenant_id, workflow_model.id) self.logger.info("Got workflow from store") return workflow except ProviderConfigurationException: self.logger.warning( "Workflow have a provider that is not configured", extra={ "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) except Exception as ex: self.logger.warning( "Error getting workflow", exc_info=ex, extra={ "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) def insert_incident(self, tenant_id: str, incident: IncidentDto, trigger: str): all_workflow_models = self.workflow_store.get_all_workflows(tenant_id) self.logger.info( "Got all workflows", extra={ "num_of_workflows": len(all_workflow_models), }, ) for workflow_model in all_workflow_models: if workflow_model.is_disabled: self.logger.debug( f"Skipping the workflow: id={workflow_model.id}, name={workflow_model.name}, " f"tenant_id={workflow_model.tenant_id} - Workflow is disabled." ) continue workflow = self._get_workflow_from_store(tenant_id, workflow_model) if workflow is None: continue # Using list comprehension instead of pandas flatten() for better performance # and to avoid pandas dependency # @tb: I removed pandas so if we'll have performance issues we can revert to pandas incident_triggers = [ event for trigger in workflow.workflow_triggers if trigger["type"] == "incident" for event in trigger.get("events", []) ] if trigger not in incident_triggers: self.logger.debug( "workflow does not contain trigger %s, skipping", trigger ) continue incident_enrichment = get_enrichment(tenant_id, str(incident.id)) if incident_enrichment: for k, v in incident_enrichment.enrichments.items(): setattr(incident, k, v) self.logger.info("Adding workflow to run") with self.scheduler.lock: self.scheduler.workflows_to_run.append( { "workflow": workflow, "workflow_id": workflow_model.id, "tenant_id": tenant_id, "triggered_by": "incident:{}".format(trigger), "event": incident, } ) self.logger.info("Workflow added to run") # @tb: should I move it to cel_utils.py? # logging is easier here and I don't see other places who might use this >.< def _convert_filters_to_cel(self, filters: list[dict[str, str]]): # Convert filters ({"key": "key", "value": "value"}) and friends to CEL self.logger.info( "Converting filters to CEL", extra={"original_filters": filters}, ) try: cel_filters = [] for filter in filters: key = filter.get("key") value = filter.get("value") exclude = filter.get("exclude", False) # malformed filter? if not key or not value: self.logger.warning( "Filter is missing key or value", extra={"filter": filter}, ) continue if value.startswith('r"'): # Try to parse regex in to CEL cel_regex = [] value = value[2:-1] # for example: value: r"error\\.[a-z]+\\..*" is to hard to convert to CEL # so we'll just hit the last else and raise an exception, that it's deprecated if "]^" in value or "]+" in value: raise Exception( f"Unsupported regex: {value}, move to new CEL filters" ) elif "|" in value: value_split = value.split("|") for value_ in value_split: value_ = value_.lstrip("(").rstrip(")").strip() if key == "source": if exclude: cel_regex.append(f'!{key}.contains("{value_}")') else: cel_regex.append(f'{key}.contains("{value_}")') else: if exclude: cel_regex.append(f'{key} != "{value_}"') else: cel_regex.append(f'{key} == "{value_}"') elif value == ".*": cel_regex.append(f"has({key})") elif value == "^$": # empty string if exclude: cel_regex.append(f'{key} != ""') else: cel_regex.append(f'{key} == ""') elif value.startswith(".*") and value.endswith(".*"): # for example: r".*prometheus.*" if exclude: cel_regex.append(f'!{key}.contains("{value[2:-2]}")') else: cel_regex.append(f'{key}.contains("{value[2:-2]}")') elif value.endswith(".*"): # for example: r"2025-01-30T09:.*" if exclude: cel_regex.append(f'!{key}.contains("{value[:-2]}")') else: cel_regex.append(f'{key}.contains("{value[:-2]}")') else: raise Exception( f"Unsupported regex: {value}, move to new CEL filters" ) # if we're talking about excluded, we need to do AND between the regexes # for example: # filters: [{"key": "source", "value": 'r"prometheus|grafana"', "exclude": true}] # cel: !source.contains("prometheus") && !source.contains("grafana") # otherwise, we do OR between the regexes # for example: # filters: [{"key": "source", "value": 'r"prometheus|grafana"'}] # cel: source.contains("prometheus") || source.contains("grafana") if exclude: cel_filters.append(f"({' && '.join(cel_regex)})") else: cel_filters.append(f"({' || '.join(cel_regex)})") else: if key == "source": # handle source, which is a list of sources if exclude: cel_filters.append(f'!{key}.contains("{value}")') else: cel_filters.append(f'{key}.contains("{value}")') else: if exclude: cel_filters.append(f'{key} != "{value}"') else: cel_filters.append(f'{key} == "{value}"') self.logger.info( "Converted filters to CEL", extra={"cel_filters": cel_filters, "original_filters": filters}, ) return " && ".join(cel_filters) except Exception as e: self.logger.exception( "Error converting filters to CEL", extra={"exception": e} ) raise def insert_events(self, tenant_id, events: typing.List[AlertDto | IncidentDto]): for event in events: self.logger.info("Getting all workflows", extra={"tenant_id": tenant_id}) all_workflow_models = self.workflow_store.get_all_workflows( tenant_id, exclude_disabled=True ) self.logger.info( "Got all workflows", extra={ "num_of_workflows": len(all_workflow_models), "tenant_id": tenant_id, }, ) for workflow_model in all_workflow_models: workflow = self._get_workflow_from_store(tenant_id, workflow_model) if workflow is None: # Exception is thrown in _get_workflow_from_store, we don't need to log it here, just continue. continue for trigger in workflow.workflow_triggers: # If the trigger is not an alert, it's not relevant for this event. if not trigger.get("type") == "alert": self.logger.debug( "Trigger type is not alert, skipping", extra={ "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) continue if "filters" not in trigger and "cel" not in trigger: self.logger.warning( "Trigger is missing filters or cel", extra={ "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) should_run = True else: # By default, the workflow should not run. Only if the CEL evaluates to true, the workflow will run. should_run = False # backward compatibility for filter. should be removed in the future # if triggers and cel are set, we override the cel with filters. if "filters" in trigger: try: # this is old format, so let's convert it to CEL trigger["cel"] = self._convert_filters_to_cel( trigger["filters"] ) except Exception: self.logger.exception( "Failed to convert filters to CEL, workflow will not run", extra={ "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) continue cel = trigger.get("cel", "") if not cel: self.logger.warning( "Trigger is missing cel", extra={ "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) continue # source is a special case which can be used as string comparison although it is a list if "source" in cel: try: self.logger.info( "Checking if source needs to be replaced", extra={ "cel": cel, "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) pattern = r'source\s*==\s*[\'"]([^\'"]+)[\'"]' replacement = r'source.contains("\1")' cel = re.sub(pattern, replacement, cel) except Exception: self.logger.exception( "Error replacing source in CEL", extra={ "cel": cel, "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) continue # Preprocess the CEL expression to handle severity comparisons properly try: cel = preprocess_cel_expression(cel) self.logger.debug( "Preprocessed CEL expression", extra={ "original_cel": trigger.get("cel", ""), "preprocessed_cel": cel, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) except Exception: self.logger.exception( "Error preprocessing CEL expression", extra={ "cel": cel, "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) continue compiled_ast = self.cel_environment.compile(cel) program = self.cel_environment.program(compiled_ast) # Convert event to dict and normalize severity for CEL evaluation event_payload = event.dict() # Convert severity string to numeric order for proper comparison with preprocessed CEL if isinstance(event_payload.get("severity"), str): try: event_payload["severity"] = AlertSeverity( event_payload["severity"].lower() ).order except (ValueError, AttributeError): # If severity conversion fails, keep original value pass activation = celpy.json_to_cel(event_payload) try: should_run = program.evaluate(activation) except celpy.evaluation.CELEvalError as e: self.logger.exception( "Error evaluating CEL for event in insert_events", extra={ "exception": e, "event": event, "trigger": trigger, "workflow_id": workflow_model.id, "tenant_id": tenant_id, "cel": trigger["cel"], "deprecated_filters": trigger.get("filters"), }, ) continue if bool(should_run) is False: self.logger.debug( "Workflow should not run, skipping", extra={ "triggers": workflow.workflow_triggers, "workflow_id": workflow_model.id, "tenant_id": tenant_id, "cel": trigger["cel"], "deprecated_filters": trigger.get("filters"), }, ) continue # enrich the alert with more data self.logger.info("Found a workflow to run") event.trigger = "alert" # prepare the alert with the enrichment self.logger.info("Enriching alert") alert_enrichment = get_enrichment(tenant_id, event.fingerprint) if alert_enrichment: for k, v in alert_enrichment.enrichments.items(): setattr(event, k, v) self.logger.info("Alert enriched") # apply only_on_change (https://github.com/keephq/keep/issues/801) fields_that_needs_to_be_change = trigger.get("only_on_change", []) severity_changed = trigger.get("severity_changed", False) # if there are fields that needs to be changed, get the previous alert if fields_that_needs_to_be_change or severity_changed: previous_alert = get_previous_alert_by_fingerprint( tenant_id, event.fingerprint ) if severity_changed: fields_that_needs_to_be_change.append("severity") # now compare: # (no previous alert means that the workflow should run) if previous_alert: for field in fields_that_needs_to_be_change: # the field hasn't change if getattr(event, field) == previous_alert.event.get( field ): self.logger.info( "Skipping the workflow because the field hasn't change", extra={ "field": field, "event": event, "previous_alert": previous_alert, }, ) should_run = False break if should_run and severity_changed: setattr(event, "severity_changed", True) setattr( event, "previous_severity", previous_alert.event.get("severity"), ) previous_severity = AlertSeverity( previous_alert.event.get("severity") ) current_severity = AlertSeverity(event.severity) if previous_severity < current_severity: setattr(event, "severity_change", "increased") else: setattr(event, "severity_change", "decreased") if not should_run: continue # Lastly, if the workflow should run, add it to the scheduler self.logger.info("Adding workflow to run") # SHAHAR: TODO - finish redis implementation # if REDIS is enabled, add the workflow to the queue """ if os.environ.get("REDIS", "false").lower() == "true": try: self.logger.info("Adding workflow to REDIS") from arq import ArqRedis from keep.api.arq_pool import get_pool from keep.api.consts import KEEP_ARQ_QUEUE_WORKFLOWS # We need to run this asynchronously async def enqueue_workflow(): redis: ArqRedis = await get_pool() job = await redis.enqueue_job( "run_workflow_in_worker", # You'll need to create this function tenant_id, str(workflow_model.id), # Convert UUID to string if needed "alert", # triggered_by event, # Pass the event _queue_name=KEEP_ARQ_QUEUE_WORKFLOWS, ) self.logger.info( "Enqueued workflow job", extra={ "job_id": job.job_id, "workflow_id": workflow_model.id, "tenant_id": tenant_id, "queue": KEEP_ARQ_QUEUE_WORKFLOWS, }, ) # Execute the async function loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) job_id = loop.run_until_complete(enqueue_workflow()) self.logger.info("Job enqueued", extra={"job_id": job_id}) except Exception as e: self.logger.error( "Failed to enqueue workflow job", extra={ "exception": str(e), "workflow_id": workflow_model.id, "tenant_id": tenant_id, }, ) """ with self.scheduler.lock: self.scheduler.workflows_to_run.append( { "workflow": workflow, "workflow_id": workflow_model.id, "tenant_id": tenant_id, "triggered_by": "alert", "event": event, } ) self.logger.info("Workflow added to run") self.logger.info("All workflows added to run") def _get_event_value(self, event, filter_key): # if the filter key is a nested key, get the value if "." in filter_key: filter_key_split = filter_key.split(".") # event is alert dto so we need getattr event_val = getattr(event, filter_key_split[0], None) if not event_val: return None # iterate the other keys for key in filter_key_split[1:]: event_val = event_val.get(key, None) # if the key doesn't exist, return None because we didn't find the value if not event_val: return None return event_val else: return getattr(event, filter_key, None) def _check_premium_providers(self, workflow: Workflow): """ Check if the workflow uses premium providers in multi tenant mode. Args: workflow (Workflow): The workflow to check. Raises: Exception: If the workflow uses premium providers in multi tenant mode. """ if os.environ.get("AUTH_TYPE", IdentityManagerTypes.NOAUTH.value) in ( IdentityManagerTypes.AUTH0.value, "MULTI_TENANT", ): # backward compatibility for provider in workflow.workflow_providers_type: if provider in self.PREMIUM_PROVIDERS: raise Exception( f"Provider {provider} is a premium provider. You can self-host or contact us to get access to it." ) def _run_workflow_on_failure( self, workflow: Workflow, workflow_execution_id: str, error_message: str ): """ Runs the workflow on_failure action. Args: workflow (Workflow): The workflow that fails workflow_execution_id (str): Workflow execution id error_message (str): The error message(s) """ if workflow.on_failure: self.logger.info( f"Running on_failure action for workflow {workflow.workflow_id}", extra={ "workflow_execution_id": workflow_execution_id, "workflow_id": workflow.workflow_id, "tenant_id": workflow.context_manager.tenant_id, }, ) # Adding the exception message to the provider context, so it'll be available for the action message = ( f"Workflow {workflow.workflow_id} failed with errors: {error_message}" ) # TODO: maybe to set the message in step.vars instead of provider_parameters so user can format it workflow.on_failure.provider_parameters = { **workflow.on_failure.provider_parameters, "message": message, } workflow.on_failure.run() self.logger.info( "Ran on_failure action for workflow", extra={ "workflow_execution_id": workflow_execution_id, "workflow_id": workflow.workflow_id, "tenant_id": workflow.context_manager.tenant_id, }, ) else: self.logger.debug( "No on_failure configured for workflow", extra={ "workflow_execution_id": workflow_execution_id, "workflow_id": workflow.workflow_id, "tenant_id": workflow.context_manager.tenant_id, }, ) @timing_histogram(workflow_execution_duration) def _run_workflow(self, workflow: Workflow, workflow_execution_id: str): self.logger.debug(f"Running workflow {workflow.workflow_id}") threading.current_thread().workflow_debug = workflow.workflow_debug threading.current_thread().workflow_id = workflow.workflow_id threading.current_thread().workflow_execution_id = workflow_execution_id threading.current_thread().tenant_id = workflow.context_manager.tenant_id errors = [] try: self._check_premium_providers(workflow) errors = workflow.run(workflow_execution_id) if errors: self._run_workflow_on_failure( workflow, workflow_execution_id, ", ".join(errors) ) except Exception as e: self.logger.error( f"Error running workflow {workflow.workflow_id}", extra={"exception": e, "workflow_execution_id": workflow_execution_id}, ) self._run_workflow_on_failure(workflow, workflow_execution_id, str(e)) raise if errors is not None and any(errors): self.logger.info(msg=f"Workflow {workflow.workflow_id} ran with errors") else: self.logger.info(f"Workflow {workflow.workflow_id} ran successfully") self._save_workflow_results(workflow, workflow_execution_id) return [errors, None] @staticmethod def _get_workflow_results(workflow: Workflow): """ Get the results of the workflow from the DB. Args: workflow (Workflow): The workflow to get the results for. Returns: dict: The results of the workflow. """ workflow_results = { action.name: action.provider.results for action in workflow.workflow_actions } if workflow.workflow_steps: workflow_results.update( {step.name: step.provider.results for step in workflow.workflow_steps} ) return workflow_results def _save_workflow_results(self, workflow: Workflow, workflow_execution_id: str): """ Save the results of the workflow to the DB. Args: workflow (Workflow): The workflow to save. workflow_execution_id (str): The workflow execution ID. """ self.logger.info(f"Saving workflow {workflow.workflow_id} results") workflow_results = { action.name: action.provider.results for action in workflow.workflow_actions } if workflow.workflow_steps: workflow_results.update( {step.name: step.provider.results for step in workflow.workflow_steps} ) try: save_workflow_results( tenant_id=workflow.context_manager.tenant_id, workflow_execution_id=workflow_execution_id, workflow_results=workflow_results, ) except Exception as e: self.logger.error( f"Error saving workflow {workflow.workflow_id} results", extra={"exception": e}, ) raise self.logger.info(f"Workflow {workflow.workflow_id} results saved") def _run_workflows_from_cli(self, workflows: typing.List[Workflow]): workflows_errors = [] for workflow in workflows: try: random_workflow_id = str(uuid.uuid4()) errors, _ = self._run_workflow( workflow, workflow_execution_id=random_workflow_id ) workflows_errors.append(errors) except Exception as e: self.logger.error( f"Error running workflow {workflow.workflow_id}", extra={"exception": e}, ) raise return workflows_errors ================================================ FILE: keep/workflowmanager/workflowscheduler.py ================================================ import enum import hashlib import logging import time import uuid from concurrent.futures import ThreadPoolExecutor from functools import wraps from threading import Lock from sqlalchemy.exc import IntegrityError from keep.api.consts import RUNNING_IN_CLOUD_RUN from keep.api.core.config import config from keep.api.core.db import create_workflow_execution from keep.api.core.db import finish_workflow_execution as finish_workflow_execution_db from keep.api.core.db import ( get_enrichment, get_previous_execution_id, get_timeouted_workflow_exections, ) from keep.api.core.db import get_workflow_by_id as get_workflow_db from keep.api.core.db import get_workflows_that_should_run from keep.api.core.metrics import ( workflow_execution_errors_total, workflow_execution_status, workflow_executions_total, workflow_queue_size, workflows_running, ) from keep.api.models.alert import AlertDto from keep.api.models.incident import IncidentDto from keep.api.utils.email_utils import KEEP_EMAILS_ENABLED, EmailTemplates, send_email from keep.providers.providers_factory import ProviderConfigurationException from keep.workflowmanager.workflow import Workflow, WorkflowStrategy from keep.workflowmanager.workflowstore import WorkflowStore READ_ONLY_MODE = config("KEEP_READ_ONLY", default="false") == "true" MAX_WORKERS = config("WORKFLOWS_MAX_WORKERS", default="20") class WorkflowStatus(enum.Enum): SUCCESS = "success" ERROR = "error" PROVIDERS_NOT_CONFIGURED = "providers_not_configured" def timing_histogram(histogram): def decorator(func): @wraps(func) def wrapper(*args, **kwargs): start_time = time.time() try: result = func(*args, **kwargs) return result finally: duration = time.time() - start_time # Try to get tenant_id and workflow_id from self try: tenant_id = args[1].context_manager.tenant_id except Exception: tenant_id = "unknown" try: workflow_id = args[1].workflow_id except Exception: workflow_id = "unknown" histogram.labels(tenant_id=tenant_id, workflow_id=workflow_id).observe( duration ) return wrapper return decorator class WorkflowScheduler: MAX_SIZE_SIGNED_INT = 2147483647 MAX_WORKERS = config("KEEP_MAX_WORKFLOW_WORKERS", default="20", cast=int) def __init__(self, workflow_manager): self.logger = logging.getLogger(__name__) self.workflow_manager = workflow_manager self.workflow_store = WorkflowStore() # all workflows that needs to be run due to alert event self.workflows_to_run = [] self._stop = False self.lock = Lock() self.interval_enabled = ( config("WORKFLOWS_INTERVAL_ENABLED", default="true") == "true" ) self.executor = ThreadPoolExecutor( max_workers=self.MAX_WORKERS, thread_name_prefix="WorkflowScheduler", ) self.scheduler_future = None self.futures = set() # Initialize metrics for queue size self._update_queue_metrics() def _update_queue_metrics(self): """Update queue size metrics""" with self.lock: for workflow in self.workflows_to_run: tenant_id = workflow.get("tenant_id", "unknown") workflow_queue_size.labels(tenant_id=tenant_id).set( len(self.workflows_to_run) ) async def start(self): self.logger.info("Starting workflows scheduler") # Shahar: fix for a bug in unit tests self._stop = False self.scheduler_future = self.executor.submit(self._start) self.logger.info("Workflows scheduler started") def _handle_interval_workflows(self): workflows = [] if not self.interval_enabled: self.logger.debug("Interval workflows are disabled") return try: # get all workflows that should run due to interval workflows = get_workflows_that_should_run() except Exception as ex: self.logger.warning( "Error getting workflows that should run", exc_info=ex, ) pass for workflow in workflows: workflow_execution_id = workflow.get("workflow_execution_id") tenant_id = workflow.get("tenant_id") workflow_id = workflow.get("workflow_id") try: workflow_obj = self.workflow_store.get_workflow(tenant_id, workflow_id) except ProviderConfigurationException: self.logger.exception( "Provider configuration is invalid", extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, }, ) self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.PROVIDERS_NOT_CONFIGURED, error=f"Providers are not configured for workflow {workflow_id}", ) continue except Exception as e: self.logger.warning( f"Error getting workflow: {e}", exc_info=e, extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, }, ) self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.ERROR, error=f"Error getting workflow: {e}", ) continue future = self.executor.submit( self._run_workflow, tenant_id, workflow_id, workflow_obj, workflow_execution_id, ) self.futures.add(future) future.add_done_callback(lambda f: self.futures.remove(f)) def _run_workflow( self, tenant_id, workflow_id, workflow: Workflow, workflow_execution_id: str, event_context=None, inputs=None, ): if READ_ONLY_MODE: self.logger.debug("Sleeping for 3 seconds in favor of read only mode") time.sleep(3) self.logger.info(f"Running workflow {workflow.workflow_id}...") try: # Increment running workflows counter workflows_running.labels(tenant_id=tenant_id).inc() # Track execution # Shahar: currently incident doesn't have trigger so we will workaround it if isinstance(event_context, AlertDto): workflow_executions_total.labels( tenant_id=tenant_id, workflow_id=workflow_id, trigger_type=event_context.trigger if event_context else "interval", ).inc() else: # TODO: add trigger to incident workflow_executions_total.labels( tenant_id=tenant_id, workflow_id=workflow_id, trigger_type="incident", ).inc() # Run the workflow if isinstance(event_context, AlertDto): workflow.context_manager.set_event_context(event_context) else: workflow.context_manager.set_incident_context(event_context) if inputs: workflow.context_manager.set_inputs(inputs) errors, _ = self.workflow_manager._run_workflow( workflow, workflow_execution_id ) except Exception as e: # Track error metrics workflow_execution_errors_total.labels( tenant_id=tenant_id, workflow_id=workflow_id, error_type=type(e).__name__, ).inc() workflow_execution_status.labels( tenant_id=tenant_id, workflow_id=workflow_id, status="error" ).inc() self.logger.exception( f"Failed to run workflow {workflow.workflow_id}...", extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, }, ) self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.ERROR, error=str(e), ) return finally: # Decrement running workflows counter workflows_running.labels(tenant_id=tenant_id).dec() self._update_queue_metrics() if errors is not None and any(errors): self.logger.info(msg=f"Workflow {workflow.workflow_id} ran with errors") self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.ERROR, error="\n".join(str(e) for e in errors), ) else: self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.SUCCESS, error=None, ) self.logger.info(f"Workflow {workflow.workflow_id} ran") def handle_manual_event_workflow( self, workflow_id, workflow_revision, tenant_id, triggered_by_user, event: AlertDto | IncidentDto, workflow: Workflow = None, test_run: bool = False, inputs: dict = None, ): self.logger.info(f"Running manual event workflow {workflow_id}...") try: unique_execution_number = self._get_unique_execution_number() self.logger.info(f"Unique execution number: {unique_execution_number}") if isinstance(event, IncidentDto): event_id = str(event.id) event_type = "incident" fingerprint = "incident:{}".format(event_id) else: event_id = event.event_id event_type = "alert" fingerprint = event.fingerprint workflow_execution_id = create_workflow_execution( workflow_id=workflow_id, workflow_revision=workflow_revision, tenant_id=tenant_id, triggered_by=f"manually by {triggered_by_user}", execution_number=unique_execution_number, fingerprint=fingerprint, event_id=event_id, event_type=event_type, test_run=test_run, ) self.logger.info(f"Workflow execution id: {workflow_execution_id}") # This is kinda WTF exception since create_workflow_execution shouldn't fail for manual except Exception as e: self.logger.error(f"WTF: error creating workflow execution: {e}") raise e self.logger.info( f"Adding workflow to run {'(test)' if test_run else ''}", extra={ "workflow_id": workflow_id, "by_definition": workflow is not None, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, "triggered_by": "manual", "triggered_by_user": triggered_by_user, }, ) with self.lock: event.trigger = "manual" self.workflows_to_run.append( { "workflow_id": workflow_id, "workflow": workflow, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, "triggered_by": "manual", "triggered_by_user": triggered_by_user, "event": event, "retry": True, "test_run": test_run, "inputs": inputs, } ) return workflow_execution_id def _get_unique_execution_number(self, fingerprint=None, workflow_id=None): """ Translates the fingerprint to a unique execution number Returns: int: an int represents unique execution number """ # if fingerprint supplied if fingerprint and workflow_id: payload = f"{str(fingerprint)}:{str(workflow_id)}".encode() # else, just return random elif fingerprint: payload = str(fingerprint).encode() else: payload = str(uuid.uuid4()).encode() return int(hashlib.sha256(payload).hexdigest(), 16) % ( WorkflowScheduler.MAX_SIZE_SIGNED_INT + 1 ) def _timeout_workflows(self): """ Record timeout for workflows that are running for too long. """ workflow_executions = get_timeouted_workflow_exections() for workflow_execution in workflow_executions: self.logger.info( "Timeout workflow execution detected", extra={ "workflow_id": workflow_execution.workflow_id, "workflow_execution_id": workflow_execution.id, "tenant_id": workflow_execution.tenant_id, }, ) timeout_message = "Workflow execution timed out. " if RUNNING_IN_CLOUD_RUN: timeout_message += ( "Please contact Keep support for help with this issue." ) else: timeout_message += ( "Most probably it's caused by worker restart or crash " "during long workflow execution. Check backend logs." ) self._finish_workflow_execution( tenant_id=workflow_execution.tenant_id, workflow_id=workflow_execution.workflow_id, workflow_execution_id=workflow_execution.id, status=WorkflowStatus.ERROR, error=timeout_message, ) def _handle_event_workflows(self): # TODO - event workflows should be in DB too, to avoid any state problems. # take out all items from the workflows to run and run them, also, clean the self.workflows_to_run list with self.lock: workflows_to_run, self.workflows_to_run = self.workflows_to_run, [] for workflow_to_run in workflows_to_run: self.logger.info( "Running event workflow on background", extra={ "workflow_id": workflow_to_run.get("workflow_id"), "workflow_execution_id": workflow_to_run.get( "workflow_execution_id" ), "tenant_id": workflow_to_run.get("tenant_id"), }, ) workflow = workflow_to_run.get("workflow") workflow_id = workflow_to_run.get("workflow_id") tenant_id = workflow_to_run.get("tenant_id") # Update queue size metrics workflow_queue_size.labels(tenant_id=tenant_id).set( len(self.workflows_to_run) ) workflow_execution_id = workflow_to_run.get("workflow_execution_id") if not workflow: self.logger.info("Loading workflow") try: workflow = self.workflow_store.get_workflow( workflow_id=workflow_id, tenant_id=tenant_id ) # In case the provider are not configured properly except ProviderConfigurationException as e: self.logger.warning( f"Error getting workflow: {e}", exc_info=e, extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, }, ) self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.PROVIDERS_NOT_CONFIGURED, error=f"Providers are not configured for workflow {workflow_id}, please configure it so Keep will be able to run it", ) continue except Exception as e: self.logger.warning( f"Error getting workflow: {e}", exc_info=e, extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, }, ) self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.ERROR, error=f"Error getting workflow: {e}", ) continue event = workflow_to_run.get("event") triggered_by = workflow_to_run.get("triggered_by") if triggered_by == "manual": triggered_by_user = workflow_to_run.get("triggered_by_user") triggered_by = f"manually by {triggered_by_user}" elif triggered_by.startswith("incident:"): triggered_by = f"type:{triggered_by} name:{event.name} id:{event.id}" else: triggered_by = f"type:alert name:{event.name} id:{event.id}" if isinstance(event, IncidentDto): event_id = str(event.id) event_type = "incident" fingerprint = event_id else: event_id = event.event_id event_type = "alert" fingerprint = event.fingerprint # In manual, we create the workflow execution id sync so it could be tracked by the caller (UI) # In event (e.g. alarm), we will create it here if not workflow_execution_id: # creating the execution id here to be able to trace it in logs even in case of IntegrityError # eventually, workflow_execution_id == execution_id execution_id = str(uuid.uuid4()) try: # if the workflow can run in parallel, we just to create a some random execution number if workflow.workflow_strategy == WorkflowStrategy.PARALLEL.value: workflow_execution_number = self._get_unique_execution_number() # else, we want to enforce that no workflow already run with the same fingerprint else: workflow_execution_number = self._get_unique_execution_number( fingerprint, workflow_id ) workflow_execution_id = create_workflow_execution( workflow_id=workflow_id, workflow_revision=workflow.workflow_revision, tenant_id=tenant_id, triggered_by=triggered_by, execution_number=workflow_execution_number, fingerprint=fingerprint, event_id=event_id, execution_id=execution_id, event_type=event_type, ) # If there is already running workflow from the same event except IntegrityError: # if the strategy is with RETRY, just put a warning and add it back to the queue if ( workflow.workflow_strategy == WorkflowStrategy.NONPARALLEL_WITH_RETRY.value ): self.logger.info( "Collision with workflow execution! will retry next time", extra={ "workflow_id": workflow_id, "tenant_id": tenant_id, }, ) with self.lock: self.workflows_to_run.append( { "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, "triggered_by": triggered_by, "event": event, "retry": True, } ) continue # else if NONPARALLEL, just finish the execution elif ( workflow.workflow_strategy == WorkflowStrategy.NONPARALLEL.value ): self.logger.error( "Collision with workflow execution! will not retry", extra={ "workflow_id": workflow_id, "tenant_id": tenant_id, }, ) self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.ERROR, error="Workflow already running with the same fingerprint", ) continue # else, just raise the exception (that should not happen) else: self.logger.exception("Collision with workflow execution!") continue except Exception as e: self.logger.error(f"Error creating workflow execution: {e}") continue # if thats a retry, we need to re-pull the alert/incident to update the enrichments # for example: 2 alerts arrived within a 0.1 seconds the first one is "firing" and the second one is "resolved" # - the first alert will trigger a workflow that will create a ticket with "firing" # and enrich the alert with the ticket_url # - the second one will wait for the next iteration # - on the next iteratino, the second alert enriched with the ticket_url # and will trigger a workflow that will update the ticket with "resolved" if workflow_to_run.get("retry", False): try: self.logger.info( "Updating enrichments for workflow after retry", extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, }, ) new_enrichment = get_enrichment( tenant_id, fingerprint, refresh=True ) # merge the new enrichment with the original event if new_enrichment: new_event = event.dict() new_event.update(new_enrichment.enrichments) if isinstance(event, IncidentDto): event = IncidentDto(**new_event) else: event = AlertDto(**new_event) self.logger.info( "Enrichments updated for workflow after retry", extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, "new_enrichment": new_enrichment, }, ) except Exception as e: self.logger.error( f"Failed to get enrichment: {e}", extra={ "workflow_id": workflow_id, "workflow_execution_id": workflow_execution_id, "tenant_id": tenant_id, }, ) self._finish_workflow_execution( tenant_id=tenant_id, workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.ERROR, error=f"Error getting alert by id: {e}", ) continue # Last, run the workflow inputs = workflow_to_run.get("inputs", {}) future = self.executor.submit( self._run_workflow, tenant_id, workflow_id, workflow, workflow_execution_id, event, inputs, ) self.futures.add(future) future.add_done_callback(lambda f: self.futures.remove(f)) self.logger.debug( "Event workflows handled", extra={"current_number_of_workflows": len(self.futures)}, ) def _start(self): RUN_TIMEOUT_CHECKS_EVERY = 100 self.logger.info("Starting workflows scheduler") runs = 0 while not self._stop: runs += 1 # get all workflows that should run now self.logger.debug( "Starting workflow scheduler iteration", extra={"current_number_of_workflows": len(self.futures)}, ) try: self._handle_interval_workflows() self._handle_event_workflows() if runs % RUN_TIMEOUT_CHECKS_EVERY == 0: self._timeout_workflows() except Exception: # This is the "mainloop" of the scheduler, we don't want to crash it # But any exception here should be investigated self.logger.error("Error getting workflows that should run") pass self.logger.debug("Sleeping until next iteration") time.sleep(1) self.logger.info("Workflows scheduler stopped") def stop(self): self.logger.info("Stopping scheduled workflows") self._stop = True # Wait for scheduler to stop first if self.scheduler_future: try: self.scheduler_future.result( timeout=5 ) # Add timeout to prevent hanging except Exception: self.logger.exception("Error waiting for scheduler to stop") # Cancel all running workflows with timeout for future in list(self.futures): # Create a copy of futures set try: self.logger.info("Cancelling future") future.cancel() future.result(timeout=1) # Add timeout self.logger.info("Future cancelled") except Exception: self.logger.exception("Error cancelling future") # Shutdown the executor with timeout if self.executor: try: self.logger.info("Shutting down executor") self.executor.shutdown(wait=True, cancel_futures=True) self.executor = None self.logger.info("Executor shut down") except Exception: self.logger.exception("Error shutting down executor") self.futures.clear() self.logger.info("Scheduled workflows stopped") def _finish_workflow_execution( self, tenant_id: str, workflow_id: str, workflow_execution_id: str, status: WorkflowStatus, error=None, ): # mark the workflow execution as finished in the db finish_workflow_execution_db( tenant_id=tenant_id, workflow_id=workflow_id, execution_id=workflow_execution_id, status=status.value, error=error, ) if KEEP_EMAILS_ENABLED: # get the previous workflow execution id previous_execution = get_previous_execution_id( tenant_id, workflow_id, workflow_execution_id ) # if error, send an email if status == WorkflowStatus.ERROR and ( previous_execution is None # this means this is the first execution, for example or previous_execution.status != WorkflowStatus.ERROR.value ): workflow = get_workflow_db(tenant_id=tenant_id, workflow_id=workflow_id) try: keep_platform_url = config( "KEEP_PLATFORM_URL", default="https://platform.keephq.dev" ) error_logs_url = f"{keep_platform_url}/workflows/{workflow_id}/runs/{workflow_execution_id}" self.logger.debug( f"Sending email to {workflow.created_by} for failed workflow {workflow_id}" ) email_sent = send_email( to_email=workflow.created_by, template_id=EmailTemplates.WORKFLOW_RUN_FAILED, workflow_id=workflow_id, workflow_name=workflow.name, workflow_execution_id=workflow_execution_id, error=error, url=error_logs_url, ) if email_sent: self.logger.info( f"Email sent to {workflow.created_by} for failed workflow {workflow_id}" ) except Exception as e: self.logger.error( f"Failed to send email to {workflow.created_by} for failed workflow {workflow_id}: {e}" ) ================================================ FILE: keep/workflowmanager/workflowstore.py ================================================ import io import logging import os import random import uuid from typing import Tuple import celpy import requests import validators from fastapi import HTTPException from keep.api.core.db import ( add_or_update_workflow, delete_workflow, delete_workflow_by_provisioned_file, get_all_provisioned_workflows, get_all_workflows, get_all_workflows_yamls, get_workflow_by_id, get_workflow_execution, get_workflow_execution_with_logs, ) from keep.api.core.workflows import get_workflows_with_last_executions_v2 from keep.api.models.db.workflow import Workflow as WorkflowModel from keep.api.models.query import QueryDto from keep.api.models.workflow import PreparsedWorkflowDTO, ProviderDTO from keep.functions import cyaml from keep.parser.parser import Parser from keep.providers.providers_factory import ProvidersFactory from keep.workflowmanager.workflow import Workflow from sqlalchemy.exc import NoResultFound class WorkflowStore: def __init__(self): self.parser = Parser() self.logger = logging.getLogger(__name__) self.celpy_env = celpy.Environment() def get_workflow_execution( self, tenant_id: str, workflow_execution_id: str, is_test_run: bool | None = None, ): try: return get_workflow_execution(tenant_id, workflow_execution_id, is_test_run) except NoResultFound: raise HTTPException( status_code=404, detail=f"Workflow execution {workflow_execution_id} not found", ) def get_workflow_execution_with_logs( self, tenant_id: str, workflow_execution_id: str, is_test_run: bool | None = None, ): try: return get_workflow_execution_with_logs( tenant_id, workflow_execution_id, is_test_run ) except NoResultFound: raise HTTPException( status_code=404, detail=f"Workflow execution {workflow_execution_id} not found", ) def create_workflow( self, tenant_id: str, created_by, workflow: dict, force_update: bool = True, lookup_by_name: bool = False, ): workflow_id = workflow.get("id") self.logger.info(f"Creating workflow {workflow_id}") interval = self.parser.parse_interval(workflow) if not workflow.get("name"): # workflow name is None or empty string workflow_name = workflow_id workflow["name"] = workflow_name else: workflow_name = workflow.get("name") workflow_db = add_or_update_workflow( id=str(uuid.uuid4()), name=workflow_name, tenant_id=tenant_id, description=workflow.get("description"), created_by=created_by, updated_by=created_by, interval=interval, is_disabled=Parser.parse_disabled(workflow), workflow_raw=cyaml.dump(workflow, width=99999), force_update=force_update, lookup_by_name=lookup_by_name, ) self.logger.info( f"Workflow {workflow_db.id}, {workflow_db.revision} created successfully" ) return workflow_db def delete_workflow(self, tenant_id, workflow_id): self.logger.info(f"Deleting workflow {workflow_id}") workflow = get_workflow_by_id(tenant_id, workflow_id) if not workflow: raise HTTPException( status_code=404, detail=f"Workflow {workflow_id} not found" ) if workflow.provisioned: raise HTTPException(403, detail="Cannot delete a provisioned workflow") try: delete_workflow(tenant_id, workflow_id) except Exception as e: self.logger.exception(f"Error deleting workflow {workflow_id}: {str(e)}") raise HTTPException( status_code=500, detail=f"Failed to delete workflow {workflow_id}" ) def _parse_workflow_to_dict(self, workflow_path: str) -> dict: """ Parse a workflow to a dictionary from either a file or a URL. Args: workflow_path (str): a URL or a file path Returns: dict: Dictionary with the workflow information """ self.logger.debug("Parsing workflow") # If the workflow is a URL, get the workflow from the URL if validators.url(workflow_path) is True: response = requests.get(workflow_path) return self._read_workflow_from_stream(io.StringIO(response.text)) else: # else, get the workflow from the file with open(workflow_path, "r") as file: return self._read_workflow_from_stream(file) def get_raw_workflow(self, tenant_id: str, workflow_id: str) -> str: workflow = get_workflow_by_id(tenant_id, workflow_id) if not workflow: raise HTTPException( status_code=404, detail=f"Workflow {workflow_id} not found", ) return self.format_workflow_yaml(workflow.workflow_raw) def get_workflow(self, tenant_id: str, workflow_id: str) -> Workflow: workflow = get_workflow_by_id(tenant_id, workflow_id) if not workflow: raise HTTPException( status_code=404, detail=f"Workflow {workflow_id} not found", ) workflow_yaml = cyaml.safe_load(workflow.workflow_raw) workflow = self.parser.parse( tenant_id, workflow_yaml, workflow_db_id=workflow.id, workflow_revision=workflow.revision, is_test=workflow.is_test, ) if len(workflow) > 1: raise HTTPException( status_code=500, detail=f"More than one workflow with id {workflow_id} found", ) elif workflow: return workflow[0] else: raise HTTPException( status_code=404, detail=f"Workflow {workflow_id} not found", ) def get_workflow_from_dict(self, tenant_id: str, workflow_dict: dict) -> Workflow: logging.info("Parsing workflow from dict", extra={"workflow": workflow_dict}) workflow = self.parser.parse(tenant_id, workflow_dict) if workflow: return workflow[0] else: raise HTTPException( status_code=500, detail="Unable to parse workflow from dict", ) def get_all_workflows( self, tenant_id: str, exclude_disabled: bool = False ) -> list[WorkflowModel]: # list all tenant's workflows workflows = get_all_workflows(tenant_id, exclude_disabled) return workflows def get_all_workflows_with_last_execution( self, tenant_id: str, cel: str = None, limit: int = None, offset: int = None, sort_by: str = None, sort_dir: str = None, session=None, ): # list all tenant's workflows return get_workflows_with_last_executions_v2( tenant_id=tenant_id, cel=cel, limit=limit, offset=offset, sort_by=sort_by, sort_dir=sort_dir, fetch_last_executions=25, session=session, ) def get_all_workflows_yamls(self, tenant_id: str) -> list[str]: # list all tenant's workflows yamls (Workflow.workflow_raw) return list(get_all_workflows_yamls(tenant_id)) def get_workflows_from_path( self, tenant_id, workflow_path: str | tuple[str], providers_file: str = None, actions_file: str = None, ) -> list[Workflow]: """Backward compatibility method to get workflows from a path. Args: workflow_path (str | tuple[str]): _description_ providers_file (str, optional): _description_. Defaults to None. Returns: list[Workflow]: _description_ """ # get specific workflows, the original interface # to interact with workflows workflows = [] if isinstance(workflow_path, tuple): for workflow_url in workflow_path: workflow_yaml = self._parse_workflow_to_dict(workflow_url) workflows.extend( self.parser.parse( tenant_id, workflow_yaml, providers_file, actions_file ) ) elif os.path.isdir(workflow_path): workflows.extend( self._get_workflows_from_directory( tenant_id, workflow_path, providers_file, actions_file ) ) else: workflow_yaml = self._parse_workflow_to_dict(workflow_path) workflows = self.parser.parse( tenant_id, workflow_yaml, providers_file, actions_file ) return workflows def _get_workflows_from_directory( self, tenant_id, workflows_dir: str, providers_file: str = None, actions_file: str = None, ) -> list[Workflow]: """ Run workflows from a directory. Args: workflows_dir (str): A directory containing workflows yamls. providers_file (str, optional): The path to the providers yaml. Defaults to None. """ workflows = [] for file in os.listdir(workflows_dir): if file.endswith(".yaml") or file.endswith(".yml"): self.logger.info(f"Getting workflows from {file}") parsed_workflow_yaml = self._parse_workflow_to_dict( os.path.join(workflows_dir, file) ) try: workflows.extend( self.parser.parse( tenant_id, parsed_workflow_yaml, providers_file, actions_file, ) ) self.logger.info(f"Workflow from {file} fetched successfully") except Exception as e: print(e) self.logger.error( f"Error parsing workflow from {file}", extra={"exception": e} ) return workflows @staticmethod def format_workflow_yaml(yaml_string: str) -> str: yaml_content = cyaml.safe_load(yaml_string) if "workflow" in yaml_content: yaml_content = yaml_content["workflow"] # backward compatibility elif "alert" in yaml_content: yaml_content = yaml_content["alert"] valid_workflow_yaml = {"workflow": yaml_content} return cyaml.dump(valid_workflow_yaml, width=99999) @staticmethod def pre_parse_workflow_yaml(yaml_content): parser = Parser() if "workflow" in yaml_content: yaml_content = yaml_content["workflow"] # backward compatibility elif "alert" in yaml_content: yaml_content = yaml_content["alert"] workflow_name = yaml_content.get("name") or yaml_content.get("id") if not workflow_name: raise ValueError(f"Workflow {yaml_content} does not have a name or id") workflow_id = str(uuid.uuid4()) workflow_description = yaml_content.get("description") workflow_interval = parser.parse_interval(yaml_content) workflow_disabled = parser.parse_disabled(yaml_content) return PreparsedWorkflowDTO( id=workflow_id, name=workflow_name, description=workflow_description, interval=workflow_interval, disabled=workflow_disabled, ) @staticmethod def provision_workflows( tenant_id: str, ) -> list[Workflow]: """ Provision workflows from a directory or env variable. Args: tenant_id (str): The tenant ID. Returns: list[Workflow]: A list of provisioned Workflow objects. """ logger = logging.getLogger(__name__) provisioned_workflows = [] provisioned_workflows_dir = os.environ.get("KEEP_WORKFLOWS_DIRECTORY") provisioned_workflow_yaml = os.environ.get("KEEP_WORKFLOW") # Get all existing provisioned workflows logger.info("Getting all already provisioned workflows") provisioned_workflows = get_all_provisioned_workflows(tenant_id) logger.info(f"Found {len(provisioned_workflows)} provisioned workflows") if not (provisioned_workflows_dir or provisioned_workflow_yaml): logger.info("No workflows for provisioning found") if provisioned_workflows: logger.info("Found existing provisioned workflows, deleting them") for workflow in provisioned_workflows: logger.info(f"Deprovisioning workflow {workflow.id}") delete_workflow(tenant_id, workflow.id) logger.info(f"Workflow {workflow.id} deprovisioned successfully") return [] if ( provisioned_workflows_dir is not None and provisioned_workflow_yaml is not None ): raise Exception( "Workflows provisioned via env var and directory at the same time. Please choose one." ) if provisioned_workflows_dir is not None and not os.path.isdir( provisioned_workflows_dir ): raise FileNotFoundError( f"Directory {provisioned_workflows_dir} does not exist" ) ### Provisioning from env var if provisioned_workflow_yaml is not None: logger.info("Provisioning workflow from env var") pre_parsed_workflow = None try: workflow_yaml = cyaml.safe_load(provisioned_workflow_yaml) pre_parsed_workflow = WorkflowStore.pre_parse_workflow_yaml( workflow_yaml ) except ValueError as e: logger.error( "Error provisioning workflow from env var: yaml is invalid", extra={"exception": e}, ) try: # Un-provisioning other workflows. for workflow in provisioned_workflows: if ( not pre_parsed_workflow or not workflow.name == pre_parsed_workflow.name ): if not pre_parsed_workflow: logger.info( f"Deprovisioning workflow {workflow.id} as no workflows to provision" ) else: logger.info( f"Deprovisioning workflow {workflow.id} as its id doesn't match the provisioned workflow provided in the env" ) delete_workflow(tenant_id, workflow.id) logger.info( f"Workflow {workflow.id} deprovisioned successfully" ) if not pre_parsed_workflow: logger.info("No workflows to provision") return [] logger.info( f"Provisioning workflow {pre_parsed_workflow.id} from env var" ) add_or_update_workflow( id=pre_parsed_workflow.id, name=pre_parsed_workflow.name, tenant_id=tenant_id, description=pre_parsed_workflow.description, created_by="system", updated_by="system", interval=pre_parsed_workflow.interval, is_disabled=pre_parsed_workflow.disabled, workflow_raw=cyaml.dump(workflow_yaml, width=99999), provisioned=True, provisioned_file=None, ) provisioned_workflows.append(workflow_yaml) logger.info("Workflow provisioned successfully") except Exception as e: logger.error( "Error provisioning workflow from env var", extra={"exception": e}, ) ### Provisioning from the directory if provisioned_workflows_dir is not None: logger.info( f"Provisioning workflows from directory {provisioned_workflows_dir}" ) # Check for workflows that are no longer in the directory or outside the workflows_dir and delete them for workflow in provisioned_workflows: if ( workflow.provisioned_file is None or not os.path.exists(workflow.provisioned_file) or not provisioned_workflows_dir.endswith( os.path.commonpath( [provisioned_workflows_dir, workflow.provisioned_file] ) ) ): logger.info( f"Deprovisioning workflow {workflow.id} as its file no longer exists or is outside the workflows directory" ) delete_workflow_by_provisioned_file( tenant_id, workflow.provisioned_file ) logger.info(f"Workflow {workflow.id} deprovisioned successfully") # Provision new workflows from the directory for file in os.listdir(provisioned_workflows_dir): if file.endswith((".yaml", ".yml")): logger.info(f"Provisioning workflow from {file}") workflow_path = os.path.join(provisioned_workflows_dir, file) try: with open(workflow_path, "r") as yaml_file: workflow_yaml = cyaml.safe_load(yaml_file.read()) pre_parsed_workflow = WorkflowStore.pre_parse_workflow_yaml( workflow_yaml ) add_or_update_workflow( id=pre_parsed_workflow.id, name=pre_parsed_workflow.name, tenant_id=tenant_id, description=pre_parsed_workflow.description, created_by="system", updated_by="system", interval=pre_parsed_workflow.interval, is_disabled=pre_parsed_workflow.disabled, workflow_raw=cyaml.dump(workflow_yaml, width=99999), provisioned=True, provisioned_file=workflow_path, ) provisioned_workflows.append(workflow_yaml) logger.info(f"Workflow from {file} provisioned successfully") except Exception as e: logger.error( f"Error provisioning workflow from {file}", extra={"exception": e}, ) else: logger.info(f"Skipping file {file} as it is not a YAML file") return provisioned_workflows def _read_workflow_from_stream(self, stream) -> dict: """ Parse a workflow from an IO stream. Args: stream (IOStream): The stream to read from Raises: e: If the stream is not a valid YAML Returns: dict: Dictionary with the workflow information """ self.logger.debug("Parsing workflow") try: workflow = cyaml.safe_load(stream) except cyaml.YAMLError as e: self.logger.error(f"Error parsing workflow: {e}") raise e return workflow def get_random_workflow_templates( self, tenant_id: str, workflows_dir: str, limit: int ) -> list[dict]: """ Get random workflows from a directory. Args: tenant_id (str): The tenant to which the workflows belong. workflows_dir (str): A directory containing workflows yamls. limit (int): The number of workflows to return. Returns: List[dict]: A list of workflows """ if not os.path.isdir(workflows_dir): raise FileNotFoundError(f"Directory {workflows_dir} does not exist") workflow_yaml_files = [ f for f in os.listdir(workflows_dir) if f.endswith((".yaml", ".yml")) ] if not workflow_yaml_files: raise FileNotFoundError(f"No workflows found in directory {workflows_dir}") random.shuffle(workflow_yaml_files) workflows = [] count = 0 for file in workflow_yaml_files: if count == limit: break try: file_path = os.path.join(workflows_dir, file) workflow_yaml = self._parse_workflow_to_dict(file_path) if "workflow" in workflow_yaml: workflow_yaml["name"] = workflow_yaml["workflow"]["id"] workflow_yaml["workflow_raw"] = cyaml.dump(workflow_yaml) workflow_yaml["workflow_raw_id"] = workflow_yaml["workflow"]["id"] workflows.append(workflow_yaml) count += 1 self.logger.info(f"Workflow from {file} fetched successfully") except Exception as e: self.logger.error( f"Error parsing or fetching workflow from {file}: {e}" ) return workflows def query_workflow_templates( self, tenant_id: str, workflows_dir: str, query: QueryDto ) -> Tuple[list[dict], int]: """ Get random workflows from a directory. Args: tenant_id (str): The tenant to which the workflows belong. workflows_dir (str): A directory containing workflows yamls. limit (int): The number of workflows to return. Returns: List[dict]: A list of workflows """ if not os.path.isdir(workflows_dir): raise FileNotFoundError(f"Directory {workflows_dir} does not exist") workflow_yaml_files = [ f for f in os.listdir(workflows_dir) if f.endswith((".yaml", ".yml")) ] if not workflow_yaml_files: raise FileNotFoundError(f"No workflows found in directory {workflows_dir}") workflows = [] for file in workflow_yaml_files: try: file_path = os.path.join(workflows_dir, file) workflow_yaml = self._parse_workflow_to_dict(file_path) if "workflow" in workflow_yaml: workflow_yaml["name"] = workflow_yaml["workflow"]["id"] workflow_yaml["workflow_raw"] = cyaml.dump(workflow_yaml) workflow_yaml["workflow_raw_id"] = workflow_yaml["workflow"]["id"] if not query.cel: workflows.append(workflow_yaml) continue ast = self.celpy_env.compile(query.cel) prgm = self.celpy_env.program(ast) activation = celpy.json_to_cel( { "name": workflow_yaml.get("workflow", {}) .get("name", None) .lower(), "description": workflow_yaml.get("workflow", {}) .get("description", "") .lower(), } ) relevant = prgm.evaluate(activation) if relevant: workflows.append(workflow_yaml) self.logger.info(f"Workflow from {file} fetched successfully") except Exception as e: self.logger.error( f"Error parsing or fetching workflow from {file}: {e}" ) return workflows[query.offset : query.offset + query.limit], len(workflows) def get_workflow_meta_data( self, tenant_id: str, workflow: WorkflowModel | None, installed_providers_by_type: dict, ): providers_dto = [] triggers = [] # Early return if workflow is None if workflow is None: return providers_dto, triggers # Step 1: Load workflow YAML and handle potential parsing errors more thoroughly try: workflow_raw_data = workflow.workflow_raw if not isinstance(workflow_raw_data, str): self.logger.error(f"workflow_raw is not a string workflow: {workflow}") return providers_dto, triggers # Parse the workflow YAML safely workflow_yaml_dict = cyaml.safe_load(workflow_raw_data) if workflow_yaml_dict.get("workflow"): workflow_yaml_dict = workflow_yaml_dict.get("workflow") if not workflow_yaml_dict: self.logger.error( f"Parsed workflow_yaml is empty or invalid: {workflow_yaml_dict}" ) return providers_dto, triggers except Exception as e: # Improved logging to capture more details about the error self.logger.error( f"Failed to parse workflow in get_workflow_meta_data: {e}, workflow: {workflow}" ) return ( providers_dto, triggers, ) # Return empty providers and triggers in case of error try: providers = self.parser.get_providers_from_workflow_dict(workflow_yaml_dict) except Exception as e: self.logger.error( f"Failed to get providerts from workflow: {e}, workflow: {workflow}" ) providers = [] # Step 2: Process providers and add them to DTO for provider in providers: try: provider_data = installed_providers_by_type[provider.get("type")][ provider.get("name") ] provider_dto = ProviderDTO( name=provider_data.name, type=provider_data.type, id=provider_data.id, installed=True, ) # add only if not already in the list if provider_data.id not in [p.id for p in providers_dto]: providers_dto.append(provider_dto) except KeyError: # Handle case where the provider is not installed try: conf = ProvidersFactory.get_provider_required_config( provider.get("type") ) except ModuleNotFoundError: self.logger.warning( f"Non-existing provider in workflow: {provider.get('type')}" ) conf = None # Handle providers based on whether they require config provider_dto = ProviderDTO( name=provider.get("name"), type=provider.get("type"), id=None, installed=( conf is None ), # Consider it installed if no config is required ) providers_dto.append(provider_dto) # Step 3: Extract triggers from workflow triggers = self.parser.get_triggers_from_workflow_dict(workflow_yaml_dict) return providers_dto, triggers @staticmethod def is_alert_rule_workflow(workflow_raw: dict): # checks if the workflow is an alert rule actions = workflow_raw.get("actions", []) for action in actions: # check if the action is a keep action is_keep_action = action.get("provider", {}).get("type") == "keep" if is_keep_action: # check if the keep action is an alert if "alert" in action.get("provider", {}).get("with", {}): return True # if no keep action is found, return False return False ================================================ FILE: keep-ui/.dockerignore ================================================ node_modules .next .vercel .env.* .venv/ .vscode/ .github/ .pytest_cache ================================================ FILE: keep-ui/.eslintignore ================================================ node_modules ================================================ FILE: keep-ui/.eslintrc.json ================================================ { "extends": ["next/core-web-vitals", "prettier"] } ================================================ FILE: keep-ui/.gitignore ================================================ # Logs logs *.log npm-debug.log* # Runtime data pids *.pid *.seed !lib # Directory for instrumented libs generated by jscoverage/JSCover lib-cov # Coverage directory used by tools like istanbul coverage # nyc test coverage .nyc_output # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) .grunt # node-waf configuration .lock-wscript # Compiled binary addons (http://nodejs.org/api/addons.html) build/Release # Dependency directories node_modules jspm_packages # Optional npm cache directory .npm # Optional REPL history .node_repl_history .next .env.local app/topology/mock-topology-data.tsx .vercel # Sentry Config File .env.sentry-build-plugin # Monaco workers (generated at build time for turbopack dev) public/monaco-workers/ # TypeScript build info tsconfig.tsbuildinfo ================================================ FILE: keep-ui/.prettierrc ================================================ { "trailingComma": "es5", "tabWidth": 2, "semi": true, "singleQuote": false } ================================================ FILE: keep-ui/README.md ================================================ # Keep UI ## Background Keep UI is a user interface platform designed to manage and configure providers for an application. It allows users to connect and disconnect various providers, such as Grafana and Datadog, and configure their authentication settings. The platform provides a user-friendly interface to facilitate the management of provider connections. ## How to Start To start using Keep UI, follow the steps below: 1. Clone the repository from GitHub. 2. Install the necessary dependencies by running `npm install` or `yarn install`. 3. Configure the environment variables required for the application. Refer to the documentation for the specific environment variables needed. 4. Start the development server using `npm run dev` or `yarn dev`. 5. Access the Keep UI application in your browser at `http://localhost:3000` (or the specified port). ## How to Contribute Contributions to Keep UI are welcome and encouraged. To contribute to the project, please follow these guidelines: 1. Fork the repository on GitHub. 2. Create a new branch for your feature or bug fix. 3. Make your changes in the branch, ensuring to adhere to the coding style and guidelines. 4. Write unit tests for new features or modifications, if applicable. 5. Commit your changes and push them to your forked repository. 6. Submit a pull request to the main repository, describing the changes and providing any additional relevant information. 7. Participate in the code review process and address any feedback or comments received. 8. Once approved, your changes will be merged into the main codebase. Please ensure that your contributions align with the project's coding standards, documentation guidelines, and overall goals. For major changes or new features, it is advisable to discuss them with the project maintainers or open an issue to gather feedback and ensure they align with the project roadmap. ## License Keep UI is released under the MIT License. ================================================ FILE: keep-ui/__mocks__/@monaco-editor/react.js ================================================ const React = require('react'); module.exports = { Editor: () => React.createElement('div', { 'data-testid': 'monaco-editor' }), DiffEditor: () => React.createElement('div', { 'data-testid': 'monaco-diff-editor' }), loader: { config: jest.fn(), init: jest.fn(() => Promise.resolve({ editor: { create: jest.fn(), defineTheme: jest.fn(), setTheme: jest.fn(), getModel: jest.fn(), setModelMarkers: jest.fn(), }, languages: { register: jest.fn(), setMonarchTokensProvider: jest.fn(), setLanguageConfiguration: jest.fn(), registerCompletionItemProvider: jest.fn(), }, MarkerSeverity: { Error: 8, Warning: 4, Info: 2, Hint: 1, }, })), }, }; ================================================ FILE: keep-ui/__mocks__/monaco-editor.js ================================================ module.exports = { editor: { create: jest.fn(), defineTheme: jest.fn(), setTheme: jest.fn(), getModel: jest.fn(), setModelMarkers: jest.fn(), }, languages: { register: jest.fn(), setMonarchTokensProvider: jest.fn(), setLanguageConfiguration: jest.fn(), registerCompletionItemProvider: jest.fn(), }, MarkerSeverity: { Error: 8, Warning: 4, Info: 2, Hint: 1, }, }; ================================================ FILE: keep-ui/app/(health)/health/check.tsx ================================================ "use client"; import ProvidersTiles from "@/app/(keep)/providers/providers-tiles"; import React, { useEffect, useState } from "react"; import { defaultProvider, Provider } from "@/shared/api/providers"; import { useProvidersWithHealthCheck } from "@/utils/hooks/useProviders"; import Loading from "@/app/(keep)/loading"; import HealthPageBanner from "@/components/banners/health-page-banner"; const useFetchProviders = () => { const [providers, setProviders] = useState([]); const { data, error, mutate } = useProvidersWithHealthCheck(); if (error) { throw error; } const isLocalhost: boolean = true; useEffect(() => { if (data) { const fetchedProviders = data.providers .filter((provider: Provider) => { return provider.health; }) .map((provider) => ({ ...defaultProvider, ...provider, id: provider.type, installed: provider.installed ?? false, health: provider.health, })); setProviders(fetchedProviders); } }, [data]); return { providers, error, isLocalhost, mutate, }; }; export default function ProviderHealthPage() { const { providers, isLocalhost, mutate } = useFetchProviders(); if (!providers || providers.length <= 0) { return ; } return ( <> ); } ================================================ FILE: keep-ui/app/(health)/health/modal.tsx ================================================ import React from "react"; import Modal from "@/components/ui/Modal"; import { Badge, BarChart, Button, Card, DonutChart, Subtitle, Title, } from "@tremor/react"; import { CheckCircle2Icon } from "lucide-react"; interface ProviderHealthResultsModalProps { handleClose: () => void; isOpen: boolean; healthResults: any; } const ProviderHealthResultsModal = ({ handleClose, isOpen, healthResults, }: ProviderHealthResultsModalProps) => { const handleModalClose = () => { handleClose(); }; return (
Spammy Alerts {healthResults?.spammy?.length ? ( <> Sorry to say, but looks like your alerts are spammy ) : ( <>
Everything is ok )}
Rules Quality {healthResults?.rules?.unused ? ( <> {healthResults?.rules.unused} of your{" "} {healthResults.rules.used + healthResults.rules.unused} alert rules are not in use ) : ( <>
Everything is ok )}
Actionable
Everything is ok
Topology coverage {healthResults?.topology?.uncovered.length ? ( <> Not of your services are covered. Alerts are missing for: {healthResults?.topology?.uncovered.map((service: any) => { return ( {service.display_name ? service.display_name : service.service} ); })} ) : ( <>
Everything is ok )}
Want to improve your observability?
); }; export default ProviderHealthResultsModal; ================================================ FILE: keep-ui/app/(health)/health/page.tsx ================================================ import { Metadata } from "next"; import ProviderHealthPage from "./check"; export const metadata: Metadata = { title: "Keep – Check your alerts quality", description: "Easily check the configuration quality of your observability tools such as Datadog, Grafana, Prometheus, and more without the need to sign up.", }; export default ProviderHealthPage; ================================================ FILE: keep-ui/app/(health)/layout.tsx ================================================ import React, { ReactNode } from "react"; import { NextAuthProvider } from "../auth-provider"; import { Mulish } from "next/font/google"; import { ToastContainer } from "react-toastify"; import { getConfig } from "@/shared/lib/server/getConfig"; import { ConfigProvider } from "../config-provider"; import { PHProvider } from "../posthog-provider"; import ReadOnlyBanner from "@/components/banners/read-only-banner"; import { auth } from "@/auth"; import { ThemeScript, WatchUpdateTheme } from "@/shared/ui"; import "@/app/globals.css"; import "react-toastify/dist/ReactToastify.css"; import { PostHogPageView } from "@/shared/ui/PostHogPageView"; // If loading a variable font, you don't need to specify the font weight const mulish = Mulish({ subsets: ["latin"], display: "swap", }); type RootLayoutProps = { children: ReactNode; }; export default async function RootLayout({ children }: RootLayoutProps) { const config = getConfig(); const session = await auth(); return ( {/* ThemeScript must be the first thing to avoid flickering */} {/* @ts-ignore-error Server Component */} {/* https://discord.com/channels/752553802359505017/1068089513253019688/1117731746922893333 */}
{/* Add the banner here, before the navbar */} {config.READ_ONLY && }
{children}
{/** footer */} {process.env.GIT_COMMIT_HASH && process.env.SHOW_BUILD_INFO !== "false" && (
Version: {process.env.KEEP_VERSION} | Build:{" "} {process.env.GIT_COMMIT_HASH.slice(0, 6)}
)}
); } ================================================ FILE: keep-ui/app/(keep)/[...not-found]/page.tsx ================================================ "use client"; import { notFound } from "next/navigation"; // https://github.com/vercel/next.js/discussions/50034 export default function NotFoundDummy() { notFound(); } ================================================ FILE: keep-ui/app/(keep)/ai/ai-plugins.tsx ================================================ "use client"; import { Card, Title } from "@tremor/react"; import { useAIStats, useAIActions } from "utils/hooks/useAI"; import { useEffect, useMemo, useState } from "react"; import Image from "next/image"; import debounce from "lodash.debounce"; import { KeepLoader, PageSubtitle, showErrorToast, showSuccessToast, } from "@/shared/ui"; import { PageTitle } from "@/shared/ui"; import { AIConfig } from "./model"; function RangeInputWithLabel({ setting, onChange, }: { setting: any; onChange: (newValue: number) => void; }) { const [value, setValue] = useState(setting.value); // Create a memoized debounced function const debouncedOnChange = useMemo( () => debounce((value: number) => onChange(value), 1000), [onChange] ); // Cleanup the debounced function on unmount useEffect(() => { return () => { debouncedOnChange.cancel(); }; }, [debouncedOnChange]); return (

value: {value}

{ const newValue = setting.type === "float" ? parseFloat(e.target.value) : parseInt(e.target.value, 10); setValue(newValue); debouncedOnChange(newValue); }} />
); } export function AIPlugins() { const { data: aistats, isLoading, mutate: refetchAIStats, } = useAIStats({ refreshInterval: 5000, }); const { updateAISettings } = useAIActions(); const handleUpdateAISettings = async ( algorithm_id: string, algorithm_config: AIConfig ) => { try { await updateAISettings(algorithm_id, algorithm_config); showSuccessToast("Settings updated successfully!"); refetchAIStats(); } catch (error) { showErrorToast(error); } }; return (
AI Plugins For correlation, summarization, and enrichment
{isLoading ? ( ) : null} {aistats?.algorithm_configs?.length === 0 && (
AI
No AI enabled for this tenant

AI plugins can correlate, enrich, or summarize your alerts and incidents by leveraging the the context within Keep allowing you to gain deeper insights and respond more effectively.

By the way, AI plugins are designed to work even in air-gapped environments. You can train models using your data, so there is no need to share information with third-party providers like OpenAI. Keep your data secure and private.

Talk to us to get access!

)} {aistats?.algorithm_configs?.map((algorithm_config, index) => (

{algorithm_config.algorithm.name}

{algorithm_config.algorithm.description}

{algorithm_config.settings.map((setting) => (
{setting.type === "bool" ? ( { const newValue = e.target.checked; setting.value = newValue; handleUpdateAISettings( algorithm_config.algorithm_id, algorithm_config ); }} className="mt-2 bg-orange-500 accent-orange-200" /> ) : null}

{setting.name}

{setting.description}

{setting.type === "float" ? (
{ setting.value = newValue; handleUpdateAISettings( algorithm_config.algorithm_id, algorithm_config ); }} />
) : null} {setting.type === "int" ? (
{ setting.value = newValue; handleUpdateAISettings( algorithm_config.algorithm_id, algorithm_config ); }} />
) : null}
))}
{algorithm_config.settings_proposed_by_algorithm && JSON.stringify(algorithm_config.settings) !== JSON.stringify( algorithm_config.settings_proposed_by_algorithm ) && ( The new settings proposal

The last time the model was trained and used for inference, it suggested a configuration update. However, please note that a configuration update might not be very effective if the data quantity or quality is low. For more details, please refer to the logs below.

{algorithm_config.settings_proposed_by_algorithm.map( (proposed_setting: any, idx: number) => (

{proposed_setting.name}:{" "} {String(proposed_setting.value)}

) )}
)}

Execution logs:

                    {algorithm_config.feedback_logs
                      ? algorithm_config.feedback_logs
                      : "Algorithm not executed yet."}
                  
))}
); } ================================================ FILE: keep-ui/app/(keep)/ai/model.ts ================================================ interface FloatOrIntSetting { max?: number; min?: number; type: "float" | "int"; value: number; } interface BoolSetting { type: "bool"; value: boolean; } interface BaseSetting { name: string; description: string; } export type AlgorithmSetting = BaseSetting & (FloatOrIntSetting | BoolSetting); export interface Algorithm { name: string; description: string; last_time_reminded?: string; } export interface AIConfig { id: string; algorithm_id: string; tenant_id: string; settings: AlgorithmSetting[]; settings_proposed_by_algorithm: AlgorithmSetting[]; feedback_logs: string; algorithm: Algorithm; } export interface AIStats { alerts_count: number; incidents_count: number; first_alert_datetime?: Date; algorithm_configs: AIConfig[]; } export interface AILogs { log: string; } ================================================ FILE: keep-ui/app/(keep)/ai/page.tsx ================================================ import { AIPlugins } from "./ai-plugins"; export default function Page() { return ; } export const metadata = { title: "Keep - AI Correlation", description: "Correlate Alerts and Incidents with AI to identify patterns and trends.", }; ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/page.tsx ================================================ import { createServerApiClient } from "@/shared/api/server"; import AlertsPage from "./ui/alerts"; import { getInitialFacets } from "@/features/filter/api"; type PageProps = { params: Promise<{ id: string }>; searchParams: Promise<{ [key: string]: string | string[] | undefined }>; }; export default async function Page(props: PageProps) { const params = await props.params; const api = await createServerApiClient(); const initialFacets = await getInitialFacets(api, "alerts"); return ; } export const metadata = { title: "Keep - Alerts", description: "Single pane of glass for all your alerts.", }; ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/__tests__/alerts-fingerprint.test.tsx ================================================ /** * Tests for the alerts.tsx fingerprint-modal fix. * * Bug: when alerts re-fetched (polling / WebSocket), the useEffect that opens * the ViewAlertModal or EnrichAlertSidePanel was re-evaluated. If the alert * list was momentarily empty (during the refetch) the component would fire a * false "Alert fingerprint not found" toast and close the modal. * * Fix: `resolvedFingerprintRef` stores the fingerprint once it has been * matched so that subsequent re-evaluations of the same fingerprint (with an * empty or partial alerts list) do not trigger the error path. */ import React from "react"; import { render, act, waitFor } from "@testing-library/react"; import { useRouter, useSearchParams } from "next/navigation"; import { useProviders } from "@/utils/hooks/useProviders"; import { usePresets } from "@/entities/presets/model"; import { useAlertsTableData } from "@/widgets/alerts-table/ui/useAlertsTableData"; import { showErrorToast } from "@/shared/ui"; import Alerts from "../alerts"; // ─── Mock navigation ───────────────────────────────────────────────────────── jest.mock("next/navigation", () => ({ useRouter: jest.fn(), useSearchParams: jest.fn(), })); // ─── Mock data hooks ───────────────────────────────────────────────────────── jest.mock("@/utils/hooks/useProviders", () => ({ useProviders: jest.fn(), })); jest.mock("@/entities/presets/model", () => ({ usePresets: jest.fn(), })); jest.mock("@/widgets/alerts-table/ui/useAlertsTableData", () => ({ useAlertsTableData: jest.fn(), })); // ─── Mock UI utilities ─────────────────────────────────────────────────────── jest.mock("@/shared/ui", () => ({ showErrorToast: jest.fn(), KeepLoader: () => null, })); // ─── Mock all heavy child components ──────────────────────────────────────── // Only ViewAlertModal and EnrichAlertSidePanel render observable testid // attributes so we can assert that the fix works. jest.mock("../alert-table-tab-panel-server-side", () => ({ __esModule: true, default: () =>
, })); jest.mock("@/features/alerts/alert-history", () => ({ AlertHistoryModal: () => null, })); jest.mock("@/features/alerts/alert-assign-ticket", () => ({ AlertAssignTicketModal: () => null, })); jest.mock("@/features/alerts/alert-note", () => ({ AlertNoteModal: () => null, })); jest.mock("@/features/alerts/alert-call-provider-method", () => ({ AlertMethodModal: () => null, })); jest.mock("@/features/workflows/manual-run-workflow", () => ({ ManualRunWorkflowModal: () => null, })); jest.mock("@/features/alerts/dismiss-alert", () => ({ AlertDismissModal: () => null, })); jest.mock("@/features/alerts/view-raw-alert", () => ({ // Renders a testid only when an alert is supplied so tests can assert on it. ViewAlertModal: ({ alert }: any) => alert ?
: null, })); jest.mock("@/features/alerts/alert-change-status", () => ({ AlertChangeStatusModal: () => null, })); jest.mock("@/features/alerts/enrich-alert", () => ({ EnrichAlertSidePanel: ({ isOpen }: any) => isOpen ?
: null, })); jest.mock("@/app/(keep)/not-found", () => ({ __esModule: true, default: () =>
Not Found
, })); // ─── Helpers ───────────────────────────────────────────────────────────────── const makeAlert = (fingerprint: string) => ({ id: fingerprint, fingerprint, name: `Alert ${fingerprint}`, description: "", severity: "critical", status: "firing", source: ["prometheus"], lastReceived: new Date(), environment: "production", pushed: false, deleted: false, dismissed: false, enriched_fields: [], }); const baseAlertsData = { alerts: [] as ReturnType[], alertsLoading: false, mutateAlerts: jest.fn(), alertsError: null, totalCount: 0, facetsCel: null, facetsPanelRefreshToken: null, }; const mockReplace = jest.fn(); const mockSearchParamsGet = jest.fn(); // ─── Global setup ──────────────────────────────────────────────────────────── beforeEach(() => { jest.clearAllMocks(); (useRouter as jest.Mock).mockReturnValue({ replace: mockReplace, push: jest.fn(), back: jest.fn(), }); // Return an object with a controllable .get() so each test can set params. (useSearchParams as jest.Mock).mockReturnValue({ get: mockSearchParamsGet, }); // Default: no query params. mockSearchParamsGet.mockReturnValue(null); (useProviders as jest.Mock).mockReturnValue({ data: { installed_providers: [] }, }); // Return empty saved presets; "feed" comes from defaultPresets inside the // component so selectedPreset will be found without any extra setup. (usePresets as jest.Mock).mockReturnValue({ dynamicPresets: [], isLoading: false, }); (useAlertsTableData as jest.Mock).mockReturnValue(baseAlertsData); }); // ─── Tests ─────────────────────────────────────────────────────────────────── describe("Alerts — fingerprint modal fix (dataSettled guard)", () => { it("does NOT fire error when alerts is briefly empty but totalCount > 0 (stale-empty SWR flash)", async () => { // Regression test for the 3-render cascade in useLastAlerts: // SWR marks isLoading=false before the React state carrying the real results // has been flushed. For one render, alerts=[] while totalCount is already // the real count (>0). The fix: only act when alerts.length>0 OR totalCount===0. const alert = makeAlert("fp-stale"); mockSearchParamsGet.mockImplementation((key: string) => key === "alertPayloadFingerprint" ? "fp-stale" : null ); // Phase 1 — stale-empty flash: alerts=[], alertsLoading=false, totalCount=5 (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [], alertsLoading: false, totalCount: 5, }); const { rerender } = render( ); // No error should fire during the stale-empty phase. await waitFor(() => { expect(showErrorToast).not.toHaveBeenCalled(); }); // Phase 2 — real data arrives: alerts=[alert], totalCount=1 (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [alert], alertsLoading: false, totalCount: 1, }); await act(async () => { rerender(); }); // Modal should open and still no error. expect(showErrorToast).not.toHaveBeenCalled(); expect(mockReplace).not.toHaveBeenCalled(); }); }); describe("Alerts — fingerprint modal fix (resolvedFingerprintRef)", () => { it("opens view modal when fingerprint is found and shows no error", async () => { const alert = makeAlert("fp-abc"); mockSearchParamsGet.mockImplementation((key: string) => key === "alertPayloadFingerprint" ? "fp-abc" : null ); (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [alert], }); const { findByTestId } = render( ); // Modal should appear. await findByTestId("view-alert-modal"); expect(showErrorToast).not.toHaveBeenCalled(); }); it("shows error toast when fingerprint is not in the alerts list", async () => { mockSearchParamsGet.mockImplementation((key: string) => key === "alertPayloadFingerprint" ? "fp-missing" : null ); // Alerts list present but does not contain the requested fingerprint. (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [makeAlert("fp-other")], }); render(); await waitFor(() => { expect(showErrorToast).toHaveBeenCalledWith( null, "Alert fingerprint not found" ); }); // URL should have been cleared. expect(mockReplace).toHaveBeenCalled(); }); it("does NOT show error toast on background re-fetch after fingerprint was resolved", async () => { // Core regression test: after a successful modal open, the alerts list // briefly empties (due to a polling re-fetch), then repopulates. // Without the fix, the empty-list evaluation fires the error toast. const alert = makeAlert("fp-abc"); mockSearchParamsGet.mockImplementation((key: string) => key === "alertPayloadFingerprint" ? "fp-abc" : null ); // Step 1 — alert is present; modal opens and ref is stored. (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [alert], }); const { rerender } = render( ); await waitFor(() => { expect(showErrorToast).not.toHaveBeenCalled(); }); // Step 2 — alerts list empties mid-refetch. (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [], }); await act(async () => { rerender(); }); // The fix: resolvedFingerprintRef is still "fp-abc" so the error path is // skipped. expect(showErrorToast).not.toHaveBeenCalled(); expect(mockReplace).not.toHaveBeenCalled(); }); it("opens enrich sidebar when both fingerprint and enrich params are present", async () => { const alert = makeAlert("fp-enrich"); mockSearchParamsGet.mockImplementation((key: string) => { if (key === "alertPayloadFingerprint") return "fp-enrich"; if (key === "enrich") return "true"; return null; }); (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [alert], }); const { findByTestId } = render( ); await findByTestId("enrich-sidebar"); expect(showErrorToast).not.toHaveBeenCalled(); }); it("resets the ref and opens modal correctly when navigating to a different fingerprint", async () => { // Ensure that navigating from fp-1 to fp-2 does NOT inherit fp-1's ref // and still opens fp-2's modal without errors. const alert1 = makeAlert("fp-1"); const alert2 = makeAlert("fp-2"); mockSearchParamsGet.mockImplementation((key: string) => key === "alertPayloadFingerprint" ? "fp-1" : null ); (useAlertsTableData as jest.Mock).mockReturnValue({ ...baseAlertsData, alerts: [alert1, alert2], }); const { rerender } = render( ); // First fingerprint resolved — no errors. await waitFor(() => { expect(showErrorToast).not.toHaveBeenCalled(); }); // Navigate to a different fingerprint. mockSearchParamsGet.mockImplementation((key: string) => key === "alertPayloadFingerprint" ? "fp-2" : null ); (showErrorToast as jest.Mock).mockClear(); await act(async () => { rerender(); }); // fp-2 is present in the list, so the modal should open without error. expect(showErrorToast).not.toHaveBeenCalled(); }); }); ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alert-table-alert-facets.tsx ================================================ import React, { useCallback } from "react"; import { AlertFacetsProps, FacetValue } from "./alert-table-facet-types"; import { Facet } from "./alert-table-facet"; import { getFilteredAlertsForFacet, getSeverityOrder, } from "./alert-table-facet-utils"; import { useLocalStorage } from "@/utils/hooks/useLocalStorage"; import { AlertDto } from "@/entities/alerts/model"; import { DynamicFacetWrapper, AddFacetModal, } from "./alert-table-facet-dynamic"; import { PlusIcon } from "@heroicons/react/24/outline"; import { usePathname } from "next/navigation"; export const AlertFacets: React.FC = ({ alerts, facetFilters, setFacetFilters, dynamicFacets, setDynamicFacets, onDelete, className, table, showSkeleton, }) => { const pathname = usePathname(); const timeRangeFilter = table .getState() .columnFilters.find((filter) => filter.id === "lastReceived"); const timeRange = timeRangeFilter?.value as | { start: Date; end: Date; isFromCalendar: boolean } | undefined; const presetName = pathname?.split("/").pop() || "default"; const [isModalOpen, setIsModalOpen] = useLocalStorage( `addFacetModalOpen-${presetName}`, false ); const handleSelect = ( facetKey: string, value: string, exclusive: boolean, isAllOnly: boolean ) => { const newFilters = { ...facetFilters }; if (isAllOnly) { if (exclusive) { newFilters[facetKey] = [value]; } else { delete newFilters[facetKey]; } } else { if (exclusive) { newFilters[facetKey] = [value]; } else { const currentValues = newFilters[facetKey] || []; if (currentValues.includes(value)) { newFilters[facetKey] = currentValues.filter((v) => v !== value); if (newFilters[facetKey].length === 0) { delete newFilters[facetKey]; } } else { newFilters[facetKey] = [...currentValues, value]; } } } setFacetFilters(newFilters); }; const getFacetValues = useCallback( (key: keyof AlertDto | string): FacetValue[] => { const filteredAlerts = getFilteredAlertsForFacet( alerts, facetFilters, key, timeRange ); const valueMap = new Map(); let nullCount = 0; filteredAlerts.forEach((alert) => { let value; // Handle nested keys like "labels.host" if (typeof key === "string" && key.includes(".")) { const [parentKey, childKey] = key.split("."); const parentValue = alert[parentKey as keyof AlertDto]; if ( typeof parentValue === "object" && parentValue !== null && !Array.isArray(parentValue) && !(parentValue instanceof Date) ) { value = (parentValue as Record)[childKey]; } else { value = undefined; } } else { value = alert[key as keyof AlertDto]; } if (Array.isArray(value)) { if (value.length === 0) { nullCount++; } else { value.forEach((v) => { valueMap.set(v, (valueMap.get(v) || 0) + 1); }); } } else if (value !== undefined && value !== null) { const strValue = String(value); valueMap.set(strValue, (valueMap.get(strValue) || 0) + 1); } else { nullCount++; } }); let values = Array.from(valueMap.entries()).map(([label, count]) => ({ label, count, isSelected: facetFilters[key]?.includes(label) || !facetFilters[key]?.length, })); if (["assignee", "incident"].includes(key as string) && nullCount > 0) { values.push({ label: "n/a", count: nullCount, isSelected: facetFilters[key]?.includes("n/a") || !facetFilters[key]?.length, }); } if (key === "severity") { values.sort((a, b) => { if (a.label === "n/a") return 1; if (b.label === "n/a") return -1; const orderDiff = getSeverityOrder(b.label) - getSeverityOrder(a.label); if (orderDiff !== 0) return orderDiff; return b.count - a.count; }); } else { values.sort((a, b) => { if (a.label === "n/a") return 1; if (b.label === "n/a") return -1; return b.count - a.count; }); } return values; }, [alerts, facetFilters, timeRange] ); const staticFacets = [ "severity", "status", "source", "assignee", "dismissed", "incident", ]; const handleAddFacet = (column: string) => { setDynamicFacets([ ...dynamicFacets, { key: column, name: column.charAt(0).toUpperCase() + column.slice(1), }, ]); }; const handleDeleteFacet = (facetKey: string) => { setDynamicFacets(dynamicFacets.filter((df) => df.key !== facetKey)); const newFilters = { ...facetFilters }; delete newFilters[facetKey]; setFacetFilters(newFilters); }; return (
{/* Facet button */} handleSelect("severity", value, exclusive, isAllOnly) } facetKey="severity" facetFilters={facetFilters} showSkeleton={showSkeleton} /> handleSelect("status", value, exclusive, isAllOnly) } facetKey="status" facetFilters={facetFilters} showSkeleton={showSkeleton} /> handleSelect("source", value, exclusive, isAllOnly) } facetKey="source" facetFilters={facetFilters} showSkeleton={showSkeleton} /> handleSelect("assignee", value, exclusive, isAllOnly) } facetKey="assignee" facetFilters={facetFilters} showSkeleton={showSkeleton} /> handleSelect("dismissed", value, exclusive, isAllOnly) } facetKey="dismissed" facetFilters={facetFilters} showSkeleton={showSkeleton} /> handleSelect("incident", value, exclusive, isAllOnly) } facetFilters={facetFilters} showSkeleton={showSkeleton} /> {/* Dynamic facets */} {dynamicFacets.map((facet) => ( handleSelect(facet.key, value, exclusive, isAllOnly) } facetKey={facet.key} facetFilters={facetFilters} onDelete={() => handleDeleteFacet(facet.key)} /> ))} {/* Facet Modal */} setIsModalOpen(false)} table={table} onAddFacet={handleAddFacet} existingFacets={[ ...staticFacets, ...dynamicFacets.map((df) => df.key), ]} />
); }; ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alert-table-facet-dynamic.tsx ================================================ import React, { useState } from "react"; import { TextInput } from "@tremor/react"; import { TrashIcon } from "@heroicons/react/24/outline"; import { FacetProps } from "./alert-table-facet-types"; import { AlertDto } from "@/entities/alerts/model"; import { Facet } from "./alert-table-facet"; import Modal from "@/components/ui/Modal"; import { Table } from "@tanstack/table-core"; import { FiSearch } from "react-icons/fi"; interface AddFacetModalProps { isOpen: boolean; onClose: () => void; table: Table; onAddFacet: (column: string) => void; existingFacets: string[]; } export const AddFacetModal: React.FC = ({ isOpen, onClose, table, onAddFacet, existingFacets, }) => { const [searchTerm, setSearchTerm] = useState(""); const availableColumns = table .getAllColumns() .filter( (col) => // Filter out pinned columns and existing facets !col.getIsPinned() && !existingFacets.includes(col.id) && // Filter by search term col.id.toLowerCase().includes(searchTerm.toLowerCase()) ) .map((col) => col.id); return (
setSearchTerm(e.target.value)} className="mb-4" />
{availableColumns.map((column) => ( ))}
); }; export interface DynamicFacetProps extends FacetProps { onDelete: () => void; } export const DynamicFacetWrapper: React.FC = ({ onDelete, ...facetProps }) => { return (
); }; ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alert-table-facet-types.tsx ================================================ import { AlertDto } from "@/entities/alerts/model"; import { Table } from "@tanstack/table-core"; export interface DynamicFacet { key: string; name: string; } export interface FacetValue { label: string; count: number; isSelected: boolean; } export interface FacetFilters { [key: string]: string[]; } export interface FacetValueProps { label: string; count: number; isSelected: boolean; onSelect: (value: string, exclusive: boolean, isAllOnly: boolean) => void; facetKey: string; showIcon?: boolean; facetFilters: FacetFilters; } export interface FacetProps { name: string; values: FacetValue[]; onSelect: (value: string, exclusive: boolean, isAllOnly: boolean) => void; facetKey: string; facetFilters: FacetFilters; showIcon?: boolean; showSkeleton?: boolean; } export interface AlertFacetsProps { alerts: AlertDto[]; facetFilters: FacetFilters; setFacetFilters: ( filters: FacetFilters | ((filters: FacetFilters) => FacetFilters) ) => void; dynamicFacets: DynamicFacet[]; setDynamicFacets: ( facets: DynamicFacet[] | ((facets: DynamicFacet[]) => DynamicFacet[]) ) => void; onDelete: (facetKey: string) => void; className?: string; table: Table; showSkeleton?: boolean; } ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alert-table-facet-utils.tsx ================================================ import { FacetFilters } from "./alert-table-facet-types"; import { AlertDto } from "@/entities/alerts/model"; import { isQuickPresetRange } from "@/components/ui/DateRangePicker"; export const getFilteredAlertsForFacet = ( alerts: AlertDto[], facetFilters: FacetFilters, currentFacetKey: string, timeRange?: { start: Date; end: Date; isFromCalendar: boolean } ) => { return alerts.filter((alert) => { // Only apply time range filter if both start and end dates exist if (timeRange?.start && timeRange?.end) { const lastReceived = new Date(alert.lastReceived); const rangeStart = new Date(timeRange.start); const rangeEnd = new Date(timeRange.end); if (!isQuickPresetRange(timeRange)) { rangeEnd.setHours(23, 59, 59, 999); } if (lastReceived < rangeStart || lastReceived > rangeEnd) { return false; } } // Then apply facet filters, excluding the current facet return Object.entries(facetFilters).every(([facetKey, includedValues]) => { // Skip filtering by current facet to avoid circular dependency if (facetKey === currentFacetKey || includedValues.length === 0) { return true; } let value; if (facetKey.includes(".")) { const [parentKey, childKey] = facetKey.split("."); const parentValue = alert[parentKey as keyof AlertDto]; if ( typeof parentValue === "object" && parentValue !== null && !Array.isArray(parentValue) && !(parentValue instanceof Date) ) { value = (parentValue as Record)[childKey]; } } else { value = alert[facetKey as keyof AlertDto]; } if (facetKey === "source") { const sources = value as string[]; if (includedValues.includes("n/a")) { return !sources || sources.length === 0; } return ( Array.isArray(sources) && sources.some((source) => includedValues.includes(source)) ); } if (includedValues.includes("n/a")) { return value === null || value === undefined || value === ""; } if (value === null || value === undefined || value === "") { return false; } return includedValues.includes(String(value)); }); }); }; export const getSeverityOrder = (severity: string): number => { switch (severity) { case "low": return 1; case "info": return 2; case "warning": return 3; case "error": case "high": return 4; case "critical": return 5; default: return 6; } }; ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alert-table-facet-value.tsx ================================================ import React, { useCallback, useMemo } from "react"; import { Icon } from "@tremor/react"; import { Text } from "@tremor/react"; import { FacetValueProps } from "./alert-table-facet-types"; import { getStatusIcon, getStatusColor } from "@/shared/lib/status-utils"; import { BellIcon, BellSlashIcon, FireIcon } from "@heroicons/react/24/outline"; import clsx from "clsx"; import { useIncidents } from "@/utils/hooks/useIncidents"; import { getIncidentName } from "@/entities/incidents/lib/utils"; import { UserStatefulAvatar } from "@/entities/users/ui"; import { useUser } from "@/entities/users/model/useUser"; import { SeverityBorderIcon, UISeverity } from "@/shared/ui"; import { DynamicImageProviderIcon } from "@/components/ui"; const AssigneeLabel = ({ email }: { email: string }) => { const user = useUser(email); return user ? user.name : email; }; export const FacetValue: React.FC = ({ label, count, isSelected, onSelect, facetKey, showIcon = false, facetFilters, }) => { const { data: incidents } = useIncidents( { candidate: false, predicted: null, limit: 100, offset: undefined, sorting: undefined, cel: "", }, { revalidateOnFocus: false, } ); const incidentMap = useMemo(() => { return new Map( incidents?.items.map((incident) => [ incident.id.replaceAll("-", ""), incident, ]) || [] ); }, [incidents]); const incident = useMemo( () => (facetKey === "incident" ? incidentMap.get(label) : null), [incidentMap, facetKey, label] ); const handleCheckboxClick = (e: React.MouseEvent) => { e.stopPropagation(); onSelect(label, false, false); }; const isExclusivelySelected = () => { const currentFilter = facetFilters[facetKey] || []; return currentFilter.length === 1 && currentFilter[0] === label; }; const handleActionClick = (e: React.MouseEvent) => { e.stopPropagation(); if (isExclusivelySelected()) { onSelect("", false, true); } else { onSelect(label, true, true); } }; const getValueIcon = useCallback( (label: string, facetKey: string) => { if (facetKey === "source") { return ( ); } if (facetKey === "severity") { return ; } if (facetKey === "assignee") { return ; } if (facetKey === "status") { return ( ); } if (facetKey === "dismissed") { return ( ); } if (facetKey === "incident") { if (incident) { return ( ); } return ( ); } return null; }, [incident] ); const humanizeLabel = useCallback( (label: string, facetKey: string) => { if (facetKey === "assignee") { if (label === "n/a") { return "Not assigned"; } return ; } if (facetKey === "incident") { if (label === "n/a") { return "No incident"; } if (incident) { return getIncidentName(incident); } else { return label; } } if (facetKey === "dismissed") { return label === "true" ? "Dismissed" : "Not dismissed"; } return {label}; }, [incident] ); const currentFilter = facetFilters[facetKey] || []; const isValueSelected = !currentFilter?.length || currentFilter.includes(label); return (
{}} style={{ accentColor: "#eb6221" }} className="h-4 w-4 rounded border-gray-300 cursor-pointer" />
{showIcon && (
{getValueIcon(label, facetKey)}
)} {humanizeLabel(label, facetKey)}
{count > 0 && ( {count} )}
); }; ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alert-table-facet.tsx ================================================ import React from "react"; import { Title } from "@tremor/react"; import { ChevronDownIcon, ChevronRightIcon } from "@heroicons/react/20/solid"; import { FacetProps } from "./alert-table-facet-types"; import { FacetValue } from "./alert-table-facet-value"; import { useLocalStorage } from "@/utils/hooks/useLocalStorage"; import { usePathname } from "next/navigation"; import Skeleton from "react-loading-skeleton"; export const Facet: React.FC = ({ name, values, onSelect, facetKey, facetFilters, showIcon = true, showSkeleton, }) => { const pathname = usePathname(); // Get preset name from URL const presetName = pathname?.split("/").pop() || "default"; // Store open/close state in localStorage with a unique key per preset and facet const [isOpen, setIsOpen] = useLocalStorage( `facet-${presetName}-${facetKey}-open`, true ); // Store filter value in localStorage per preset and facet const [filter, setFilter] = useLocalStorage( `facet-${presetName}-${facetKey}-filter`, "" ); const filteredValues = values.filter((v) => v.label.toLowerCase().includes(filter.toLowerCase()) ); const Icon = isOpen ? ChevronDownIcon : ChevronRightIcon; return (
setIsOpen(!isOpen)} >
{name}
{isOpen && (
{values.length >= 10 && (
setFilter(e.target.value)} className="w-full px-2 py-1 text-sm border border-gray-300 rounded" />
)}
{showSkeleton ? ( Array.from({ length: 3 }).map((_, index) => (
)) ) : values.length > 0 ? ( filteredValues.map((value) => ( )) ) : (
No matching values found
)}
)}
); }; ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alert-table-tab-panel-server-side.tsx ================================================ import { FacetDto } from "@/features/filter"; import { AlertTableServerSide } from "@/widgets/alerts-table/ui/alert-table-server-side"; import { useAlertTableCols } from "@/widgets/alerts-table/lib/alert-table-utils"; import { AlertDto, AlertKnownKeys, AlertsQuery, getTabsFromPreset, } from "@/entities/alerts/model"; import { Preset } from "@/entities/presets/model/types"; import { AlertsTableDataQuery } from "@/widgets/alerts-table/ui/useAlertsTableData"; interface Props { initialFacets: FacetDto[]; alerts: AlertDto[]; alertsTotalCount: number; facetsCel: string | null; facetsPanelRefreshToken: string | undefined; preset: Preset; isAsyncLoading: boolean; setTicketModalAlert: (alert: AlertDto | null) => void; setNoteModalAlert: (alert: AlertDto | null) => void; setRunWorkflowModalAlert: (alert: AlertDto | null) => void; setDismissModalAlert: (alert: AlertDto[] | null) => void; setChangeStatusAlert: (alert: AlertDto | null) => void; mutateAlerts: () => void; onReload?: (query: AlertsQuery) => void; onQueryChange?: (query: AlertsTableDataQuery) => void; } export default function AlertTableTabPanelServerSide({ initialFacets, alerts, alertsTotalCount, preset, facetsCel, facetsPanelRefreshToken, isAsyncLoading, setTicketModalAlert, setNoteModalAlert, setRunWorkflowModalAlert, setDismissModalAlert, setChangeStatusAlert, mutateAlerts, onReload, onQueryChange, }: Props) { const additionalColsToGenerate = [ ...new Set( alerts?.flatMap((alert) => { const keys = Object.keys(alert).filter( (key) => !AlertKnownKeys.includes(key) ); return keys.flatMap((key) => { if ( typeof alert[key as keyof AlertDto] === "object" && alert[key as keyof AlertDto] !== null ) { return Object.keys(alert[key as keyof AlertDto] as object).map( (subKey) => `${key}.${subKey}` ); } return key; }); }) || [] ), ]; const alertTableColumns = useAlertTableCols({ additionalColsToGenerate: additionalColsToGenerate, isCheckboxDisplayed: preset.name !== "deleted" && preset.name !== "dismissed", isMenuDisplayed: true, setTicketModalAlert: setTicketModalAlert, setNoteModalAlert: setNoteModalAlert, setRunWorkflowModalAlert: setRunWorkflowModalAlert, setDismissModalAlert: setDismissModalAlert, setChangeStatusAlert: setChangeStatusAlert, presetName: preset.name, presetNoisy: preset.is_noisy, }); const presetTabs = getTabsFromPreset(preset); return ( ); } ================================================ FILE: keep-ui/app/(keep)/alerts/[id]/ui/alerts.tsx ================================================ "use client"; import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import { useRouter, useSearchParams } from "next/navigation"; import { type AlertDto, type AlertsQuery } from "@/entities/alerts/model"; import { usePresets, type Preset } from "@/entities/presets/model"; import { AlertHistoryModal } from "@/features/alerts/alert-history"; import { AlertAssignTicketModal } from "@/features/alerts/alert-assign-ticket"; import { AlertNoteModal } from "@/features/alerts/alert-note"; import { AlertMethodModal } from "@/features/alerts/alert-call-provider-method"; import { ManualRunWorkflowModal } from "@/features/workflows/manual-run-workflow"; import { AlertDismissModal } from "@/features/alerts/dismiss-alert"; import { ViewAlertModal } from "@/features/alerts/view-raw-alert"; import { AlertChangeStatusModal } from "@/features/alerts/alert-change-status"; import { EnrichAlertSidePanel } from "@/features/alerts/enrich-alert"; import { FacetDto } from "@/features/filter"; import { useApi } from "@/shared/lib/hooks/useApi"; import { KeepLoader, showErrorToast } from "@/shared/ui"; import NotFound from "@/app/(keep)/not-found"; import AlertTableTabPanelServerSide from "./alert-table-tab-panel-server-side"; import { useProviders } from "@/utils/hooks/useProviders"; import { useAlertsTableData, AlertsTableDataQuery, } from "@/widgets/alerts-table/ui/useAlertsTableData"; const defaultPresets: Preset[] = [ { id: "11111111-1111-1111-1111-111111111111", // FEED_PRESET_ID name: "feed", options: [], is_private: false, is_noisy: false, alerts_count: 0, should_do_noise_now: false, tags: [], counter_shows_firing_only: false, }, ]; type AlertsProps = { initialFacets: FacetDto[]; presetName: string; }; export default function Alerts({ presetName, initialFacets }: AlertsProps) { const api = useApi(); const [alertsQueryState, setAlertsQueryState] = useState< AlertsQuery | undefined >(); const [alertsTableDataQuery, setAlertsTableDataQuery] = useState(); const { data: providersData = { installed_providers: [] } } = useProviders(); const router = useRouter(); const ticketingProviders = useMemo( () => providersData.installed_providers.filter((provider) => provider.tags.includes("ticketing") ), [providersData.installed_providers] ); const searchParams = useSearchParams(); // hooks for the note and ticket modals const [noteModalAlert, setNoteModalAlert] = useState(); const [ticketModalAlert, setTicketModalAlert] = useState(); const [runWorkflowModalAlert, setRunWorkflowModalAlert] = useState(); const [dismissModalAlert, setDismissModalAlert] = useState< AlertDto[] | null >(); const [changeStatusAlert, setChangeStatusAlert] = useState(); const [viewAlertModal, setViewAlertModal] = useState(); const [viewEnrichAlertModal, setEnrichAlertModal] = useState(); const [isEnrichSidebarOpen, setIsEnrichSidebarOpen] = useState(false); const { dynamicPresets: savedPresets = [], isLoading: _isPresetsLoading } = usePresets({ revalidateOnFocus: false, }); const isPresetsLoading = _isPresetsLoading || !api.isReady(); const presets = [...defaultPresets, ...savedPresets] as const; const selectedPreset = presets.find( (preset) => preset.name.toLowerCase() === decodeURIComponent(presetName) ); const { alerts, alertsLoading, mutateAlerts, alertsError: alertsError, totalCount, facetsCel, facetsPanelRefreshToken, } = useAlertsTableData(alertsTableDataQuery); // Track which fingerprint has already been resolved so that a background // alerts re-fetch (polling / WebSocket) doesn't fire "not found" after the // modal was successfully opened. const resolvedFingerprintRef = useRef(null); useEffect(() => { const fingerprint = searchParams?.get("alertPayloadFingerprint"); const enrich = searchParams?.get("enrich"); // Reset when the user navigates to a different fingerprint. if (fingerprint !== resolvedFingerprintRef.current) { resolvedFingerprintRef.current = null; } // Only act once data is actually settled: either we have alerts to search // through, or the backend confirmed there are zero results (totalCount === 0). // This guards against a 3-render cascade in useLastAlerts where `alerts` // briefly equals [] while `isLoading` is already false but the React state // carrying the actual results hasn't been flushed yet. const dataSettled = alerts && !alertsLoading && (alerts.length > 0 || totalCount === 0); if (fingerprint && enrich && dataSettled) { const alert = alerts?.find((alert) => alert.fingerprint === fingerprint); if (alert) { resolvedFingerprintRef.current = fingerprint; setEnrichAlertModal(alert); setIsEnrichSidebarOpen(true); } else if (!resolvedFingerprintRef.current) { showErrorToast(null, "Alert fingerprint not found"); resetUrlAfterModal(); } } else if (fingerprint && dataSettled) { const alert = alerts?.find((alert) => alert.fingerprint === fingerprint); if (alert) { resolvedFingerprintRef.current = fingerprint; setViewAlertModal(alert); } else if (!resolvedFingerprintRef.current) { showErrorToast(null, "Alert fingerprint not found"); resetUrlAfterModal(); } } else if (alerts && !alertsLoading) { resolvedFingerprintRef.current = null; setViewAlertModal(null); setEnrichAlertModal(null); setIsEnrichSidebarOpen(false); } }, [searchParams, alerts, alertsLoading, totalCount]); const alertsQueryStateRef = useRef(alertsQueryState); const reloadAlerts = useCallback( (alertsQuery: AlertsQuery) => { // if the query is the same as the last one, just refetch if ( JSON.stringify(alertsQuery) === JSON.stringify(alertsQueryStateRef.current) ) { mutateAlerts(); return; } // if the query is different, update the state setAlertsQueryState(alertsQuery); alertsQueryStateRef.current = alertsQuery; }, [setAlertsQueryState] ); const resetUrlAfterModal = useCallback(() => { const currentParams = new URLSearchParams(window.location.search); Array.from(currentParams.keys()) .filter((paramKey) => paramKey !== "cel") .forEach((paramKey) => currentParams.delete(paramKey)); let url = `${window.location.pathname}`; if (currentParams.toString()) { url += `?${currentParams.toString()}`; } router.replace(url); }, [router]); // if we don't have presets data yet, just show loading if (!selectedPreset && isPresetsLoading) { return ; } // if we have an error, throw it, error.tsx will catch it if (alertsError) { throw alertsError; } if (!selectedPreset) { return ; } return ( <> setDismissModalAlert(null)} /> setChangeStatusAlert(null)} /> setTicketModalAlert(null)} ticketingProviders={ticketingProviders} alert={ticketModalAlert ?? null} /> setNoteModalAlert(null)} alert={noteModalAlert ?? null} /> setRunWorkflowModalAlert(null)} /> resetUrlAfterModal()} mutate={mutateAlerts} /> { setIsEnrichSidebarOpen(false); resetUrlAfterModal(); }} mutate={mutateAlerts} /> ); } ================================================ FILE: keep-ui/app/(keep)/dashboard/GridItem.tsx ================================================ import React, { useState } from "react"; import { Card } from "@tremor/react"; import MenuButton from "./MenuButton"; import { WidgetData } from "./types"; import PresetGridItem from "./widget-types/preset/preset-grid-item"; import MetricGridItem from "./widget-types/metric/metric-grid-item"; import GenericMetricsGridItem from "./widget-types/generic-metrics/generic-metrics-grid-item"; interface GridItemProps { item: WidgetData; onEdit: (id: string, updateData?: WidgetData) => void; onDelete: (id: string) => void; onSave: (updateItem: WidgetData) => void; } const GridItem: React.FC = ({ item, onEdit, onDelete, onSave, }) => { const [updatedItem, setUpdatedItem] = useState(item); const handleEdit = () => { onEdit(updatedItem.i, updatedItem); }; return (
{item.name} onDelete(item.i)} onSave={() => { onSave(updatedItem); }} />
{item.preset && } {item.metric && } {item.genericMetrics && ( )}
); }; export default GridItem; ================================================ FILE: keep-ui/app/(keep)/dashboard/GridItemContainer.tsx ================================================ import React from "react"; import GridItem from "./GridItem"; import { WidgetData } from "./types"; interface GridItemContainerProps { item: WidgetData; onEdit: (id: string) => void; onDelete: (id: string) => void; onSave: (updateItem: WidgetData) => void; } const GridItemContainer: React.FC = ({ item, onEdit, onDelete, onSave, }) => { return ( onEdit(item.i)} onDelete={() => onDelete(item.i)} onSave={onSave} /> ); }; export default GridItemContainer; ================================================ FILE: keep-ui/app/(keep)/dashboard/GridLayout.tsx ================================================ import React from "react"; import { Responsive, WidthProvider, Layout } from "react-grid-layout"; import GridItemContainer from "./GridItemContainer"; import { LayoutItem, WidgetData } from "./types"; import "react-grid-layout/css/styles.css"; import { MetricsWidget } from "@/utils/hooks/useDashboardMetricWidgets"; import { Preset } from "@/entities/presets/model/types"; const ResponsiveGridLayout = WidthProvider(Responsive); interface GridLayoutProps { layout: LayoutItem[]; onLayoutChange: (layout: LayoutItem[]) => void; data: WidgetData[]; onEdit: (id: string) => void; onDelete: (id: string) => void; presets: Preset[]; onSave: (updateItem: WidgetData) => void; metrics: MetricsWidget[]; } const GridLayout: React.FC = ({ layout, onLayoutChange, data, onEdit, onDelete, onSave, presets, metrics, }) => { const layouts = { lg: layout }; return ( <> { const updatedLayout = currentLayout.map((item) => ({ ...item, static: item.static ?? false, // Ensure static is a boolean })); onLayoutChange(updatedLayout as LayoutItem[]); }} breakpoints={{ lg: 1200, md: 996, sm: 768, xs: 480, xxs: 0 }} cols={{ lg: 24, md: 20, sm: 12, xs: 8, xxs: 4 }} rowHeight={30} containerPadding={[0, 0]} margin={[10, 10]} useCSSTransforms={true} isDraggable={true} isResizable={true} compactType={null} draggableHandle=".grid-item__widget" transformScale={1} > {data.map((item) => { //Updating the static hardcode db value. if (item.preset) { const preset = presets?.find((p) => p?.id === item?.preset?.id); item.preset = { ...item.preset, alerts_count: preset?.alerts_count ?? 0, }; } else if (item.metric) { const metric = metrics?.find((m) => m?.id === item?.metric?.id); if (metric) { item.metric = { ...metric }; } } return (
); })}
); }; export default GridLayout; ================================================ FILE: keep-ui/app/(keep)/dashboard/MenuButton.tsx ================================================ import React, { Fragment } from "react"; import { Menu, Transition } from "@headlessui/react"; import { Icon } from "@tremor/react"; import { PencilIcon, TrashIcon } from "@heroicons/react/24/outline"; import { Bars3Icon } from "@heroicons/react/20/solid"; import { FiSave } from "react-icons/fi"; interface MenuButtonProps { onEdit: () => void; onDelete: () => void; onSave?: () => void; } const MenuButton: React.FC = ({ onEdit, onDelete, onSave, }) => { const stopPropagation = (e: React.MouseEvent) => { e.stopPropagation(); }; return (
{({ active }) => ( )} {({ active }) => ( )} {onSave && ( {({ active }) => ( )} )}
); }; export default MenuButton; ================================================ FILE: keep-ui/app/(keep)/dashboard/WidgetModal.tsx ================================================ import React, { useState } from "react"; import Modal from "@/components/ui/Modal"; import { Button, Select, SelectItem, Subtitle, TextInput } from "@tremor/react"; import { WidgetData, WidgetType } from "./types"; import { Controller, get, useForm, useWatch } from "react-hook-form"; import { MetricsWidget } from "@/utils/hooks/useDashboardMetricWidgets"; import { Preset } from "@/entities/presets/model/types"; import { PresetWidgetForm } from "./widget-types/preset/preset-widget-form"; import { MetricWidgetForm } from "./widget-types/metric/metric-widget-form"; import { GenericMetricsWidgetForm } from "./widget-types/generic-metrics/generic-metrics-widget-form"; interface WidgetForm { widgetName: string; widgetType: WidgetType; } interface WidgetModalProps { isOpen: boolean; onClose: () => void; onAddWidget: (widget: any) => void; onEditWidget: (updatedWidget: WidgetData) => void; presets: Preset[]; editingItem?: WidgetData | null; metricWidgets: MetricsWidget[]; } const WidgetModal: React.FC = ({ isOpen, onClose, onAddWidget, onEditWidget, presets, editingItem, metricWidgets, }) => { const [innerFormState, setInnerFormState] = useState<{ isValid: boolean; formValue: any; }>({ isValid: false, formValue: {} }); const { control, handleSubmit, formState: { errors, isValid }, reset, } = useForm({ defaultValues: { widgetName: editingItem?.name || "", widgetType: editingItem?.widgetType || WidgetType.PRESET, }, }); const widgetType = useWatch({ control, name: "widgetType", }); const onSubmit = (data: WidgetForm) => { if (editingItem) { let updatedWidget: WidgetData = { ...editingItem, name: data.widgetName, widgetType: data.widgetType || WidgetType.PRESET, // backwards compatibility ...innerFormState.formValue, }; onEditWidget(updatedWidget); } else { onAddWidget({ name: data.widgetName, widgetType: data.widgetType || WidgetType.PRESET, // backwards compatibility ...innerFormState.formValue, }); // cleanup form reset({ widgetName: "", widgetType: WidgetType.PRESET, }); } onClose(); }; return (
Widget Name ( )} />
Widget Type { return ( ); }} />
{widgetType === WidgetType.PRESET && ( setInnerFormState({ formValue, isValid }) } > )} {widgetType == WidgetType.GENERICS_METRICS && ( <> setInnerFormState({ formValue, isValid }) } > )} {widgetType === WidgetType.METRIC && ( setInnerFormState({ formValue, isValid }) } > )}
); }; export default WidgetModal; ================================================ FILE: keep-ui/app/(keep)/dashboard/[id]/dashboard.tsx ================================================ "use client"; import { useParams } from "next/navigation"; import { ChangeEvent, useEffect, useState } from "react"; import GridLayout from "../GridLayout"; import WidgetModal from "../WidgetModal"; import { Button, Card, Icon, Subtitle, TextInput } from "@tremor/react"; import { GenericsMetrics, LayoutItem, Threshold, WidgetData, WidgetType, } from "../types"; import { FiEdit2, FiSave } from "react-icons/fi"; import { useDashboards } from "utils/hooks/useDashboards"; import { toast } from "react-toastify"; import { GenericFilters } from "@/components/filters/GenericFilters"; import { useDashboardPreset } from "utils/hooks/useDashboardPresets"; import { MetricsWidget, useDashboardMetricWidgets, } from "@/utils/hooks/useDashboardMetricWidgets"; import { useApi } from "@/shared/lib/hooks/useApi"; import { showErrorToast } from "@/shared/ui"; import "../styles.css"; import { Preset } from "@/entities/presets/model/types"; const DASHBOARD_FILTERS = [ { type: "date", key: "time_stamp", value: "", name: "Last received", }, ]; const DashboardPage = () => { const api = useApi(); const allPresets = useDashboardPreset(); const { id }: any = useParams(); const { dashboards, isLoading, mutate: mutateDashboard } = useDashboards(); const [isModalOpen, setIsModalOpen] = useState(false); const [layout, setLayout] = useState([]); const [widgetData, setWidgetData] = useState([]); const { widgets: allMetricWidgets } = useDashboardMetricWidgets(true); const [editingItem, setEditingItem] = useState(null); const [dashboardName, setDashboardName] = useState(decodeURIComponent(id)); const [isEditingName, setIsEditingName] = useState(false); useEffect(() => { if (!isLoading) { const dashboard = dashboards?.find( (d) => d.dashboard_name === decodeURIComponent(id) ); if (dashboard) { setLayout(dashboard.dashboard_config.layout); setWidgetData(dashboard.dashboard_config.widget_data); setDashboardName(dashboard.dashboard_name); } } }, [id, dashboards, isLoading]); const openModal = () => { setEditingItem(null); // Ensure new modal opens without editing item context setIsModalOpen(true); }; const closeModal = () => setIsModalOpen(false); const handleAddWidget = (widget: any) => { const uniqueId = `w-${Date.now()}`; const newItem: LayoutItem = { i: uniqueId, x: 0, y: 0, w: 3, h: 3, minW: 2, minH: 3, static: false, }; const newWidget: WidgetData = { ...newItem, ...widget, }; setLayout((prevLayout) => [...prevLayout, newWidget]); setWidgetData((prevData) => [...prevData, newWidget]); }; const handleEditWidget = (id: string, update?: WidgetData) => { let itemToEdit = widgetData.find((d) => d.i === id) || null; if (itemToEdit && update) { setEditingItem({ ...itemToEdit, ...update }); } else { setEditingItem(itemToEdit); } setIsModalOpen(true); }; const handleSaveEdit = (updatedItem: WidgetData) => { setWidgetData((prevData) => prevData.map((item) => (item.i === updatedItem.i ? updatedItem : item)) ); closeModal(); }; const handleDeleteWidget = (id: string) => { setLayout(layout.filter((item) => item.i !== id)); setWidgetData(widgetData.filter((item) => item.i !== id)); }; const handleLayoutChange = (newLayout: LayoutItem[]) => { setLayout(newLayout); setWidgetData((prevData) => prevData.map((item) => { const newItem = newLayout.find((l) => l.i === item.i); return newItem ? { ...item, ...newItem } : item; }) ); }; const handleSaveDashboard = async () => { try { let dashboard = dashboards?.find( (d) => d.dashboard_name === decodeURIComponent(id) ); const method = dashboard ? "PUT" : "POST"; const endpoint = `/dashboard${ dashboard ? `/${encodeURIComponent(dashboard.id)}` : "" }`; const result = await api.post( endpoint, { dashboard_name: dashboardName, dashboard_config: { layout, widget_data: widgetData, }, }, { method, } ); console.log("Dashboard saved successfully", result); mutateDashboard(); toast.success("Dashboard saved successfully"); } catch (error) { showErrorToast(error, "Failed to save dashboard"); } }; const toggleEditingName = () => { setIsEditingName(!isEditingName); }; const handleNameChange = (e: ChangeEvent) => { setDashboardName(e.target.value); }; return (
{isEditingName ? ( ) : ( {dashboardName} )}
{layout.length === 0 ? (

No widgets available

Click to add your first widget

) : ( )} {isModalOpen && ( )}
); }; export default DashboardPage; ================================================ FILE: keep-ui/app/(keep)/dashboard/[id]/page.tsx ================================================ import DashboardPage from "./dashboard"; export default function Page() { return ; } export const metadata = { title: "Keep - Dashboards", description: "Single pane of glass for all your alerts.", }; ================================================ FILE: keep-ui/app/(keep)/dashboard/alert-quality-table.tsx ================================================ "use client"; // Add this line at the top to make this a Client Component import React, { useState, useEffect, Dispatch, SetStateAction, useMemo, } from "react"; import { GenericTable } from "@/components/table/GenericTable"; import { useAlertQualityMetrics } from "@/utils/hooks/useAlertQuality"; import { useProviders } from "@/utils/hooks/useProviders"; import { Provider, ProvidersResponse } from "@/shared/api/providers"; import { TabGroup, TabList, Tab, Callout } from "@tremor/react"; import { GenericFilters } from "@/components/filters/GenericFilters"; import { useSearchParams } from "next/navigation"; import { AlertKnownKeys } from "@/entities/alerts/model"; import { createColumnHelper, DisplayColumnDef } from "@tanstack/react-table"; import { ExclamationCircleIcon } from "@heroicons/react/20/solid"; const tabs = [ { name: "All", value: "all" }, { name: "Installed", value: "installed" }, { name: "Linked", value: "linked" }, ]; const ALERT_QUALITY_FILTERS = [ { type: "date", key: "time_stamp", value: "", name: "Last received", }, ]; const FilterTabs = ({ tabs, setTab, tab, }: { tabs: { name: string; value: string }[]; setTab: Dispatch>; tab: number; }) => { return (
{ setTab(index); }} > {tabs.map((tabItem) => ( {tabItem.name} ))}
); }; interface AlertMetricQuality { alertsReceived: number; alertsCorrelatedToIncidentsPercentage: number; alertsWithSeverityPercentage: number; [key: string]: number; } type FinalAlertQuality = Provider & AlertMetricQuality & { provider_display_name: string }; interface Pagination { limit: number; offset: number; } const QualityTable = ({ providersMeta, alertsQualityMetrics, isDashBoard, setFields, fieldsValue, }: { providersMeta: ProvidersResponse | undefined; alertsQualityMetrics: Record> | undefined; isDashBoard?: boolean; setFields: (fields: string | string[] | Record) => void; fieldsValue: string | string[] | Record; }) => { const [pagination, setPagination] = useState({ limit: 10, offset: 0, }); const customFieldFilter = { type: "select", key: "fields", value: isDashBoard ? fieldsValue : "", name: "Field", options: AlertKnownKeys.map((key) => ({ value: key, label: key })), // only_one: true, searchParamsNotNeed: isDashBoard, can_select: 3, setFilter: setFields, }; const searchParams = useSearchParams(); const entries = searchParams ? Array.from(searchParams.entries()) : []; const columnHelper = createColumnHelper(); const params = entries.reduce( (acc, [key, value]) => { if (key in acc) { if (Array.isArray(acc[key])) { acc[key] = [...acc[key], value]; return acc; } else { acc[key] = [acc[key] as string, value]; } return acc; } acc[key] = value; return acc; }, {} as Record ); function toArray(value: string | string[]) { if (!value) { return []; } if (!Array.isArray(value) && value) { return [value]; } return value; } const fields = toArray( params?.["fields"] || (fieldsValue as string | string[]) || [] ) as string[]; const [tab, setTab] = useState(0); const handlePaginationChange = (newLimit: number, newOffset: number) => { setPagination({ limit: newLimit, offset: newOffset }); }; useEffect(() => { handlePaginationChange(10, 0); }, [tab, searchParams?.toString()]); // Construct columns based on the fields selected const columns = useMemo(() => { const baseColumns = [ columnHelper.display({ id: "provider_display_name", header: "Provider Name", cell: ({ row }) => { const displayName = row.original.provider_display_name; return (
{displayName}
id: {row.original.id}
type: {row.original.type}
); }, }), columnHelper.accessor("alertsReceived", { id: "alertsReceived", header: "Alerts Received", }), columnHelper.display({ id: "alertsCorrelatedToIncidentsPercentage", header: "% of Alerts Correlated to Incidents", cell: ({ row }) => { return `${row.original.alertsCorrelatedToIncidentsPercentage.toFixed( 2 )}%`; }, }), ] as DisplayColumnDef[]; // Add dynamic columns based on the fields const dynamicColumns = fields.map((field: string) => columnHelper.accessor( `alertsWith${field.charAt(0).toUpperCase() + field.slice(1)}Percentage`, { id: `alertsWith${ field.charAt(0).toUpperCase() + field.slice(1) }Percentage`, header: `% of Alerts Having ${ field.charAt(0).toUpperCase() + field.slice(1) }`, cell: (info: any) => `${info.getValue().toFixed(2)}%`, } ) ) as DisplayColumnDef[]; return [ ...baseColumns, ...dynamicColumns, ] as DisplayColumnDef[]; }, [fields]); // Process data and include dynamic fields const finalData = useMemo(() => { let providers: Provider[] | null = null; if (!providersMeta || !alertsQualityMetrics) { return null; } switch (tab) { case 0: providers = [ ...providersMeta?.installed_providers, ...providersMeta?.linked_providers, ]; break; case 1: providers = providersMeta?.installed_providers || []; break; case 2: providers = providersMeta?.linked_providers || []; break; default: providers = [ ...providersMeta?.installed_providers, ...providersMeta?.linked_providers, ]; break; } if (!providers) { return null; } function getProviderDisplayName(provider: Provider) { return ( (provider?.details?.name ? `${provider.details.name} (${provider.display_name})` : provider.display_name) || provider.type ); } const innerData: FinalAlertQuality[] = providers.map((provider) => { const providerId = provider.id; const providerType = provider.type; const key = `${providerId}_${providerType}`; const alertQuality = alertsQualityMetrics[key]; const totalAlertsReceived = alertQuality?.total_alerts ?? 0; const correlated_alerts = alertQuality?.correlated_alerts ?? 0; const correltedPert = totalAlertsReceived && correlated_alerts ? (correlated_alerts / totalAlertsReceived) * 100 : 0; const severityPert = totalAlertsReceived ? ((alertQuality?.severity_count ?? 0) / totalAlertsReceived) * 100 : 0; // Calculate percentages for dynamic fields const dynamicFieldPercentages = fields.reduce( (acc, field: string) => { acc[ `alertsWith${ field.charAt(0).toUpperCase() + field.slice(1) }Percentage` ] = totalAlertsReceived ? ((alertQuality?.[`${field}_count`] ?? 0) / totalAlertsReceived) * 100 : 0; return acc; }, {} as Record ); return { ...provider, alertsReceived: totalAlertsReceived, alertsCorrelatedToIncidentsPercentage: correltedPert, alertsWithSeverityPercentage: severityPert, ...dynamicFieldPercentages, // Add dynamic field percentages here provider_display_name: getProviderDisplayName(provider), } as FinalAlertQuality; }); return innerData; }, [tab, providersMeta, alertsQualityMetrics, fields]); return (
{!isDashBoard && (

Alert Quality Dashboard

)}
{finalData && ( data={finalData} columns={columns} rowCount={finalData?.length} offset={pagination.offset} limit={pagination.limit} onPaginationChange={handlePaginationChange} dataFetchedAtOneGO={true} onRowClick={(row) => { console.log("Row clicked:", row); }} /> )}
); }; const AlertQuality = ({ isDashBoard, filters, setFilters, }: { isDashBoard?: boolean; filters: { [x: string]: string | string[]; }; setFilters: any; }) => { const fieldsValue = filters?.fields || ""; const { data: providersMeta } = useProviders(); const { data: alertsQualityMetrics, error } = useAlertQualityMetrics( isDashBoard ? (fieldsValue as string | string[]) : "" ); if (error) { return ( Failed to load Alert Quality Metrics ); } return ( { setFilters((filters: any) => { return { ...filters, fields: field, }; }); }} fieldsValue={fieldsValue} /> ); }; export default AlertQuality; ================================================ FILE: keep-ui/app/(keep)/dashboard/styles.css ================================================ .grid-item__widget:hover { cursor: move; } .grid-item__widget .panel { box-shadow: none; border: none; } .grid-item__widget .panel:focus { outline: none; } .hidden-on-small { display: none; } @media (min-width: 1024px) { .hidden-on-small { display: inherit; } } ================================================ FILE: keep-ui/app/(keep)/dashboard/types.tsx ================================================ import { MetricsWidget } from "@/utils/hooks/useDashboardMetricWidgets"; import { Preset } from "@/entities/presets/model/types"; export interface LayoutItem { i: string; x: number; y: number; w: number; h: number; minW?: number; minH?: number; static: boolean; } export interface GenericsMetrics { key: string; label: string; widgetType: "table" | "chart"; meta: { defaultFilters: { [key: string]: string | string[]; }; }; } export enum WidgetType { PRESET = "PRESET", METRIC = "METRIC", GENERICS_METRICS = "GENERICS_METRICS", } export enum PresetPanelType { ALERT_TABLE = "ALERT_TABLE", ALERT_COUNT_PANEL = "ALERT_COUNT_PANEL", } export interface WidgetData extends LayoutItem { thresholds?: Threshold[]; preset?: Preset; name: string; widgetType: WidgetType; genericMetrics?: GenericsMetrics; metric?: MetricsWidget; presetPanelType?: PresetPanelType; showFiringOnly?: boolean; customLink?: string; } export interface Threshold { value: number; color: string; } ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/generic-metrics/generic-metrics-grid-item.tsx ================================================ import React, { useEffect, useState } from "react"; import { WidgetData } from "../../types"; import AlertQuality from "@/app/(keep)/dashboard/alert-quality-table"; interface GridItemProps { item: WidgetData; onEdit: (updatedItem: WidgetData) => void; } const GenericMetricsGridItem: React.FC = ({ item, onEdit }) => { const [filters, setFilters] = useState({ ...(item?.genericMetrics?.meta?.defaultFilters || {}), }); useEffect(() => { let meta; if (item?.genericMetrics?.meta) { meta = { ...item.genericMetrics.meta, defaultFilters: filters || {}, }; } const updatedItem = { ...item, genericMetrics: { ...item.genericMetrics, meta, }, }; onEdit(updatedItem as WidgetData); }, [filters]); function renderGenericMetrics() { switch (item?.genericMetrics?.key) { case "alert_quality": return ( ); default: return null; } } return (
{renderGenericMetrics()}
); }; export default GenericMetricsGridItem; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/generic-metrics/generic-metrics-widget-form.tsx ================================================ import { Select, SelectItem, Subtitle } from "@tremor/react"; import { useEffect } from "react"; import { Controller, get, useForm, useWatch } from "react-hook-form"; import { GenericsMetrics, LayoutItem } from "../../types"; const GENERIC_METRICS = [ { key: "alert_quality", label: "Alert Quality", widgetType: "table", meta: { defaultFilters: { fields: "severity" }, }, }, ] as GenericsMetrics[]; interface GenericMetricsForm { selectedGenericMetrics: string; } export interface GenericMetricsWidgetFormProps { editingItem?: any; onChange: (formState: any, isValid: boolean) => void; } export const GenericMetricsWidgetForm: React.FC< GenericMetricsWidgetFormProps > = ({ editingItem, onChange }) => { const { control, formState: { errors, isValid }, } = useForm({ defaultValues: { selectedGenericMetrics: editingItem?.genericMetrics?.key ?? "", }, }); const formValues = useWatch({ control }); const deepClone = (obj: GenericsMetrics | undefined) => { if (!obj) { return obj; } return JSON.parse(JSON.stringify(obj)) as GenericsMetrics; }; function getLayoutValues(): LayoutItem { if (editingItem) { return {} as LayoutItem; } return { w: 12, h: 20, minW: 10, minH: 15, static: false, } as LayoutItem; } useEffect(() => { const genericMetrics = deepClone( GENERIC_METRICS.find((g) => g.key === formValues.selectedGenericMetrics) ); onChange({ ...getLayoutValues(), genericMetrics }, true); }, [formValues]); return (
Generic Metrics ( )} />
); }; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/metric/metric-grid-item.tsx ================================================ import React from "react"; import { AreaChart } from "@tremor/react"; import { WidgetData } from "../../types"; interface GridItemProps { item: WidgetData; } const GridItem: React.FC = ({ item }) => { return (
`${Intl.NumberFormat().format(number).toString()}` } startEndOnly connectNulls showLegend={false} showTooltip={true} xAxisLabel="Timestamp" />
); }; export default GridItem; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/metric/metric-widget-form.tsx ================================================ import { Select, SelectItem, Subtitle } from "@tremor/react"; import { useEffect } from "react"; import { Controller, get, useForm, useWatch } from "react-hook-form"; import { MetricsWidget } from "@/utils/hooks/useDashboardMetricWidgets"; import { LayoutItem } from "../../types"; interface PresetForm { selectedMetricWidget: string; } export interface MetricWidgetFormProps { metricWidgets: MetricsWidget[]; editingItem?: any; onChange: (formState: any, isValid: boolean) => void; } export const MetricWidgetForm: React.FC = ({ metricWidgets, editingItem, onChange, }) => { const { control, formState: { errors, isValid }, } = useForm({ defaultValues: { selectedMetricWidget: editingItem?.metric?.id ?? "", }, }); const formValues = useWatch({ control }); useEffect(() => { const metric = metricWidgets.find( (p) => p.id === formValues.selectedMetricWidget ); onChange({ ...getLayoutValues(), metric }, isValid); }, [formValues]); function getLayoutValues(): LayoutItem { if (editingItem) { return {} as LayoutItem; } return { w: 6, h: 8, minW: 2, minH: 7, static: false, } as LayoutItem; } return (
Widget ( )} />
); }; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/preset/columns-selection.tsx ================================================ import { useFacetPotentialFields } from "@/features/filter/hooks"; import { MultiSelect, MultiSelectItem } from "@tremor/react"; import React, { useEffect, useMemo, useState } from "react"; import { defaultColumns } from "./constants"; interface ColumnsSelectionProps { selectedColumns?: string[]; onChange: (selected: string[]) => void; } const ColumnsSelection: React.FC = ({ selectedColumns, onChange, }) => { const [selectedColumnsState, setSelectedColumnsState] = useState>( new Set(selectedColumns || defaultColumns) ); const { data } = useFacetPotentialFields("alerts"); useEffect( () => onChange(Array.from(selectedColumnsState)), [selectedColumnsState] ); const sortedOptions = useMemo(() => { return data?.slice().sort((first, second) => { const inSetA = selectedColumnsState.has(first); const inSetB = selectedColumnsState.has(second); if (inSetA && !inSetB) return -1; if (!inSetA && inSetB) return 1; return first.localeCompare(second); }); }, [data, selectedColumnsState]); return ( setSelectedColumnsState(new Set(selected))} > {sortedOptions?.map((field) => ( {field} ))} ); }; export default ColumnsSelection; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/preset/constants.ts ================================================ export const defaultColumns = [ "severity", "status", "source", "name", "description", "lastReceived", ]; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/preset/preset-grid-item.tsx ================================================ import React, { useMemo } from "react"; import { WidgetData, WidgetType, PresetPanelType } from "../../types"; import { usePresetAlertsCount } from "@/features/presets/custom-preset-links"; import { useDashboardPreset } from "@/utils/hooks/useDashboardPresets"; import { Button, Icon } from "@tremor/react"; import { FireIcon } from "@heroicons/react/24/outline"; import * as Tooltip from "@radix-ui/react-tooltip"; import Skeleton from "react-loading-skeleton"; import "react-loading-skeleton/dist/skeleton.css"; import { useRouter } from "next/navigation"; import TimeAgo from "react-timeago"; import { useSearchParams } from "next/navigation"; import WidgetAlertsTable from "./widget-alerts-table"; import WidgetAlertCountPanel from "./widget-alert-count-panel"; import CelInput from "@/features/cel-input/cel-input"; interface GridItemProps { item: WidgetData; } const PresetGridItem: React.FC = ({ item }) => { const searchParams = useSearchParams(); const timeRangeCel = useMemo(() => { const timeRangeSearchParam = searchParams.get("time_stamp"); if (timeRangeSearchParam) { const parsedTimeRange = JSON.parse(timeRangeSearchParam); return `lastReceived >= "${parsedTimeRange.start}" && lastReceived <= "${parsedTimeRange.end}"`; } return ""; }, [searchParams]); const presets = useDashboardPreset(); const countOfLastAlerts = (item.preset as any).countOfLastAlerts; const preset = useMemo( () => presets.find((preset) => preset.id === item.preset?.id), [presets, item.preset?.id] ); const presetCel = useMemo( () => preset?.options.find((option) => option.label === "CEL")?.value || "", [preset] ); const filterCel = useMemo( () => [timeRangeCel, presetCel].filter(Boolean).join(" && "), [presetCel, timeRangeCel] ); const { alerts, totalCount: presetAlertsCount, isLoading, } = usePresetAlertsCount( filterCel, !!preset?.counter_shows_firing_only, countOfLastAlerts, 0, 10000 // refresh interval ); const router = useRouter(); function handleGoToPresetClick() { router.push(`/alerts/${preset?.name.toLowerCase()}`); } const getColor = () => { let color = "#000000"; if ( item.widgetType === WidgetType.PRESET && item.thresholds && item.preset ) { for (let i = item.thresholds.length - 1; i >= 0; i--) { if (item.preset && presetAlertsCount >= item.thresholds[i].value) { color = item.thresholds[i].color; break; } } } return color; }; function hexToRgb(hex: string, alpha: number = 1) { // Remove '#' if present hex = hex.replace(/^#/, ""); // Handle shorthand form (#f44 → #ff4444) if (hex.length === 3) { hex = hex .split("") .map((c) => c + c) .join(""); } const bigint = parseInt(hex, 16); const r = (bigint >> 16) & 255; const g = (bigint >> 8) & 255; const b = bigint & 255; return `rgb(${r}, ${g}, ${b}, ${alpha})`; } function renderCEL() { if (!presetCel) { return; } return (
Preset CEL:
{presetCel}
); } function renderAlertsCountText() { const label = preset?.counter_shows_firing_only ? "Firing alerts count:" : "Alerts count:"; let state: string = "nothingToShow"; if (countOfLastAlerts > 0) { if (presetAlertsCount <= countOfLastAlerts) { state = "allAlertsShown"; } else { state = "someAlertsShown"; } } return (
{label}
{isLoading && ( )} {!isLoading && ( <> {state === "nothingToShow" && ( {presetAlertsCount} alerts )} {state === "allAlertsShown" && ( showing {presetAlertsCount} alerts )} {state === "someAlertsShown" && ( showing {countOfLastAlerts} out of {presetAlertsCount} )} {preset?.counter_shows_firing_only && ( )} )}
); } const isAlertTable = item.presetPanelType === PresetPanelType.ALERT_TABLE || !item.presetPanelType; const isAlertCountPanel = item.presetPanelType === PresetPanelType.ALERT_COUNT_PANEL; return (
{isAlertTable && ( <>
Preset name:
{preset?.name}
{renderCEL()} {renderAlertsCountText()}
{countOfLastAlerts > 0 && ( )} )} {isAlertCountPanel && ( )}
); }; export default PresetGridItem; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/preset/preset-widget-form.tsx ================================================ import { Trashcan } from "@/components/icons"; import { Preset } from "@/entities/presets/model"; import { Button, Icon, Select, SelectItem, Subtitle, TextInput, Switch, } from "@tremor/react"; import { useEffect, useMemo, useState } from "react"; import { Controller, get, useForm, useWatch, useFieldArray, } from "react-hook-form"; import { LayoutItem, Threshold, PresetPanelType } from "../../types"; import ColumnsSelection from "./columns-selection"; interface PresetForm { selectedPreset: string; countOfLastAlerts: string; thresholds: Threshold[]; presetPanelType: PresetPanelType; showFiringOnly: boolean; customLink?: string; } export interface PresetWidgetFormProps { editingItem?: any; presets: Preset[]; onChange: (formState: any, isValid: boolean) => void; } export const PresetWidgetForm: React.FC = ({ editingItem, presets, onChange, }: PresetWidgetFormProps) => { const { control, formState: { errors, isValid }, register, } = useForm({ defaultValues: { selectedPreset: editingItem?.preset?.id, countOfLastAlerts: editingItem ? editingItem.preset.countOfLastAlerts || 0 : 5, thresholds: editingItem?.thresholds || [ { value: 0, color: "#10b981" }, // Bold emerald green { value: 20, color: "#dc2626" }, // Bold red ], presetPanelType: editingItem?.presetPanelType || PresetPanelType.ALERT_TABLE, showFiringOnly: editingItem?.showFiringOnly ?? false, customLink: editingItem?.customLink || "", }, }); const [presetColumns, setPresetColumns] = useState( editingItem ? editingItem.presetColumns : undefined ); const { fields, append, remove, move, replace } = useFieldArray({ control, name: "thresholds", }); const formValues = useWatch({ control }); const normalizedFormValues = useMemo(() => { return { countOfLastAlerts: parseInt(formValues.countOfLastAlerts || "0"), selectedPreset: presets.find((p) => p.id === formValues.selectedPreset), presetColumns, thresholds: formValues.thresholds?.map((t) => ({ ...t, value: parseInt(t.value?.toString() as string, 10) || 0, })), presetPanelType: formValues.presetPanelType || PresetPanelType.ALERT_TABLE, showFiringOnly: formValues.showFiringOnly ?? false, customLink: formValues.customLink || "", }; }, [formValues, presetColumns]); function getLayoutValues(): LayoutItem { if (editingItem) { return {} as LayoutItem; } const isAlertTable = normalizedFormValues.presetPanelType === PresetPanelType.ALERT_TABLE; const isAlertCountPanel = normalizedFormValues.presetPanelType === PresetPanelType.ALERT_COUNT_PANEL; if (isAlertCountPanel) { // Narrower, more compact layout for count panels with no minimum width return { w: 4, h: 3, minW: 0, minH: 2, static: false, } as LayoutItem; } // Original layout for alert tables const itemHeight = isAlertTable && normalizedFormValues.countOfLastAlerts > 0 ? 6 : 4; const itemWidth = isAlertTable && normalizedFormValues.countOfLastAlerts > 0 ? 8 : 6; return { w: itemWidth, h: itemHeight, minW: 4, minH: 4, static: false, } as LayoutItem; } useEffect(() => { onChange( { ...getLayoutValues(), preset: { ...normalizedFormValues.selectedPreset, countOfLastAlerts: normalizedFormValues.countOfLastAlerts, }, presetColumns: normalizedFormValues.presetColumns, thresholds: normalizedFormValues.thresholds, presetPanelType: normalizedFormValues.presetPanelType, showFiringOnly: normalizedFormValues.showFiringOnly, customLink: normalizedFormValues.customLink, }, isValid ); }, [normalizedFormValues, isValid]); const handleThresholdBlur = () => { const reorderedThreesholds = formValues?.thresholds ?.map((t) => ({ ...t, value: parseInt(t.value?.toString() as string, 10) || 0, })) .sort((a, b) => a.value - b.value); if (!reorderedThreesholds) { return; } replace(reorderedThreesholds as any); }; const handleAddThreshold = () => { const maxThreshold = Math.max( ...(formValues.thresholds?.map((t) => t.value) as any), 0 ); append({ value: maxThreshold + 10, color: "#000000" }); }; return ( <>
Preset ( )} />
Panel Type ( )} />
{formValues.presetPanelType === PresetPanelType.ALERT_COUNT_PANEL && ( <>
Show Firing Alerts Only ( )} />
Custom Link (optional) ( )} />
)} {formValues.presetPanelType === PresetPanelType.ALERT_TABLE && ( <>
Last alerts count to display ( )} />
setPresetColumns(selectedColumns)} > )}
Thresholds
{fields.map((field, index) => (
{fields.length > 1 && ( )}
))}
); }; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/preset/widget-alert-count-panel.tsx ================================================ import React, { useMemo } from "react"; import { WidgetData, WidgetType, Threshold } from "../../types"; import { usePresetAlertsCount } from "@/features/presets/custom-preset-links"; import { useDashboardPreset } from "@/utils/hooks/useDashboardPresets"; import { Button, Icon } from "@tremor/react"; import { FireIcon } from "@heroicons/react/24/outline"; import Skeleton from "react-loading-skeleton"; import "react-loading-skeleton/dist/skeleton.css"; import { useRouter } from "next/navigation"; import { useSearchParams } from "next/navigation"; interface WidgetAlertCountPanelProps { presetName: string; showFiringOnly?: boolean; background?: string; thresholds?: Threshold[]; customLink?: string; } const WidgetAlertCountPanel: React.FC = ({ presetName, showFiringOnly = false, background, thresholds = [], customLink, }) => { const searchParams = useSearchParams(); const timeRangeCel = useMemo(() => { const timeRangeSearchParam = searchParams.get("time_stamp"); if (timeRangeSearchParam) { const parsedTimeRange = JSON.parse(timeRangeSearchParam); return `lastReceived >= "${parsedTimeRange.start}" && lastReceived <= "${parsedTimeRange.end}"`; } return ""; }, [searchParams]); const presets = useDashboardPreset(); const preset = useMemo( () => presets.find((preset) => preset.name === presetName), [presets, presetName] ); const presetCel = useMemo( () => preset?.options.find((option) => option.label === "CEL")?.value || "", [preset] ); const filterCel = useMemo( () => [timeRangeCel, presetCel].filter(Boolean).join(" && "), [presetCel, timeRangeCel] ); // Get total alerts count const { totalCount: totalAlertsCount, isLoading: isLoadingTotal, } = usePresetAlertsCount( filterCel, false, // Always get total count 0, 0, 10000 ); // Get firing alerts count const { totalCount: firingAlertsCount, isLoading: isLoadingFiring, } = usePresetAlertsCount( filterCel, true, // Get firing count 0, 0, 10000 ); const isLoading = isLoadingTotal || isLoadingFiring; const router = useRouter(); function handleGoToPresetClick() { router.push(`/alerts/${preset?.name.toLowerCase()}`); } function handleCustomLinkClick() { if (customLink) { window.open(customLink, '_blank'); } } const getColor = (count: number) => { let color = "#1f2937"; // Default dark gray instead of black if (thresholds && thresholds.length > 0) { for (let i = thresholds.length - 1; i >= 0; i--) { if (count >= thresholds[i].value) { color = thresholds[i].color; break; } } } return color; }; function hexToRgb(hex: string, alpha: number = 1) { // Remove '#' if present hex = hex.replace(/^#/, ""); // Handle shorthand form (#f44 → #ff4444) if (hex.length === 3) { hex = hex .split("") .map((c) => c + c) .join(""); } const bigint = parseInt(hex, 16); const r = (bigint >> 16) & 255; const g = (bigint >> 8) & 255; const b = bigint & 255; return `rgb(${r}, ${g}, ${b}, ${alpha})`; } const label = showFiringOnly ? "Firing Alerts" : "Total Alerts"; const displayCount = showFiringOnly ? firingAlertsCount : totalAlertsCount; const count = isLoading ? "..." : displayCount; // Use firing count for threshold colors when showFiringOnly is selected const thresholdCount = showFiringOnly ? firingAlertsCount : totalAlertsCount; const color = getColor(thresholdCount); return (
{/* Header with label and button */}
{label} {showFiringOnly && ( )}
{customLink && ( )}
{/* Main content area with diagonal alignment */}
{/* Preset name and count in diagonal layout */}
{preset?.name}
{isLoading ? ( ) : ( count )}
); }; export default WidgetAlertCountPanel; ================================================ FILE: keep-ui/app/(keep)/dashboard/widget-types/preset/widget-alerts-table.tsx ================================================ import React, { useEffect, useMemo } from "react"; import { WidgetData, WidgetType } from "../../types"; import { usePresetAlertsCount } from "@/features/presets/custom-preset-links"; import { useDashboardPreset } from "@/utils/hooks/useDashboardPresets"; import { Button, Icon } from "@tremor/react"; import { FireIcon } from "@heroicons/react/24/outline"; import { DynamicImageProviderIcon } from "@/components/ui"; import { getStatusColor, getStatusIcon } from "@/shared/lib/status-utils"; import { getNestedValue } from "@/shared/lib/object-utils"; import { SeverityBorderIcon, UISeverity } from "@/shared/ui"; import { severityMapping } from "@/entities/alerts/model"; import * as Tooltip from "@radix-ui/react-tooltip"; import Skeleton from "react-loading-skeleton"; import "react-loading-skeleton/dist/skeleton.css"; import { useRouter } from "next/navigation"; import TimeAgo from "react-timeago"; import { useSearchParams } from "next/navigation"; import { useLocalStorage } from "@/utils/hooks/useLocalStorage"; import { ColumnRenameMapping } from "@/widgets/alerts-table/ui/alert-table-column-rename"; import { DEFAULT_COLS } from "@/widgets/alerts-table/lib/alert-table-utils"; import { ColumnOrderState } from "@tanstack/table-core"; import { startCase } from "lodash"; import { defaultColumns } from "./constants"; interface WidgetAlertsTableProps { presetName: string; alerts?: any[]; columns?: string[]; background?: string; } const WidgetAlertsTable: React.FC = ({ presetName, alerts, columns, background, }) => { const columnsGapClass = "pr-3"; const borderClass = "border-b"; const [columnRenameMapping] = useLocalStorage( `column-rename-mapping-${presetName}`, {} ); const [presetOrderedColumns] = useLocalStorage( `column-order-${presetName}`, DEFAULT_COLS ); const columnsMeta: { [key: string]: any } = useMemo( () => ({ severity: { gridColumnTemplate: "min-content", renderHeader: () =>
, renderValue: (alert: any) => ( ), }, status: { gridColumnTemplate: "min-content", renderHeader: () =>
, renderValue: (alert: any) => ( ), }, source: { gridColumnTemplate: "min-content", renderHeader: () =>
, renderValue: (alert: any) => ( ), }, name: { gridColumnTemplate: "minmax(100px, 1fr)", renderValue: (alert: any) => (
{alert.name}
), }, description: { gridColumnTemplate: "minmax(100px, 1fr)", renderValue: (alert: any) => (
{alert.description}
), }, lastReceived: { gridColumnTemplate: "min-content", renderValue: (alert: any) => , }, }), [columnRenameMapping] ); const orderedColumns = useMemo(() => { const presetColumns: string[] = columns || defaultColumns; const indexed: { [key: string]: number } = ( presetOrderedColumns || defaultColumns ).reduce((prev, curr, index) => ({ ...prev, [curr]: index }), {}); return presetColumns.slice().sort((firstColum, secondColumn) => { const indexOfFirst = indexed[firstColum] || 0; const indexOfSecond = indexed[secondColumn] || 0; return indexOfFirst - indexOfSecond; }); }, [columns, presetOrderedColumns]); function renderHeaders() { return orderedColumns?.map((column, index) => { const columnMeta = columnsMeta[column]; let columnHeaderValue; if (columnMeta?.renderHeader) { columnHeaderValue = columnMeta.renderHeader(); } else { columnHeaderValue = (
{columnRenameMapping[column] || startCase(column)}
); } return (
{columnHeaderValue}
); }); } function renderTableBody() { const alertsToRender = alerts || Array.from({ length: 5 }).fill(undefined); return alertsToRender ?.map((alert, alertIndex) => { return orderedColumns?.map((column, index) => { const columnMeta = columnsMeta[column]; let columnValue; if (!alert) { columnValue = ; } else if (columnMeta?.renderValue) { columnValue = columnMeta.renderValue(alert); } else { columnValue = (
{getNestedValue(alert, column)}
); } const _columnsGapClass = index < orderedColumns.length - 1 ? columnsGapClass : ""; const _borderClass = alertIndex < alertsToRender.length - 1 ? borderClass : ""; return (
{columnValue}
); }); }) .flat(); } const gridTemplateColumns = useMemo( () => orderedColumns ?.map((column) => { const columnMeta = columnsMeta[column]; let gridColumnTemplate = "auto"; if (columnMeta?.gridColumnTemplate) { gridColumnTemplate = columnMeta.gridColumnTemplate; } else { // Default sizing for arbitrary columns gridColumnTemplate = "minmax(auto, 1fr)"; } return gridColumnTemplate; }) .join(" "), [orderedColumns, columnsMeta] ); return (
{renderHeaders()} {renderTableBody()}
); }; export default WidgetAlertsTable; ================================================ FILE: keep-ui/app/(keep)/deduplication/DeduplicationPlaceholder.tsx ================================================ import { Card, Subtitle, Title } from "@tremor/react"; import Link from "next/link"; import Image from "next/image"; import deduplicationPlaceholder from "./deduplication-placeholder.svg"; export const DeduplicationPlaceholder = () => { return ( <>
No Deduplications Yet Alert deduplication is the first layer of denoising. It groups similar alerts from one source.
To connect alerts across sources into incidents, check{" "} Correlations
This page will become active once the first alerts are registered.
Deduplication
); }; ================================================ FILE: keep-ui/app/(keep)/deduplication/DeduplicationSidebar.tsx ================================================ import { useEffect, useState, useMemo } from "react"; import { Dialog } from "@headlessui/react"; import { useForm, Controller, SubmitHandler } from "react-hook-form"; import { Text, Button, TextInput, Callout, Badge, Switch, Icon, Title, Card, } from "@tremor/react"; import { IoMdClose } from "react-icons/io"; import { DeduplicationRule } from "@/app/(keep)/deduplication/models"; import { useDeduplicationFields } from "utils/hooks/useDeduplicationRules"; import { Select } from "@/shared/ui"; import { ExclamationTriangleIcon, InformationCircleIcon, } from "@heroicons/react/24/outline"; import { KeyedMutator } from "swr"; import { useApi } from "@/shared/lib/hooks/useApi"; import { KeepApiError } from "@/shared/api"; import { Providers } from "@/shared/api/providers"; import SidePanel from "@/components/SidePanel"; import { useConfig } from "@/utils/hooks/useConfig"; interface ProviderOption { value: string; label: string; logoUrl: string; } interface DeduplicationSidebarProps { isOpen: boolean; toggle: VoidFunction; selectedDeduplicationRule: DeduplicationRule | null; onSubmit: (data: Partial) => Promise; mutateDeduplicationRules: KeyedMutator; providers: { installed_providers: Providers; linked_providers: Providers }; } const DeduplicationSidebar: React.FC = ({ isOpen, toggle, selectedDeduplicationRule, onSubmit, mutateDeduplicationRules, providers, }) => { const { control, handleSubmit, setValue, reset, setError, watch, formState: { errors }, clearErrors, } = useForm>({ defaultValues: selectedDeduplicationRule || { name: "", description: "", provider_type: "", provider_id: "", fingerprint_fields: [], full_deduplication: false, ignore_fields: [], }, }); const [isSubmitting, setIsSubmitting] = useState(false); const { data: config } = useConfig(); const { data: deduplicationFields = {} } = useDeduplicationFields(); const api = useApi(); const alertProviders = useMemo( () => [ { id: null, type: "keep", details: { name: "Keep" }, tags: ["alert"] }, ...providers.installed_providers, ...providers.linked_providers, ].filter((provider) => provider.tags?.includes("alert")), [providers] ); const fullDeduplication = watch("full_deduplication"); const selectedProviderType = watch("provider_type"); const selectedProviderId = watch("provider_id"); const fingerprintFields = watch("fingerprint_fields"); const ignoreFields = watch("ignore_fields"); const availableFields = useMemo(() => { const defaultFields = [ "source", "service", "description", "fingerprint", "name", "lastReceived", ]; if (selectedProviderType) { const key = `${selectedProviderType}_${selectedProviderId || "null"}`; const providerFields = deduplicationFields[key] || []; return [ ...new Set([ ...defaultFields, ...providerFields, ...(fingerprintFields ?? []), ...(ignoreFields ?? []), ]), ]; } return [...new Set([...defaultFields, ...(fingerprintFields ?? [])])]; }, [ selectedProviderType, selectedProviderId, deduplicationFields, fingerprintFields, ignoreFields, ]); useEffect(() => { if (isOpen && selectedDeduplicationRule) { reset(selectedDeduplicationRule); } else if (isOpen) { reset({ name: "", description: "", provider_type: "", provider_id: "", fingerprint_fields: [], full_deduplication: false, ignore_fields: [], }); } }, [isOpen, selectedDeduplicationRule, reset]); const handleToggle = () => { if (isOpen) { clearErrors(); } toggle(); }; const onFormSubmit: SubmitHandler> = async ( data ) => { setIsSubmitting(true); clearErrors(); try { let url = "/deduplications"; if (selectedDeduplicationRule && selectedDeduplicationRule.id) { url += `/${selectedDeduplicationRule.id}`; } const method = !selectedDeduplicationRule || !selectedDeduplicationRule.id ? "POST" : "PUT"; const response = method === "POST" ? await api.post(url, data) : await api.put(url, data); console.log("Deduplication rule saved:", data); reset(); handleToggle(); await mutateDeduplicationRules(); } catch (error) { if (error instanceof KeepApiError) { setError("root.serverError", { type: "manual", message: error.message || "Failed to save deduplication rule", }); } else { setError("root.serverError", { type: "manual", message: "An unexpected error occurred", }); } } finally { setIsSubmitting(false); } }; return (
{selectedDeduplicationRule ? `Edit ${selectedDeduplicationRule.name}` : "Add deduplication rule"} {selectedDeduplicationRule?.default && ( Default Rule )}
{selectedDeduplicationRule?.default && (
Editing a default deduplication rule requires advanced knowledge. Default rules are carefully designed to provide optimal deduplication for specific alert types. Modifying these rules may impact the efficiency of your alert processing. If you're unsure about making changes, we recommend creating a new custom rule instead of modifying the default one.

Learn more about deduplication rules
)} {selectedDeduplicationRule?.is_provisioned && (
Editing a provisioned deduplication rule is not allowed. Please contact your system administrator for more information.
)}
Rule name ( )} />
Description ( )} />
Provider Select the provider for which this deduplication rule will apply. This determines the source of alerts that will be processed by this rule. ( ({ value: fieldName, label: fieldName, }))} placeholder="Select fingerprint fields" value={field.value?.map((value: string) => ({ value, label: value, }))} onChange={(selectedOptions) => { field.onChange( selectedOptions.map( (option: { value: string }) => option.value ) ); }} noOptionsMessage={() => selectedProviderType ? "No options" : "Please choose provider to see available fields" } /> )} /> {errors.fingerprint_fields && (

{errors.fingerprint_fields.message}

)}
( )} /> Full deduplication 1. Full deduplication: Keep will discard events if they are the same (excluding the 'Ignore Fields').
2. Partial deduplication (default): Uses specified fields to correlate alerts. E.g., two alerts with same 'service' and 'env' fields will be deduped into one alert.
{fullDeduplication && (
Ignore fields (