Skip to content

Commit 467bb72

Browse files
committed
Adding more checks based on datadog blog pages and own understanding
1 parent 85cf7fe commit 467bb72

14 files changed

+742
-0
lines changed

cpu_limits_low-variables.tf

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
variable "cpu_limits_low_enabled" {
2+
type = bool
3+
default = true
4+
}
5+
6+
variable "cpu_limits_low_warning" {
7+
type = number
8+
default = 0
9+
}
10+
11+
variable "cpu_limits_low_critical" {
12+
type = number
13+
default = -30
14+
}
15+
16+
variable "cpu_limits_low_evaluation_period" {
17+
type = string
18+
default = "last_5m"
19+
}
20+
21+
variable "cpu_limits_low_severity" {
22+
type = string
23+
default = "minor"
24+
}
25+
26+
variable "cpu_limits_low_note" {
27+
type = string
28+
default = ""
29+
}
30+
31+
variable "cpu_limits_low_docs" {
32+
type = string
33+
default = ""
34+
}
35+
36+
variable "cpu_limits_low_filter_override" {
37+
type = string
38+
default = ""
39+
}
40+
41+
variable "cpu_limits_low_alerting_enabled" {
42+
type = bool
43+
default = false
44+
}
45+
46+
variable "cpu_limits_low_no_data_timeframe" {
47+
type = number
48+
default = null
49+
}
50+
51+
variable "cpu_limits_low_notify_no_data" {
52+
type = bool
53+
default = false
54+
}
55+
56+
variable "cpu_limits_low_ok_threshold" {
57+
type = number
58+
default = null
59+
}
60+
61+
variable "cpu_limits_low_name_prefix" {
62+
type = string
63+
default = ""
64+
}
65+
66+
variable "cpu_limits_low_name_suffix" {
67+
type = string
68+
default = ""
69+
}
70+
71+
variable "cpu_limits_low_priority" {
72+
description = "Number from 1 (high) to 5 (low)."
73+
74+
type = number
75+
default = null
76+
}

cpu_limits_low.tf

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
locals {
2+
cpu_limits_low_filter = coalesce(
3+
var.cpu_limits_low_filter_override,
4+
var.filter_str
5+
)
6+
}
7+
8+
module "cpu_limits_low" {
9+
source = "[email protected]:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.1"
10+
11+
name = "Available CPU for Limits Low"
12+
query = "max(${var.cpu_limits_low_evaluation_period}):sum:kubernetes.cpu.capacity{${local.cpu_limits_low_filter}} by {host,cluster_name} - sum:kubernetes.cpu.limits{${local.cpu_limits_low_filter}} by {host,cluster_name} < ${var.cpu_limits_low_critical}"
13+
alert_message = "Kubernetes cluster cpu room for limits is too low"
14+
recovery_message = "Kubernetes cluster cpu limits is ok again."
15+
16+
# monitor level vars
17+
enabled = var.cpu_limits_low_enabled
18+
alerting_enabled = var.cpu_limits_low_alerting_enabled
19+
critical_threshold = var.cpu_limits_low_critical
20+
warning_threshold = var.cpu_limits_low_warning
21+
priority = var.cpu_limits_low_priority
22+
severity = var.cpu_limits_low_severity
23+
docs = var.cpu_limits_low_docs
24+
note = var.cpu_limits_low_note
25+
26+
# module level vars
27+
env = var.alert_env
28+
service = var.service
29+
notification_channel = var.notification_channel
30+
additional_tags = var.additional_tags
31+
locked = var.locked
32+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
locals {
2+
deploy_desired_vs_status_filter = coalesce(
3+
var.deploy_desired_vs_status_filter_override,
4+
var.filter_str
5+
)
6+
}
7+
8+
module "deploy_desired_vs_status" {
9+
source = "[email protected]:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.1"
10+
11+
name = "Desired pods vs current pods (Deployments)"
12+
query = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {cluster_name,host} - max:kubernetes_state.deployment.replicas{${local.deploy_desired_vs_status_filter}} by {cluster_name,host} > ${var.deploy_desired_vs_status_critical}"
13+
alert_message = "Kubernetes is having trouble getting all the pods to start. (Based on replicas number in all the deployments)"
14+
recovery_message = "All pods described in deployments have started"
15+
16+
# monitor level vars
17+
enabled = var.deploy_desired_vs_status_enabled
18+
alerting_enabled = var.deploy_desired_vs_status_alerting_enabled
19+
critical_threshold = var.deploy_desired_vs_status_critical
20+
warning_threshold = var.deploy_desired_vs_status_warning
21+
priority = var.deploy_desired_vs_status_priority
22+
severity = var.deploy_desired_vs_status_severity
23+
docs = var.deploy_desired_vs_status_docs
24+
note = var.deploy_desired_vs_status_note
25+
26+
# module level vars
27+
env = var.alert_env
28+
service = var.service
29+
notification_channel = var.notification_channel
30+
additional_tags = var.additional_tags
31+
locked = var.locked
32+
}

deploy_desired_vs_status.tf

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
variable "deploy_desired_vs_status_enabled" {
2+
type = bool
3+
default = true
4+
}
5+
6+
variable "deploy_desired_vs_status_warning" {
7+
type = number
8+
default = 1
9+
# warning at 1 difference
10+
}
11+
12+
variable "deploy_desired_vs_status_critical" {
13+
type = number
14+
default = 10
15+
# critical at 10 difference
16+
}
17+
18+
variable "deploy_desired_vs_status_evaluation_period" {
19+
type = string
20+
default = "last_15m"
21+
}
22+
23+
variable "deploy_desired_vs_status_severity" {
24+
type = string
25+
default = "minor"
26+
}
27+
28+
variable "deploy_desired_vs_status_note" {
29+
type = string
30+
default = ""
31+
}
32+
33+
variable "deploy_desired_vs_status_docs" {
34+
type = string
35+
default = "The amount of expected pods to run minus the actual number"
36+
}
37+
38+
variable "deploy_desired_vs_status_filter_override" {
39+
type = string
40+
default = ""
41+
}
42+
43+
variable "deploy_desired_vs_status_alerting_enabled" {
44+
type = bool
45+
default = false
46+
}
47+
48+
variable "deploy_desired_vs_status_no_data_timeframe" {
49+
type = number
50+
default = null
51+
}
52+
53+
variable "deploy_desired_vs_status_notify_no_data" {
54+
type = bool
55+
default = false
56+
}
57+
58+
variable "deploy_desired_vs_status_ok_threshold" {
59+
type = number
60+
default = null
61+
}
62+
63+
variable "deploy_desired_vs_status_name_prefix" {
64+
type = string
65+
default = ""
66+
}
67+
68+
variable "deploy_desired_vs_status_name_suffix" {
69+
type = string
70+
default = ""
71+
}
72+
73+
variable "deploy_desired_vs_status_priority" {
74+
description = "Number from 1 (high) to 5 (low)."
75+
76+
type = number
77+
default = null
78+
}

memory_limits_low-variables.tf

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
variable "memory_limits_low_enabled" {
2+
type = bool
3+
default = true
4+
}
5+
6+
variable "memory_limits_low_warning" {
7+
type = number
8+
default = 4000000000
9+
}
10+
11+
variable "memory_limits_low_critical" {
12+
type = number
13+
default = 3000000000
14+
}
15+
16+
variable "memory_limits_low_evaluation_period" {
17+
type = string
18+
default = "last_5m"
19+
}
20+
21+
variable "memory_limits_low_severity" {
22+
type = string
23+
default = "minor"
24+
}
25+
26+
variable "memory_limits_low_note" {
27+
type = string
28+
default = ""
29+
}
30+
31+
variable "memory_limits_low_docs" {
32+
type = string
33+
default = ""
34+
}
35+
36+
variable "memory_limits_low_filter_override" {
37+
type = string
38+
default = ""
39+
}
40+
41+
variable "memory_limits_low_alerting_enabled" {
42+
type = bool
43+
default = false
44+
}
45+
46+
variable "memory_limits_low_no_data_timeframe" {
47+
type = number
48+
default = null
49+
}
50+
51+
variable "memory_limits_low_notify_no_data" {
52+
type = bool
53+
default = false
54+
}
55+
56+
variable "memory_limits_low_ok_threshold" {
57+
type = number
58+
default = null
59+
}
60+
61+
variable "memory_limits_low_name_prefix" {
62+
type = string
63+
default = ""
64+
}
65+
66+
variable "memory_limits_low_name_suffix" {
67+
type = string
68+
default = ""
69+
}
70+
71+
variable "memory_limits_low_priority" {
72+
description = "Number from 1 (high) to 5 (low)."
73+
74+
type = number
75+
default = null
76+
}

memory_limits_low.tf

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
locals {
2+
memory_limits_low_filter = coalesce(
3+
var.memory_limits_low_filter_override,
4+
var.filter_str
5+
)
6+
}
7+
8+
module "memory_limits_low" {
9+
source = "[email protected]:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.1"
10+
11+
name = "Available Memory for Limits Low"
12+
query = "avg(${var.memory_limits_low_evaluation_period}):max:kubernetes.memory.capacity{${local.memory_limits_low_filter}} by {host,cluster_name} - max:kubernetes.memory.limits{${local.memory_limits_low_filter}} by {host,cluster_name} < ${var.memory_limits_low_critical}"
13+
alert_message = "Kubernetes cluster memory room for limits is too low"
14+
recovery_message = "Kubernetes cluster memory limits is ok again."
15+
16+
# monitor level vars
17+
enabled = var.memory_limits_low_enabled
18+
alerting_enabled = var.memory_limits_low_alerting_enabled
19+
critical_threshold = var.memory_limits_low_critical
20+
warning_threshold = var.memory_limits_low_warning
21+
priority = var.memory_limits_low_priority
22+
severity = var.memory_limits_low_severity
23+
docs = var.memory_limits_low_docs
24+
note = var.memory_limits_low_note
25+
26+
# module level vars
27+
env = var.alert_env
28+
service = var.service
29+
notification_channel = var.notification_channel
30+
additional_tags = var.additional_tags
31+
locked = var.locked
32+
}

0 commit comments

Comments
 (0)