Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: PrometheusConfigurationReload
expr: prometheus_config_last_reload_successful
!= 1
for: 5m
labels:
severity: critical
annotations:
description: |-
Prometheus configuration reload error
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Prometheus configuration reload (instance {{ $labels.instance }})
|
ok
|
|
4.05s ago
|
145.2us |
alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered
< 1
for: 5m
labels:
severity: critical
annotations:
description: |-
Prometheus cannot connect the alertmanager
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
}})
|
ok
|
|
4.05s ago
|
49.32us |
alert: ExporterDown
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
description: |-
Prometheus exporter down
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Exporter down (instance {{ $labels.instance }})
|
ok
|
|
4.05s ago
|
10.39ms |
alert: OutOfMemory
expr: (100
- (((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes
- node_memory_Cached_bytes - node_memory_Shmem_bytes) / node_memory_MemTotal_bytes)
* 100)) < 5
for: 5m
labels:
severity: critical
annotations:
description: |-
Node memory is filling up (< 5% left)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Out of memory (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
230.9us |
alert: UnusualNetworkThroughputIn
expr: sum
by(instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Unusual network throughput in (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
118.4us |
alert: UnusualNetworkThroughputOut
expr: sum
by(instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Host network interfaces are probably sending too much data (> 100 MB/s)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Unusual network throughput out (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
90.36us |
alert: UnusualDiskReadRate
expr: sum
by(instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
description: |-
Disk is probably reading too much data (> 50 MB/s)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Unusual disk read rate (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
96.05us |
alert: UnusualDiskWriteRate
expr: sum
by(instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
description: |-
Disk is probably writing too much data (> 50 MB/s)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Unusual disk write rate (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
103.1us |
alert: OutOfDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/"}
* 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
for: 5m
labels:
severity: critical
annotations:
description: |-
Disk is almost full (< 10% left)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Out of disk space (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
98.39us |
alert: OutOfInodes
expr: node_filesystem_files_free{mountpoint="/"}
/ node_filesystem_files{mountpoint="/"} * 100 < 10
for: 5m
labels:
severity: critical
annotations:
description: |-
Disk is almost running out of available inodes (< 10% left)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Out of inodes (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
77.52us |
alert: UnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m])
/ rate(node_disk_reads_completed_total[1m]) > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (read operations > 100ms)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Unusual disk read latency (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
131.2us |
alert: UnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m])
/ rate(node_disk_writes_completed_total[1m]) > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (write operations > 100ms)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Unusual disk write latency (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
136.4us |
alert: HighCpuLoad
expr: 100
- (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
> 90
for: 5m
labels:
severity: critical
annotations:
description: |-
CPU load is > 90%
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: High CPU load (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
94.09us |
alert: SwapIsFillingUp
expr: (1
- (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
description: |-
Swap is filling up (>90%)
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Swap is filling up (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
74.62us |
alert: PhysicalComponentTooHot
expr: node_hwmon_temp_celsius
> 75
for: 5m
labels:
severity: warning
annotations:
description: |-
Physical hardware component too hot
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Physical component too hot (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
440.9us |
alert: NodeOvertemperatureAlarm
expr: node_hwmon_temp_alarm
== 1
for: 5m
labels:
severity: critical
annotations:
description: |-
Physical node temperature alarm triggered
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Node overtemperature alarm (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
33.49us |
alert: SslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry
- time() < 86400 * 7
for: 5m
labels:
severity: warning
annotations:
description: |-
SSL certificate expires in 7 days
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: SSL certificate will expire soon (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
53.2us |
alert: SslCertificateExpired
expr: probe_ssl_earliest_cert_expiry
- time() <= 0
for: 5m
labels:
severity: critical
annotations:
description: |-
SSL certificate has expired already
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: SSL certificate expired (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
39.93us |
alert: SlowProbe
expr: avg_over_time(probe_duration_seconds[1m])
> 2
for: 5m
labels:
severity: warning
annotations:
description: |-
Blackbox probe took more than 1s to complete
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Slow probe (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
65.44us |
alert: HttpStatusCode
expr: probe_http_status_code
<= 199 or probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
description: |-
HTTP status code is not 200-399
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: HTTP Status Code (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
249.9us |
alert: HttpSlowRequests
expr: avg_over_time(probe_http_duration_seconds[1m])
> 1
for: 5m
labels:
severity: warning
annotations:
description: |-
HTTP request took more than 1s
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: HTTP slow requests (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
165.6us |
alert: SlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m])
> 1
for: 5m
labels:
severity: warning
annotations:
description: |-
Blackbox ping took more than 1s
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Slow ping (instance {{ $labels.instance }})
|
ok
|
|
4.04s ago
|
40.04us |
|
35.167s ago |
156us |