1.主机及服务存活状态设置报警
2.内存使用率设置报警
3.cpu繁忙百分比设置报警
4.cpu iowait报警
5.disk 使用率百分比报警
6.网卡流量监控
1.主机及服务存活状态设置报警
1.1定义主机标签
- job_name: pre-yzfs-node-47.100.70.42
static_configs:
- targets: ['172.16.40.153:19100']
labels:
node_name: cloud-pre-47.100.70.42
cluster_name: pre
1.2设定规则
groups:
- name: 实例存活告警规则
rules:
- alert: 实例存活告警
expr: up{job="prometheus"} == 0 or up{job="node"} == 0
for: 1m
labels:
user: prometheus
severity: Disaster
annotations:
summary: "Instance {{ $labels.instance }} is down"
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
value: "{{ $value }}"
2.内存使用率设置报警
公式:
((total内存-(free内存+buffer内存+cache内存))/total内存*100
通过公式计算出使用内存的百分比
groups:
- name: 内存告警规则
rules:
- alert: "内存使用率告警"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 内存报警"
description: "{{ $labels.alertname }} 内存资源利用率大于90%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
3.cpu繁忙百分比设置报警
公式:
(1-空闲状态cpu时间/所有状态cpu时间)*100
groups:
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: 100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100 > 70
#expr: (1-sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 70
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} CPU报警"
description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
4.cpu iowait报警
公式:
(cpu_iowait_time[1m]/cpu_total[1m])*100
groups:
- name: CPU Iowaite 报警规则
rules:
- alert: CPU Iowait 报警
expr: (sum(increase(node_cpu_seconds_total{mode="iowait"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 60
for: 30s
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} CPU Iowait 报警"
description: "服务器: CPU Iowait 超过60%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
5.disk 使用率百分比
公式:
(disk_total_size-disk_avail_size)/disk_total_size *100
groups:
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 磁盘报警"
description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
value: "{{ $value }}"
6.网卡流量监控报警
公式:
(进网流量[1m]+出网流量[1m])/1024(Kb)/1024(Mb)
groups:
- name: 网卡流量监控
rules:
- alert: 网卡流量
expr: (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])+irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[1m]))/1024/1024 > 4
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 网卡流量报警"
description: "服务器:{{$labels.alertname}},网卡流量超过4M! 当前值: {{ $value }}M)"
value: "{{ $value }}"
总配置文件
[root@iZuf6fzcihc5izn2c1vz9yZ rules]# cat node_status.yml
groups:
- name: 实例存活告警规则
rules:
- alert: 实例存活告警
expr: up{job="prometheus"} == 0 or up{job="node"} == 0
for: 1m
labels:
user: prometheus
severity: Disaster
annotations:
summary: "Instance {{ $labels.instance }} is down"
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
value: "{{ $value }}"
- name: 内存告警规则
rules:
- alert: "内存使用率告警"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 内存报警"
description: "{{ $labels.alertname }} 内存资源利用率大于90%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: 100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100 > 70
#expr: (1-sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 70
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} CPU报警"
description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
- name: CPU Iowaite 报警规则
rules:
- alert: CPU Iowait 报警
expr: (sum(increase(node_cpu_seconds_total{mode="iowait"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 60
for: 30s
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} CPU Iowait 报警"
description: "服务器: CPU Iowait 超过60%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 磁盘报警"
description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
value: "{{ $value }}"
- name: 网卡流量监控
rules:
- alert: 网卡流量
expr: (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])+irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[1m]))/1024/1024 > 4
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 网卡流量报警"
description: "服务器:{{$labels.alertname}},网卡流量超过4M! 当前值: {{ $value }}M)"
value: "{{ $value }}"
- name: 系统15分钟负载报警规则
rules:
- alert: 系统15分钟负载告警
expr: node_load5 > 5.6
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 系统负载报警"
description: "服务器:{{$labels.alertname}},系统负载: 使用超过70%!当前值: {{ $value }}%)"
value: "{{ $value }}"
标签:node,设定,报警,labels,bytes,value,Prometheus,1m
From: https://www.cnblogs.com/yangtao416/p/17066337.html