###alertmanager的目录
[root@test /data/software/alertmanager]# ll
total 62512
-rwxr-xr-x 1 3434 3434 35410965 Aug 24 19:12 alertmanager
-rw-r--r-- 1 3434 3434 727 Nov 30 14:33 alertmanager.yml
-rwxr-xr-x 1 3434 3434 28566971 Aug 24 19:13 amtool
-rw-r--r-- 1 3434 3434 11357 Aug 24 19:14 LICENSE
-rw-r--r-- 1 3434 3434 457 Aug 24 19:14 NOTICE
-rw-r--r-- 1 root root 1305 Nov 30 17:35 wechat.tmpl
###alertmanager的配置文件
[root@test /data/software/alertmanager]# cat alertmanager.yml
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 5m
repeat_interval: 1h
receiver: 'wechat'
# receiver: 'web.hook'
#receivers:
# - name: 'web.hook'
# webhook_configs:
# - url: 'http://127.0.0.1:8080/adapter/wx'
# send_resolved: false
templates:
- '/data/software/alertmanager/*.tmpl'
receivers:
- name: 'wechat'
wechat_configs:
- api_secret: '企微应用的key'
corp_id: '企微公司id'
agent_id: '企微应用id'
#to_party: '1' #企业微信中部门ID
to_user: '要发送人的id'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
###消息通知的模板
[root@test /data/software/alertmanager]# cat wechat.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
========= 监控报警 =========
告警状态:{{ .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ $alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
触发阀值:{{ .Annotations.value }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
========= 异常恢复 =========
告警状态:{{ .Status }}
告警类型:{{ $alert.Labels.alertname }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- end }}
###prometheus配置文件
[root@test /data/software/prometheus]# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093 ###这里是 alertmanagers的端口
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/data/software/prometheus/rules/*.yml" ###这里是告警规则放置的目录
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus-server"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["prometheus.com:9090"]
###cpu告警规则
[root@test /data/software/prometheus/rules]# cat cpu_over.yml
groups:
- name: CPU报警规则50
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 50
for: 1m
labels:
severity: warning
annotations:
summary: "CPU使用率正在飙升。注意!!!"
description: "CPU使用率超过50%(当前值:{{ $value }}%)"
- name: CPU报警规则70
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 70
for: 1m
labels:
severity: critical
annotations:
summary: "CPU使用率正在飙升。关注!!!"
description: "CPU使用率超过70%(当前值:{{ $value }}%)"
- name: CPU报警规则90
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
for: 1m
labels:
severity: emergency
annotations:
summary: "CPU使用率正在飙升。严重,立即处理!!!"
description: "CPU使用率超过90%(当前值:{{ $value }}%)"
告警规则可参考:
https://www.modb.pro/db/335991
https://blog.csdn.net/agonie201218/article/details/126243110
https://cloud.tencent.com/developer/article/2216582?areaSource=102001.2&traceId=pWGCiwZuYxp0FamqoV8-w
标签:alertmanager,end,3434,alert,promethues,告警,企微,CPU
From: https://www.cnblogs.com/world-of-yuan/p/17958825