标签：node Alertmanager -- Grafana sample prometheus 监控 k8s root

Prometheus

相关命令

docker network create monitoring

mkdir -p /etc/prometheus
vim /etc/prometheus/prometheus.yml

docker run -itd --name prometheus \
--net=monitoring \
-p 9090:9090 \
--restart always \
-v /etc/prometheus:/etc/prometheus \
-v prometheus-data:/prometheus \
prom/prometheus:v2.53.2

配置文件
/etc/prometheus/prometheus.yml

global:
    scrape_interval: 15s
    evaluation_interval: 15s
alerting:
  alertmanagers:
    - static_configs:
      - targets:
    # - alertmanager:9093
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

运行实例记录

[root@k8s-sample ~]# docker network create monitoring
d622c0cbdd342bb819aa896c057782ac44ec359bcd3b7f9b30bd1cd0064dfc1d
[root@k8s-sample ~]# 
[root@k8s-sample ~]# mkdir -p /etc/prometheus
[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
    scrape_interval: 15s
    evaluation_interval: 15s
alerting:
  alertmanagers:
    - static_configs:
      - targets:
    # - alertmanager:9093
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker run -itd --name prometheus --net=monitoring -p 9090:9090 --restart always -v /etc/prometheus:/etc/prometheus -v prometheus-data:/prometheus prom/prometheus:v2.53.2
060917136c37c3e5f7c12866e25ab828aecfdc031e1bebb92c153c58e24a9051
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker ps 
CONTAINER ID   IMAGE                     COMMAND                  CREATED         STATUS         PORTS                                       NAMES
060917136c37   prom/prometheus:v2.53.2   "/bin/prometheus --c…"   6 seconds ago   Up 5 seconds   0.0.0.0:9090->9090/tcp, :::9090->9090/tcp   prometheus
[root@k8s-sample ~]#

可直接登录“http://:9090”页面

Grafana

相关命令

docker run -d --name=grafana \
--net=monitoring \
-p 3000:3000 \
--restart always \
-v grafana-data:/var/lib/grafana \
grafana/grafana

运行实例记录

[root@k8s-sample ~]# docker run -d --name=grafana \
--net=monitoring \
-p 3000:3000 \
--restart always \
-v grafana-data:/var/lib/grafana \
grafana/grafana
3e2ed40167581e3c0d836a9b6155a8b0bc37012a7b8e67baa45b2fdd474b0865
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker ps
CONTAINER ID   IMAGE                     COMMAND                  CREATED         STATUS         PORTS                                       NAMES
3e2ed4016758   grafana/grafana           "/run.sh"                5 seconds ago   Up 5 seconds   0.0.0.0:3000->3000/tcp, :::3000->3000/tcp   grafana
b0e8d55c2f2c   prom/prometheus:v2.53.2   "/bin/prometheus --c…"   6 minutes ago   Up 6 minutes   0.0.0.0:9090->9090/tcp, :::9090->9090/tcp   prometheus
[root@k8s-sample ~]# 
[root@k8s-sample ~]#

登录“http://:3000”页面，默认账号admin/admin，登录后更改密码。
添加数据源：Home --> Connections --> Data sources --> Add data source --> prometheus --> Connection 填写 http://:9090 --> Save & test

Exporter

Node Exporter

部署 Node Exporter
部署完成后，浏览器访问 http://:9100/metrics 页面可以查看到采集的指标数据。

[root@k8s-sample ~]# tar -xzvf node_exporter-1.8.2.linux-amd64.tar.gz -C /opt
node_exporter-1.8.2.linux-amd64/
node_exporter-1.8.2.linux-amd64/NOTICE
node_exporter-1.8.2.linux-amd64/node_exporter
node_exporter-1.8.2.linux-amd64/LICENSE
[root@k8s-sample ~]# 
[root@k8s-sample ~]# cd /opt/
[root@k8s-sample opt]# ln -sv node_exporter-1.8.2.linux-amd64 node_exporter
'node_exporter' -> 'node_exporter-1.8.2.linux-amd64'
[root@k8s-sample opt]# 
[root@k8s-sample opt]# useradd prometheus && echo "prometheus:prometheus"|chpasswd && chage -M 99999 prometheus
[root@k8s-sample opt]# 
[root@k8s-sample opt]# chown -R prometheus:prometheus /opt/node_exporter-1.8.2.linux-amd64/
[root@k8s-sample opt]# 
[root@k8s-sample opt]# ll /opt |grep node_exporter
lrwxrwxrwx  1 root       root        31 Oct 18 22:34 node_exporter -> node_exporter-1.8.2.linux-amd64
drwxr-xr-x  2 prometheus prometheus  56 Jul 14 19:58 node_exporter-1.8.2.linux-amd64
[root@k8s-sample opt]# 
[root@k8s-sample opt]# cd
[root@k8s-sample ~]# vim /usr/lib/systemd/system/node_exporter.service
[root@k8s-sample ~]# cat /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
Documentation=https://prometheus.io/
After=network-online.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/opt/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@k8s-sample ~]# 
[root@k8s-sample ~]# systemctl daemon-reload
[root@k8s-sample ~]# systemctl enable node_exporter.service
Created symlink /etc/systemd/system/multi-user.target.wants/node_exporter.service → /usr/lib/systemd/system/node_exporter.service.
[root@k8s-sample ~]# systemctl start node_exporter.service
[root@k8s-sample ~]# systemctl status node_exporter.service
● node_exporter.service - node_exporter
     Loaded: loaded (/usr/lib/systemd/system/node_exporter.service; enabled; preset: disable>
     Active: active (running) since Fri 2024-10-18 22:36:19 CST; 7s ago
       Docs: https://prometheus.io/
   Main PID: 8177 (node_exporter)
      Tasks: 5 (limit: 48820)
     Memory: 4.7M
        CPU: 9ms
     CGroup: /system.slice/node_exporter.service
             └─8177 /opt/node_exporter/node_exporter

Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.349Z caller=tls_confi>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.349Z caller=tls_confi>
lines 1-21/21 (END)
[root@k8s-sample ~]#

Prometheus添加监控指标
添加完成后，可以在Web UI页面导航栏的 Status 中选择 Targets 查看监控目标。

[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml 
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml 
global:
    scrape_interval: 15s
    evaluation_interval: 15s
alerting:
  alertmanagers:
    - static_configs:
      - targets:
    # - alertmanager:9093
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
  - job_name: "linux-server"
    metrics_path: "/metrics" # 指标接口路径，默认/metrics
    scheme: http # 连接协议，默认http
    static_configs:
      - targets: ["192.168.16.170:9100"]
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#

Grafana 导入仪表盘
Grafana页面 --> 左侧菜单栏Dashboards --> New --> New dashboard --> Import a dashboard --> 输入仪表盘ID 12633
--> Load加载 --> 设置仪表盘名称和数据源 --> Import完成导入 --> Dashboards看到对应的仪表盘页面。

cAdvisor Exporter

https://github.com/google/cadvisor

[root@k8s-sample ~]# docker pull swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
v0.49.1: Pulling from ddn-k8s/gcr.io/cadvisor/cadvisor-amd64
619be1103602: Pull complete 
3b8469b194b8: Pull complete 
6361eeb1639c: Pull complete 
4f4fb700ef54: Pull complete 
902eccca70f3: Pull complete 
Digest: sha256:00ff3424f13db8d6d62778253e26241c45a8d53343ee09944a474bf88d3511ac
Status: Downloaded newer image for swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker tag  swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1  gcr.io/cadvisor/cadvisor-amd64:v0.49.1
[root@k8s-sample ~]# docker rmi swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
[root@k8s-sample ~]# docker images |grep cadvisor
gcr.io/cadvisor/cadvisor-amd64   v0.49.1   c02cf39d3dba   7 months ago    80.8MB
[root@k8s-sample ~]# 

[root@k8s-sample ~]# docker images |grep cadvisor
gcr.io/cadvisor/cadvisor-amd64   v0.49.1   c02cf39d3dba   7 months ago    80.8MB
[root@k8s-sample ~]# docker run -d --name=cadvisor \
--publish=8080:8080 \
--restart always \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--detach=true \
--privileged \
--device=/dev/kmsg \
gcr.io/cadvisor/cadvisor-amd64:v0.49.1
56e4af8073bc960dfeffadb9e962c4107ae482d88cb3e29a651ba4c443962ba0
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker ps 
CONTAINER ID   IMAGE                                    COMMAND                  CREATED         STATUS                            PORTS                                       NAMES
56e4af8073bc   gcr.io/cadvisor/cadvisor-amd64:v0.49.1   "/usr/bin/cadvisor -…"   7 seconds ago   Up 5 seconds (health: starting)   0.0.0.0:8080->8080/tcp, :::8080->8080/tcp   cadvisor
279f91ec6f9f   prom/alertmanager                        "/bin/alertmanager -…"   47 hours ago    Up 22 minutes                     0.0.0.0:9093->9093/tcp, :::9093->9093/tcp   alertmanager
3e2ed4016758   grafana/grafana                          "/run.sh"                2 days ago      Up 22 minutes                     0.0.0.0:3000->3000/tcp, :::3000->3000/tcp   grafana
b0e8d55c2f2c   prom/prometheus:v2.53.2                  "/bin/prometheus --c…"   2 days ago      Up 22 minutes                     0.0.0.0:9090->9090/tcp, :::9090->9090/tcp   prometheus
[root@k8s-sample ~]#

直接访问如下页面

http://:8080 查看cadvisor的相关信息
http://:8080/metrics 查看采集的指标数据

在Prometheus添加监控指标
添加完成后，可以在Web UI页面导航栏的 Status 中选择 Targets 查看监控目标。

[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
    scrape_interval: 15s
    evaluation_interval: 15s
alerting:
  alertmanagers:
    - static_configs:
      - targets:
        - 192.168.16.170:9093
rule_files:
  - "./rules/linux-server.yml"
  - "./rules/general.yml"
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
  - job_name: "linux-server"
    metrics_path: "/metrics" # 指标接口路径，默认/metrics
    scheme: http # 连接协议，默认http
    static_configs:
      - targets: ["192.168.16.170:9100"]
  - job_name: "docker-server"
    static_configs:
      - targets: ["192.168.16.170:8080"]
[root@k8s-sample ~]#  
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#

Grafana导入仪表盘
Grafana页面 --> 左侧菜单栏Dashboards --> New --> New dashboard --> Import a dashboard --> 输入仪表盘ID 14282
--> Load加载 --> 设置仪表盘名称和数据源 --> Import完成导入 --> Dashboards看到对应的仪表盘页面。

Alertmanager

相关命令

mkdir -p /etc/alertmanager

vim /etc/alertmanager/alertmanager.yml

docker run -d --name=alertmanager \
--net=monitoring \
-v /etc/alertmanager:/etc/alertmanager \
-p 9093:9093 \
--restart always \
prom/alertmanager

编写配置文件
/etc/alertmanager/alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.163.com:25'
  smtp_from: '[email protected]'
  smtp_auth_username: '[email protected]'
  smtp_auth_password: 'XXXXXX'
  smtp_require_tls: false
route:
  receiver: 'default-receiver'
  group_by: [alertname]
  group_wait: 1m
  group_interval: 5m
  repeat_interval: 30m
receivers:
- name: 'default-receiver'
  email_configs:
  - to: '[email protected]'
    send_resolved: true

[root@k8s-sample ~]# vim /etc/alertmanager/alertmanager.yml
[root@k8s-sample ~]# cat /etc/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.163.com:25'
  smtp_from: '[email protected]'
  smtp_auth_username: '[email protected]'
  smtp_auth_password: 'XXXXXX'
  smtp_require_tls: false
route:
  receiver: 'default-receiver'
  group_by: [alertname]
  group_wait: 1m
  group_interval: 5m
  repeat_interval: 30m
receivers:
- name: 'default-receiver'
  email_configs:
  - to: '[email protected]'
    send_resolved: true
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker run -d --name=alertmanager --net=monitoring -v /etc/alertmanager:/etc/alertmanager -p 9093:9093 --restart always prom/alertmanager
279f91ec6f9fe6f154e99b1d110e754361ad7f2c20066967b290990d72b395a0
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker ps
CONTAINER ID   IMAGE                     COMMAND                  CREATED         STATUS         PORTS                                       NAMES
279f91ec6f9f   prom/alertmanager         "/bin/alertmanager -…"   5 seconds ago   Up 4 seconds   0.0.0.0:9093->9093/tcp, :::9093->9093/tcp   alertmanager
3e2ed4016758   grafana/grafana           "/run.sh"                2 hours ago     Up 2 hours     0.0.0.0:3000->3000/tcp, :::3000->3000/tcp   grafana
b0e8d55c2f2c   prom/prometheus:v2.53.2   "/bin/prometheus --c…"   2 hours ago     Up 2 hours     0.0.0.0:9090->9090/tcp, :::9090->9090/tcp   prometheus
[root@k8s-sample ~]#

更新Prometheus配置文件，指定alertmanager的访问地址

[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml 
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml 
global:
    scrape_interval: 15s
    evaluation_interval: 15s
alerting:
  alertmanagers:
    - static_configs:
      - targets:
        - 192.168.16.170:9093
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
  - job_name: "linux-server"
    metrics_path: "/metrics" # 指标接口路径，默认/metrics
    scheme: http # 连接协议，默认http
    static_configs:
      - targets: ["192.168.16.170:9100"]
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#

可以直接登录“http://:9093”页面

Alertmanager 告警规则

相关命令

mkdir -p /etc/prometheus/rules

vim /etc/prometheus/rules/linux-server.yml
vim /etc/prometheus/rules/general.yml

vim /etc/prometheus/prometheus.yml
docker exec -it prometheus kill -HUP 1

创建告警规则文件（主机资源使用率）

groups: # 告警规则组
- name: Linux-Server  # 告警规则组名称
  rules: # 规则
  - alert: HighCPUUsage # 告警名称
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[2m])) by (instance) * 100) > 80 # 触发告警的表达式
    for: 2m # 定义触发告警的持续时间
    labels: # 告警事件的标签
      severity: warning # 定义告警级别
    annotations:
      summary: "{{ $labels.instance }} CPU使用率超过80%"
      description: "{{ $labels.instance }} CPU使用率超过80%，当前值: {{ $value }}"
  - alert: HighMemoryUsage
    expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }} 内存使用率超过80%"
      description: "{{ $labels.instance }} 内存使用率超过80%，当前值: {{ $value }}"
  - alert: HighDiskSpaceUsage
    expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{ $labels.instance }} {{ $labels.mountpoint }} 分区使用率超过80%"
      description: "{{ $labels.instance }} {{ $labels.mountpoint }} 分区使用率超过80%，当前值: {{ $value }}"

创建告警规则文件（监控目标无法连接）

groups:
- name: General
  rules:
  - alert: InstanceDown
    expr: up == 0 # "up"是内置指标，0表示存活状态，1表示无法连接
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "{{ $labels.instance }} 连接失败"
      description: "{{ $labels.instance }} 连接失败，可能是服务器故障！"

更新Prometheus的配置文件

[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
    scrape_interval: 15s
    evaluation_interval: 15s
alerting:
  alertmanagers:
    - static_configs:
      - targets:
        - 192.168.16.170:9093
rule_files:
  - "./rules/linux-server.yml"  # 相对路径
  - "./rules/general.yml" # 相对路径
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
  - job_name: "linux-server"
    metrics_path: "/metrics" # 指标接口路径，默认/metrics
    scheme: http # 连接协议，默认http
    static_configs:
      - targets: ["192.168.16.170:9100"]
[root@k8s-sample ~]# 
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#

查看告警规则信息，在如下页面均可以查看到已定义告警规则的相关信息

告警页面 http://:9090/alerts
规则页面 http://:9090/rules
配置页面 http://:9090/config

测试与验证
通过压力测试工具stress模拟cpu使用率过载告警。
告警触发后，可以在 http://:9090/alerts 和 http://:9093/#/alerts 页面看到告警的相关信息。

[root@k8s-sample ~]# dnf install -y epel-release && dnf install stress -y
[root@k8s-sample ~]# stress --version
stress 1.0.4
[root@k8s-sample ~]# 
[root@k8s-sample ~]# stress --cpu 8
stress: info: [41378] dispatching hogs: 8 cpu, 0 io, 0 vm, 0 hdd
^C
[root@k8s-sample ~]#

自定义告警内容模版

自定义告警内容可以更直观显示关键信息，提高可读性。

在/etc/alertmanager目录下创建.tmpl结尾的模版文件
Alertmanager配置文件中，通过templates字段指定告警模版文件的路径，在接收者配置中指定模版名称
Alertmanager重新加载配置文件

标签：node,Alertmanager,--,Grafana,sample,prometheus,监控,k8s,root
From： https://www.cnblogs.com/anliven/p/18491599

监控工具 - 快速创建Prometheus-Grafana-Alertmanager监控系统

Prometheus

Grafana

Exporter

Node Exporter

cAdvisor Exporter

Alertmanager

Alertmanager 告警规则

自定义告警内容模版

相关文章

赞助商

阅读排行