1. 拉取所需的镜像
docker pull prom/node-exporter # 用于机器系统数据收集
docker pull prom/mysqld-exporter # 用于MySQL数据库数据收集
docker pull google/cadvisor # 用于收集宿主机上的docker容器数据
docker pull prom/prometheus # 监控&报警&时间序列数据库的组合
docker pull grafana/grafana # 数据可视化平台
docker pull prom/alertmanager # 处理Prometheus监控系统生成的警报
docker pull ncabatoff/process-exporter # 监控进程
2. 创建子网
docker network create --subnet=172.30.1.0/24 prom
3. 启动node_exporter
docker run -d --name node-exporter --restart=always -p 9100:9100 --network prom --ip 172.30.1.2 \
-v "/proc:/host/proc:ro" -v "/sys:/host/sys:ro" -v "/:/rootfs:ro" prom/node-exporter
创建完容器后访问 http://本机ip:9100/metrics 能看到数据即可
4. 启动cadvisor
docker run -d --name=cadvisor --restart=always --network prom --ip 172.30.1.3 \
-v /:/rootfs:ro -v /var/run:/var/run:rw -v /sys:/sys:ro -v /var/lib/docker/:/var/lib/docker:ro -v /dev/disk/:/dev/disk:ro \
-p 8080:8080 google/cadvisor:latest
创建完容器后访问 http://本机ip:8080/metrics 能看到数据即可
5. 启动mysqld_exporter
进入数据库创建监控用户并授权
create user 'exporter'@'%' identified by 'exporter';
grant process, replication client, select on *.* to 'exporter'@'%';
flush privileges;
编辑.my.cnf (mysql_exporter容器读取此配置文件连接数据库)
vim /opt/.my.cnf
# 文件配置如下
[client]
host=172.25.8.101 # 数据库ip
port=3306
user=exporter
password=exporter
创建运行mysqld_exporter容器
docker run -d --name mysqld_exporter --restart=always --network prom --ip 172.30.1.4 \
-p 9104:9104 -v /opt/.my.cnf:/.my.cnf prom/mysqld-exporter
创建完容器后访问 http://本机ip:9104/metrics 能看到数据即可
6. 启动grafana
mkdir /opt/grafana-storage
chmod 777 -R /opt/grafana-storage
docker run -d --name grafana --restart=always --name=grafana --network prom --ip 172.30.1.5 \
-p 3000:3000 -v /opt/grafana-storage:/var/lib/grafana grafana/grafana
grafana web页面: http://本机ip:3000 # 账号admin 密码admin
7. 启动alertmanager
mkdir /opt/alertmanager
vim /opt/alertmanager/alertmanager.yml
# 设置报警服务
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'xxxxxxxxx' # 邮箱smtp认证密码
smtp_require_tls: false
route:
group_by: ['alertname'] #根据告警规则组名进行分组,默认这里就是用alertname就可以了,可以精确到每一个告警规则,alertname的取值就是promethues中rules中自定义的告警规则的名称,根据触发情况取值会有所变动
group_wait: 10s #分组内第一个告警等待时间,10s内如有第二个告警会合并一个告警
group_interval: 10s #发送新告警间隔时间
repeat_interval: 10s #重复告警间隔发送时间
receiver: 'mail' #发送给哪个接收人,定义一个名字,具体接收人是谁,可以在下面的该名字下定
receivers:
- name: 'mail'
email_configs:
- to: '[email protected]'
send_resolved: true #设置恢复时候也提醒恢复信息,默认没有配置,则恢复时候不会发送提示信息
docker run -d --name alertmanager --network prom --ip 172.30.1.6 -p 9093:9093 \
-v /opt/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager
8. 启动Prometheus
mkdir /opt/prometheus
vim /opt/prometheus/prometheus.yml
global:
scrape_interval: 60s
evaluation_interval: 60s
alerting:
alertmanagers:
- static_configs:
- targets:
- 172.30.1.6:9093 #当安装了alertmanager,需要告警时可以指定alertmanager的ip和端口,若不用告警则可注释该行.
rule_files:
- "/usr/local/prometheus/rules/*.yml" #告警相关规则配置,不用可注释,rules是在promethues安装目录中创建一个rules目录
# - "second_rules.yml" #告警相关规则配置,不用可注释
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['127.0.0.1:9090']
labels:
instance: prometheus
- job_name: linux
static_configs:
- targets: ['172.30.1.2:9100']
labels:
instance: localhost
# - targets: ['192.168.1.22:9100']
#这里添加targets,可以使用Prometheus监控其他装有node_exporter的节点,单节点则不需要
# labels:
# instance: 192.168.1.22
- job_name: cadvisor
static_configs:
- targets: ['172.30.1.3:8080']
labels:
instance: cAdvisor
- job_name: mysqld
static_configs:
- targets: ['172.30.1.4:9104']
- job_name: process
static_configs:
- targets: ['172.30.1.8:9256']
labels:
instance: process
docker run -d --name prometheus --restart=always -p 9090:9090 --network prom --ip 172.30.1.6 \
-v /opt/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /opt/prometheus/rules/:/usr/local/prometheus/rules prom/prometheus
启动完成登录 本机ip:9090 可查看到监控项目详情
grafana:访问 本机ip:3000 配置好数据源可看到类似下图的dashboard界面
9. process-exporter监控icvs进程
配置监控项
vim /opt/process-exporter/process-exporter.yml
# 使用spid监控icvs各项服务,内容如下
process_names:
- name: "{{.Matches}}"
cmdline:
- '000000100000000001'
- name: "{{.Matches}}"
cmdline:
- '001000100000000001'
- name: "{{.Matches}}"
cmdline:
- '002000100000000001'
- name: "{{.Matches}}"
cmdline:
- '003000100000000001'
- name: "{{.Matches}}"
cmdline:
- '004000100000000001'
- name: "{{.Matches}}"
cmdline:
- '005000100000000001'
- name: "{{.Matches}}"
cmdline:
- '006000100000000001'
- name: "{{.Matches}}"
cmdline:
- '008000100000000001'
- name: "{{.Matches}}"
cmdline:
- '010000100000000001'
- name: "{{.Matches}}"
cmdline:
- '013000100000000001'
- name: "{{.Matches}}"
cmdline:
- '014000100000000001'
- name: "{{.Matches}}"
cmdline:
- '017000100000000001'
- name: "{{.Matches}}"
cmdline:
- '022000100000000001'
- name: "{{.Matches}}"
cmdline:
- '027000100000000001'
- name: "{{.Matches}}"
cmdline:
- '033000100000000001'
- name: "{{.Matches}}"
cmdline:
- '034000100000000001'
- name: "{{.Matches}}"
cmdline:
- '021000100000000001'
配置报警规则
vim /opt/prometheus/rules/process.yml
# yml文件内容如下
groups:
- name: icvs_process_rule
rules:
- alert: CMS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:000000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "CMS {{ $labels.instance }} has been down for more than 15s"
description: "CMS has been down, This requires immediate action!"
- alert: DBS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:001000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "DBS {{ $labels.instance }} has been down for more than 15s"
description: "DBS has been down, This requires immediate action!"
- alert: PAS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:002000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "PAS {{ $labels.instance }} has been down for more than 15s"
description: "PAS has been down, This requires immediate action!"
- alert: CAS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:003000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "CAS {{ $labels.instance }} has been down for more than 15s"
description: "CAS has been down, This requires immediate action!"
- alert: VTDU ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:004000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "VTDU {{ $labels.instance }} has been down for more than 15s"
description: "VTDU has been down, This requires immediate action!"
- alert: VTDU_Scheduler ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:005000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "VTDU_Scheduler {{ $labels.instance }} has been down for more than 15s"
description: "VTDU_Scheduler has been down, This requires immediate action!"
- alert: AES ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:006000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "AES_7 {{ $labels.instance }} has been down for more than 15s"
description: "AES has been down, This requires immediate action!"
- alert: LogDBS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:008000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "LogDBS {{ $labels.instance }} has been down for more than 15s"
description: "LogDBS has been down, This requires immediate action!"
- alert: LAS_Scheduler ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:010000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "LAS_Scheduler {{ $labels.instance }} has been down for more than 15s"
description: "LAS_Scheduler has been down, This requires immediate action!"
- alert: CSS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:013000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "CSS {{ $labels.instance }} has been down for more than 15s"
description: "CSS has been down, This requires immediate action!"
- alert: CSS_Scheduler ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:014000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "CSS_Scheduler {{ $labels.instance }} has been down for more than 15s"
description: "CSS_Scheduler has been down, This requires immediate action!"
- alert: CMSDeamon ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:017000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "CMS Deamon {{ $labels.instance }} has been down for more than 15s"
description: "CMS Deamon has been down, This requires immediate action!"
- alert: IVSS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:022000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "IVSS {{ $labels.instance }} has been down for more than 15s"
description: "IVSS has been down, This requires immediate action!"
- alert: StorageDBS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:027000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "StorageDBS {{ $labels.instance }} has been down for more than 15s"
description: "StorageDBS has been down, This requires immediate action!"
- alert: MPS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:033000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "MPS {{ $labels.instance }} has been down for more than 15s"
description: "MPS has been down, This requires immediate action!"
- alert: LocationDBS ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:034000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "LocationDBS {{ $labels.instance }} has been down for more than 15s"
description: "LocationDBS has been down, This requires immediate action!"
- alert: SMSAuth ProcessDown
expr: (namedprocess_namegroup_num_procs{groupname="map[:021000100000000001]"}) == 0
for: 15s
labels:
severity: error
annotations:
summary: "SMSAuth {{ $labels.instance }} has been down for more than 15s"
description: "SMSAuth has been down, This requires immediate action!"
启动process-exporter
docker run -itd --name process_exporter --network prom --ip 172.30.1.8 -p 9256:9256 \
--privileged -v /proc:/host/proc:ro -v /opt/process-exporter:/config ncabatoff/process-exporter \
--procfs /host/proc -config.path /config/process-exporter.yml
此时再查看Prometheus页面,会出现icvs各进程的状态项目
10. 使用docker-compose管理上述容器
注: 若使用docker-compose管理容器,上述步骤中的所有docker run命令都不需要操作
# 安装docker-compose
curl -L https://github.com/docker/compose/releases/download/v2.24.0-birthday.10/docker-compose-linux-x86_64 > /usr/local/bin/docker-compose # 网络不好可在本机下载好再上传
chmod +x /usr/local/bin/docker-compose
ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
docker-compose --version # 测试docker-compose是否安装成功
编辑docker-compose的yml文件
vim /mydata/docker-composes/prometheus.yml
# yml文件内容如下
version: "3"
services:
node-exporter:
image: prom/node-exporter
container_name: node-exporter
networks:
prom:
ipv4_address: 172.30.1.2
ports:
- 9100:9100
restart: always
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
cadvisor:
image: google/cadvisor
container_name: cadvisor
networks:
prom:
ipv4_address: 172.30.1.3
ports:
- 8080:8080
restart: always
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
mysqld_exporter:
image: prom/mysqld-exporter
container_name: mysqld-exporter
networks:
prom:
ipv4_address: 172.30.1.4
ports:
- 9104:9104
restart: always
volumes:
- /opt/.my.cnf:/.my.cnf
grafana:
image: grafana/grafana
container_name: grafana
networks:
prom:
ipv4_address: 172.30.1.5
ports:
- 3000:3000
restart: always
volumes:
- /opt/grafana-storage:/var/lib/grafana
alertmanager:
image: prom/alertmanager
container_name: alertmanager
networks:
prom:
ipv4_address: 172.30.1.6
ports:
- 9093:9093
restart: always
volumes:
- /opt/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
process-exporter:
image: ncabatoff/process-exporter
container_name: process-exporter
networks:
prom:
ipv4_address: 172.30.1.8
ports:
- 9256:9256
restart: always
volumes:
- /proc:/host/proc:ro
- /opt/process-exporter:/config
command:
--procfs /host/proc
-config.path /config/process-exporter.yml
prometheus:
image: prom/prometheus
container_name: prometheus
networks:
prom:
ipv4_address: 172.30.1.7
ports:
- 9090:9090
restart: always
volumes:
- /opt/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- /opt/prometheus/rules/:/usr/local/prometheus/rules
# 声明外部网络
networks:
prom:
external: true
docker-compose启动监控容器
docker-compose -f /mydata/docker-composes/prometheus.yml up -d # 启动文件中容器,-f 指定文件,-d 后台运行
docker-compose -f /mydata/docker-composes/prometheus.yml ps
docker-compose -f /mydata/docker-composes/prometheus.yml stop
配置完成,查看邮箱告警
标签:exporter,name,labels,been,down,Prometheus,监控,docker,Docker From: https://blog.51cto.com/u_16539916/9395529