Docker部署Prometheus监控

标签：exporter name labels been down Prometheus 监控 docker Docker

1. 拉取所需的镜像

docker pull prom/node-exporter 					# 用于机器系统数据收集
docker pull prom/mysqld-exporter				# 用于MySQL数据库数据收集
docker pull google/cadvisor						# 用于收集宿主机上的docker容器数据
docker pull prom/prometheus						# 监控&报警&时间序列数据库的组合
docker pull grafana/grafana						# 数据可视化平台
docker pull prom/alertmanager					# 处理Prometheus监控系统生成的警报
docker pull ncabatoff/process-exporter			# 监控进程

2. 创建子网

docker network create --subnet=172.30.1.0/24 prom

3. 启动node_exporter

docker run -d --name node-exporter --restart=always -p 9100:9100 --network prom --ip 172.30.1.2 \
-v "/proc:/host/proc:ro" -v "/sys:/host/sys:ro" -v "/:/rootfs:ro" prom/node-exporter

创建完容器后访问 http://本机ip:9100/metrics 能看到数据即可

4. 启动cadvisor

docker run -d --name=cadvisor --restart=always --network prom --ip 172.30.1.3 \
-v /:/rootfs:ro -v /var/run:/var/run:rw -v /sys:/sys:ro -v /var/lib/docker/:/var/lib/docker:ro -v /dev/disk/:/dev/disk:ro \
-p 8080:8080  google/cadvisor:latest

创建完容器后访问 http://本机ip:8080/metrics 能看到数据即可

5. 启动mysqld_exporter

进入数据库创建监控用户并授权

create user 'exporter'@'%' identified by 'exporter';
grant process, replication client, select on *.* to 'exporter'@'%'; 
flush privileges;

编辑.my.cnf (mysql_exporter容器读取此配置文件连接数据库)

vim /opt/.my.cnf
# 文件配置如下
[client]
host=172.25.8.101 # 数据库ip
port=3306
user=exporter
password=exporter

创建运行mysqld_exporter容器

docker run -d --name mysqld_exporter --restart=always --network prom --ip 172.30.1.4 \
-p 9104:9104 -v /opt/.my.cnf:/.my.cnf  prom/mysqld-exporter

创建完容器后访问 http://本机ip:9104/metrics 能看到数据即可

6. 启动grafana

mkdir /opt/grafana-storage				
chmod 777 -R /opt/grafana-storage

docker run -d --name grafana --restart=always --name=grafana --network prom --ip 172.30.1.5 \
 -p 3000:3000 -v /opt/grafana-storage:/var/lib/grafana grafana/grafana

grafana web页面： http://本机ip:3000 # 账号admin 密码admin

7. 启动alertmanager

mkdir /opt/alertmanager
vim /opt/alertmanager/alertmanager.yml

# 设置报警服务
global:
  resolve_timeout: 1m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: 'xxxxxxxxx@qq.com'
  smtp_auth_username: 'xxxxxxxxx@qq.com'
  smtp_auth_password: 'xxxxxxxxx'						# 邮箱smtp认证密码
  smtp_require_tls: false
route:
  group_by: ['alertname']   #根据告警规则组名进行分组,默认这里就是用alertname就可以了,可以精确到每一个告警规则,alertname的取值就是promethues中rules中自定义的告警规则的名称,根据触发情况取值会有所变动
  group_wait: 10s           #分组内第一个告警等待时间,10s内如有第二个告警会合并一个告警
  group_interval: 10s       #发送新告警间隔时间
  repeat_interval: 10s      #重复告警间隔发送时间
  receiver: 'mail'          #发送给哪个接收人,定义一个名字,具体接收人是谁,可以在下面的该名字下定
receivers:
  - name: 'mail'
    email_configs:
      - to: '2795298748@qq.com'
        send_resolved: true   #设置恢复时候也提醒恢复信息，默认没有配置,则恢复时候不会发送提示信息

docker run -d --name alertmanager --network prom --ip 172.30.1.6 -p 9093:9093 \
 -v /opt/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager

8. 启动Prometheus

mkdir /opt/prometheus
vim /opt/prometheus/prometheus.yml

global:
  scrape_interval:     60s
  evaluation_interval: 60s

alerting:
  alertmanagers:
    - static_configs:
      - targets:
           - 172.30.1.6:9093  #当安装了alertmanager,需要告警时可以指定alertmanager的ip和端口,若不用告警则可注释该行.
rule_files:
   - "/usr/local/prometheus/rules/*.yml"     #告警相关规则配置,不用可注释,rules是在promethues安装目录中创建一个rules目录
     # - "second_rules.yml"  #告警相关规则配置,不用可注释

scrape_configs:

  - job_name: prometheus
    static_configs:
      - targets: ['127.0.0.1:9090']
        labels:
          instance: prometheus

  - job_name: linux
    static_configs:
      - targets: ['172.30.1.2:9100']
        labels:
          instance: localhost
   #  - targets: ['192.168.1.22:9100']  
   #这里添加targets，可以使用Prometheus监控其他装有node_exporter的节点，单节点则不需要
   #    labels:
   #      instance: 192.168.1.22

  - job_name: cadvisor
    static_configs:
      - targets: ['172.30.1.3:8080']
        labels:
          instance: cAdvisor

  - job_name: mysqld
    static_configs:
      - targets: ['172.30.1.4:9104']
      
  - job_name: process
    static_configs:
      - targets: ['172.30.1.8:9256']
        labels:
          instance: process

docker run  -d --name prometheus --restart=always -p 9090:9090 --network prom --ip 172.30.1.6 \
-v /opt/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml  \
-v /opt/prometheus/rules/:/usr/local/prometheus/rules  prom/prometheus

启动完成登录本机ip:9090 可查看到监控项目详情

grafana：访问本机ip:3000 配置好数据源可看到类似下图的dashboard界面

Docker部署Prometheus监控_Prometheus

9. process-exporter监控icvs进程

配置监控项

vim /opt/process-exporter/process-exporter.yml
# 使用spid监控icvs各项服务，内容如下

process_names:
  - name: "{{.Matches}}"
    cmdline:
    - '000000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '001000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '002000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '003000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '004000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '005000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '006000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '008000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '010000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '013000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '014000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '017000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '022000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '027000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '033000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '034000100000000001'

  - name: "{{.Matches}}"
    cmdline:
    - '021000100000000001'

配置报警规则

vim /opt/prometheus/rules/process.yml
# yml文件内容如下

groups:
 - name: icvs_process_rule
   rules:
   - alert: CMS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:000000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "CMS {{ $labels.instance }}  has been down for more than 15s"
       description: "CMS has been down, This requires immediate action!"

   - alert: DBS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:001000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "DBS {{ $labels.instance }}  has been down for more than 15s"
       description: "DBS has been down, This requires immediate action!"

   - alert: PAS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:002000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "PAS {{ $labels.instance }}  has been down for more than 15s"
       description: "PAS has been down, This requires immediate action!"


   - alert: CAS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:003000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "CAS {{ $labels.instance }}  has been down for more than 15s"
       description: "CAS has been down, This requires immediate action!"

   - alert: VTDU ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:004000100000000001]"}) == 0
     for: 15s
     labels: 
       severity: error
     annotations:
       summary: "VTDU {{ $labels.instance }}  has been down for more than 15s"
       description: "VTDU has been down, This requires immediate action!"

   - alert: VTDU_Scheduler ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:005000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "VTDU_Scheduler {{ $labels.instance }}  has been down for more than 15s"
       description: "VTDU_Scheduler has been down, This requires immediate action!"

   - alert: AES ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:006000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "AES_7 {{ $labels.instance }}  has been down for more than 15s"
       description: "AES has been down, This requires immediate action!"


   - alert: LogDBS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:008000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "LogDBS {{ $labels.instance }}  has been down for more than 15s"
       description: "LogDBS has been down, This requires immediate action!"

   - alert: LAS_Scheduler ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:010000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "LAS_Scheduler {{ $labels.instance }}  has been down for more than 15s"
       description: "LAS_Scheduler has been down, This requires immediate action!"

   - alert: CSS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:013000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "CSS {{ $labels.instance }}  has been down for more than 15s"
       description: "CSS has been down, This requires immediate action!"

   - alert: CSS_Scheduler ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:014000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "CSS_Scheduler {{ $labels.instance }}  has been down for more than 15s"
       description: "CSS_Scheduler has been down, This requires immediate action!"


   - alert: CMSDeamon ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:017000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "CMS Deamon {{ $labels.instance }}  has been down for more than 15s"
       description: "CMS Deamon has been down, This requires immediate action!"

   - alert: IVSS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:022000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "IVSS {{ $labels.instance }}  has been down for more than 15s"
       description: "IVSS has been down, This requires immediate action!"

   - alert: StorageDBS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:027000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "StorageDBS {{ $labels.instance }}  has been down for more than 15s"
       description: "StorageDBS has been down, This requires immediate action!"

   - alert: MPS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:033000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "MPS {{ $labels.instance }}  has been down for more than 15s"
       description: "MPS has been down, This requires immediate action!"


   - alert: LocationDBS ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:034000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "LocationDBS {{ $labels.instance }}  has been down for more than 15s"
       description: "LocationDBS has been down, This requires immediate action!"

   - alert: SMSAuth ProcessDown
     expr: (namedprocess_namegroup_num_procs{groupname="map[:021000100000000001]"}) == 0
     for: 15s
     labels:
       severity: error
     annotations:
       summary: "SMSAuth {{ $labels.instance }}  has been down for more than 15s"
       description: "SMSAuth has been down, This requires immediate action!"

启动process-exporter

docker run -itd --name process_exporter  --network prom --ip 172.30.1.8 -p 9256:9256 \
--privileged -v /proc:/host/proc:ro -v /opt/process-exporter:/config  ncabatoff/process-exporter  \
--procfs /host/proc -config.path /config/process-exporter.yml

此时再查看Prometheus页面，会出现icvs各进程的状态项目

Docker部署Prometheus监控_Prometheus_02

10. 使用docker-compose管理上述容器

注：若使用docker-compose管理容器，上述步骤中的所有docker run命令都不需要操作

# 安装docker-compose
curl -L https://github.com/docker/compose/releases/download/v2.24.0-birthday.10/docker-compose-linux-x86_64 > /usr/local/bin/docker-compose				# 网络不好可在本机下载好再上传
chmod +x /usr/local/bin/docker-compose
ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
docker-compose --version				# 测试docker-compose是否安装成功

编辑docker-compose的yml文件

vim /mydata/docker-composes/prometheus.yml
# yml文件内容如下

version: "3"
services: 

  node-exporter: 
    image: prom/node-exporter
    container_name: node-exporter
    networks: 
      prom: 
        ipv4_address: 172.30.1.2
    ports: 
      - 9100:9100
    restart: always
    volumes: 
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
          
  cadvisor: 
    image: google/cadvisor
    container_name: cadvisor
    networks: 
      prom: 
        ipv4_address: 172.30.1.3
    ports: 
      - 8080:8080
    restart: always
    volumes: 
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
          
  mysqld_exporter: 
    image: prom/mysqld-exporter
    container_name: mysqld-exporter
    networks: 
      prom: 
        ipv4_address: 172.30.1.4
    ports: 
      - 9104:9104
    restart: always
    volumes: 
      - /opt/.my.cnf:/.my.cnf
          
  grafana: 
    image: grafana/grafana
    container_name: grafana
    networks: 
      prom: 
        ipv4_address: 172.30.1.5
    ports: 
      - 3000:3000
    restart: always
    volumes: 
      - /opt/grafana-storage:/var/lib/grafana
          
  alertmanager: 
    image: prom/alertmanager
    container_name: alertmanager
    networks: 
      prom: 
        ipv4_address: 172.30.1.6
    ports: 
      - 9093:9093
    restart: always
    volumes: 
      - /opt/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
          
  process-exporter: 
    image: ncabatoff/process-exporter
    container_name: process-exporter
    networks: 
      prom: 
        ipv4_address: 172.30.1.8
    ports: 
      - 9256:9256
    restart: always
    volumes: 
      - /proc:/host/proc:ro
      - /opt/process-exporter:/config
    command: 
      --procfs /host/proc
      -config.path /config/process-exporter.yml

  prometheus: 
    image: prom/prometheus
    container_name: prometheus
    networks: 
      prom: 
        ipv4_address: 172.30.1.7
    ports: 
      - 9090:9090
    restart: always
    volumes: 
      - /opt/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - /opt/prometheus/rules/:/usr/local/prometheus/rules

# 声明外部网络
networks: 
  prom: 
    external: true

docker-compose启动监控容器

docker-compose -f /mydata/docker-composes/prometheus.yml up -d						# 启动文件中容器，-f 指定文件，-d 后台运行
docker-compose -f /mydata/docker-composes/prometheus.yml ps
docker-compose -f /mydata/docker-composes/prometheus.yml stop

Docker部署Prometheus监控_docker_03

配置完成，查看邮箱告警

Docker部署Prometheus监控_Prometheus_04

标签：exporter,name,labels,been,down,Prometheus,监控,docker,Docker
From： https://blog.51cto.com/u_16539916/9395529