采集+监控
1.LB
LB
配置文件,nginx
自带的ngx_http_stub_status_module
提供的/nginx_status
(可自定义命名)端点输出的是nginx
自己的简单状态信息
vim InforSuiteLB/conf/InforSuiteLB.conf
location /nginx_status {
stub_status on;
# access_log off;
# allow 127.0.0.1;
# deny all;
}
配置好启动LB
即可
2.metrics
1.Nginx Prometheus Exporter
解压tar
包,创建systemctl
启动命令
# 解压
tar -zxvf nginx-prometheus-exporter_1.1.0_linux_amd64.tar.gz
vim /etc/systemd/system/nginx-prometheus-exporter.service
[Unit]
Description=nginx-prometheus-exporter
Documentation=https://github.com/nginxinc/nginx-prometheus-exporter
After=network.target
[Service]
Type=simple
User=root
ExecStart= /usr/local/nginx-prometheus-exporter \
-web.listen-address=:9113 \
-nginx.scrape-uri=http://192.168.209.132:80/nginx_status
Restart=on-failure
[Install]
WantedBy=multi-user.target
启动Nginx Prometheus Exporter
# 重新加载服务文件
systemctl daemon-reload
# 设置开机自启
systemctl enable nginx-prometheus-exporter.service
# 启动exporter
systemctl start nginx-prometheus-exporter.service
# 查看exporter状态
systemctl status nginx-prometheus-exporter.service
Nginx Prometheus Exporter
将nginx
暴露出来的指标转为Prometheus
可接收的metrics
格式,供其他组件收集或拉取
2.prometheus 数据源
# 1 进入安装目录
cd /usr/local
# 2 下载安装包
wget https://github.com/prometheus/prometheus/releases/download/v2.42.0/prometheus-2.42.0.linux-amd64.tar.gz
# 3 解压
tar -zxvf prometheus-2.42.0.linux-amd64.tar.gz
# 4 重命名
mv prometheus-2.42.0.linux-amd64 prometheus
配置开机自启动
vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus
After=network.target
Documentation=https://prometheus.io/
[Service]
Type=simple
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/usr/local/prometheus/data --web.listen-address=:9090 --web.enable-lifecycle
Restart=on-failure
[Install]
WantedBy=multi-user.target
配置文件
vi prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
remote_write:
- url: "http://192.168.209.132:4317/api/v1/write"
remote_read:
- url: "http://192.168.209.132:4317/api/v1/read"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["192.168.209.132:9090"]
- job_name: 'nginx-stub-status'
static_configs:
- targets: ['192.168.209.132:1234'] # otelcol的地址和端口
启动
# 重新加载服务文件
systemctl daemon-reload
# 设置开机自启
systemctl enable prometheus
# 启动prometheus
systemctl start prometheus
# 查看prometheus状态
systemctl status prometheus
# 查看服务是否启动
lsof -i:9090
3.logs
1.loki
下载
mkdir /usr/local/loki
###下载二进制包
wget "https://github.com/grafana/loki/releases/download/v2.7.4/loki-linux-amd64.zip"
###解压二进制包
unzip "loki-linux-amd64.zip"
### make sure it is executable
chmod a+x "loki-linux-amd64"
配置
vim loki-local-config.yml
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 10m
chunk_retain_period: 30s
schema_config:
configs:
- from: 2020-05-15
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 168h
storage_config:
boltdb:
directory: /usr/local/loki/index
filesystem:
directory: /usr/local/loki/chunks # 块存储路径
limits_config:
enforce_metric_name: false
reject_old_samples: true # 是否拒绝老样本
reject_old_samples_max_age: 168h # 168小时之前的样本将会被删除
ingestion_rate_mb: 200
ingestion_burst_size_mb: 300
per_stream_rate_limit: 1000MB
max_entries_limit_per_query: 10000
chunk_store_config:
max_look_back_period: 168h # 为避免查询超过保留期的数据,必须小于或等于下方的时间值
table_manager:
retention_deletes_enabled: true # 保留删除开启
retention_period: 168h # 超过168h的块数据将被删除
ruler:
storage:
type: local
local:
directory: /usr/local/loki/rules
rule_path: /usr/loca/loki/rules-temp
alertmanager_url: http://192.168.209.132:9093 # alertmanager地址
ring:
kvstore:
store: inmemory
enable_api: true
enable_alertmanager_v2: true
启动文件
vim restart-loki.sh
配置
#!/bin/bash
echo "stop loki"
ps -ef | grep loki-linux-amd64 | grep -v grep | awk '{print $2}'| xargs kill -9
echo "Begin start loki"
sleep 1
str=$"\n"
nohup ./loki-linux-amd64 --config.file=loki-local-config.yml &
sstr=$(echo -e $str)
echo $sstr
### 增加执行权限
chmod +x restart-loki.sh
### 启动
cd /usr/local/loki
./restart-loki.sh
2.日志代理 Promtail
下载
mkdir /usr/local/promtail
###下载二进制包
wget "https://github.com/grafana/loki/releases/download/v2.7.4/promtail-linux-amd64.zip"
###解压二进制包
unzip promtail-linux-amd64
### make sure it is executable
chmod a+x "promtail-linux-amd64"
配置
vim promtail-local-config.yml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /usr/local/promtail/positions.yaml
clients:
- url: http://192.168.209.132:3100/loki/api/v1/push # 填写好Loki地址
scrape_configs:
- job_name: nginx
pipeline_stages:
- replace:
expression: '(?:[0-9]{1,3}\.){3}([0-9]{1,3})'
replace: '***'
static_configs:
- targets:
- localhost
labels:
job: nginx_access_log
host: appfelstrudel
agent: promtail
__path__: /usr/local/InforSuiteLB/logs/json_access.log
启动文件
vi restart-promtail.sh
配置
#!/bin/bash
echo "Begin stop promtail"
ps -ef | grep promtail-linux-amd64 | grep -v grep | awk '{print $2}' | xargs kill -9
echo "Begin start promtail...."
nohup ./promtail-linux-amd64 --config.file=promtail-local-config.yml > ./promtail-9080.log 2>&1 &
### 增加执行权限
chmod +x restart-promtail.sh
### 启动
cd /usr/local/promtail
./restart-promtail.sh
4.OpenTelemetry Collector
rpm
安装
rpm -ivh otelcol_0.94.0_linux_amd64.rpm
配置文件
vim /etc/otelcol/config.yaml
# To limit exposure to denial of service attacks, change the host in endpoints below from 0.0.0.0 to a specific network interface.
# See https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/security-best-practices.md#safeguards-against-denial-of-service-attacks
extensions:
health_check:
pprof:
endpoint: 0.0.0.0:1777
zpages:
endpoint: 0.0.0.0:55679
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
opencensus:
endpoint: 0.0.0.0:55678
# Collect own metrics
prometheus:
config:
scrape_configs:
- job_name: 'nginx-stub-status'
scrape_interval: 10s
static_configs:
- targets: ['192.168.209.132:9113']
labels:
job: 'nginx-stub-status'
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_binary:
endpoint: 0.0.0.0:6832
thrift_compact:
endpoint: 0.0.0.0:6831
thrift_http:
endpoint: 0.0.0.0:14268
zipkin:
endpoint: 0.0.0.0:9411
processors:
batch:
exporters:
debug:
verbosity: detailed
prometheus/metrics:
endpoint: "192.168.209.132:1234"
service:
pipelines:
traces:
receivers: [otlp, opencensus, jaeger, zipkin]
processors: [batch]
exporters: [debug]
metrics:
receivers: [otlp, opencensus, prometheus]
processors: [batch]
exporters: [debug, prometheus/metrics]
logs:
receivers: [otlp]
processors: [batch]
exporters: [debug]
extensions: [health_check, pprof, zpages]
启动服务
# 重新加载服务文件
systemctl daemon-reload
# 设置开机自启
systemctl enable otelcol
# 启动grafana
systemctl start otelcol
# 查看grafana状态
systemctl status otelcol
5.grafana 安装
# 1 进入安装目录
cd /usr/local
# 2 下载安装包
wget https://dl.grafana.com/oss/release/grafana-9.4.3.linux-amd64.tar.gz
# 3 解压
tar -zxvf grafana-9.4.3.linux-amd64.tar.gz
# 4 重命名
mv grafana-9.4.3 grafana
配置开机自启动
# 创建grafana.service文件
vim /usr/lib/systemd/system/grafana.service
[Unit]
Description=Grafana
After=network.target
[Service]
Type=notify
ExecStart=/usr/local/grafana/bin/grafana-server -homepath /usr/local/grafana
Restart=on-failure
[Install]
WantedBy=multi-user.target
启动
# 重新加载服务文件
systemctl daemon-reload
# 设置开机自启
systemctl enable grafana
# 启动grafana
systemctl start grafana
# 查看grafana状态
systemctl status grafana
# 查看服务是否启动
lsof -i:3000
配置中文
default_language = zh-Hans
启动后,打开前端(3000端口)配置数据源、仪表盘(nginx
指标导入模板ID: 11199,nginx
日志导入模板ID: 12559)