准备
官网:
创建命名空间:
mkdir -p /root/prometheus && cd /root/prometheus
cat > monitor-sa.yaml <<"EOF"
kind: Namespace
apiVersion: v1
metadata:
name: monitor-sa
EOF
查看命名空间:
kubectl apply -f monitor-sa.yaml
kubectl get namespace monitor-sa
NAME STATUS AGE
monitor-sa Active 37s
1. node-export安装
镜像地址:https://hub.docker.com/r/prom/node-exporter/tags
github:https://github.com/prometheus/node_exporter
mkdir -p /root/prometheus && cd /root/prometheus
cat > node-export.yaml <<"EOF"
apiVersion: apps/v1
kind: DaemonSet #可以保证k8s集群的每个节点都运行完全一样的pod
metadata:
name: node-exporter
namespace: monitor-sa
labels:
name: node-exporter
spec:
selector:
matchLabels:
name: node-exporter
template:
metadata:
labels:
name: node-exporter
spec:
hostPID: true # 使用主机的PID
hostIPC: true # 使用主机的IPC
hostNetwork: true # 使用主机的网络
containers:
- name: node-exporter
image: prom/node-exporter:v1.3.0
ports:
- containerPort: 9100
resources:
requests:
cpu: 0.15
securityContext: # 开启特权模式,以root运行容器
privileged: true
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host
- --collector.filesystem.ignored-mount-points
- '"^/(sys|proc|dev|host|etc)($|/)"'
volumeMounts:
- name: dev
mountPath: /host/dev
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: rootfs
mountPath: /rootfs
tolerations: # 配置容忍度,可以容忍master节点的污点,master节点也需要收集数据
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
volumes: # 下面定义的是宿主机的目录,挂载到容器内
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
EOF
创建node-export:
kubectl apply -f node-export.yaml
kubectl get pod -n monitor-sa
NAME READY STATUS RESTARTS AGE
node-exporter-4f568 1/1 Running 0 17s
node-exporter-l2nw7 1/1 Running 0 17s
node-exporter-m58ps 1/1 Running 0 17s
# 查看端口
netstat -lntup | grep 9100
tcp6 0 0 :::9100 :::* LISTEN 1407112/node_export
# 查看数据采集数据
curl http://localhost:9100/metrics
# 显示本地主机cpu使用情况
curl http://localhost:9100/metrics | grep node_cpu_seconds
# 查看负载
curl http://localhost:9100/metrics | grep node_load
注:node-export默认的监听端口是9100,可以看到当前主机获取到的所有监控数据。
netstat -lntup | grep 9100
tcp6 0 0 :::9100 :::* LISTEN 3045628/node_export
报错处理,使用prom/node-exporter:v0.16.0
出现的问题,本地是已经挂载了这个目录,但是出现报错,干脆直接升级镜像到prom/node-exporter:v1.3.0
,重启Prometheus
,发现可以了,可以观察node-exporter
容器的日志和监控模版磁盘读写速率的数据有没有出现。
kubectl logs -f -n monitor-sa node-exporter-4f568
time="2023-03-11T04:54:35Z" level=error msg="ERROR: diskstats collector failed after 0.000281s: invalid line for /host/proc/diskstats for sda" source="collector.go:132"
time="2023-03-11T04:54:50Z" level=error msg="ERROR: diskstats collector failed after 0.001121s: invalid line for /host/proc/diskstats for sr0" source="collector.go:132"
time="2023-03-11T04:55:05Z" level=error msg="ERROR: diskstats collector failed after 0.000325s: invalid line for /host/proc/diskstats for sr0" source="collector.go:132"
time="2023-03-11T04:55:20Z" level=error msg="ERROR: diskstats collector failed after 0.000349s: invalid line for /host/proc/diskstats for sr1" source="collector.go:132"
time="2023-03-11T04:55:35Z" level=error msg="ERROR: diskstats collector failed after 0.001031s: invalid line for /host/proc/diskstats for sr1" source="collector.go:132"
2. 部署prometheus server
2.1 创建sa账号
rbac文件内容:
mkdir -p /root/prometheus && cd /root/prometheus
cat > rbac.yaml <<"EOF"
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor-sa
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor-sa
EOF
创建rbac:
kubectl apply -f rbac.yaml
2.2 创建数据目录
prometheus在哪个节点部署就在哪个节点上创建目录
mkdir -p /data
chmod 777 /data/
2.3 部署prometheus
2.3.1 prometheus.yml文件配置
ConfigMap文件内容:
mkdir -p /root/prometheus && cd /root/prometheus
cat > prometheus-cfg.yaml <<"EOF"
---
kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus-config
namespace: monitor-sa
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 1m
scrape_configs:
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-node-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
EOF
创建configmap:
# kubectl apply -f prometheus-cfg.yaml
configmap/prometheus-config created
# kubectl get configmaps -n monitor-sa
NAME DATA AGE
prometheus-config 1 29s
2.3.2 prometheus Deployment创建
镜像地址:https://hub.docker.com/r/prom/prometheus/tags
查看node节点名称,将服务创建到k8s-node01
节点上(这里如果想部署到master需要注意节点的污点)。
kubectl get node | awk '{print $1}'
NAME
k8s-master01
k8s-node01
k8s-node02
deployments文件内容:
mkdir -p /root/prometheus && cd /root/prometheus
cat > prometheus-deploy.yaml <<"EOF"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-server
namespace: monitor-sa
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
component: server
#matchExpressions:
#- {key: app, operator: In, values: [prometheus]}
#- {key: component, operator: In, values: [server]}
template:
metadata:
labels:
app: prometheus
component: server
annotations:
prometheus.io/scrape: 'false'
spec:
nodeName: k8s-node01 # 注意这里node01的名称,根据本地填写让pod调度到这个节点上面
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.37.6
imagePullPolicy: IfNotPresent
command:
- prometheus
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention=720h
- --web.enable-lifecycle
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: /etc/prometheus/prometheus.yml
name: prometheus-config
subPath: prometheus.yml
- mountPath: /prometheus/
name: prometheus-storage-volume
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
items:
- key: prometheus.yml
path: prometheus.yml
mode: 0644
- name: prometheus-storage-volume
hostPath:
path: /data
type: Directory
EOF
创建deployments:
# kubectl apply -f prometheus-deploy.yaml
deployment.apps/prometheus-server created
# kubectl get pod -n monitor-sa
NAME READY STATUS RESTARTS AGE
node-exporter-4f568 1/1 Running 0 5h41m
node-exporter-l2nw7 1/1 Running 0 5h41m
node-exporter-m58ps 1/1 Running 0 5h41m
prometheus-server-858c97464f-b4czn 1/1 Running 0 3m57s
2.3.3 prometheus svc创建
svc文件内容:
mkdir -p /root/prometheus && cd /root/prometheus
cat > prometheus-svc.yaml <<"EOF"
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitor-sa
labels:
app: prometheus
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
protocol: TCP
selector:
app: prometheus
component: server
EOF
创建svc(这里我们使用的是用node节点的port去访问):
# kubectl apply -f prometheus-svc.yaml
service/prometheus created
# kubectl get svc -n monitor-sa
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
prometheus NodePort 10.101.117.42 <none> 9090:30170/TCP 11h
访问地址:http://{任意主机ip地址}:30170
注:如果查看这样页面没数据最好去看看prometheus-server
容器的log日志。
level=error ts=2023-03-09T13:09:18.515651655Z caller=main.go:216 component=k8s_client_runtime err="github.com/prometheus/prometheus/discovery/kubernetes/kubernetes.go:269: Failed to list *v1.Service: services is forbidden: User \"system:serviceaccount:monitor-sa:monitor\" cannot list resource \"services\" in API group \"\" at the cluster scope: RBAC: clusterrole.rbac.authorization.k8s.io \"cluster-admin\\u00a0\" not found"
这里是因为ClusterRole
授权有问题。
3. Grafana安装和配置
3.1 部署Grafana
镜像地址:https://hub.docker.com/r/grafana/grafana
mkdir -p /root/prometheus && cd /root/prometheus
cat > Grafana.yaml <<"EOF"
apiVersion: apps/v1
kind: Deployment
metadata:
name: monitoring-grafana
namespace: monitor-sa
spec:
replicas: 1
selector:
matchLabels:
task: monitoring
k8s-app: grafana
template:
metadata:
labels:
task: monitoring
k8s-app: grafana
spec:
securityContext:
fsGroup: 472
supplementalGroups:
- 0
containers:
- name: grafana
image: grafana/grafana:9.4.3
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
protocol: TCP
volumeMounts:
- mountPath: /var/lib/grafana
name: grafana-storage
- mountPath: /etc/ssl/certs
name: ca-certificates
readOnly: true
env:
- name: INFLUXDB_HOST
value: monitoring-influxdb
- name: GF_SERVER_HTTP_PORT
value: "3000"
# The following env variables are required to make Grafana accessible via
# the kubernetes api-server proxy. On production clusters, we recommend
# removing these env variables, setup auth for grafana, and expose the grafana
# service using a LoadBalancer or a public IP.
- name: GF_AUTH_BASIC_ENABLED
value: "false"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ORG_ROLE
value: Admin
- name: GF_SERVER_ROOT_URL
# If you're only using the API Server proxy, set this value instead:
# value: /api/v1/namespaces/kube-system/services/monitoring-grafana/proxy
value: /
volumes:
- name: ca-certificates
hostPath:
path: /etc/ssl/certs
- name: grafana-storage
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
labels:
# For use as a Cluster add-on (https://github.com/kubernetes/kubernetes/tree/master/cluster/addons)
# If you are NOT using this as an addon, you should comment out this line.
kubernetes.io/cluster-service: 'true'
kubernetes.io/name: monitoring-grafana
name: monitoring-grafana
namespace: monitor-sa
spec:
# In a production setup, we recommend accessing Grafana through an external Loadbalancer
# or through a public IP.
# type: LoadBalancer
# You could also use NodePort to expose the service at a randomly-generated port
# type: NodePort
ports:
- port: 80
targetPort: 3000
selector:
k8s-app: grafana
type: NodePort
EOF
创建Grafana:
# kubectl apply -f Grafana.yaml
deployment.apps/monitoring-grafana created
service/monitoring-grafana created
# 查看pod启动情况
kubectl get pods -n kube-system| grep monitor
monitoring-grafana-5b4bb4c64b-5qwxj 1/1 Running 0 41s
# 查看svc
kubectl get svc -n kube-system | grep grafana
monitoring-grafana NodePort 10.101.25.22 <none> 80:31478/TCP 17m
3.2 登陆grafana
在浏览器访问:http://{任意主机ip地址}:31478
账号密码都是:admin
3.3 Grafana接入Prometheus数据源
找到默认的Prometheus
-
Name:Prometheus
点击
3.4 导入Node Exporter模版
模版地址:https://grafana.com/grafana/dashboards/?search=node_exporter&collector=nodeexporter
使用的模版:https://github.com/ming-ddtechcg/Prometheus-1/blob/master/Node_Exporter.json
标签:node,kubectl,monitor,系统,yaml,prometheus,监控,root From: https://www.cnblogs.com/-k8s/p/17206603.html