环境初始化
# 重命名
[root@localhost ~]# hostnamectl set-hostname master1
[root@localhost ~]# su
su
#配置静态IP
[root@master1 ~]# cd /etc/sysconfig/network-scripts/
[root@master1 network-scripts]# vim ifcfg-ens33
BOOTPROTO="none"
NAME="ens33"
DEVICE="ens33"
ONBOOT="yes"
IPADDR=192.168.136.161
PREFIX=24
GATEWAY=192.168.136.2
DNS1=114.114.114.114
DNS2=101.226.4.6
[root@master1 network-scripts]# service network restart
Restarting network (via systemctl): [ 确定 ]
#检查IP地址
[root@master1 network-scripts]# ip add
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: ens33: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:0c:29:2f:4d:2c brd ff:ff:ff:ff:ff:ff
inet 192.168.136.161/24 brd 192.168.136.255 scope global noprefixroute ens33
valid_lft forever preferred_lft forever
inet6 fe80::20c:29ff:fe2f:4d2c/64 scope link
valid_lft forever preferred_lft forever
#查看路由
[root@master1 network-scripts]# ip route
default via 192.168.136.2 dev ens33 proto static metric 100
192.168.136.0/24 dev ens33 proto kernel scope link src 192.168.136.153 metric 100
#查看本地DNS服务器
[root@master1 network-scripts]# cat /etc/resolv.conf
# Generated by NetworkManager
nameserver 114.114.114.114
nameserver 101.226.4.6
配置hosts文件
vim /etc/hosts
cat /etc/hosts
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.136.161 master1
192.168.136.162 master2
192.168.136.164 node1
192.168.136.165 node2
192.168.136.166 node3
更新和配置软件源
[root@master1 network-scripts]# cd /etc/yum.repos.d/
[root@master1 yum.repos.d]# rm -f *
[root@master1 yum.repos.d]# curl -O http://mirrors.aliyun.com/repo/Centos-7.repo
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 2523 100 2523 0 0 30244 0 --:--:-- --:--:-- --:--:-- 30768
[root@master1 yum.repos.d]# ls
Centos-7.repo
[root@master1 yum.repos.d]# yum makecache fast
已加载插件:fastestmirror
Determining fastest mirrors
* base: mirrors.aliyun.com
* extras: mirrors.aliyun.com
* updates: mirrors.aliyun.com
关闭firewalld和selinux
[root@master1 yum.repos.d]# systemctl stop firewalld
[root@master1 yum.repos.d]# systemctl disable firewalld
Removed symlink /etc/systemd/system/multi-user.target.wants/firewalld.service.
Removed symlink /etc/systemd/system/dbus-org.fedoraproject.FirewallD1.service.
#临时关闭selinux
[root@master1 yum.repos.d]# setenforce 0
#永久关闭
[root@master1 yum.repos.d]# sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config
[root@master1 yum.repos.d]# reboot
[root@master1 ~]# getenforce
Disabled
关闭交换分区
#临时关闭
[root@master1 ~]# swapoff -a
#永久关闭
[root@master1 ~]# vim /etc/fstab
#/dev/mapper/centos-swap swap swap defaults 0 0
调整内核参数
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
sysctl -p
modprobe br_netfilter
lsmod | grep br_netfilter
配置ipvs功能
# 安装ipset和ipvsadm
yum install ipset ipvsadmin -y
# 添加需要加载的模块写入脚本文件
cat <<EOF > /etc/sysconfig/modules/ipvs.modules
#!/bin/bash
modprobe -- ip_vs
modprobe -- ip_vs_rr
modprobe -- ip_vs_wrr
modprobe -- ip_vs_sh
modprobe -- nf_conntrack_ipv4
EOF
# 为脚本文件添加执行权限
chmod +x /etc/sysconfig/modules/ipvs.modules
# 执行脚本文件
/bin/bash /etc/sysconfig/modules/ipvs.modules
# 查看对应的模块是否加载成功
lsmod | grep -e ip_vs -e nf_conntrack_ipv4
# 重启
reboot
配置时间同步
# 跟网络时间做同步
ntpdate ntp.cloud.aliyuncs.com
# 添加计划任务
crontab -e
* */1 * * * /usr/sbin/ntpdate ntp.cloud.aliyuncs.com
# 重启crond服务
service crond restart
配置docker环境
下载安装docker的仓库文件
cd /etc/yum.repos.d/
curl -O https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
ls
Centos-7.repo docker-ce.repo
安装docker环境
yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
#查看docker版本
[root@master1 yum.repos.d]# docker -v
Docker version 26.1.4, build 5650f9b
#启动docker
[root@master1 yum.repos.d]# systemctl start docker
#设置开机自启
[root@master1 yum.repos.d]# systemctl enable docker
Created symlink from /etc/systemd/system/multi-user.target.wants/docker.service to /usr/lib/systemd/system/docker.service.
#查看docker状态
[root@master1 yum.repos.d]# systemctl status docker
配置docker镜像加速
[root@master1 yum.repos.d]# vim /etc/docker/daemon.json
{
"registry-mirrors": ["https://hub.docker-alhk.dkdun.com/"],
"exec-opts": ["native.cgroupdriver=systemd"] #指定cgroup的驱动程序是systemd
}
#重新加载docker的配置文件和重启docker服务
[root@master1 yum.repos.d]# systemctl daemon-reload
[root@master1 yum.repos.d]# systemctl restart docker
配置cri-docker
kubernets 1.24版本后默认使用containerd做底层容器,需要使用cri-dockerd做中间层来与docker通信
mkdir /cri-docker
cd /cri-docker/
# 下载
wget https://github.com/Mirantis/cri-dockerd/releases/download/v0.3.8/cri-dockerd-0.3.8-3.el7.x86_64.rpm
# 安装
rpm -ivh cri-dockerd-0.3.8-3.el7.x86_64.rpm
# 重载系统守护进程
systemctl daemon-reload
# 修改配置文件
vim /usr/lib/systemd/system/cri-docker.service
# 修改第10行 ExecStart
# 改为
ExecStart=/usr/bin/cri-dockerd --pod-infra-container-image=registry.aliyuncs.com/google_containers/pause:3.9 --container-runtime-endpoint fd://
配置cri-docker服务自启动
# 重载系统守护进程
systemctl daemon-reload
# 启动cri-dockerd
systemctl start cri-docker.socket cri-docker
# 设置cri-dockerd自启动
systemctl enable cri-docker.socket cri-docker
# 检查Docker组件状态
systemctl status docker cir-docker.socket cri-docker
配置k8s集群环境
安装kubectl
# 下载
curl -LO "https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl"
# 检验 可选
curl -LO "https://dl.k8s.io/v1.28.2/bin/linux/amd64/kubectl.sha256"
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check
# 安装
install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
# 测试
kubectl version --client
#Client Version: v1.28.2
#Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3
配置k8s组件源
cat <<EOF | tee /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64
enabled=1
gpgcheck=0
repo_gpgcheck=0
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF
yum makecache
安装
# 安装
yum install -y install kubeadm-1.28.2-0 kubelet-1.28.2-0 kubectl-1.28.2-0 --disableexcludes=kubernetes
# 如果报错未找到就试试不指定版本
yum install -y install kubeadm kubelet kubectl --disableexcludes=kubernetes
# 设置自启动
systemctl enable --now kubelet
集群初始化
在master节点执行
kubeadm init --kubernetes-version=v1.28.2 \
--pod-network-cidr=10.224.0.0/16 \
--apiserver-advertise-address=192.168.136.161 \
--image-repository=registry.aliyuncs.com/google_containers \
--cri-socket=unix:///var/run/cri-dockerd.sock
#192.168.136.161为master1的IP
swap 未关闭报错
kubectl 未启动报错
之后启动后再执行命令还是报错,端口 6443, 10259, 10257, 10250, 2379, 2380 都被占用了,有写yaml文件已存在, etcd 数据目录不为空
ps aux |grep 6443
kill -9 3058
......
删除已存在的yaml文件
rm /etc/kubernetes/manifests/kube-apiserver.yaml
rm /etc/kubernetes/manifests/kube-controller-manager.yaml
rm /etc/kubernetes/manifests/kube-scheduler.yaml
rm /etc/kubernetes/manifests/etcd.yaml
清空 etcd 数据目录
rm -rf /var/lib/etcd/*
成功后会提示以下信息:
[addons] Applied essential addon: CoreDNS
[addons] Applied essential addon: kube-proxy
Your Kubernetes control-plane has initialized successfully!
To start using your cluster, you need to run the following as a regular user:
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
Alternatively, if you are the root user, you can run:
export KUBECONFIG=/etc/kubernetes/admin.conf
You should now deploy a pod network to the cluster.
Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
https://kubernetes.io/docs/concepts/cluster-administration/addons/
Then you can join any number of worker nodes by running the following on each as root:
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
--discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e
记下系统提示命令kubeadm join xxxxx
,并在后面追加unix:///var/run/cri-dockerd.sock
完整命令应该类似于:kubeadm join 192.168.136.161:6443 --token xxx --discovery-token-ca-cert-hash sha256:xxx --cri-socket unix:///var/run/cri-dockerd.sock
mkdir -p $HOME/.kube
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
chown $(id -u):$(id -g) $HOME/.kube/config
查找token和certificate key
[root@master1 .kube]# kubeadm init phase upload-certs --upload-certs
I0816 16:16:42.129855 8453 version.go:256] remote version is much newer: v1.31.0; falling back to: stable-1.28
[upload-certs] Storing the certificates in Secret "kubeadm-certs" in the "kube-system" Namespace
[upload-certs] Using certificate key:
b5ecc6f24110ac0e31a1026a74fe95ad4e2fdab1c6dba919b7f651c9d7dd265f
[root@master1 .kube]# kubeadm token create --print-join-command
kubeadm join 192.168.136.161:6443 --token btyzsw.3o1zswwdm0v904pr --discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e
node节点加入集群
# 上面得到的命令
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
--discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e \
--cri-socket unix:///var/run/cri-dockerd.sock
[preflight] Running pre-flight checks
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Starting the kubelet
[kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap...
This node has joined the cluster:
* Certificate signing request was sent to apiserver and a response was received.
* The Kubelet was informed of the new secure connection details.
Run 'kubectl get nodes' on the control-plane to see this node join the cluster.
master节点加入集群
#参考文档
#https://kubernetes.io/zh-cn/docs/reference/setup-tools/kubeadm/kubeadm-join/
#在node节点加入集群命令中添加参数 --control-plane
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
--discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e \
--control-plane \
--cri-socket unix:///var/run/cri-dockerd.sock
出现下面的错误,集群当前没有配置稳定的 controlPlaneEndpoint
地址
这通常是在集群初始化时通过 kubeadm init
命令的 --control-plane-endpoint
参数来设置的。
[root@master2 ~]# kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
> --discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e \
> --control-plane \
> --cri-socket unix:///var/run/cri-dockerd.sock
[preflight] Running pre-flight checks
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
error execution phase preflight:
One or more conditions for hosting a new control plane instance is not satisfied.
unable to add a new control plane instance to a cluster that doesn't have a stable controlPlaneEndpoint address
Please ensure that:
* The cluster has a stable controlPlaneEndpoint address.
* The certificates that must be shared among control plane instances are provided.
To see the stack trace of this error execute with --v=5 or higher
解决方法
[root@master1 ~]# kubectl get cm kubeadm-config -n kube-system
NAME DATA AGE
kubeadm-config 1 33m
[root@master1 ~]# kubectl describe cm kubeadm-config -n kube-system
Name: kubeadm-config
Namespace: kube-system
Labels: <none>
Annotations: <none>
Data
====
ClusterConfiguration:
----
apiServer:
extraArgs:
authorization-mode: Node,RBAC
timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
controllerManager: {}
dns: {}
etcd:
local:
dataDir: /var/lib/etcd
imageRepository: registry.aliyuncs.com/google_containers
kind: ClusterConfiguration
kubernetesVersion: v1.28.2
networking:
dnsDomain: cluster.local
podSubnet: 10.224.0.0/16
serviceSubnet: 10.96.0.0/12
scheduler: {}
BinaryData
====
Events: <none>
[root@master1 ~]# kubectl edit cm kubeadm-config -n kube-system
configmap/kubeadm-config edited
master2执行命令加入节点
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu --discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e --control-plane --certificate-key b5ecc6f24110ac0e31a1026a74fe95ad4e2fdab1c6dba919b7f651c9d7dd265f --cri-socket unix:///var/run/cri-dockerd.sockkube
完成后查看节点
[root@master1 ~]# kubectl get node
NAME STATUS ROLES AGE VERSION
master1 NotReady control-plane 119m v1.28.2
master2 NotReady control-plane 7m49s v1.28.2
node1 NotReady <none> 107m v1.28.2
node2 NotReady <none> 107m v1.28.2
node3 NotReady <none> 107m v1.28.2
分配worker
# 在master上执行
kubectl label node node1 node-role.kubernetes.io/worker=worker
kubectl label node node2 node-role.kubernetes.io/worker=worker
kubectl label node node3 node-role.kubernetes.io/worker=worker
安装Calico网络插件
# master执行
wget https://docs.projectcalico.org/manifests/calico.yaml
kubectl apply -f calico.yaml
# 验证
kubectl get nodes
NAME STATUS ROLES AGE VERSION
master1 Ready control-plane 126m v1.28.2
master2 Ready control-plane 14m v1.28.2
node1 Ready worker 114m v1.28.2
node2 Ready worker 114m v1.28.2
node3 Ready worker 114m v1.28.2
#查看pod
root@master1 ~]# kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system calico-kube-controllers-658d97c59c-prwkv 1/1 Running 0 5m25s
kube-system calico-node-2kdfk 1/1 Running 0 5m25s
kube-system calico-node-47hcn 1/1 Running 0 5m25s
kube-system calico-node-4pc5c 1/1 Running 0 5m25s
kube-system calico-node-nsqfv 1/1 Running 0 5m25s
kube-system calico-node-vltbx 1/1 Running 0 5m25s
kube-system coredns-66f779496c-k2hf8 1/1 Running 0 127m
kube-system coredns-66f779496c-sr9rc 1/1 Running 0 127m
kube-system etcd-master1 1/1 Running 1 (129m ago) 127m
kube-system etcd-master2 1/1 Running 0 15m
kube-system kube-apiserver-master1 1/1 Running 1 (130m ago) 127m
kube-system kube-apiserver-master2 1/1 Running 0 15m
kube-system kube-controller-manager-master1 1/1 Running 2 (15m ago) 127m
kube-system kube-controller-manager-master2 1/1 Running 0 15m
kube-system kube-proxy-7w9qw 1/1 Running 0 15m
kube-system kube-proxy-8bb5g 1/1 Running 0 115m
kube-system kube-proxy-b8r8z 1/1 Running 0 115m
kube-system kube-proxy-cbhx4 1/1 Running 0 127m
kube-system kube-proxy-dg65j 1/1 Running 0 115m
kube-system kube-scheduler-master1 1/1 Running 2 (15m ago) 127m
kube-system kube-scheduler-master2 1/1 Running 0 15m
安装Dashboard
以下命令均只在master节点上执行
下载安装
wget https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml
vim recommended.yaml
修改Service部分,改为NodePort对外暴露端口
The range of valid ports is 30000-32767
39 spec:
40 type: NodePort
41 ports:
42 - port: 443
43 targetPort: 8443
44 nodePort: 30081
45 selector:
46 k8s-app: kubernetes-dashboard
安装
kubectl apply -f recommended.yaml
查看
[root@master1 ~]# kubectl get pods,svc -n kubernetes-dashboard
NAME READY STATUS RESTARTS AGE
pod/dashboard-metrics-scraper-5657497c4c-zlfz6 1/1 Running 0 88s
pod/kubernetes-dashboard-78f87ddfc-9zrss 1/1 Running 0 88s
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/dashboard-metrics-scraper ClusterIP 10.99.245.129 <none> 8000/TCP 88s
service/kubernetes-dashboard NodePort 10.98.7.173 <none> 443:30081/TCP 12s
创建账号
创建dashboard-access-token.yaml文件
# Creating a Service Account
apiVersion: v1
kind: ServiceAccount
metadata:
name: admin-user
namespace: kubernetes-dashboard
---
# Creating a ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: admin-user
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: admin-user
namespace: kubernetes-dashboard
---
# Getting a long-lived Bearer Token for ServiceAccount
apiVersion: v1
kind: Secret
metadata:
name: admin-user
namespace: kubernetes-dashboard
annotations:
kubernetes.io/service-account.name: "admin-user"
type: kubernetes.io/service-account-token
# Clean up and next steps
# kubectl -n kubernetes-dashboard delete serviceaccount admin-user
# kubectl -n kubernetes-dashboard delete clusterrolebinding admin-user
执行
kubectl apply -f dashboard-access-token.yaml
#获取token
kubectl get secret admin-user -n kubernetes-dashboard -o jsonpath={".data.token"} | base64 -d
访问dashboard
[root@master1 ~]# kubectl get secret -n kubernetes-dashboard
NAME TYPE DATA AGE
admin-user kubernetes.io/service-account-token 3 49s
kubernetes-dashboard-certs Opaque 0 4m58s
kubernetes-dashboard-csrf Opaque 1 4m58s
kubernetes-dashboard-key-holder Opaque 2 4m58s
[root@master1 ~]# kubectl get svc -n kubernetes-dashboard
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
dashboard-metrics-scraper ClusterIP 10.99.245.129 <none> 8000/TCP 5m20s
kubernetes-dashboard NodePort 10.98.7.173 <none> 443:30081/TCP 4m4s
浏览器访问集群ip:端口(https://192.168.136.161:30081/),注意https
输入上一步获取到的token即可
解决token默认15分钟过期的问题
[root@k8s-master-1 ~]# vim recommended.yaml
193 containers:
194 - name: kubernetes-dashboard
195 image: kubernetesui/dashboard:v2.7.0
196 imagePullPolicy: Always
197 ports:
198 - containerPort: 8443
199 protocol: TCP
200 args:
201 - --auto-generate-certificates
202 - --namespace=kubernetes-dashboard
203 - --token-ttl=43200 #添加这条配置,超时时间调整为12小时
重新应用
[root@k8s-master-1 ~]# kubectl apply -f recommended.yaml
安装kubectl命令自动补全
yum install bash-completion -y
# 临时设置自动补全
source <(kubectl completion bash)
# 永久设置自动补全
echo "source <(kubectl completion bash)" >> ~/.bashrc && bash
部署metric-server
下载
master执行
wget https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.6.2/components.yaml
修改
vim
修改140行左右
原:
containers:
- args:
...
image: k8s.gcr.io/metrics-server/metrics-server:v0.6.2
修改后:
containers:
- args:
...
- --kubelet-insecure-tls # 添加这一行
image: admin4j/metrics-server:v0.6.2 # 修改镜像仓库地址
应用
kubectl apply -f components.yaml
查看
[root@master1 ~]# kubectl top nodes
NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
master1 180m 9% 1143Mi 66%
master2 147m 7% 1050Mi 61%
node1 78m 3% 866Mi 50%
node2 79m 3% 894Mi 52%
node3 81m 4% 914Mi 53%
目前问题,master1和master2都需要启动,否则会报错
#master1上报错
[root@master1 ~]# kubectl get nodes
Error from server: etcdserver: request timed out
#master2上报错
root@master2 ~]# kubectl get pod -o wide -A
Unable to connect to the server: dial tcp 192.168.136.161:6443: i/o timeout
Prometheus监控k8s
监控方案
Cadvisor + node-exporter + prometheus + grafana
- Cadvisor:数据采集
- node-exporter:汇总
- prometheus:处理、存储
- grafana:展示
监控流程
- 容器监控:Prometheus使用cadvisor采集容器监控指标,而cadvisor集成在K8S的kubelet中所以无需部署,通过Prometheus进程存储,使用grafana进行展示。
- node节点监控:node端的监控通过node_exporter采集当前主机的资源,通过Prometheus进程存储,最后使用grafana进行展示。
- master节点监控:master的监控通过kube-state-metrics插件从K8S获取到apiserver的相关数据并通过网页页面暴露出来,然后通过Prometheus进程存储,最后使用grafana进行展示
#上传yaml文件
[root@master1 prometheus-k8s]# ls
configmap.yaml csdn—prometheus监控k8s.txt grafana-deploy.yaml grafana-ing.yaml grafana-svc.yaml node-exporter.yaml prometheus.deploy.yml prometheus.svc.yml rbac-setup.yaml
#修改prometheus.deploy.yml,镜像版本修改为 - image: prom/prometheus
#node-exporter.yaml,prometheus.deploy.yml,grafana-deploy.yaml
#提前在节点拉取镜像
docker pull prom/node-exporter
docker pull prom/prometheus
docker pull grafana/grafana:6.1.4
采用daemonset方式部署node-exporter
[root@master1 prometheus-k8s]# kubectl apply -f node-exporter.yaml
daemonset.apps/node-exporter created
service/node-exporter created
[root@master1 prometheus-k8s]# kubectl get pods -A -o wide |grep exporter
kube-system node-exporter-5kfl9 1/1 Running 0 60s 10.224.135.7 node3 <none> <none>
kube-system node-exporter-ckg9f 1/1 Running 0 60s 10.224.104.3 node2 <none> <none>
kube-system node-exporter-s2frk 1/1 Running 0 60s 10.224.166.131 node1 <none> <none>
[root@master1 prometheus-k8s]# kubectl get daemonset -A
NAMESPACE NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
kube-system calico-node 5 5 5 5 5 kubernetes.io/os=linux 18h
kube-system kube-proxy 5 5 5 5 5 kubernetes.io/os=linux 20h
kube-system node-exporter 3 3 3 3 3 <none> 2m41s
[root@master1 prometheus-k8s]# kubectl get svc -A
NAMESPACE NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
default kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 20h
kube-system kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 20h
kube-system metrics-server ClusterIP 10.103.190.182 <none> 443/TCP 17h
kube-system node-exporter NodePort 10.99.110.5 <none> 9100:31672/TCP 2m48s
kubernetes-dashboard dashboard-metrics-scraper ClusterIP 10.99.245.129 <none> 8000/TCP 18h
kubernetes-dashboard kubernetes-dashboard NodePort 10.98.7.173 <none> 443:30081/TCP 18h
部署Prometheus
[root@master1 prometheus-k8s]# kubectl apply -f rbac-setup.yaml
clusterrole.rbac.authorization.k8s.io/prometheus created
serviceaccount/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
[root@master1 prometheus-k8s]# kubectl apply -f configmap.yaml
configmap/prometheus-config created
[root@master1 prometheus-k8s]# kubectl apply -f prometheus.deploy.yml
deployment.apps/prometheus created
[root@master1 prometheus-k8s]# kubectl apply -f prometheus.svc.yml
service/prometheus created
部署grafana
[root@master1 prometheus-k8s]# kubectl apply -f grafana-deploy.yaml
deployment.apps/grafana-core created
[root@master1 prometheus-k8s]# kubectl apply -f grafana-svc.yaml
service/grafana created
[root@master1 prometheus-k8s]# kubectl apply -f grafana-ing.yaml
[root@master1 prometheus-k8s]# cd ..
[root@master1 k8s-prometheus-grafana-master]# kubectl apply -f grafana-ing.yaml
ingress.networking.k8s.io/grafana created
校验测试
查看pod/svc信息
[root@master1 prometheus-k8s]# kubectl get pods -A -o wide
[root@master1 prometheus-k8s]# kubectl get svc -A
查看prometheus
访问http://192.168.136.165:30003
,这是Prometheus的页面,依次点击Status>Targets
可以看到已经成功连接到k8s的apiserver
查看node exporter
访问http://192.168.136.161:31672/metrics
,这是node-exporter采集的数据。
查看grafana
访问http://192.168.136.165:30950
,这是grafana的页面,账户、密码都是admin。
创建Dashboard
add data souce