前言:
利用k8s CronJob 来实现etcd集群的自动备份,并通过sftp传输到本k8s集群外的服务器上,进行存储。
实验步骤:
基本环境情况:
服务器角色 | IP | 系统 | ETCD版本 |
K8S集群操作服务器 | 192.168.1.136 | Centos7.9 | 3.4.9 |
存储服务器 | 192.168.1.105 | Centos7.9 | - |
创建Dockerfile镜像:
[root@k8s-master1 ~]# mkdir /software/k8s-yaml/etcd-backup/
[root@k8s-master1 ~]# cd /software/k8s-yaml/etcd-backup/
[root@k8s-master1 etcd-backup]# vim Dockerfile
FROM python:3-alpine
RUN mkdir /root/.ssh \
&& touch /root/.ssh/config \
&& echo -e "Host *\n\tStrictHostKeyChecking no\n\tUserKnownHostsFile /dev/null\n\tKexAlgorithms +diffie-hellman-group1-sha1\n\tPubkeyAcceptedKeyTypes +ssh-rsa\n\tHostkeyAlgorithms +ssh-rsa" > /root/.ssh/config
RUN apk add -U --no-cache curl lftp ca-certificates openssh \
&& curl -L https://yunwei-software.oss-cn-zhangjiakou.aliyuncs.com/etcdctl -o /usr/local/bin/etcdctl \
&& chmod +x /usr/local/bin/etcdctl
PS:etcd版本为3.4.9,如ETCD版本是不是3.4.9,可以使用ADD将自己集群中的etcdctl打入镜像中。或调整下面的Dockerfile,从Gitlab上拉去。
GitHub拉取使用的Dockerfile:
FROM python:3-alpine
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories
# 设置自己集群中etcd的版本
ARG ETCD_VERSION=v3.4.9
RUN apk add -U --no-cache curl lftp ca-certificates openssh
RUN mkdir /root/.ssh \
&& touch /root/.ssh/config \
&& echo -e "Host *\n\tStrictHostKeyChecking no\n\tUserKnownHostsFile /dev/null\n\tKexAlgorithms +diffie-hellman-group1-sha1\n\tPubkeyAcceptedKeyTypes +ssh-rsa\n\tHostkeyAlgorithms +ssh-rsa" > /root/.ssh/config
ADD s3cmd-master.zip /s3cmd-master.zip
RUN unzip /s3cmd-master.zip -d /opt \
&& cd /opt/s3cmd-master \
&& python setup.py install \
&& rm -rf /s3cmd-master.zip
RUN curl -L https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/etcd-${ETCD_VERSION}-linux-amd64.tar.gz -o /opt/etcd-${ETCD_VERSION}-linux-amd64.tar.gz \
&& cd /opt && tar xzf etcd-${ETCD_VERSION}-linux-amd64.tar.gz \
&& mv etcd-${ETCD_VERSION}-linux-amd64/etcdctl /usr/local/bin/etcdctl \
&& rm -rf etcd-${ETCD_VERSION}-linux-amd64*
镜像创建并上传至镜像仓库(本地和云上都可,方便其他节点拉取该镜像)
[root@k8s-master1 etcd-backup]# docker build -t lws_etcd_backups:v1 .
[root@k8s-master1 etcd-backup]# docker tag lws_etcd_backups:v1 registry.cn-zhangjiakou.aliyuncs.com/newtime-test/etcd_backups:lws_v1
[root@k8s-master1 etcd-backup]# docker push registry.cn-zhangjiakou.aliyuncs.com/newtime-test/etcd_backups:lws_v1
ConfigMap创建:
[root@k8s-master1 etcd-backup]# vim etcd-backup-cm.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: cron-sftp
namespace: backups
data:
entrypoint.sh: |
#!/bin/bash
#variables
sftp_user="ftp01"
sftp_passwd="Nisec123456"
sftp_url="sftp://192.168.1.105:22"
backup_dir=/home/ftp/etcd-backup/$CLUSTER_NAME
# backup etcd data
mkdir -p /snapshot
chmod +x /usr/local/bin/etcdctl
file=etcd-snapshot-$(date +%Y%m%d-%H%M%S).db
etcdctl --endpoints $ENDPOINTS \
--cert=/opt/etcd/ssl/server.pem \
--key=/opt/etcd/ssl/server-key.pem \
--cacert=/opt/etcd/ssl/ca.pem \
snapshot save /snapshot/$file
# upload etcd snapshot file
lftp -u $sftp_user,$sftp_passwd $sftp_url<<EOF
mkdir -p $backup_dir
cd $backup_dir
lcd /snapshot
put $file
by
EOF
# remove the expired snapshot file
total_num=$(lftp -u $sftp_user,$sftp_passwd $sftp_url -e "ls $backup_dir | wc -l;by")
if [ $total_num -gt $BACKUP_COUNTS ]; then
expired_num=$(expr $total_num - $BACKUP_COUNTS)
expired_files=$(lftp -u $sftp_user,$sftp_passwd $sftp_url -e "ls $backup_dir | head -n $expired_num;by" | awk '{print $NF}')
for f in $expired_files; do
to_remove=${backup_dir}/${f}
echo "start to remove $to_remove"
lftp -u $sftp_user,$sftp_passwd $sftp_url -e "rm -f $to_remove;by"
done
fi
# remove local etcd snapshot file
rm -f /snapshot/$file
PS:按实际情况修改SFTP段落的配置。
#创建cm类型的cron-sftp
[root@k8s-master1 etcd-backup]# kubectl create ns backups
[root@k8s-master1 etcd-backup]# kubectl apply -f etcd-backup-cm.yaml
[root@k8s-master1 etcd-backup]# kubectl get cm -n backups
NAME DATA AGE
cron-sftp 1 6s
kube-root-ca.crt 1 11s
CronJob创建:
[root@k8s-master1 etcd-backup]# vim etcd-backup-cronjob.yaml
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: etcd-backup-sftp
namespace: backups
spec:
schedule: "*/5 * * * *"
jobTemplate:
spec:
template:
metadata:
labels:
app: etcd-backup
spec:
containers:
- name: etcd-backup
image: registry.cn-zhangjiakou.aliyuncs.com/newtime-test/etcd_backups:lws_v1
imagePullPolicy: IfNotPresent
workingDir: /
command: ["sh", "./entrypoint.sh"]
env:
- name: ENDPOINTS
value: "192.168.1.136:2379"
- name: ETCDCTL_API
value: "3"
- name: BACKUP_COUNTS
value: "5"
- name: CLUSTER_NAME
value: "cluster1"
volumeMounts:
- mountPath: /entrypoint.sh
name: configmap-volume
readOnly: true
subPath: entrypoint.sh
- mountPath: /opt/etcd/ssl
name: etcd-certs
readOnly: true
- mountPath: /etc/localtime
name: lt-config
- mountPath: /etc/timezone
name: tz-config
volumes:
- name: configmap-volume
configMap:
defaultMode: 0777
name: cron-sftp
- name: etcd-certs
hostPath:
path: /opt/etcd/ssl
- name: lt-config
hostPath:
path: /etc/localtime
- name: tz-config
hostPath:
path: /etc/timezone
hostNetwork: true
restartPolicy: OnFailure
PS:可以通过nodeAffinity将执行etcd备份的CrobJob调度到任意etcd节点上运行。示例如下:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/etcd
operator: Exists
我这边共有4个节点,是将ETCD的SSL证书放到了每个节点中,所以没有设置nodeAffinity。
#把SSL证书放到所有节点中:
[root@k8s-master1 etcd-backup]# scp -p /opt/etcd/ssl/ 192.168.1.139:/opt/etcd/ssl
运行etcd-backup-cronjob.yaml:
[root@k8s-master1 etcd-backup]# kubectl apply -f etcd-backup-cronjob.yaml
[root@k8s-master1 etcd-backup]# kubectl get cj -n backups
NAME SCHEDULE SUSPEND ACTIVE LAST SCHEDULE AGE
etcd-backup-sftp */5 * * * * False 0 <none> 7s
#5分钟后查询pods创建情况:
[root@k8s-master1 etcd-backup]# kubectl get pods -n backups
NAME READY STATUS RESTARTS AGE
etcd-backup-sftp-1677308100-cw4b8 0/1 Completed 0 1m51s
[root@k8s-master1 etcd-backup]# kubectl logs etcd-backup-sftp-1677308100-cw4b8 -n backups
{"level":"info","ts":1677308105.1600003,"caller":"snapshot/v3_snapshot.go:119","msg":"created temporary db file","path":"/snapshot/etcd-snapshot-20230225-145505.db.part"}
{"level":"info","ts":"2023-02-25T14:55:05.191+0800","caller":"clientv3/maintenance.go:200","msg":"opened snapshot stream; downloading"}
{"level":"info","ts":1677308105.1914499,"caller":"snapshot/v3_snapshot.go:127","msg":"fetching snapshot","endpoint":"192.168.1.136:2379"}
{"level":"info","ts":"2023-02-25T14:55:05.872+0800","caller":"clientv3/maintenance.go:208","msg":"completed snapshot read; closing"}
{"level":"info","ts":1677308106.153034,"caller":"snapshot/v3_snapshot.go:142","msg":"fetched snapshot","endpoint":"192.168.1.136:2379","size":"18 MB","took":0.992465311}
{"level":"info","ts":1677308106.1532946,"caller":"snapshot/v3_snapshot.go:152","msg":"saved","path":"/snapshot/etcd-snapshot-20230225-145505.db"}
Snapshot saved at /snapshot/etcd-snapshot-20230225-145505.db
mkdir: Access failed: Failure (/home/ftp/etcd-backup/cluster1)
start to remove /home/ftp/etcd-backup/cluster1/.
start to remove /home/ftp/etcd-backup/cluster1/..
start to remove /home/ftp/etcd-backup/cluster1/etcd-snapshot-20230225-143011.db
查看etcd备份情况:
因为机房的K8S集群目前没有出现过问题,自己目前也没有时间去测试使用snapshot.db文件恢复,等有时间了再去做实验吧。
恢复以及参考的链接如下: