一、这套脚本的组成方式

1.1 变量定义层

export start_time=$(date -d "1 day ago" "+%Y-%m-%d 00:00:00")
export end_time=$(date -d "0 day ago" "+%Y-%m-%d 00:00:00")
export etcd_host1=<etcd-host-1>
export etcd_host2=<etcd-host-2>
export etcd_host3=<etcd-host-3>
export ansible_hosts=/path/to/hosts

1.2 命令模板层

etcd.cmd.tmplservice.cmd.tmpl 这类模板文件通过 envsubst 渲染成最终命令文件,使同一套巡检逻辑可以在不同环境复用。

二、批量执行框架的核心逻辑

2.1 healthcheck.sh 的主流程

cd $(dirname $0)
source ../env.sh
envsubst <service.cmd.tmpl >service.cmd
envsubst <etcd.cmd.tmpl >etcd.cmd
. ../run.sh
copy_shdir
run etcd.cmd $DATE
run k8s.cmd $DATE
run os.cmd $DATE
run middleware.cmd $DATE
run service.cmd $DATE

2.2 run.sh 的执行方式

[kubectl]
kubectl get node
kubectl get pod -A

[master]
systemctl status kube-apiserver | grep Active:
ansible -i $ansible_hosts $NODE -m shell -a "$line" | tee -a $LOG

三、巡检内容可以拆成哪些主题

3.1 基础组件巡检

  • etcd.cmd:ETCD 成员状态、健康状态和 member 列表。
  • k8s.cmd:控制面组件、Node、DNS、Calico 等状态检查。
  • os.cmd:CPU、内存、磁盘和 NTP 状态。

3.2 深度巡检

  • middleware.cmd:Redis、MinIO、ZooKeeper、Kafka、RabbitMQ、MySQL 等中间件状态。
  • service.cmd:systemd 服务状态与 journalctl 时间窗口内错误日志。
  • sh/process.sh:补充主机进程统计信息。

四、完整脚本

以下为本文对应的完整脚本,便于直接复制复用。

4.1 env.sh

export NTP_SERVER1=
export NTP_SERVER2=
export start_time=$(date -d "1 day ago"  "+%Y-%m-%d 00:00:00")
export end_time=$(date -d "0 day ago"  "+%Y-%m-%d 00:00:00")
export etcd_host1=10.232.22.71
export etcd_host2=10.232.22.72
export etcd_host3=10.232.22.73
export ansible_hosts=/home/cjs/paas_kz_k8s_v1.23.6_harbor_2.5.4_x86_64_20230412/hosts

4.2 run.sh

#bin/bash
export ansible_hosts=/home/cjs/paas_kz_k8s_v1.23.6_harbor_2.5.4_x86_64_20230412/hosts
copy_shdir() {
    WORK_DIR=$(dirname $0)
    ansible -i $ansible_hosts node -m copy -a "src=${WORK_DIR}/sh dest=/home/ecip/"
    #ansible -i /apps/5-deploy/deploy/k8s/paas_kz_k8s_v1.23.6_x86_64_20220914/hosts node -m copy -a "src=${WORK_DIR}/sh dest=/home/admin/"
}

fmt() {
    cat |
        awk ' 
            /^([0-9]{1,3}\.){3}[0-9]{1,3}[[:space:]]*\|/ { $1=sprintf("%-15s",$1);prefix=$0} 
            !/^([0-9]{1,3}\.){3}[0-9]{1,3}[[:space:]]*\|/ { print prefix"\t"$0 }
        '
}

run() {
    FILE=$1
    time=$2
    CMDS1=$(cat $FILE)
    today=$(date "+%Y%m%d")
    DATE=$(date "+%Y%m%d_%H%M%S")
    time=$DATE
    WORK_DIR=$(dirname $0)
    mkdir -p $WORK_DIR/logs/$today
    LOG=${WORK_DIR}/logs/$today/${FILE}.${time}.log
    #LOG=${WORK_DIR}/logs/healthcheck.${time}.log

    # 必须使用while read line; do ... done <<< "$CMDS1" 读入文件
    # 不能使用echo "$CMDS1" | while read line; do ... done的形式,管道会启动子进程,exit时,只是退出了子进程,导致出错时主进程仍继续运行
    while read -r line
    do

       # 空行,不做处理
       echo $line | grep -E '^[[:space:]]*#*[[:space:]]*$'
       [[ $? == 0 ]] && continue

       # 获取ansible组名
       NODE1=$(echo $line | grep -E '^[[:space:]]*\[[^]]*\][[:space:]]*$')
       if [[ $? == 0 ]]; then
           echo -e "\n\n$line"
           NODE=$(echo $line | sed 's/\[\|\]//g')
           continue
       fi

       if [[ x == x"$NODE" ]]; then
           echo "ansible组名为空"
           exit 1
       fi

       echo -e "\n" 2>&1 |
           tee -a $LOG
       echo "$line" 2>&1 |
           tee -a $LOG
       ansible -i $ansible_hosts $NODE -m shell -a "$line" |
           fmt |
           tee -a $LOG

    done <<< "$CMDS1"
}

4.3 healthcheck.sh

#!/bin/bash
#首先配置需要检查日志的开始结束日期
cd $(dirname $0)
export start_time=$(date -d "1 day ago"  "+%Y-%m-%d 00:00:00")
export end_time=$(date -d "0 day ago"  "+%Y-%m-%d 00:00:00")
source ../env.sh
envsubst <service.cmd.tmpl >service.cmd
envsubst <etcd.cmd.tmpl >etcd.cmd

#clear log dir
#rm -f logs/*
. ../run.sh

#脚本目录同步到所有主机
copy_shdir

DATE=$(date "+%Y%m%d_%H%M%S")
#检查etcd
run etcd.cmd $DATE

#检查k8s
run k8s.cmd $DATE

#检查操作系统
run os.cmd $DATE

#检查企业承包套件(深度巡检使用)
run middleware.cmd $DATE

#检查系统服务(深度巡检使用)
run service.cmd $DATE

4.4 etcd.cmd

[kubectl]
etcdctl  --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://10.232.22.71:1159,https://10.232.22.72:1159,https://10.232.22.73:1159 endpoint status
etcdctl  --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://10.232.22.71:1159,https://10.232.22.72:1159,https://10.232.22.73:1159 member list
etcdctl  --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://10.232.22.71:1159,https://10.232.22.72:1159,https://10.232.22.73:1159 endpoint health

[master]
systemctl status etcd -l

4.5 etcd.cmd.tmpl

[kubectl]
etcdctl  --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://${etcd_host1}:1159,https://${etcd_host2}:1159,https://${etcd_host3}:1159 endpoint status
etcdctl  --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://${etcd_host1}:1159,https://${etcd_host2}:1159,https://${etcd_host3}:1159 member list
etcdctl  --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://${etcd_host1}:1159,https://${etcd_host2}:1159,https://${etcd_host3}:1159 endpoint health

[master]
systemctl status etcd -l

4.6 k8s.cmd

[master]
systemctl status etcd | grep Active:
systemctl status kube-scheduler | grep Active:
systemctl status kube-apiserver | grep Active:
systemctl status kube-controller-manager | grep Active:
systemctl status kubelet | grep Active:

[kubectl]
kubectl get node
kubectl get cs
kubectl get pod -A|grep calico
kubectl get po -A|grep dns

[node]
systemctl status kubelet | grep Active:
systemctl status kube-proxy | grep Active:
systemctl status docker | grep Active:

[harbor]
cd /apps/harbor_setup/harbor;docker-compose ps
docker ps -a| grep keepalive

4.7 middleware.cmd

[kubectl]
#检查redis
kubectl get po -A | grep redis | grep -vi running
kubectl -n paas-middleware get pods | grep devops-redis-master |  awk '{print $1}' | xargs -i kubectl -n paas-middleware  exec {} -- redis-cli -a \!8nKVyV3tU05dtfb info Replication
kubectl -n paas-middleware get pods |grep devops-redis-sentinel-1 | awk '{print $1}' |xargs -i kubectl -n paas-middleware  exec {} -- redis-cli -p 26379 -a \!8nKVyV3tU05dtfb info sentinel
kubectl -n paas-middleware get pods |grep devops-redis-sentinel-2 | awk '{print $1}' |xargs -i kubectl -n paas-middleware  exec {} -- redis-cli -p 26379 -a \!8nKVyV3tU05dtfb info sentinel
kubectl -n paas-middleware get pods |grep devops-redis-sentinel-3 | awk '{print $1}' |xargs -i kubectl -n paas-middleware  exec {} -- redis-cli -p 26379 -a \!8nKVyV3tU05dtfb info sentinel

#检查minio
#kubectl -n paas-middleware exec minio-cluster-0 -- mc admin info mini0
kubectl get po -A | grep minio | grep -vi running

#检查zookeeper
kubectl get po -A | grep zk | grep -vi running
kubectl -n paas-middleware  get pods | grep zk | awk '{print $1}' | xargs -i kubectl -n paas-middleware  exec {}  -- zkServer.sh status

#检查kafka
#kubectl -n paas-middleware  get pods | grep zk | awk 'NR==1 {print $1}' | xargs -i kubectl -n paas-middleware  exec {}  -- zkCli.sh ls /brokers/ids/1
kubectl get po -A | grep kafka | grep -vi running
kubectl -n paas-middleware  get pods | grep zk | awk '{print $1}' | xargs -i kubectl -n paas-middleware  exec {}  -- zkCli.sh ls /brokers/ids/1

#检查rabbitmq
kubectl -n paas-middleware get pods| grep "mq-" | grep -vi running
kubectl -n paas-middleware get pods | grep "mq-"  | awk '{print $1}'  | xargs -i kubectl -n paas-middleware exec {}  -- /opt/rabbitmq/sbin/rabbitmqctl cluster_status -s

##检查mysql
kubectl -n paas-middleware get pods | grep mysql | grep -vi running
kubectl -n paas-middleware get pods | grep mysql-master| awk '{print $1}' |xargs -i kubectl -n paas-middleware exec {}  -- mysql -uadmin -p59mkkVCrMj7T0lnS -e "show slave status\G;show status like 'Threads%';"
kubectl -n paas-middleware get pods | grep mysql-master |awk '{print $1}' | xargs -i kubectl -n paas-middleware logs  --tail 10 {}
kubectl -n paas-middleware get pods | grep mysql-slave | awk '{print $1}' | xargs -i kubectl -n paas-middleware logs  --tail 10 {}

4.8 os.cmd

[node]
top -bn1 | grep load | awk '{printf "CPU Load --- %.2f\n", $(NF-2)}'
free -g | awk 'NR==2{printf "Memory Usage --- %4s/%4s GB (%.2f%%)\n", $3,$2,$3*100/$2 }'
df -h | egrep '[5-9][0-9]%|100%' |awk '{printf "Disk Usage --- %s  \t Disk --- %s\n", $5,$6}'
timedatectl | grep 'NTP synchronized'

[mysql]
mysql -uroot -p'59mkkVCrMj7T0lnS' -e "show slave status;"

4.9 service.cmd

[master]
systemctl status etcd | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00"  -u etcd  | grep Error
ETCDCTL_API=3 etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints="10.232.22.71:1159,10.232.22.72:1159,10.232.22.73:1159" endpoint health

systemctl status kube-scheduler | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00"  -u kube-scheduler  | grep Error
systemctl status kube-apiserver | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00"  -u kube-apiserver  | grep Error
systemctl status kube-controller-manager | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00"  -u kube-controller-manager  | grep Error

[node]
systemctl status kubelet | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00"  -u kubelet  | grep Error
systemctl status kube-proxy | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00"  -u kube-proxy  | grep Error
systemctl status docker | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00"  -u docker  | grep Error
sh /home/ecip/sh/process.sh

4.10 service.cmd.tmpl

[master]
systemctl status etcd | grep Active:
journalctl --since="${start_time}" --until="${end_time}"  -u etcd  | grep Error
ETCDCTL_API=3 etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints="${etcd_host1}:1159,${etcd_host2}:1159,${etcd_host3}:1159" endpoint health

systemctl status kube-scheduler | grep Active:
journalctl --since="${start_time}" --until="${end_time}"  -u kube-scheduler  | grep Error
systemctl status kube-apiserver | grep Active:
journalctl --since="${start_time}" --until="${end_time}"  -u kube-apiserver  | grep Error
systemctl status kube-controller-manager | grep Active:
journalctl --since="${start_time}" --until="${end_time}"  -u kube-controller-manager  | grep Error

[node]
systemctl status kubelet | grep Active:
journalctl --since="${start_time}" --until="${end_time}"  -u kubelet  | grep Error
systemctl status kube-proxy | grep Active:
journalctl --since="${start_time}" --until="${end_time}"  -u kube-proxy  | grep Error
systemctl status docker | grep Active:
journalctl --since="${start_time}" --until="${end_time}"  -u docker  | grep Error
sh /home/ecip/sh/process.sh

4.11 process.sh

#!/bin/bash
#Author.nhx
running=0
sleeping=0
stoped=0
zombie=0

#在proc目录下所有以数字开始的都是当前计算机正在运行的进程的进程PID
#每个PID编号的目录下纪律有该进程相关的信息

for pid in /proc/[1-9]*
do
   if [ -f $pid/stat ];then
       procs=$[procs+1]
       stat=$(awk '{print $3}' $pid/stat)
   fi
#每个pid目录下都有一个stat文件,该文件的第3列是该进程的状态信息
case $stat in
R)
  running=$[running+1];;
T)
  stoped=$[stoped+1];;
S)
  sleeping=$[sleeping+1];;
Z)
  zombie=$[zombie+1];;
esac
done

echo "进程统计信息如下:进程数量为:$procs, Running 进程数为:$running,Stoped 进程为:$stoped,Sleeping 进程数为:$sleeping,Zombie 进程数为:$zombie"
#echo "进程统计信息如下"
#echo "总进程数量为:$procs"
#echo "Running 进程数为:$running"
#echo "Stoped 进程为:$stoped"
#echo "Sleeping 进程数为:$sleeping"
#echo "Zombie 进程数为:$zombie"