一、K8s集群部署

1.1 Runtime安装

1、卸载已经安装的 docker

apt-get remove docker \
    docker-client \
    docker-client-latest \
    docker-common \
    docker-latest \
    docker-latest-logrotate \
    docker-logrotate \
    docker-engine docker-ce containerd -y

2、配置源

# 更新源
sudo apt-get update

# 下载工具
sudo apt-get install -y ca-certificates curl gnupg lsb-release

# 创建目录
sudo mkdir -p /etc/apt/keyrings

# 配置源
curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg

echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null

# 更新源
sudo apt-get update

3、安装 Containerd

apt-get install containerd.io -y

4、配置 Containerd 的内核

cat <<EOF | sudo tee /etc/modules-load.d/containerd.conf
overlay
br_netfilter
EOF
sudo modprobe overlay
sudo modprobe br_netfilter

cat <<EOF | sudo tee /etc/sysctl.d/99-kubernetes-cri.conf
net.bridge.bridge-nf-call-iptables = 1
net.ipv4.ip_forward = 1
net.bridge.bridge-nf-call-ip6tables = 1
EOF

# 内核参数生效
sudo sysctl --system

5、创建 Containerd 的配置文件

# 生成配置文件
sudo mkdir -p /etc/containerd
containerd config default | sudo tee /etc/containerd/config.toml

# 修改配置文件
sed -i 's#SystemdCgroup = false#SystemdCgroup = true#g' /etc/containerd/config.toml
sed -i 's#k8s.gcr.io/pause#registry.cn-hangzhou.aliyuncs.com/google_containers/pause#g' /etc/containerd/config.toml
sed -i 's#registry.gcr.io/pause#registry.cn-hangzhou.aliyuncs.com/google_containers/pause#g' /etc/containerd/config.toml
sed -i 's#registry.k8s.io/pause#registry.cn-hangzhou.aliyuncs.com/google_containers/pause#g' /etc/containerd/config.toml

6、启动Containerd

systemctl daemon-reload
systemctl restart containerd

# 验证结果
root@VM-0-10-ubuntu:/home/ubuntu# ctr plugin ls

1.2 Kubernetes 部署

1、关闭swap

# 临时关闭
swapoff -a

# 永久关闭,/etc/fstab配置文件注释自动挂载 swapoff
vim /etc/fstab 

2、添加 Kubernetes 源

国内:

apt-get update && apt-get install -y apt-transport-https

# 下载并处理 Kubernetes GPG 密钥
sudo curl -fsSL \
https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.31/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg

# 添加 Kubernetes 软件源到系统源列表
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.31/deb/ /" | sudo tee /etc/apt/sources.list.d/kubernetes.list

国外:

# 安装必要的依赖包
sudo apt install -y apt-transport-https ca-certificates curl gpg

# 下载并处理 Kubernetes 的 GPG 密钥
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.31/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg 

# 添加 Kubernetes 软件源到系统源列表,并抑制输出
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.31/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list > /dev/null

3、安装 Kubernetes 组件

apt-get update
apt-get install -y kubelet kubeadm kubectl
sudo apt-mark hold kubelet kubeadm kubectl

4、集群初始化

国内环境:

# 预拉取 Kubernetes 安装所需的镜像
sudo kubeadm config images pull \
--image-repository registry.cn-hangzhou.aliyuncs.com/google_containers --kubernetes-version 1.31.6

# 在 master 节点进行 init,其他节点不要进行 init
sudo kubeadm init --apiserver-advertise-address 192.168.0.104 \
--image-repository registry.cn-hangzhou.aliyuncs.com/google_containers \
--cri-socket "unix:///var/run/containerd/containerd.sock" \
--kubernetes-version 1.31.6

国外环境:

# 预拉取指定版本的 Kubernetes 安装所需镜像
sudo kubeadm config images pull --kubernetes-version 1.31.6

# 在 master 节点进行初始化操作,其他节点请勿执行此步骤
sudo kubeadm init --apiserver-advertise-address 10.224.0.2 --cri-socket "unix:///var/run/containerd/containerd.sock" --kubernetes-version 1.31.6

5、配置 kubeconfig

mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config

6、安装 Addons

# 安装git工具
apt-get install git -y

# 下载源码文件
git clone https://gitee.com/dukuan/k8s-ha-install.git

# 开始安装
cd k8s-ha-install/
git checkout manual-installation-v1.31.x
cd single/
kubectl create -f .

7、解除污点

这里使用master节点进行测试,所以解除Master节点的污点,让Master节点做node节点。

kubectl taint node node-role.kubernetes.io/control-plane- --all

8、查看集群状态

# 查看node使用情况
kubectl top node

# 查看node节点
kubectl get node

二、K8s GPU Operator部署

2.1 Helm 安装

官方安装文档:https://helm.sh/docs/intro/install/

Helm 安装包:https://github.com/helm/helm/releases

1、安装Helm

# 下载安装包
mkdir helm && cd helm
wget https://get.helm.sh/helm-v3.16.2-linux-amd64.tar.gz

# 解压安装
tar xf helm-v3.16.2-linux-amd64.tar.gz
mv linux-amd64/helm /usr/local/bin/

# 验证版本
helm version

2、创建Namespace

kubectl create ns gpu-operator

2.2 部署 GPU Operator

1、添加仓库

helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
&& helm repo update

2、下载安装包

# 下载指定版本的gpu-operator
helm pull nvidia/gpu-operator --version v24.9.2

# 解压
tar xf gpu-operator-v24.9.2.tgz

# 进入安装目录
cd gpu-operator/

3、开始安装

helm install gpu-operator -n gpu-operator --create-namespace .

说明:国内机器需要修改 vim charts/node-feature-discovery/values.yaml 仓库地址

4、查看 Pod 状态

kubectl get po -n gpu-operator

5、查看 GPU 资源

kubectl describe node | grep Allocatable: -A 10

Allocatable:
  cpu:                8
  ephemeral-storage:  95001823485
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             31973804Ki
  pods:               110
System Info:
  Machine ID:                 aa0de61855c940efb7546ad537e45332
  System UUID:                aa0de618-55c9-40ef-b754-6ad537e45332
  Boot ID:                    d3b4740f-86b1-49ec-8f74-5ef6500f1cbf

6、创建 GPU 测试服务

# 编写yaml文件
vim test.yaml

apiVersion: v1
kind: Pod
metadata:
  name: cuda-vectoradd
spec:
  restartPolicy: OnFailure
  containers:
  - name: cuda-vectoradd
    image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04"
    resources:
      limits:
        nvidia.com/gpu: 1

# 应用
kubectl create -f test.yaml

7、查看日志

kubectl logs cuda-vectoradd

#回显
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done

三、动态存储配置

1、Local Path Storage 部署

#创建存储目录
mkdir -p /data/local-path-provisioner

#定制Local Path Storage yaml文件
vim localpath.yaml

apiVersion: v1
kind: Namespace
metadata:
  name: local-path-storage

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: local-path-provisioner-service-account
  namespace: local-path-storage

---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: local-path-provisioner-role
  namespace: local-path-storage
rules:
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: local-path-provisioner-role
rules:
  - apiGroups: [""]
    resources: ["nodes", "persistentvolumeclaims", "configmaps", "pods", "pods/log"]
    verbs: ["get", "list", "watch"]
  - apiGroups: [""]
    resources: ["persistentvolumes"]
    verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]
  - apiGroups: [""]
    resources: ["events"]
    verbs: ["create", "patch"]
  - apiGroups: ["storage.k8s.io"]
    resources: ["storageclasses"]
    verbs: ["get", "list", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: local-path-provisioner-bind
  namespace: local-path-storage
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: local-path-provisioner-role
subjects:
  - kind: ServiceAccount
    name: local-path-provisioner-service-account
    namespace: local-path-storage

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: local-path-provisioner-bind
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: local-path-provisioner-role
subjects:
  - kind: ServiceAccount
    name: local-path-provisioner-service-account
    namespace: local-path-storage

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: local-path-provisioner
  namespace: local-path-storage
spec:
  replicas: 1
  selector:
    matchLabels:
      app: local-path-provisioner
  template:
    metadata:
      labels:
        app: local-path-provisioner
    spec:
      serviceAccountName: local-path-provisioner-service-account
      containers:
        - name: local-path-provisioner
          image: registry.cn-beijing.aliyuncs.com/dotbalo/local-path-provisioner:v0.0.31
          imagePullPolicy: IfNotPresent
          command:
            - local-path-provisioner
            - --debug
            - start
            - --config
            - /etc/config/config.json
          volumeMounts:
            - name: config-volume
              mountPath: /etc/config/
          env:
            - name: POD_NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
            - name: CONFIG_MOUNT_PATH
              value: /etc/config/
      volumes:
        - name: config-volume
          configMap:
            name: local-path-config

---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: local-path
provisioner: rancher.io/local-path
volumeBindingMode: WaitForFirstConsumer
reclaimPolicy: Delete

---
kind: ConfigMap
apiVersion: v1
metadata:
  name: local-path-config
  namespace: local-path-storage
data:
  config.json: |-
    {
            "nodePathMap":[
            {
                    "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES",
                    "paths":["/data/local-path-provisioner"]
            }
            ]
    }
  setup: |-
    #!/bin/sh
    set -eu
    mkdir -m 0777 -p "$VOL_DIR"
  teardown: |-
    #!/bin/sh
    set -eu
    rm -rf "$VOL_DIR"
  helperPod.yaml: |-
    apiVersion: v1
    kind: Pod
    metadata:
      name: helper-pod
    spec:
      priorityClassName: system-node-critical
      tolerations:
        - key: node.kubernetes.io/disk-pressure
          operator: Exists
          effect: NoSchedule
      containers:
      - name: helper-pod
        image: registry.cn-beijing.aliyuncs.com/dotbalo/busybox
        imagePullPolicy: IfNotPresent

#应用yaml
kubectl create -f localpath.yaml

2、结果验证

# 查看pod状态
kubectl get po -n local-path-storage

# 查看StorageClass
kubectl get sc local-path

3、创建 PVC 测试

vim pvc-test.yaml

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: local-path-pvc
spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: local-path
  resources:
   requests:
     storage: 128Mi

# 应用
kubectl create -f pvc-test.yaml

4、创建 Pod

# 编写yaml文件
vim pod.yaml

apiVersion: v1
kind: Pod
metadata:
  name: volume-test
spec:
  containers:
  - name: volume-test
    image: registry.cn-beijing.aliyuncs.com/dotbalo/redis:v7.0.15 
    imagePullPolicy: IfNotPresent
    volumeMounts:
    - name: volv
      mountPath: /data
    ports:
    - containerPort: 80
  volumes:
  - name: volv
    persistentVolumeClaim:
      claimName: local-path-pvc

# 应用yaml文件
kubectl create -f pod.yaml

5、验证结果

# 查看pod
kubectl get po volume-test
NAME          READY   STATUS    RESTARTS   AGE
volume-test   1/1     Running   0          46s

# 查看pvc
kubectl get pvc local-path-pvc
NAME             STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS   VOLUMEATTRIBUTESCLASS   AGE
local-path-pvc   Bound    pvc-36d0c89e-eab7-4960-adff-9494224f50ef   128Mi      RWO            local-path     <unset>                 64s

四、K8s Ollama Operator部署

官网:https://ollama-operator.ayaka.io/

1、部署Ollama Operator

kubectl apply \
  --server-side=true \
  -f https://raw.githubusercontent.com/nekomeowww/ollama-operator/v0.10.1/dist/install.yaml

2、查看部署状态

kubectl get po -n ollama-operator-system

3、下载客户端工具并上传到服务器的/home/ubuntu目录

客户端工具下载:https://github.com/nekomeowww/ollama-operator/releases/

cd /home/ubuntu

wget https://github.com/nekomeowww/ollama-operator/releases/download/v0.10.5/kollama_v0.10.5_linux_amd64.tar.gz

4、解压客户端工具

tar xf kollama_v0.10.5_linux_amd64.tar.gz

5、测试客户端工具是否正常使用

./kollama --help