一、亲和性与反亲和性¶
使用亲和性与反亲和性的一些好处有:
- 亲和性、反亲和性语言的表达能力更强。nodeSelector 只能选择所有固定标签的节点。
亲和性:
- requiredDuringSchedulingIgnoredDuringExecution: 调度器只有在规则被满足的时候才能执行调度。此功能类似于 nodeSelector, 但其语法表达能力更强。
- preferredDuringSchedulingIgnoredDuringExecution: 调度器会尝试寻找满足对应规则的节点。如果找不到匹配的节点,调度器仍然会在其它节点调度该 Pod。
NodeAffinity(节点亲和性):
- 如果pod标明调度策略规则是 “软需求”,这样调度器在无法找到匹配节点时仍然调度该Pod。
- 如果pod标明调度策略规则是 “硬需求”,如果node节点不满足需求,pod不会被调度,而是一直处于pending状态。
- 优先级:硬需求>>软需求
PodAffinity(亲和性)/PodAntiAffinity(互斥性):
- Pod亲和性主要解决 pod 可以和哪些 pod 部署在同一个拓扑域中的问题;
- Pod互斥性主要是解决 pod 不能和哪些 pod 部署在同一个拓扑域中的问题;
Kubernetes提供的操作符有下面的几种:
| 调度策略 | 匹配 标签 | 操作符 | 拓扑 域支 持 | 调度目标 |
|---|---|---|---|---|
| nodeAffinity | node 节点 | In, NotIn, Exists,DoesNotExist, Gt, Lt | 否 | 指定主机 |
| podAffinity | pod | In, NotIn, Exists,DoesNotExist | 是 | POD与指定POD同一拓扑域 |
| podAnitAffinity | pod | In, NotIn, Exists,DoesNotExist | 是 | POD与指定POD不在同一拓扑域 |
Kubernetes提供的操作符有下面的几种:
- In:label 的值在某个列表中
- NotIn:label 的值不在某个列表中
- Gt:label 的值大于某个值
- Lt:label 的值小于某个值
- Exists:某个 label 存在
- DoesNotExist:某个 label 不存在
测试验证
NodeAffinify:
# 为node01节点添加标签
[root@master01 ~]# kubectl label nodes node01 apptype=core
# 定义Yaml文件
[root@master01 ~]# vim nodeaffinify-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: affinity
name: affinity
spec:
replicas: 1
selector:
matchLabels:
app: affinity
template:
metadata:
labels:
app: affinity
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
name: nginxweb
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution: #硬策略
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- node02
preferredDuringSchedulingIgnoredDuringExecution: # 软策略
- weight: 1
preference:
matchExpressions:
- key: apptype
operator: In
values:
- core
# 创建
[root@master01 ~]# k apply -f nodeaffinify-demo.yaml
# 观察到pod调度到node02节点,说明硬策略优先级高于软策略
[root@master01 ~]# k get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
affinity-77fd58df9b-9grxt 1/1 Running 0 11s 172.29.55.11 node02 <none> <none>
# 环境复原
[root@master01 ~]# k delete -f nodeaffinify-demo.yaml
[root@master01 ~]# kubectl label nodes node01 apptype-
podAffinity:
# 为node01节点添加标签
[root@master01 ~]# kubectl label nodes node01 apptype=core
# 定义Yaml文件,用于测试(准备环境)
[root@master01 ~]# vim busybox.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: busybox
spec:
replicas: 1
selector:
matchLabels:
app: busybox
template:
metadata:
labels:
app: busybox
spec:
nodeSelector:
apptype: core
containers:
- image: registry.cn-hangzhou.aliyuncs.com/abroad_images/busybox
name: busybox
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- sleep 3000
# 部署
[root@master01 ~]# k apply -f busybox.yaml
# 验证,观察到部署到node01节点
[root@master01 ~]# k get po -owide | grep busybox
busybox-74b8bf9d79-tbmcr 1/1 Running 0 56s 172.29.55.12 node01 <none> <none>
---------------------------------------------
# 定义Yaml文件
[root@master01 ~]# vim pod-affinify-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: affinity
name: affinity
spec:
replicas: 1
revisionHistoryLimit: 15 #指定在Deployment的历史修订中保留多少个版本
selector:
matchLabels:
app: affinity
template:
metadata:
labels:
app: affinity
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
name: nginxweb
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution: #硬策略
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- busybox
topologyKey: kubernetes.io/hostname
# 部署
[root@master01 ~]# k apply -f pod-affinify-demo.yaml
# 验证,观察到和有app=busybox标签的pod部署在一起
[root@master01 ~]# k get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
affinity-7f5ccbd5c7-t2tgd 1/1 Running 0 25s 172.29.55.13 node01 <none> <none>
busybox-74b8bf9d79-tbmcr 1/1 Running 0 4m 172.29.55.12 node01 <none> <none>
# 环境复原
[root@master01 ~]# k delete -f pod-affinify-demo.yaml
podAntiAffinity:
# 为node01节点添加标签
[root@master01 ~]# kubectl label nodes node01 apptype=core
# 定义Yaml文件,用于测试
[root@master01 ~]# vim busybox.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: busybox
spec:
replicas: 1
selector:
matchLabels:
app: busybox
template:
metadata:
labels:
app: busybox
spec:
nodeSelector:
apptype: core
containers:
- image: registry.cn-hangzhou.aliyuncs.com/abroad_images/busybox
name: busybox
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- sleep 3000
# 部署
[root@master01 ~]# k apply -f busybox.yaml
-------------------------------------------------
# 定义Yaml文件
[root@master01 ~]# vim pod-anti-affinify-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: affinity
name: affinity
spec:
replicas: 1
revisionHistoryLimit: 15 #指定在Deployment的历史修订中保留多少个版本
selector:
matchLabels:
app: affinity
template:
metadata:
labels:
app: affinity
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
name: nginxweb
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution: #硬策略
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- busybox
topologyKey: kubernetes.io/hostname
# 部署
[root@master01 ~]# k apply -f pod-anti-affinify-demo.yaml
# 验证,观察到和有app=busybox标签的pod部署不在一起
[root@master01 ~]# k get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
affinity-5f585b979f-prn8c 1/1 Running 0 3s 172.21.231.139 node02 <none> <none>
busybox-74b8bf9d79-tbmcr 1/1 Running 0 11m 172.29.55.12 node01 <none> <none>
# 环境复原
[root@master01 ~]# k delete -f pod-anti-affinify-demo.yaml
实验升级:
测试验证一:如果不满足 “Node硬亲和”?
结论:如果不满足 “Node硬亲和”,则pod会处于pending状态
# 定义Yaml文件
[root@master01 ~]# vim 111nodeaffinify-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: affinity
name: affinity
spec:
replicas: 1
selector:
matchLabels:
app: affinity
template:
metadata:
labels:
app: affinity
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
name: nginxweb
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution: #硬策略
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- aaa
# 创建
[root@master01 ~]# k apply -f 111nodeaffinify-demo.yaml
# 查看,观察到因kubernetes.io/hostname=aaa不存在导致pending
[root@master01 1]# k get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
affinity-5b887764c9-bsrw7 0/1 Pending 0 5s <none> <none> <none> <none>
# 进一步查看
[root@master01 1]# k describe po affinity-5b887764c9-bsrw7
...
...
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 73s default-scheduler 0/5 nodes are available: 5 node(s) didn't match Pod's node affinity/selector. preemption: 0/5 nodes are available: 5 Preemption is not helpful for scheduling..
You have new mail in /var/spool/mail/root
# 环境复原
[root@master01 ~]# k delete -f 111nodeaffinify-demo.yaml
测试验证二:如果不满足 “Node软亲和”?
结论:如果不满足 “Node硬亲和”,则pod会被调度到别的节点上
# 给node01节点打上apptype=core的标签
[root@master01 1]# kubectl label nodes node01 apptype=core
[root@master01 1]# kg node --show-labels | grep apptype
node01 Ready <none> 4d22h v1.26.9 apptype=core,beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,kubernetes.io/arch=amd64,kubernetes.io/hostname=node01,kubernetes.io/os=linux
# 定义Yaml文件
[root@master01 ~]# vim 222nodeaffinify-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: affinity
name: affinity
spec:
replicas: 1
selector:
matchLabels:
app: affinity
template:
metadata:
labels:
app: affinity
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
name: nginxweb
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution: # 软策略
- weight: 1
preference:
matchExpressions:
- key: apptype
operator: In
values:
- aaa
# 创建
[root@master01 ~]# k apply -f 222nodeaffinify-demo.yaml
# 查看,观察到虽然不完全匹配,但是会调度到别的节点
root@master01 1]# k get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
affinity-b69b6f9f7-2pj7s 1/1 Running 0 2m53s 172.21.231.140 node02 <none> <none>
# 环境复原
[root@master01 ~]# k delete -f 222nodeaffinify-demo.yaml
[root@master01 1]# kubectl label nodes node01 apptype-
测试验证三:如果不满足 “Pod硬亲和”?
结论:如果不满足 “Pod硬亲和”,则pod会处于pending状态
# 给node01节点打上apptype=core的标签
[root@master01 1]# kubectl label nodes node01 apptype=core
# 定义Yaml文件,用于测试
[root@master01 ~]# vim busybox.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: busybox
spec:
replicas: 1
selector:
matchLabels:
app: busybox
template:
metadata:
labels:
app: busybox
spec:
nodeSelector:
apptype: core
containers:
- image: registry.cn-hangzhou.aliyuncs.com/abroad_images/busybox
name: busybox
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- sleep 3000
# 部署
[root@master01 ~]# k apply -f busybox.yaml
# 验证,观察到部署到node01节点
[root@master01 ~]# k get po -owide | grep busybox
busybox-74b8bf9d79-tbmcr 1/1 Running 0 56s 172.29.55.12 node01 <none> <none>
---------------------------------------------
# 定义Yaml文件
[root@master01 ~]# vim 333pod-affinify-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: affinity
name: affinity
spec:
replicas: 1
revisionHistoryLimit: 15 #指定在Deployment的历史修订中保留多少个版本
selector:
matchLabels:
app: affinity
template:
metadata:
labels:
app: affinity
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
name: nginxweb
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution: #硬策略
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- aaa
topologyKey: kubernetes.io/hostname
# 部署
[root@master01 ~]# k apply -f 333pod-affinify-demo.yaml
# 验证,观察到因为没有app=aaa标签导致pending
[root@master01 1]# k get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
affinity-745cd5f9bd-cn7pv 0/1 Pending 0 6s <none> <none> <none> <none>
# 进一步查看
[root@master01 1]# k describe po affinity-745cd5f9bd-cn7pv
...
...
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 2m17s default-scheduler 0/5 nodes are available: 5 node(s) didn't match pod affinity rules. preemption: 0/5 nodes are available: 5 Preemption is not helpful for scheduling..
# 环境复原
[root@master01 ~]# k delete -f 333pod-affinify-demo.yaml
[root@master01 ~]# k delete -f busybox.yaml
测试验证四:如果不满足 “Pod软亲和”?
结论:如果不满足 “Pod软亲和”,则pod会被调度到别的节点
# 定义Yaml文件,用于测试
[root@master01 ~]# vim busybox.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: busybox
spec:
replicas: 1
selector:
matchLabels:
app: busybox
template:
metadata:
labels:
app: busybox
spec:
nodeSelector:
apptype: core
containers:
- image: registry.cn-hangzhou.aliyuncs.com/abroad_images/busybox
name: busybox
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- sleep 3000
# 部署
[root@master01 ~]# k apply -f busybox.yaml
-------------------------------------------------
# 定义Yaml文件
[root@master01 ~]# vim 444pod-anti-affinify-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: affinity
name: affinity
spec:
replicas: 1
revisionHistoryLimit: 15 #指定在Deployment的历史修订中保留多少个版本
selector:
matchLabels:
app: affinity
template:
metadata:
labels:
app: affinity
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
name: nginxweb
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution: #硬策略
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- aaa
topologyKey: kubernetes.io/hostname
# 部署
[root@master01 ~]# k apply -f 444pod-anti-affinify-demo.yaml
# 验证,观察到虽然没有app=aaa标签的pod,但是会随机进行部署
[root@master01 1]# k get po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
affinity-6ddcdbfd8d-4r68s 1/1 Running 0 15s 172.21.231.141 node02 <none> <none>
# 环境复原
[root@master01 ~]# k delete -f 444pod-anti-affinify-demo.yaml
[root@master01 ~]# k delete -f busybox.yaml
二、污点(taints)与容忍(tolerations)¶
对于nodeAffinity无论是硬策略还是软策略方式,都是调度 pod 到预期节点上,而 Taints恰好与之相反,如果一个节点标记为 Taints ,除非 pod 也被标识为可以容忍,否则该 Taints 节点不可以被调度 pod。也就是节点只要有了污点,正常pod都不会调度到这个节点上去。
场景:
- 1、当前很多企业有很多基于GPU的算力集群,这些GPU的节点肯定是不可以让普通 pod调度上去,这样就浪费了GPU的资源,所以我们就需要对GPU的节点打上污点,以防止pod被调度;
- 2、用户希望把Master节点保留给Kubernetes系统组件使用,例如我们使用 kubeadm搭建的集群默认就给master节点添加了一个污点标记,所以我们看到我们平时的pod都没有被调度到 master上去:
每个污点的组成: key=value:effect
每个污点有一个 key 和 value 作为污点的标签,其中 value 可以为空,effect 描述污点 的作用。当前 taint effect 支持如下三个选项:
- NoSchedule :表示 k8s 将不会将 Pod 调度到具有该污点的 Node 上。
- PreferNoSchedule :表示 k8s 将 尽量避免 将 Pod 调度到具有该污点的 Node 上。
- NoExecute(不常用) :表示 k8s 将不会将 Pod 调度到具有该污点的 Node 上,同时会将 Node 上已经存在的 Pod 驱逐出去。
说明:effect没有默认选项,其中最常用的选项为NoSchedule。
# 设置污点
[root@master ~]# kubectl taint nodes node01 devops=cn-shanghai.172.25.36.211:NoSchedule
# 查看污点
[root@master01 1]# k describe node node01 | grep -P "Name:|Taints"
Name: node01
Taints: devops=cn-shanghai.172.25.36.211:NoSchedule
# 删除污点
[root@master ~]# kubectl taint nodes node01 devops=cn-shanghai.172.25.36.211:NoSchedule-
实验:
# node01设置污点
[root@master ~]# kubectl taint nodes node01 devops=cn-shanghai.172.25.36.211:NoSchedule
# 定义yaml文件
[root@master01 1]# vim taint-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: taint
name: taint
spec:
replicas: 5
selector:
matchLabels:
app: taint
template:
metadata:
labels:
app: taint
spec:
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- name: http
containerPort: 80
tolerations: # 添加容忍
- key: "devops"
operator: "Exists"
effect: "NoSchedule"
# 应用
[root@master01 1]# k apply -f taint-demo.yaml
# 验证,观察到因为设置了容忍,所以即使node01节点打上污点仍然可以被调度
[root@master01 1]# kg po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
taint-7898ddb489-2zkxn 1/1 Running 0 15s 172.29.55.20 node01 <none> <none>
taint-7898ddb489-628ch 1/1 Running 0 15s 172.21.231.151 node02 <none> <none>
taint-7898ddb489-6vv55 1/1 Running 0 15s 172.31.112.136 master01 <none> <none>
taint-7898ddb489-cfjgm 1/1 Running 0 15s 172.20.59.202 master02 <none> <none>
taint-7898ddb489-kmpqx 1/1 Running 0 15s 172.18.71.8 master03 <none> <none>
# 环境复原
[root@master01 1]# k delete -f taint-demo.yaml
生产应用场景:
新起的pod不能调度到node01节点,所以给node01节点打上污点。但是要考虑node01节点之前的pod在打上污点重启的情况,同时保证重启的pod还是重启前所在的节点。
# 给node01节点打上标签
[root@master ~]# kubectl label nodes node01 apptype=core
# node01设置污点
[root@master ~]# kubectl taint nodes node01 devops=cn-shanghai.172.25.36.211:NoSchedule
# 添加容忍和使用nodeSelector固定节点
[root@master01 1]# vim busybox.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: busybox
spec:
replicas: 1
selector:
matchLabels:
app: busybox
template:
metadata:
labels:
app: busybox
spec:
nodeSelector: #固定节点
apptype: core
containers:
- image: registry.cn-hangzhou.aliyuncs.com/abroad_images/busybox
name: busybox
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- sleep 3000
tolerations: #添加容忍,防止重启被pending
- key: "devops"
operator: "Exists"
effect: "NoSchedule"
# 应用
[root@master01 1]# k apply -f busybox.yaml
# 验证,观察到重启的pod依然在node01节点
[root@master01 1]# kg po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
busybox-77c477cf49-k82bh 1/1 Running 0 4m4s 172.29.55.19 node01 <none> <none>
# 环境复原
[root@master01 1]# k delete -f busybox.yaml
[root@master ~]# kubectl taint nodes node01 devops=cn-shanghai.172.25.36.211:NoSchedule-
[root@master ~]# kubectl label nodes node01 apptype-
注意点:
对于 tolerations 属性的写法,其中的 key、value、effect 与 Node 的 Taint 设置需保 持一致, 还有以下几点说明:
- 如果 operator 的值是 Exists,则 value 属性可省略
- 如果 operator 的值是 Equal,则表示其 key 与 value 之间的关系是 equal(等于)
- 如果不指定 operator 属性,则默认值为 Equal
另外,还有两个特殊值:
- 空的key 如果再配合Exists 就能匹配所有的key与value,也是能容忍所有node的所有污点(Taints)。
- 空的effect 匹配所有的effect。
三、指定pod运行在固定节点¶
Pod.spec.nodeName 将 Pod 直接调度到指定的 Node 节点上,会跳过Scheduler的调度策略,该匹配规则是强制匹配。
实验:
# node01设置污点,后面测试
[root@master ~]# kubectl taint nodes node01 devops=cn-shanghai.172.25.36.211:NoSchedule
# 生成模板
[root@master01 1]# k create deploy myweb --image=registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2 -r=3 --dry-run=client -oyaml > nodeName-demo.yaml
# 定义yaml文件
[root@master01 1]# vim nodeName-demo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: myweb
name: myweb
spec:
replicas: 3
selector:
matchLabels:
app: myweb
template:
metadata:
labels:
app: myweb
spec:
nodeName: node01 #直接指派node01节点
containers:
- image: registry.cn-hangzhou.aliyuncs.com/zq-demo/nginx:1.14.2
name: nginx
ports:
- containerPort: 80
# 应用
[root@master01 1]# k apply -f nodeName-demo.yaml
# 验证,观察到即使node01配置污点也不影响
[root@master01 1]# kg po -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
myweb-789d57f9bd-28m2k 1/1 Running 0 7s 172.29.55.22 node01 <none> <none>
myweb-789d57f9bd-ddqpn 1/1 Running 0 7s 172.29.55.21 node01 <none> <none>
myweb-789d57f9bd-klzsf 1/1 Running 0 7s 172.29.55.23 node01 <none> <none>
# 环境复原
[root@master01 1]# k delete -f nodeName-demo.yaml
[root@master ~]# kubectl taint nodes node01 devops=cn-shanghai.172.25.36.211:NoSchedule-