因为 containerd 版本差异导致配置不生效的问题

问题概述

记录一个因为 containerd 版本差异导致配置不生效的问题。

在安装 rke2-v1.31.9 时,配置默认 nvidia runtime 可以生效,但是在 rke2-v1.31.5 时配置失败了,如下所示:

1
2
3
4
{{ template "base" . }}

[plugins."io.containerd.cri.v1.runtime".containerd]
default_runtime_name = "nvidia"

原因是这种配置方式是 containerd 2.x 的配置方式,rke2 从 1.31.6 使用 containerd 2.x,但是 1.31.5 用的是 1.x 的 containerd 。

版本关键区别

特性 containerd 1.x containerd 2.x
CRI 配置路径 [plugins.cri] [plugins."io.containerd.grpc.v1.cri"]
镜像仓库配置位置 [plugins.cri.registry] [plugins."io.containerd.grpc.v1.cri".registry]
证书目录配置 不支持 config_path 参数 支持 config_path 参数

containerd 1.x

如果是旧版本,需要配置默认 runtime,例如配置 nvidia runtime,可以使用这个配置:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
[plugins.opt]
path = "/var/lib/rancher/rke2/agent/containerd"

[plugins.cri]
stream_server_address = "127.0.0.1"
stream_server_port = "10010"
enable_selinux = false
sandbox_image = "192.168.2.51/rancher/mirrored-pause:3.6"

[plugins.cri.containerd]
default_runtime_name = "nvidia"
disable_snapshot_annotations = true
snapshotter = "overlayfs"


[plugins.cri.containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"

[plugins.cri.containerd.runtimes.nvidia]
base_runtime_spec = ""
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"

[plugins.cri.containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
CriuImagePath = ""
CriuPath = ""
CriuWorkPath = ""
IoGid = 0
IoUid = 0
NoNewKeyring = false
NoPivotRoot = false
Root = ""
ShimCgroup = ""
SystemdCgroup = false

如果需要配置私有镜像仓库,参考如下:

使用 http :

1
2
3
4
[plugins.cri.registry]
[plugins.cri.registry.mirrors]
[plugins.cri.registry.mirrors."192.168.2.51"]
endpoint = ["http://192.168.2.51:80"]

跳过证书校验:

1
2
3
4
[plugins.cri.registry]
[plugins.cri.registry.configs]
[plugins.cri.registry.configs."harbor.zerchin.top".tls]
insecure_skip_verify = true

使用自签名证书:

1
2
3
4
[plugins.cri.registry]
[plugins.cri.registry.configs]
[plugins.cri.registry.configs."harbor.zerchin.top".tls]
ca_file = "/opt/cert/ca.crt"

containerd 2.x

对比一下两者的区别:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
version = 2
root = "/var/lib/rancher/rke2/agent/containerd"
state = "/run/k3s/containerd"

[grpc]
address = "/run/k3s/containerd/containerd.sock"

[plugins."io.containerd.internal.v1.opt"]
path = "/var/lib/rancher/rke2/agent/containerd"

[plugins."io.containerd.grpc.v1.cri"]
stream_server_address = "127.0.0.1"
stream_server_port = "10010"
enable_selinux = false
enable_unprivileged_ports = true
enable_unprivileged_icmp = true
device_ownership_from_security_context = false
sandbox_image = "harbor.zerchin.xyz/rancher/mirrored-pause:3.6"

[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
disable_snapshot_annotations = true

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runhcs-wcow-process]
runtime_type = "io.containerd.runhcs.v1"

[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/var/lib/rancher/rke2/agent/etc/containerd/certs.d"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia"]
runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia".options]
BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime"
SystemdCgroup = true

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia-cdi"]
runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."nvidia-cdi".options]
BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime.cdi"
SystemdCgroup = true

[plugins."io.containerd.cri.v1.runtime".containerd]
default_runtime_name = "nvidia"