Skip to content

Commit 3c44bba

Browse files
committed
gpu: mount by-path directory
oneCCL requires the /dev/dri/by-path folder to be available to create a mapping between GPUs. Signed-off-by: Tuomas Katila <[email protected]>
1 parent 85b6795 commit 3c44bba

File tree

4 files changed

+223
-7
lines changed

4 files changed

+223
-7
lines changed

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,12 @@ import (
3535
)
3636

3737
const (
38-
sysfsDrmDirectory = "/sys/class/drm"
39-
devfsDriDirectory = "/dev/dri"
40-
gpuDeviceRE = `^card[0-9]+$`
41-
controlDeviceRE = `^controlD[0-9]+$`
42-
vendorString = "0x8086"
38+
sysfsDrmDirectory = "/sys/class/drm"
39+
devfsDriDirectory = "/dev/dri"
40+
devfsBypathDirectory = "/dev/dri/by-path"
41+
gpuDeviceRE = `^card[0-9]+$`
42+
controlDeviceRE = `^controlD[0-9]+$`
43+
vendorString = "0x8086"
4344

4445
// Device plugin settings.
4546
namespace = "gpu.intel.com"
@@ -338,15 +339,18 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
338339
}
339340

340341
if len(nodes) > 0 {
341-
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, nodes, nil, nil, nil)
342+
ueventPath := path.Join(dp.sysfsDir, f.Name(), "device/drm/uevent")
343+
byPathMounts := pluginutils.BypathMountsFromUevent(ueventPath, devfsBypathDirectory)
344+
345+
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, nodes, byPathMounts, nil, nil)
342346

343347
for i := 0; i < dp.options.sharedDevNum; i++ {
344348
devID := fmt.Sprintf("%s-%d", f.Name(), i)
345349
// Currently only one device type (i915) is supported.
346350
// TODO: check model ID to differentiate device models.
347351
devTree.AddDevice(deviceType, devID, deviceInfo)
348352

349-
rmDevInfos[devID] = rm.NewDeviceInfo(nodes, nil, nil)
353+
rmDevInfos[devID] = rm.NewDeviceInfo(nodes, byPathMounts, nil)
350354
}
351355
}
352356
}

cmd/gpu_plugin/gpu_plugin_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,16 @@ func TestScan(t *testing.T) {
193193
devfsdirs: []string{"card0"},
194194
expectedDevs: 1,
195195
},
196+
{
197+
name: "one device with by-path links",
198+
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
199+
sysfsfiles: map[string][]byte{
200+
"card0/device/vendor": []byte("0x8086"),
201+
"card0/device/drm/uevent": []byte("PCI_SLOT_NAME=00:11.22.3"),
202+
},
203+
devfsdirs: []string{"card0", "by-path/pci-00:11.22.3-card"},
204+
expectedDevs: 1,
205+
},
196206
{
197207
name: "sriov-1-pf-no-vfs + monitoring",
198208
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},

cmd/internal/pluginutils/bypath.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright 2022 Intel Corporation. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package pluginutils
16+
17+
import (
18+
"os"
19+
"path"
20+
"strings"
21+
22+
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
23+
)
24+
25+
const (
26+
pciSlotPrefix = "PCI_SLOT_NAME="
27+
)
28+
29+
// Returns a slice of by-path Mounts for a pci device that is identified by the
30+
// uevent file contents. by-path files are searched from the given bypathDir.
31+
// In the by-path dir, any files that start with "pci-<pci bus>" will be added to mounts.
32+
func BypathMountsFromUevent(ueventPath, bypathDir string) []pluginapi.Mount {
33+
var mounts []pluginapi.Mount
34+
35+
pciSlot := readPciSlotFromUevent(ueventPath)
36+
37+
if pciSlot == "" {
38+
return nil
39+
}
40+
41+
bypathFiles, err := os.ReadDir(bypathDir)
42+
if err != nil {
43+
return nil
44+
}
45+
46+
linkPrefix := "pci-" + pciSlot
47+
48+
for _, f := range bypathFiles {
49+
if strings.HasPrefix(f.Name(), linkPrefix) {
50+
absPath := path.Join(bypathDir, f.Name())
51+
mounts = append(mounts, pluginapi.Mount{
52+
ContainerPath: absPath,
53+
HostPath: absPath,
54+
ReadOnly: true,
55+
})
56+
}
57+
}
58+
59+
return mounts
60+
}
61+
62+
func readPciSlotFromUevent(ueventPath string) string {
63+
data, err := os.ReadFile(ueventPath)
64+
if err != nil {
65+
return ""
66+
}
67+
68+
dataLines := strings.Split(string(data), "\n")
69+
70+
for _, line := range dataLines {
71+
if !strings.HasPrefix(line, pciSlotPrefix) {
72+
continue
73+
}
74+
75+
slotValue := line[len(pciSlotPrefix):]
76+
77+
return slotValue
78+
}
79+
80+
return ""
81+
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Copyright 2022 Intel Corporation. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package pluginutils
16+
17+
import (
18+
"os"
19+
"path"
20+
"testing"
21+
22+
"k8s.io/utils/strings/slices"
23+
)
24+
25+
func createTestFiles(t *testing.T, root, ueventData string, bypathFiles []string) (string, string) {
26+
devPath := path.Join(root, "device")
27+
byPath := path.Join(root, "by-path")
28+
29+
if err := os.Mkdir(devPath, os.ModePerm); err != nil {
30+
t.Fatalf("Couldn't create test uevent test file")
31+
}
32+
33+
if len(ueventData) > 0 {
34+
err := os.WriteFile(path.Join(devPath, "uevent"), []byte(ueventData), os.ModePerm)
35+
if err != nil {
36+
t.Fatalf("Couldn't create test uevent test file")
37+
}
38+
}
39+
40+
if len(bypathFiles) > 0 {
41+
if err := os.Mkdir(byPath, os.ModePerm); err != nil {
42+
t.Fatal("Mkdir failed:", byPath)
43+
}
44+
45+
for _, f := range bypathFiles {
46+
if err := os.WriteFile(path.Join(byPath, f), []byte{1}, os.ModePerm); err != nil {
47+
t.Fatal("WriteFile failed:", path.Join(byPath, f))
48+
}
49+
}
50+
}
51+
52+
return devPath, byPath
53+
}
54+
55+
func TestBypath(t *testing.T) {
56+
type testData struct {
57+
ueventData string
58+
bypathFiles []string
59+
mountCount int
60+
}
61+
62+
tds := []testData{
63+
{
64+
"PCI_DEVICE=foobar\nPCI_SLOT_NAME=0-1-2-3-3342\n",
65+
[]string{"pci-0-1-2-3-3342-card", "pci-0-1-2-3-3342-render"},
66+
2,
67+
},
68+
{
69+
"PCI_DEVICE=foobar\nPCI_SLOT_NAME=0-1-2-3-3342\n",
70+
[]string{"pci-0-1-2-3-4444-card", "pci-0-1-2-3-4444-render"},
71+
0,
72+
},
73+
{
74+
"PCI_DEVICE=foobar\n",
75+
[]string{"pci-0-1-2-3-4444-card", "pci-0-1-2-3-4444-render"},
76+
0,
77+
},
78+
{
79+
"PCI_DEVICE=foobar\nPCI_SLOT_NAME=0-1-2-3-3342\n",
80+
[]string{},
81+
0,
82+
},
83+
{
84+
"",
85+
[]string{"pci-0-1-2-3-3342-card", "pci-0-1-2-3-3342-render"},
86+
0,
87+
},
88+
}
89+
90+
for _, td := range tds {
91+
root, err := os.MkdirTemp("", "test_by_path_mounting")
92+
if err != nil {
93+
t.Fatalf("can't create temporary directory: %+v", err)
94+
}
95+
// dirs/files need to be removed for the next test
96+
defer os.RemoveAll(root)
97+
98+
devPath, byPath := createTestFiles(t, root, td.ueventData, td.bypathFiles)
99+
100+
mounts := BypathMountsFromUevent(path.Join(devPath, "uevent"), byPath)
101+
102+
if len(mounts) != td.mountCount {
103+
t.Errorf("Wrong number of mounts %d vs. %d", len(mounts), td.mountCount)
104+
}
105+
106+
absPaths := []string{}
107+
for _, link := range td.bypathFiles {
108+
absPaths = append(absPaths, path.Join(byPath, link))
109+
}
110+
111+
for _, mount := range mounts {
112+
if !slices.Contains(absPaths, mount.ContainerPath) {
113+
t.Errorf("containerpath is incorrect: %s", mount.ContainerPath)
114+
}
115+
116+
if !slices.Contains(absPaths, mount.HostPath) {
117+
t.Errorf("hostpath is incorrect: %s", mount.HostPath)
118+
}
119+
}
120+
}
121+
}

0 commit comments

Comments
 (0)