Skip to content

Commit ac8f190

Browse files
authored
Merge commit from fork
Run update-ldcache in isolated namespaces
2 parents 3c1f1a6 + ec29b60 commit ac8f190

File tree

12 files changed

+661
-34
lines changed

12 files changed

+661
-34
lines changed
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
//go:build linux
2+
3+
/**
4+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5+
# SPDX-License-Identifier: Apache-2.0
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
**/
19+
20+
package ldcache
21+
22+
import (
23+
"errors"
24+
"fmt"
25+
"os"
26+
"os/exec"
27+
"path/filepath"
28+
"strconv"
29+
"syscall"
30+
31+
securejoin "github.com/cyphar/filepath-securejoin"
32+
33+
"github.com/moby/sys/reexec"
34+
"github.com/opencontainers/runc/libcontainer/utils"
35+
"golang.org/x/sys/unix"
36+
)
37+
38+
// pivotRoot will call pivot_root such that rootfs becomes the new root
39+
// filesystem, and everything else is cleaned up.
40+
// This is adapted from the implementation here:
41+
//
42+
// https://github.com/opencontainers/runc/blob/e89a29929c775025419ab0d218a43588b4c12b9a/libcontainer/rootfs_linux.go#L1056-L1113
43+
//
44+
// With the `mount` and `unmount` calls changed to direct unix.Mount and unix.Unmount calls.
45+
func pivotRoot(rootfs string) error {
46+
// While the documentation may claim otherwise, pivot_root(".", ".") is
47+
// actually valid. What this results in is / being the new root but
48+
// /proc/self/cwd being the old root. Since we can play around with the cwd
49+
// with pivot_root this allows us to pivot without creating directories in
50+
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
51+
52+
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
53+
if err != nil {
54+
return &os.PathError{Op: "open", Path: "/", Err: err}
55+
}
56+
defer unix.Close(oldroot) //nolint: errcheck
57+
58+
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
59+
if err != nil {
60+
return &os.PathError{Op: "open", Path: rootfs, Err: err}
61+
}
62+
defer unix.Close(newroot) //nolint: errcheck
63+
64+
// Change to the new root so that the pivot_root actually acts on it.
65+
if err := unix.Fchdir(newroot); err != nil {
66+
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
67+
}
68+
69+
if err := unix.PivotRoot(".", "."); err != nil {
70+
return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
71+
}
72+
73+
// Currently our "." is oldroot (according to the current kernel code).
74+
// However, purely for safety, we will fchdir(oldroot) since there isn't
75+
// really any guarantee from the kernel what /proc/self/cwd will be after a
76+
// pivot_root(2).
77+
78+
if err := unix.Fchdir(oldroot); err != nil {
79+
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
80+
}
81+
82+
// Make oldroot rslave to make sure our unmounts don't propagate to the
83+
// host (and thus bork the machine). We don't use rprivate because this is
84+
// known to cause issues due to races where we still have a reference to a
85+
// mount while a process in the host namespace are trying to operate on
86+
// something they think has no mounts (devicemapper in particular).
87+
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
88+
return err
89+
}
90+
// Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
91+
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
92+
return err
93+
}
94+
95+
// Switch back to our shiny new root.
96+
if err := unix.Chdir("/"); err != nil {
97+
return &os.PathError{Op: "chdir", Path: "/", Err: err}
98+
}
99+
return nil
100+
}
101+
102+
// mountLdConfig mounts the host ldconfig to the mount namespace of the hook.
103+
// We use WithProcfd to perform the mount operations to ensure that the changes
104+
// are persisted across the pivot root.
105+
func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) {
106+
hostLdconfigInfo, err := os.Stat(hostLdconfigPath)
107+
if err != nil {
108+
return "", fmt.Errorf("error reading host ldconfig: %w", err)
109+
}
110+
111+
hookScratchDirPath := "/var/run/nvidia-ctk-hook"
112+
ldconfigPath := filepath.Join(hookScratchDirPath, "ldconfig")
113+
if err := utils.MkdirAllInRoot(containerRootDirPath, hookScratchDirPath, 0755); err != nil {
114+
return "", fmt.Errorf("error creating hook scratch folder: %w", err)
115+
}
116+
117+
err = utils.WithProcfd(containerRootDirPath, hookScratchDirPath, func(hookScratchDirFdPath string) error {
118+
return createTmpFs(hookScratchDirFdPath, int(hostLdconfigInfo.Size()))
119+
120+
})
121+
if err != nil {
122+
return "", fmt.Errorf("error creating tmpfs: %w", err)
123+
}
124+
125+
if _, err := createFileInRoot(containerRootDirPath, ldconfigPath, hostLdconfigInfo.Mode()); err != nil {
126+
return "", fmt.Errorf("error creating ldconfig: %w", err)
127+
}
128+
129+
err = utils.WithProcfd(containerRootDirPath, ldconfigPath, func(ldconfigFdPath string) error {
130+
return unix.Mount(hostLdconfigPath, ldconfigFdPath, "", unix.MS_BIND|unix.MS_RDONLY|unix.MS_NODEV|unix.MS_PRIVATE|unix.MS_NOSYMFOLLOW, "")
131+
})
132+
if err != nil {
133+
return "", fmt.Errorf("error bind mounting host ldconfig: %w", err)
134+
}
135+
136+
return ldconfigPath, nil
137+
}
138+
139+
func createFileInRoot(containerRootDirPath string, destinationPath string, mode os.FileMode) (string, error) {
140+
dest, err := securejoin.SecureJoin(containerRootDirPath, destinationPath)
141+
if err != nil {
142+
return "", err
143+
}
144+
// Make the parent directory.
145+
destDir, destBase := filepath.Split(dest)
146+
destDirFd, err := utils.MkdirAllInRootOpen(containerRootDirPath, destDir, 0755)
147+
if err != nil {
148+
return "", fmt.Errorf("error creating parent dir: %w", err)
149+
}
150+
defer destDirFd.Close()
151+
// Make the target file. We want to avoid opening any file that is
152+
// already there because it could be a "bad" file like an invalid
153+
// device or hung tty that might cause a DoS, so we use mknodat.
154+
// destBase does not contain any "/" components, and mknodat does
155+
// not follow trailing symlinks, so we can safely just call mknodat
156+
// here.
157+
if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|uint32(mode), 0); err != nil {
158+
// If we get EEXIST, there was already an inode there and
159+
// we can consider that a success.
160+
if !errors.Is(err, unix.EEXIST) {
161+
return "", fmt.Errorf("error creating empty file: %w", err)
162+
}
163+
}
164+
return dest, nil
165+
}
166+
167+
// mountProc mounts a clean proc filesystem in the new root.
168+
func mountProc(newroot string) error {
169+
target := filepath.Join(newroot, "/proc")
170+
171+
if err := os.MkdirAll(target, 0755); err != nil {
172+
return fmt.Errorf("error creating directory: %w", err)
173+
}
174+
return unix.Mount("proc", target, "proc", 0, "")
175+
}
176+
177+
// createTmpFs creates a tmpfs at the specified location with the specified size.
178+
func createTmpFs(target string, size int) error {
179+
return unix.Mount("tmpfs", target, "tmpfs", 0, fmt.Sprintf("size=%d", size))
180+
}
181+
182+
// createReexecCommand creates a command that can be used to trigger the reexec
183+
// initializer.
184+
// On linux this command runs in new namespaces.
185+
func createReexecCommand(args []string) *exec.Cmd {
186+
cmd := reexec.Command(args...)
187+
cmd.Stdin = os.Stdin
188+
cmd.Stdout = os.Stdout
189+
cmd.Stderr = os.Stderr
190+
191+
cmd.SysProcAttr = &syscall.SysProcAttr{
192+
Cloneflags: syscall.CLONE_NEWNS |
193+
syscall.CLONE_NEWUTS |
194+
syscall.CLONE_NEWIPC |
195+
syscall.CLONE_NEWPID |
196+
syscall.CLONE_NEWNET,
197+
}
198+
199+
return cmd
200+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
//go:build !linux
2+
3+
/**
4+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5+
# SPDX-License-Identifier: Apache-2.0
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
**/
19+
20+
package ldcache
21+
22+
import (
23+
"fmt"
24+
"os"
25+
"os/exec"
26+
27+
"github.com/moby/sys/reexec"
28+
)
29+
30+
func pivotRoot(newroot string) error {
31+
return fmt.Errorf("not supported")
32+
}
33+
34+
func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) {
35+
return "", fmt.Errorf("not supported")
36+
}
37+
38+
func mountProc(newroot string) error {
39+
return fmt.Errorf("not supported")
40+
}
41+
42+
// createReexecCommand creates a command that can be used ot trigger the reexec
43+
// initializer.
44+
func createReexecCommand(args []string) *exec.Cmd {
45+
cmd := reexec.Command(args...)
46+
cmd.Stdin = os.Stdin
47+
cmd.Stdout = os.Stdout
48+
cmd.Stderr = os.Stderr
49+
50+
return cmd
51+
}

cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//go:build linux
2+
13
/**
24
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
35
#
@@ -26,10 +28,9 @@ import (
2628
)
2729

2830
// SafeExec attempts to clone the specified binary (as an memfd, for example) before executing it.
29-
func (m command) SafeExec(path string, args []string, envv []string) error {
31+
func SafeExec(path string, args []string, envv []string) error {
3032
safeExe, err := cloneBinary(path)
3133
if err != nil {
32-
m.logger.Warningf("Failed to clone binary %q: %v; falling back to Exec", path, err)
3334
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
3435
return syscall.Exec(path, args, envv)
3536
}

cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
//go:build !linux
2-
// +build !linux
32

43
/**
54
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
@@ -23,7 +22,7 @@ import "syscall"
2322

2423
// SafeExec is not implemented on non-linux systems and forwards directly to the
2524
// Exec syscall.
26-
func (m *command) SafeExec(path string, args []string, envv []string) error {
25+
func SafeExec(path string, args []string, envv []string) error {
2726
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
2827
return syscall.Exec(path, args, envv)
2928
}

0 commit comments

Comments
 (0)