|
| 1 | +//go:build linux |
| 2 | + |
| 3 | +/** |
| 4 | +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 5 | +# SPDX-License-Identifier: Apache-2.0 |
| 6 | +# |
| 7 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 8 | +# you may not use this file except in compliance with the License. |
| 9 | +# You may obtain a copy of the License at |
| 10 | +# |
| 11 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 12 | +# |
| 13 | +# Unless required by applicable law or agreed to in writing, software |
| 14 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 15 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 16 | +# See the License for the specific language governing permissions and |
| 17 | +# limitations under the License. |
| 18 | +**/ |
| 19 | + |
| 20 | +package ldcache |
| 21 | + |
| 22 | +import ( |
| 23 | + "errors" |
| 24 | + "fmt" |
| 25 | + "os" |
| 26 | + "os/exec" |
| 27 | + "path/filepath" |
| 28 | + "strconv" |
| 29 | + "syscall" |
| 30 | + |
| 31 | + securejoin "github.com/cyphar/filepath-securejoin" |
| 32 | + |
| 33 | + "github.com/moby/sys/reexec" |
| 34 | + "github.com/opencontainers/runc/libcontainer/utils" |
| 35 | + "golang.org/x/sys/unix" |
| 36 | +) |
| 37 | + |
| 38 | +// pivotRoot will call pivot_root such that rootfs becomes the new root |
| 39 | +// filesystem, and everything else is cleaned up. |
| 40 | +// This is adapted from the implementation here: |
| 41 | +// |
| 42 | +// https://github.com/opencontainers/runc/blob/e89a29929c775025419ab0d218a43588b4c12b9a/libcontainer/rootfs_linux.go#L1056-L1113 |
| 43 | +// |
| 44 | +// With the `mount` and `unmount` calls changed to direct unix.Mount and unix.Unmount calls. |
| 45 | +func pivotRoot(rootfs string) error { |
| 46 | + // While the documentation may claim otherwise, pivot_root(".", ".") is |
| 47 | + // actually valid. What this results in is / being the new root but |
| 48 | + // /proc/self/cwd being the old root. Since we can play around with the cwd |
| 49 | + // with pivot_root this allows us to pivot without creating directories in |
| 50 | + // the rootfs. Shout-outs to the LXC developers for giving us this idea. |
| 51 | + |
| 52 | + oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0) |
| 53 | + if err != nil { |
| 54 | + return &os.PathError{Op: "open", Path: "/", Err: err} |
| 55 | + } |
| 56 | + defer unix.Close(oldroot) //nolint: errcheck |
| 57 | + |
| 58 | + newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) |
| 59 | + if err != nil { |
| 60 | + return &os.PathError{Op: "open", Path: rootfs, Err: err} |
| 61 | + } |
| 62 | + defer unix.Close(newroot) //nolint: errcheck |
| 63 | + |
| 64 | + // Change to the new root so that the pivot_root actually acts on it. |
| 65 | + if err := unix.Fchdir(newroot); err != nil { |
| 66 | + return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err} |
| 67 | + } |
| 68 | + |
| 69 | + if err := unix.PivotRoot(".", "."); err != nil { |
| 70 | + return &os.PathError{Op: "pivot_root", Path: ".", Err: err} |
| 71 | + } |
| 72 | + |
| 73 | + // Currently our "." is oldroot (according to the current kernel code). |
| 74 | + // However, purely for safety, we will fchdir(oldroot) since there isn't |
| 75 | + // really any guarantee from the kernel what /proc/self/cwd will be after a |
| 76 | + // pivot_root(2). |
| 77 | + |
| 78 | + if err := unix.Fchdir(oldroot); err != nil { |
| 79 | + return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err} |
| 80 | + } |
| 81 | + |
| 82 | + // Make oldroot rslave to make sure our unmounts don't propagate to the |
| 83 | + // host (and thus bork the machine). We don't use rprivate because this is |
| 84 | + // known to cause issues due to races where we still have a reference to a |
| 85 | + // mount while a process in the host namespace are trying to operate on |
| 86 | + // something they think has no mounts (devicemapper in particular). |
| 87 | + if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { |
| 88 | + return err |
| 89 | + } |
| 90 | + // Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. |
| 91 | + if err := unix.Unmount(".", unix.MNT_DETACH); err != nil { |
| 92 | + return err |
| 93 | + } |
| 94 | + |
| 95 | + // Switch back to our shiny new root. |
| 96 | + if err := unix.Chdir("/"); err != nil { |
| 97 | + return &os.PathError{Op: "chdir", Path: "/", Err: err} |
| 98 | + } |
| 99 | + return nil |
| 100 | +} |
| 101 | + |
| 102 | +// mountLdConfig mounts the host ldconfig to the mount namespace of the hook. |
| 103 | +// We use WithProcfd to perform the mount operations to ensure that the changes |
| 104 | +// are persisted across the pivot root. |
| 105 | +func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) { |
| 106 | + hostLdconfigInfo, err := os.Stat(hostLdconfigPath) |
| 107 | + if err != nil { |
| 108 | + return "", fmt.Errorf("error reading host ldconfig: %w", err) |
| 109 | + } |
| 110 | + |
| 111 | + hookScratchDirPath := "/var/run/nvidia-ctk-hook" |
| 112 | + ldconfigPath := filepath.Join(hookScratchDirPath, "ldconfig") |
| 113 | + if err := utils.MkdirAllInRoot(containerRootDirPath, hookScratchDirPath, 0755); err != nil { |
| 114 | + return "", fmt.Errorf("error creating hook scratch folder: %w", err) |
| 115 | + } |
| 116 | + |
| 117 | + err = utils.WithProcfd(containerRootDirPath, hookScratchDirPath, func(hookScratchDirFdPath string) error { |
| 118 | + return createTmpFs(hookScratchDirFdPath, int(hostLdconfigInfo.Size())) |
| 119 | + |
| 120 | + }) |
| 121 | + if err != nil { |
| 122 | + return "", fmt.Errorf("error creating tmpfs: %w", err) |
| 123 | + } |
| 124 | + |
| 125 | + if _, err := createFileInRoot(containerRootDirPath, ldconfigPath, hostLdconfigInfo.Mode()); err != nil { |
| 126 | + return "", fmt.Errorf("error creating ldconfig: %w", err) |
| 127 | + } |
| 128 | + |
| 129 | + err = utils.WithProcfd(containerRootDirPath, ldconfigPath, func(ldconfigFdPath string) error { |
| 130 | + return unix.Mount(hostLdconfigPath, ldconfigFdPath, "", unix.MS_BIND|unix.MS_RDONLY|unix.MS_NODEV|unix.MS_PRIVATE|unix.MS_NOSYMFOLLOW, "") |
| 131 | + }) |
| 132 | + if err != nil { |
| 133 | + return "", fmt.Errorf("error bind mounting host ldconfig: %w", err) |
| 134 | + } |
| 135 | + |
| 136 | + return ldconfigPath, nil |
| 137 | +} |
| 138 | + |
| 139 | +func createFileInRoot(containerRootDirPath string, destinationPath string, mode os.FileMode) (string, error) { |
| 140 | + dest, err := securejoin.SecureJoin(containerRootDirPath, destinationPath) |
| 141 | + if err != nil { |
| 142 | + return "", err |
| 143 | + } |
| 144 | + // Make the parent directory. |
| 145 | + destDir, destBase := filepath.Split(dest) |
| 146 | + destDirFd, err := utils.MkdirAllInRootOpen(containerRootDirPath, destDir, 0755) |
| 147 | + if err != nil { |
| 148 | + return "", fmt.Errorf("error creating parent dir: %w", err) |
| 149 | + } |
| 150 | + defer destDirFd.Close() |
| 151 | + // Make the target file. We want to avoid opening any file that is |
| 152 | + // already there because it could be a "bad" file like an invalid |
| 153 | + // device or hung tty that might cause a DoS, so we use mknodat. |
| 154 | + // destBase does not contain any "/" components, and mknodat does |
| 155 | + // not follow trailing symlinks, so we can safely just call mknodat |
| 156 | + // here. |
| 157 | + if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|uint32(mode), 0); err != nil { |
| 158 | + // If we get EEXIST, there was already an inode there and |
| 159 | + // we can consider that a success. |
| 160 | + if !errors.Is(err, unix.EEXIST) { |
| 161 | + return "", fmt.Errorf("error creating empty file: %w", err) |
| 162 | + } |
| 163 | + } |
| 164 | + return dest, nil |
| 165 | +} |
| 166 | + |
| 167 | +// mountProc mounts a clean proc filesystem in the new root. |
| 168 | +func mountProc(newroot string) error { |
| 169 | + target := filepath.Join(newroot, "/proc") |
| 170 | + |
| 171 | + if err := os.MkdirAll(target, 0755); err != nil { |
| 172 | + return fmt.Errorf("error creating directory: %w", err) |
| 173 | + } |
| 174 | + return unix.Mount("proc", target, "proc", 0, "") |
| 175 | +} |
| 176 | + |
| 177 | +// createTmpFs creates a tmpfs at the specified location with the specified size. |
| 178 | +func createTmpFs(target string, size int) error { |
| 179 | + return unix.Mount("tmpfs", target, "tmpfs", 0, fmt.Sprintf("size=%d", size)) |
| 180 | +} |
| 181 | + |
| 182 | +// createReexecCommand creates a command that can be used to trigger the reexec |
| 183 | +// initializer. |
| 184 | +// On linux this command runs in new namespaces. |
| 185 | +func createReexecCommand(args []string) *exec.Cmd { |
| 186 | + cmd := reexec.Command(args...) |
| 187 | + cmd.Stdin = os.Stdin |
| 188 | + cmd.Stdout = os.Stdout |
| 189 | + cmd.Stderr = os.Stderr |
| 190 | + |
| 191 | + cmd.SysProcAttr = &syscall.SysProcAttr{ |
| 192 | + Cloneflags: syscall.CLONE_NEWNS | |
| 193 | + syscall.CLONE_NEWUTS | |
| 194 | + syscall.CLONE_NEWIPC | |
| 195 | + syscall.CLONE_NEWPID | |
| 196 | + syscall.CLONE_NEWNET, |
| 197 | + } |
| 198 | + |
| 199 | + return cmd |
| 200 | +} |
0 commit comments