diff options
Diffstat (limited to 'executor/executor_linux.go')
-rw-r--r-- | executor/executor_linux.go | 926 |
1 files changed, 926 insertions, 0 deletions
diff --git a/executor/executor_linux.go b/executor/executor_linux.go new file mode 100644 index 0000000..4ab8367 --- /dev/null +++ b/executor/executor_linux.go @@ -0,0 +1,926 @@ +//go:build linux + +package executor + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/armon/circbuf" + "github.com/hashicorp/consul-template/signals" + hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" + "github.com/hashicorp/nomad/client/lib/resources" + "github.com/hashicorp/nomad/client/stats" + cstructs "github.com/hashicorp/nomad/client/structs" + "github.com/hashicorp/nomad/drivers/shared/capabilities" + shelpers "github.com/hashicorp/nomad/helper/stats" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/plugins/drivers" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups" + lconfigs "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + ldevices "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/specconv" + lutils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +var ( + // ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1 + ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"} + + // ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage. + ExecutorCgroupV2MeasuredMemStats = []string{"Cache", "Swap", "Usage"} + + // ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor + ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"} +) + +// LibcontainerExecutor implements an Executor with the runc/libcontainer api +type LibcontainerExecutor struct { + id string + command *ExecCommand + + logger hclog.Logger + + totalCpuStats *stats.CpuStats + userCpuStats *stats.CpuStats + systemCpuStats *stats.CpuStats + pidCollector *pidCollector + + container libcontainer.Container + userProc *libcontainer.Process + userProcExited chan interface{} + exitState *ProcessState +} + +func NewExecutorWithIsolation(logger hclog.Logger) Executor { + logger = logger.Named("isolated_executor") + if err := shelpers.Init(); err != nil { + logger.Error("unable to initialize stats", "error", err) + } + return &LibcontainerExecutor{ + id: strings.ReplaceAll(uuid.Generate(), "-", "_"), + logger: logger, + totalCpuStats: stats.NewCpuStats(), + userCpuStats: stats.NewCpuStats(), + systemCpuStats: stats.NewCpuStats(), + pidCollector: newPidCollector(logger), + } +} + +// Launch creates a new container in libcontainer and starts a new process with it +func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) { + l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " ")) + + if command.Resources == nil { + command.Resources = &drivers.Resources{ + NomadResources: &structs.AllocatedTaskResources{}, + } + } + + l.command = command + + // create a new factory which will store the container state in the allocDir + factory, err := libcontainer.New( + path.Join(command.TaskDir, "../alloc/container"), + // note that os.Args[0] refers to the executor shim typically + // and first args arguments is ignored now due + // until https://github.com/opencontainers/runc/pull/1888 is merged + libcontainer.InitArgs(os.Args[0], "libcontainer-shim"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create factory: %v", err) + } + + // A container groups processes under the same isolation enforcement + containerCfg, err := newLibcontainerConfig(command) + if err != nil { + return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err) + } + + container, err := factory.Create(l.id, containerCfg) + if err != nil { + return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err) + } + l.container = container + + // Look up the binary path and make it executable + taskPath, hostPath, err := lookupTaskBin(command) + if err != nil { + return nil, err + } + if err := makeExecutable(hostPath); err != nil { + return nil, err + } + + combined := append([]string{taskPath}, command.Args...) + stdout, err := command.Stdout() + if err != nil { + return nil, err + } + stderr, err := command.Stderr() + if err != nil { + return nil, err + } + + l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " ")) + + // the task process will be started by the container + process := &libcontainer.Process{ + Args: combined, + Env: command.Env, + Stdout: stdout, + Stderr: stderr, + Init: true, + } + + if command.User != "" { + process.User = command.User + } + l.userProc = process + + l.totalCpuStats = stats.NewCpuStats() + l.userCpuStats = stats.NewCpuStats() + l.systemCpuStats = stats.NewCpuStats() + + // Starts the task + if err := container.Run(process); err != nil { + container.Destroy() + return nil, err + } + + pid, err := process.Pid() + if err != nil { + container.Destroy() + return nil, err + } + + // start a goroutine to wait on the process to complete, so Wait calls can + // be multiplexed + l.userProcExited = make(chan interface{}) + go l.pidCollector.collectPids(l.userProcExited, l.getAllPids) + go l.wait() + + return &ProcessState{ + Pid: pid, + ExitCode: -1, + Time: time.Now(), + }, nil +} + +func (l *LibcontainerExecutor) getAllPids() (resources.PIDs, error) { + pids, err := l.container.Processes() + if err != nil { + return nil, err + } + m := make(resources.PIDs, 1) + for _, pid := range pids { + m[pid] = resources.NewPID(pid) + } + return m, nil +} + +// Wait waits until a process has exited and returns it's exitcode and errors +func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) { + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-l.userProcExited: + return l.exitState, nil + } +} + +func (l *LibcontainerExecutor) wait() { + defer close(l.userProcExited) + + ps, err := l.userProc.Wait() + if err != nil { + // If the process has exited before we called wait an error is returned + // the process state is embedded in the error + if exitErr, ok := err.(*exec.ExitError); ok { + ps = exitErr.ProcessState + } else { + l.logger.Error("failed to call wait on user process", "error", err) + l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()} + return + } + } + + l.command.Close() + + exitCode := 1 + var signal int + if status, ok := ps.Sys().(syscall.WaitStatus); ok { + exitCode = status.ExitStatus() + if status.Signaled() { + const exitSignalBase = 128 + signal = int(status.Signal()) + exitCode = exitSignalBase + signal + } + } + + l.exitState = &ProcessState{ + Pid: ps.Pid(), + ExitCode: exitCode, + Signal: signal, + Time: time.Now(), + } +} + +// Shutdown stops all processes started and cleans up any resources +// created (such as mountpoints, devices, etc). +func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error { + if l.container == nil { + return nil + } + + status, err := l.container.Status() + if err != nil { + return err + } + + defer l.container.Destroy() + + if status == libcontainer.Stopped { + return nil + } + + if grace > 0 { + if signal == "" { + signal = "SIGINT" + } + + sig, ok := signals.SignalLookup[signal] + if !ok { + return fmt.Errorf("error unknown signal given for shutdown: %s", signal) + } + + // Signal initial container processes only during graceful + // shutdown; hence `false` arg. + err = l.container.Signal(sig, false) + if err != nil { + return err + } + + select { + case <-l.userProcExited: + return nil + case <-time.After(grace): + // Force kill all container processes after grace period, + // hence `true` argument. + if err := l.container.Signal(os.Kill, true); err != nil { + return err + } + } + } else { + err := l.container.Signal(os.Kill, true) + if err != nil { + return err + } + } + + select { + case <-l.userProcExited: + return nil + case <-time.After(time.Second * 15): + return fmt.Errorf("process failed to exit after 15 seconds") + } +} + +// UpdateResources updates the resource isolation with new values to be enforced +func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error { + return nil +} + +// Version returns the api version of the executor +func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) { + return &ExecutorVersion{Version: ExecutorVersionLatest}, nil +} + +// Stats returns the resource statistics for processes managed by the executor +func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) { + ch := make(chan *cstructs.TaskResourceUsage) + go l.handleStats(ch, ctx, interval) + return ch, nil + +} + +func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) { + defer close(ch) + timer := time.NewTimer(0) + + measuredMemStats := ExecutorCgroupV1MeasuredMemStats + if cgroups.IsCgroup2UnifiedMode() { + measuredMemStats = ExecutorCgroupV2MeasuredMemStats + } + + for { + select { + case <-ctx.Done(): + return + + case <-timer.C: + timer.Reset(interval) + } + + lstats, err := l.container.Stats() + if err != nil { + l.logger.Warn("error collecting stats", "error", err) + return + } + + pidStats, err := l.pidCollector.pidStats() + if err != nil { + l.logger.Warn("error collecting stats", "error", err) + return + } + + ts := time.Now() + stats := lstats.CgroupStats + + // Memory Related Stats + swap := stats.MemoryStats.SwapUsage + maxUsage := stats.MemoryStats.Usage.MaxUsage + rss := stats.MemoryStats.Stats["rss"] + cache := stats.MemoryStats.Stats["cache"] + mapped_file := stats.MemoryStats.Stats["mapped_file"] + ms := &cstructs.MemoryStats{ + RSS: rss, + Cache: cache, + Swap: swap.Usage, + MappedFile: mapped_file, + Usage: stats.MemoryStats.Usage.Usage, + MaxUsage: maxUsage, + KernelUsage: stats.MemoryStats.KernelUsage.Usage, + KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage, + Measured: measuredMemStats, + } + + // CPU Related Stats + totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage) + userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode) + kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode) + + totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage) + cs := &cstructs.CpuStats{ + SystemMode: l.systemCpuStats.Percent(kernelModeTime), + UserMode: l.userCpuStats.Percent(userModeTime), + Percent: totalPercent, + ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods, + ThrottledTime: stats.CpuStats.ThrottlingData.ThrottledTime, + TotalTicks: l.systemCpuStats.TicksConsumed(totalPercent), + Measured: ExecutorCgroupMeasuredCpuStats, + } + taskResUsage := cstructs.TaskResourceUsage{ + ResourceUsage: &cstructs.ResourceUsage{ + MemoryStats: ms, + CpuStats: cs, + }, + Timestamp: ts.UTC().UnixNano(), + Pids: pidStats, + } + + select { + case <-ctx.Done(): + return + case ch <- &taskResUsage: + } + + } +} + +// Signal sends a signal to the process managed by the executor +func (l *LibcontainerExecutor) Signal(s os.Signal) error { + return l.userProc.Signal(s) +} + +// Exec starts an additional process inside the container +func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) { + combined := append([]string{cmd}, args...) + // Capture output + buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize)) + + process := &libcontainer.Process{ + Args: combined, + Env: l.command.Env, + Stdout: buf, + Stderr: buf, + } + + err := l.container.Run(process) + if err != nil { + return nil, 0, err + } + + waitCh := make(chan *waitResult) + defer close(waitCh) + go l.handleExecWait(waitCh, process) + + select { + case result := <-waitCh: + ps := result.ps + if result.err != nil { + if exitErr, ok := result.err.(*exec.ExitError); ok { + ps = exitErr.ProcessState + } else { + return nil, 0, result.err + } + } + var exitCode int + if status, ok := ps.Sys().(syscall.WaitStatus); ok { + exitCode = status.ExitStatus() + } + return buf.Bytes(), exitCode, nil + + case <-time.After(time.Until(deadline)): + process.Signal(os.Kill) + return nil, 0, context.DeadlineExceeded + } + +} + +func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) { + parent, child, err := lutils.NewSockPair("socket") + if err != nil { + return nil, nil, fmt.Errorf("failed to create terminal: %v", err) + } + + return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err + +} + +func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool, + stream drivers.ExecTaskStream) error { + + // the task process will be started by the container + process := &libcontainer.Process{ + Args: cmd, + Env: l.userProc.Env, + User: l.userProc.User, + Init: false, + Cwd: "/", + } + + execHelper := &execHelper{ + logger: l.logger, + + newTerminal: l.newTerminalSocket, + setTTY: func(tty *os.File) error { + process.ConsoleSocket = tty + return nil + }, + setIO: func(stdin io.Reader, stdout, stderr io.Writer) error { + process.Stdin = stdin + process.Stdout = stdout + process.Stderr = stderr + return nil + }, + + processStart: func() error { return l.container.Run(process) }, + processWait: func() (*os.ProcessState, error) { + return process.Wait() + }, + } + + return execHelper.run(ctx, tty, stream) + +} + +type waitResult struct { + ps *os.ProcessState + err error +} + +func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) { + ps, err := process.Wait() + ch <- &waitResult{ps, err} +} + +func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) { + switch command.User { + case "root": + // when running as root, use the legacy set of system capabilities, so + // that we do not break existing nomad clusters using this "feature" + legacyCaps := capabilities.LegacySupported().Slice(true) + cfg.Capabilities = &lconfigs.Capabilities{ + Bounding: legacyCaps, + Permitted: legacyCaps, + Effective: legacyCaps, + Ambient: nil, + Inheritable: nil, + } + default: + // otherwise apply the plugin + task capability configuration + cfg.Capabilities = &lconfigs.Capabilities{ + Bounding: command.Capabilities, + } + } +} + +func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces { + namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}} + if pidMode == IsolationModePrivate { + namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID}) + } + if ipcMode == IsolationModePrivate { + namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC}) + } + return namespaces +} + +// configureIsolation prepares the isolation primitives of the container. +// The process runs in a container configured with the following: +// +// * the task directory as the chroot +// * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host +// * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker +// * some special filesystems: `/proc`, `/sys`. Some case is given to avoid exec escaping or setting malicious values through them. +func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { + defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + + // set the new root directory for the container + cfg.Rootfs = command.TaskDir + + // disable pivot_root if set in the driver's configuration + cfg.NoPivotRoot = command.NoPivotRoot + + // set up default namespaces as configured + cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC) + + if command.NetworkIsolation != nil { + cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{ + Type: lconfigs.NEWNET, + Path: command.NetworkIsolation.Path, + }) + } + + // paths to mask using a bind mount to /dev/null to prevent reading + cfg.MaskPaths = []string{ + "/proc/kcore", + "/sys/firmware", + } + + // paths that should be remounted as readonly inside the container + cfg.ReadonlyPaths = []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + } + + cfg.Devices = specconv.AllowedDevices + if len(command.Devices) > 0 { + devs, err := cmdDevices(command.Devices) + if err != nil { + return err + } + cfg.Devices = append(cfg.Devices, devs...) + } + + cfg.Mounts = []*lconfigs.Mount{ + { + Source: "tmpfs", + Destination: "/dev", + Device: "tmpfs", + Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, + Data: "mode=755", + }, + { + Source: "proc", + Destination: "/proc", + Device: "proc", + Flags: defaultMountFlags, + }, + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, + Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Device: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Data: "mode=1777,size=65536k", + Flags: defaultMountFlags, + }, + { + Source: "mqueue", + Destination: "/dev/mqueue", + Device: "mqueue", + Flags: defaultMountFlags, + }, + { + Source: "sysfs", + Destination: "/sys", + Device: "sysfs", + Flags: defaultMountFlags | syscall.MS_RDONLY, + }, + } + + if len(command.Mounts) > 0 { + cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...) + } + + return nil +} + +func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { + // If resources are not limited then manually create cgroups needed + if !command.ResourceLimits { + return cgutil.ConfigureBasicCgroups(cfg) + } + + // set cgroups path + if cgutil.UseV2 { + // in v2, the cgroup must have been created by the client already, + // which breaks a lot of existing tests that run drivers without a client + if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" { + return errors.New("cgroup path must be set") + } + parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath) + cfg.Cgroups.Path = filepath.Join("/", parent, cgroup) + } else { + // in v1, the cgroup is created using /nomad, which is a bug because it + // does not respect the cgroup_parent client configuration + // (but makes testing easy) + id := uuid.Generate() + cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id) + } + + if command.Resources == nil || command.Resources.NomadResources == nil { + return nil + } + + // Total amount of memory allowed to consume + res := command.Resources.NomadResources + memHard, memSoft := res.Memory.MemoryMaxMB, res.Memory.MemoryMB + if memHard <= 0 { + memHard = res.Memory.MemoryMB + memSoft = 0 + } + + if memHard > 0 { + cfg.Cgroups.Resources.Memory = memHard * 1024 * 1024 + cfg.Cgroups.Resources.MemoryReservation = memSoft * 1024 * 1024 + + // Disable swap to avoid issues on the machine + var memSwappiness uint64 + cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness + } + + cpuShares := res.Cpu.CpuShares + if cpuShares < 2 { + return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares) + } + + // Set the relative CPU shares for this cgroup, and convert for cgroupv2 + cfg.Cgroups.Resources.CpuShares = uint64(cpuShares) + cfg.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares)) + + if command.Resources.LinuxResources != nil && command.Resources.LinuxResources.CpusetCgroupPath != "" { + cfg.Hooks = lconfigs.Hooks{ + lconfigs.CreateRuntime: lconfigs.HookList{ + newSetCPUSetCgroupHook(command.Resources.LinuxResources.CpusetCgroupPath), + }, + } + } + + return nil +} + +func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) { + cfg := &lconfigs.Config{ + Cgroups: &lconfigs.Cgroup{ + Resources: &lconfigs.Resources{ + MemorySwappiness: nil, + }, + }, + Version: "1.0.0", + } + + for _, device := range specconv.AllowedDevices { + cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule) + } + + configureCapabilities(cfg, command) + + // children should not inherit Nomad agent oom_score_adj value + oomScoreAdj := 0 + cfg.OomScoreAdj = &oomScoreAdj + + if err := configureIsolation(cfg, command); err != nil { + return nil, err + } + + if err := configureCgroups(cfg, command); err != nil { + return nil, err + } + + return cfg, nil +} + +// cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices. +func cmdDevices(driverDevices []*drivers.DeviceConfig) ([]*devices.Device, error) { + if len(driverDevices) == 0 { + return nil, nil + } + + r := make([]*devices.Device, len(driverDevices)) + + for i, d := range driverDevices { + ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions) + if err != nil { + return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err) + } + ed.Path = d.TaskPath + r[i] = ed + } + + return r, nil +} + +var userMountToUnixMount = map[string]int{ + // Empty string maps to `rprivate` for backwards compatibility in restored + // older tasks, where mount propagation will not be present. + "": unix.MS_PRIVATE | unix.MS_REC, // rprivate + structs.VolumeMountPropagationPrivate: unix.MS_PRIVATE | unix.MS_REC, // rprivate + structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC, // rslave + structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared +} + +// cmdMounts converts a list of driver.MountConfigs into excutor.Mounts. +func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount { + if len(mounts) == 0 { + return nil + } + + r := make([]*lconfigs.Mount, len(mounts)) + + for i, m := range mounts { + flags := unix.MS_BIND + if m.Readonly { + flags |= unix.MS_RDONLY + } + + r[i] = &lconfigs.Mount{ + Source: m.HostPath, + Destination: m.TaskPath, + Device: "bind", + Flags: flags, + PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]}, + } + } + + return r +} + +// lookupTaskBin finds the file `bin`, searching in order: +// - taskDir/local +// - taskDir +// - each mount, in order listed in the jobspec +// - a PATH-like search of usr/local/bin/, usr/bin/, and bin/ inside the taskDir +// +// Returns an absolute path inside the container that will get passed as arg[0] +// to the launched process, and the absolute path to that binary as seen by the +// host (these will be identical for binaries that don't come from mounts). +// +// See also executor.lookupBin for a version used by non-isolated drivers. +func lookupTaskBin(command *ExecCommand) (string, string, error) { + taskDir := command.TaskDir + bin := command.Cmd + + // Check in the local directory + localDir := filepath.Join(taskDir, allocdir.TaskLocal) + taskPath, hostPath, err := getPathInTaskDir(command.TaskDir, localDir, bin) + if err == nil { + return taskPath, hostPath, nil + } + + // Check at the root of the task's directory + taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, command.TaskDir, bin) + if err == nil { + return taskPath, hostPath, nil + } + + // Check in our mounts + for _, mount := range command.Mounts { + taskPath, hostPath, err = getPathInMount(mount.HostPath, mount.TaskPath, bin) + if err == nil { + return taskPath, hostPath, nil + } + } + + // If there's a / in the binary's path, we can't fallback to a PATH search + if strings.Contains(bin, "/") { + return "", "", fmt.Errorf("file %s not found under path %s", bin, taskDir) + } + + // look for a file using a PATH-style lookup inside the directory + // root. Similar to the stdlib's exec.LookPath except: + // - uses a restricted lookup PATH rather than the agent process's PATH env var. + // - does not require that the file is already executable (this will be ensured + // by the caller) + // - does not prevent using relative path as added to exec.LookPath in go1.19 + // (this gets fixed-up in the caller) + + // This is a fake PATH so that we're not using the agent's PATH + restrictedPaths := []string{"/usr/local/bin", "/usr/bin", "/bin"} + + for _, dir := range restrictedPaths { + pathDir := filepath.Join(command.TaskDir, dir) + taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, pathDir, bin) + if err == nil { + return taskPath, hostPath, nil + } + } + + return "", "", fmt.Errorf("file %s not found under path", bin) +} + +// getPathInTaskDir searches for the binary in the task directory and nested +// search directory. It returns the absolute path rooted inside the container +// and the absolute path on the host. +func getPathInTaskDir(taskDir, searchDir, bin string) (string, string, error) { + + hostPath := filepath.Join(searchDir, bin) + err := filepathIsRegular(hostPath) + if err != nil { + return "", "", err + } + + // Find the path relative to the task directory + rel, err := filepath.Rel(taskDir, hostPath) + if rel == "" || err != nil { + return "", "", fmt.Errorf( + "failed to determine relative path base=%q target=%q: %v", + taskDir, hostPath, err) + } + + // Turn relative-to-taskdir path into re-rooted absolute path to avoid + // libcontainer trying to resolve the binary using $PATH. + // Do *not* use filepath.Join as it will translate ".."s returned by + // filepath.Rel. Prepending "/" will cause the path to be rooted in the + // chroot which is the desired behavior. + return filepath.Clean("/" + rel), hostPath, nil +} + +// getPathInMount for the binary in the mount's host path, constructing the path +// considering that the bin path is rooted in the mount's task path and not its +// host path. It returns the absolute path rooted inside the container and the +// absolute path on the host. +func getPathInMount(mountHostPath, mountTaskPath, bin string) (string, string, error) { + + // Find the path relative to the mount point in the task so that we can + // trim off any shared prefix when we search on the host path + mountRel, err := filepath.Rel(mountTaskPath, bin) + if mountRel == "" || err != nil { + return "", "", fmt.Errorf("path was not relative to the mount task path") + } + + hostPath := filepath.Join(mountHostPath, mountRel) + + err = filepathIsRegular(hostPath) + if err != nil { + return "", "", err + } + + // Turn relative-to-taskdir path into re-rooted absolute path to avoid + // libcontainer trying to resolve the binary using $PATH. + // Do *not* use filepath.Join as it will translate ".."s returned by + // filepath.Rel. Prepending "/" will cause the path to be rooted in the + // chroot which is the desired behavior. + return filepath.Clean("/" + bin), hostPath, nil +} + +// filepathIsRegular verifies that a filepath is a regular file (i.e. not a +// directory, socket, device, etc.) +func filepathIsRegular(path string) error { + f, err := os.Stat(path) + if err != nil { + return err + } + if !f.Mode().Type().IsRegular() { + return fmt.Errorf("path was not a regular file") + } + return nil +} + +func newSetCPUSetCgroupHook(cgroupPath string) lconfigs.Hook { + return lconfigs.NewFunctionHook(func(state *specs.State) error { + return cgroups.WriteCgroupProc(cgroupPath, state.Pid) + }) +} |