aboutsummaryrefslogtreecommitdiff
path: root/executor/executor_linux.go
diff options
context:
space:
mode:
Diffstat (limited to 'executor/executor_linux.go')
-rw-r--r--executor/executor_linux.go926
1 files changed, 926 insertions, 0 deletions
diff --git a/executor/executor_linux.go b/executor/executor_linux.go
new file mode 100644
index 0000000..4ab8367
--- /dev/null
+++ b/executor/executor_linux.go
@@ -0,0 +1,926 @@
+//go:build linux
+
+package executor
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path"
+ "path/filepath"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/armon/circbuf"
+ "github.com/hashicorp/consul-template/signals"
+ hclog "github.com/hashicorp/go-hclog"
+ "github.com/hashicorp/nomad/client/allocdir"
+ "github.com/hashicorp/nomad/client/lib/cgutil"
+ "github.com/hashicorp/nomad/client/lib/resources"
+ "github.com/hashicorp/nomad/client/stats"
+ cstructs "github.com/hashicorp/nomad/client/structs"
+ "github.com/hashicorp/nomad/drivers/shared/capabilities"
+ shelpers "github.com/hashicorp/nomad/helper/stats"
+ "github.com/hashicorp/nomad/helper/uuid"
+ "github.com/hashicorp/nomad/nomad/structs"
+ "github.com/hashicorp/nomad/plugins/drivers"
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ lconfigs "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/devices"
+ ldevices "github.com/opencontainers/runc/libcontainer/devices"
+ "github.com/opencontainers/runc/libcontainer/specconv"
+ lutils "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
+)
+
+var (
+ // ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1
+ ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
+
+ // ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage.
+ ExecutorCgroupV2MeasuredMemStats = []string{"Cache", "Swap", "Usage"}
+
+ // ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
+ ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
+)
+
+// LibcontainerExecutor implements an Executor with the runc/libcontainer api
+type LibcontainerExecutor struct {
+ id string
+ command *ExecCommand
+
+ logger hclog.Logger
+
+ totalCpuStats *stats.CpuStats
+ userCpuStats *stats.CpuStats
+ systemCpuStats *stats.CpuStats
+ pidCollector *pidCollector
+
+ container libcontainer.Container
+ userProc *libcontainer.Process
+ userProcExited chan interface{}
+ exitState *ProcessState
+}
+
+func NewExecutorWithIsolation(logger hclog.Logger) Executor {
+ logger = logger.Named("isolated_executor")
+ if err := shelpers.Init(); err != nil {
+ logger.Error("unable to initialize stats", "error", err)
+ }
+ return &LibcontainerExecutor{
+ id: strings.ReplaceAll(uuid.Generate(), "-", "_"),
+ logger: logger,
+ totalCpuStats: stats.NewCpuStats(),
+ userCpuStats: stats.NewCpuStats(),
+ systemCpuStats: stats.NewCpuStats(),
+ pidCollector: newPidCollector(logger),
+ }
+}
+
+// Launch creates a new container in libcontainer and starts a new process with it
+func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) {
+ l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
+
+ if command.Resources == nil {
+ command.Resources = &drivers.Resources{
+ NomadResources: &structs.AllocatedTaskResources{},
+ }
+ }
+
+ l.command = command
+
+ // create a new factory which will store the container state in the allocDir
+ factory, err := libcontainer.New(
+ path.Join(command.TaskDir, "../alloc/container"),
+ // note that os.Args[0] refers to the executor shim typically
+ // and first args arguments is ignored now due
+ // until https://github.com/opencontainers/runc/pull/1888 is merged
+ libcontainer.InitArgs(os.Args[0], "libcontainer-shim"),
+ )
+ if err != nil {
+ return nil, fmt.Errorf("failed to create factory: %v", err)
+ }
+
+ // A container groups processes under the same isolation enforcement
+ containerCfg, err := newLibcontainerConfig(command)
+ if err != nil {
+ return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err)
+ }
+
+ container, err := factory.Create(l.id, containerCfg)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
+ }
+ l.container = container
+
+ // Look up the binary path and make it executable
+ taskPath, hostPath, err := lookupTaskBin(command)
+ if err != nil {
+ return nil, err
+ }
+ if err := makeExecutable(hostPath); err != nil {
+ return nil, err
+ }
+
+ combined := append([]string{taskPath}, command.Args...)
+ stdout, err := command.Stdout()
+ if err != nil {
+ return nil, err
+ }
+ stderr, err := command.Stderr()
+ if err != nil {
+ return nil, err
+ }
+
+ l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " "))
+
+ // the task process will be started by the container
+ process := &libcontainer.Process{
+ Args: combined,
+ Env: command.Env,
+ Stdout: stdout,
+ Stderr: stderr,
+ Init: true,
+ }
+
+ if command.User != "" {
+ process.User = command.User
+ }
+ l.userProc = process
+
+ l.totalCpuStats = stats.NewCpuStats()
+ l.userCpuStats = stats.NewCpuStats()
+ l.systemCpuStats = stats.NewCpuStats()
+
+ // Starts the task
+ if err := container.Run(process); err != nil {
+ container.Destroy()
+ return nil, err
+ }
+
+ pid, err := process.Pid()
+ if err != nil {
+ container.Destroy()
+ return nil, err
+ }
+
+ // start a goroutine to wait on the process to complete, so Wait calls can
+ // be multiplexed
+ l.userProcExited = make(chan interface{})
+ go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
+ go l.wait()
+
+ return &ProcessState{
+ Pid: pid,
+ ExitCode: -1,
+ Time: time.Now(),
+ }, nil
+}
+
+func (l *LibcontainerExecutor) getAllPids() (resources.PIDs, error) {
+ pids, err := l.container.Processes()
+ if err != nil {
+ return nil, err
+ }
+ m := make(resources.PIDs, 1)
+ for _, pid := range pids {
+ m[pid] = resources.NewPID(pid)
+ }
+ return m, nil
+}
+
+// Wait waits until a process has exited and returns it's exitcode and errors
+func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) {
+ select {
+ case <-ctx.Done():
+ return nil, ctx.Err()
+ case <-l.userProcExited:
+ return l.exitState, nil
+ }
+}
+
+func (l *LibcontainerExecutor) wait() {
+ defer close(l.userProcExited)
+
+ ps, err := l.userProc.Wait()
+ if err != nil {
+ // If the process has exited before we called wait an error is returned
+ // the process state is embedded in the error
+ if exitErr, ok := err.(*exec.ExitError); ok {
+ ps = exitErr.ProcessState
+ } else {
+ l.logger.Error("failed to call wait on user process", "error", err)
+ l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()}
+ return
+ }
+ }
+
+ l.command.Close()
+
+ exitCode := 1
+ var signal int
+ if status, ok := ps.Sys().(syscall.WaitStatus); ok {
+ exitCode = status.ExitStatus()
+ if status.Signaled() {
+ const exitSignalBase = 128
+ signal = int(status.Signal())
+ exitCode = exitSignalBase + signal
+ }
+ }
+
+ l.exitState = &ProcessState{
+ Pid: ps.Pid(),
+ ExitCode: exitCode,
+ Signal: signal,
+ Time: time.Now(),
+ }
+}
+
+// Shutdown stops all processes started and cleans up any resources
+// created (such as mountpoints, devices, etc).
+func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
+ if l.container == nil {
+ return nil
+ }
+
+ status, err := l.container.Status()
+ if err != nil {
+ return err
+ }
+
+ defer l.container.Destroy()
+
+ if status == libcontainer.Stopped {
+ return nil
+ }
+
+ if grace > 0 {
+ if signal == "" {
+ signal = "SIGINT"
+ }
+
+ sig, ok := signals.SignalLookup[signal]
+ if !ok {
+ return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
+ }
+
+ // Signal initial container processes only during graceful
+ // shutdown; hence `false` arg.
+ err = l.container.Signal(sig, false)
+ if err != nil {
+ return err
+ }
+
+ select {
+ case <-l.userProcExited:
+ return nil
+ case <-time.After(grace):
+ // Force kill all container processes after grace period,
+ // hence `true` argument.
+ if err := l.container.Signal(os.Kill, true); err != nil {
+ return err
+ }
+ }
+ } else {
+ err := l.container.Signal(os.Kill, true)
+ if err != nil {
+ return err
+ }
+ }
+
+ select {
+ case <-l.userProcExited:
+ return nil
+ case <-time.After(time.Second * 15):
+ return fmt.Errorf("process failed to exit after 15 seconds")
+ }
+}
+
+// UpdateResources updates the resource isolation with new values to be enforced
+func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error {
+ return nil
+}
+
+// Version returns the api version of the executor
+func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) {
+ return &ExecutorVersion{Version: ExecutorVersionLatest}, nil
+}
+
+// Stats returns the resource statistics for processes managed by the executor
+func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
+ ch := make(chan *cstructs.TaskResourceUsage)
+ go l.handleStats(ch, ctx, interval)
+ return ch, nil
+
+}
+
+func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) {
+ defer close(ch)
+ timer := time.NewTimer(0)
+
+ measuredMemStats := ExecutorCgroupV1MeasuredMemStats
+ if cgroups.IsCgroup2UnifiedMode() {
+ measuredMemStats = ExecutorCgroupV2MeasuredMemStats
+ }
+
+ for {
+ select {
+ case <-ctx.Done():
+ return
+
+ case <-timer.C:
+ timer.Reset(interval)
+ }
+
+ lstats, err := l.container.Stats()
+ if err != nil {
+ l.logger.Warn("error collecting stats", "error", err)
+ return
+ }
+
+ pidStats, err := l.pidCollector.pidStats()
+ if err != nil {
+ l.logger.Warn("error collecting stats", "error", err)
+ return
+ }
+
+ ts := time.Now()
+ stats := lstats.CgroupStats
+
+ // Memory Related Stats
+ swap := stats.MemoryStats.SwapUsage
+ maxUsage := stats.MemoryStats.Usage.MaxUsage
+ rss := stats.MemoryStats.Stats["rss"]
+ cache := stats.MemoryStats.Stats["cache"]
+ mapped_file := stats.MemoryStats.Stats["mapped_file"]
+ ms := &cstructs.MemoryStats{
+ RSS: rss,
+ Cache: cache,
+ Swap: swap.Usage,
+ MappedFile: mapped_file,
+ Usage: stats.MemoryStats.Usage.Usage,
+ MaxUsage: maxUsage,
+ KernelUsage: stats.MemoryStats.KernelUsage.Usage,
+ KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
+ Measured: measuredMemStats,
+ }
+
+ // CPU Related Stats
+ totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
+ userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
+ kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
+
+ totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
+ cs := &cstructs.CpuStats{
+ SystemMode: l.systemCpuStats.Percent(kernelModeTime),
+ UserMode: l.userCpuStats.Percent(userModeTime),
+ Percent: totalPercent,
+ ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
+ ThrottledTime: stats.CpuStats.ThrottlingData.ThrottledTime,
+ TotalTicks: l.systemCpuStats.TicksConsumed(totalPercent),
+ Measured: ExecutorCgroupMeasuredCpuStats,
+ }
+ taskResUsage := cstructs.TaskResourceUsage{
+ ResourceUsage: &cstructs.ResourceUsage{
+ MemoryStats: ms,
+ CpuStats: cs,
+ },
+ Timestamp: ts.UTC().UnixNano(),
+ Pids: pidStats,
+ }
+
+ select {
+ case <-ctx.Done():
+ return
+ case ch <- &taskResUsage:
+ }
+
+ }
+}
+
+// Signal sends a signal to the process managed by the executor
+func (l *LibcontainerExecutor) Signal(s os.Signal) error {
+ return l.userProc.Signal(s)
+}
+
+// Exec starts an additional process inside the container
+func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
+ combined := append([]string{cmd}, args...)
+ // Capture output
+ buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
+
+ process := &libcontainer.Process{
+ Args: combined,
+ Env: l.command.Env,
+ Stdout: buf,
+ Stderr: buf,
+ }
+
+ err := l.container.Run(process)
+ if err != nil {
+ return nil, 0, err
+ }
+
+ waitCh := make(chan *waitResult)
+ defer close(waitCh)
+ go l.handleExecWait(waitCh, process)
+
+ select {
+ case result := <-waitCh:
+ ps := result.ps
+ if result.err != nil {
+ if exitErr, ok := result.err.(*exec.ExitError); ok {
+ ps = exitErr.ProcessState
+ } else {
+ return nil, 0, result.err
+ }
+ }
+ var exitCode int
+ if status, ok := ps.Sys().(syscall.WaitStatus); ok {
+ exitCode = status.ExitStatus()
+ }
+ return buf.Bytes(), exitCode, nil
+
+ case <-time.After(time.Until(deadline)):
+ process.Signal(os.Kill)
+ return nil, 0, context.DeadlineExceeded
+ }
+
+}
+
+func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) {
+ parent, child, err := lutils.NewSockPair("socket")
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to create terminal: %v", err)
+ }
+
+ return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err
+
+}
+
+func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool,
+ stream drivers.ExecTaskStream) error {
+
+ // the task process will be started by the container
+ process := &libcontainer.Process{
+ Args: cmd,
+ Env: l.userProc.Env,
+ User: l.userProc.User,
+ Init: false,
+ Cwd: "/",
+ }
+
+ execHelper := &execHelper{
+ logger: l.logger,
+
+ newTerminal: l.newTerminalSocket,
+ setTTY: func(tty *os.File) error {
+ process.ConsoleSocket = tty
+ return nil
+ },
+ setIO: func(stdin io.Reader, stdout, stderr io.Writer) error {
+ process.Stdin = stdin
+ process.Stdout = stdout
+ process.Stderr = stderr
+ return nil
+ },
+
+ processStart: func() error { return l.container.Run(process) },
+ processWait: func() (*os.ProcessState, error) {
+ return process.Wait()
+ },
+ }
+
+ return execHelper.run(ctx, tty, stream)
+
+}
+
+type waitResult struct {
+ ps *os.ProcessState
+ err error
+}
+
+func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
+ ps, err := process.Wait()
+ ch <- &waitResult{ps, err}
+}
+
+func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) {
+ switch command.User {
+ case "root":
+ // when running as root, use the legacy set of system capabilities, so
+ // that we do not break existing nomad clusters using this "feature"
+ legacyCaps := capabilities.LegacySupported().Slice(true)
+ cfg.Capabilities = &lconfigs.Capabilities{
+ Bounding: legacyCaps,
+ Permitted: legacyCaps,
+ Effective: legacyCaps,
+ Ambient: nil,
+ Inheritable: nil,
+ }
+ default:
+ // otherwise apply the plugin + task capability configuration
+ cfg.Capabilities = &lconfigs.Capabilities{
+ Bounding: command.Capabilities,
+ }
+ }
+}
+
+func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces {
+ namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}}
+ if pidMode == IsolationModePrivate {
+ namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID})
+ }
+ if ipcMode == IsolationModePrivate {
+ namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC})
+ }
+ return namespaces
+}
+
+// configureIsolation prepares the isolation primitives of the container.
+// The process runs in a container configured with the following:
+//
+// * the task directory as the chroot
+// * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
+// * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
+// * some special filesystems: `/proc`, `/sys`. Some case is given to avoid exec escaping or setting malicious values through them.
+func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error {
+ defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
+
+ // set the new root directory for the container
+ cfg.Rootfs = command.TaskDir
+
+ // disable pivot_root if set in the driver's configuration
+ cfg.NoPivotRoot = command.NoPivotRoot
+
+ // set up default namespaces as configured
+ cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC)
+
+ if command.NetworkIsolation != nil {
+ cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{
+ Type: lconfigs.NEWNET,
+ Path: command.NetworkIsolation.Path,
+ })
+ }
+
+ // paths to mask using a bind mount to /dev/null to prevent reading
+ cfg.MaskPaths = []string{
+ "/proc/kcore",
+ "/sys/firmware",
+ }
+
+ // paths that should be remounted as readonly inside the container
+ cfg.ReadonlyPaths = []string{
+ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+ }
+
+ cfg.Devices = specconv.AllowedDevices
+ if len(command.Devices) > 0 {
+ devs, err := cmdDevices(command.Devices)
+ if err != nil {
+ return err
+ }
+ cfg.Devices = append(cfg.Devices, devs...)
+ }
+
+ cfg.Mounts = []*lconfigs.Mount{
+ {
+ Source: "tmpfs",
+ Destination: "/dev",
+ Device: "tmpfs",
+ Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
+ Data: "mode=755",
+ },
+ {
+ Source: "proc",
+ Destination: "/proc",
+ Device: "proc",
+ Flags: defaultMountFlags,
+ },
+ {
+ Source: "devpts",
+ Destination: "/dev/pts",
+ Device: "devpts",
+ Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
+ Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
+ },
+ {
+ Device: "tmpfs",
+ Source: "shm",
+ Destination: "/dev/shm",
+ Data: "mode=1777,size=65536k",
+ Flags: defaultMountFlags,
+ },
+ {
+ Source: "mqueue",
+ Destination: "/dev/mqueue",
+ Device: "mqueue",
+ Flags: defaultMountFlags,
+ },
+ {
+ Source: "sysfs",
+ Destination: "/sys",
+ Device: "sysfs",
+ Flags: defaultMountFlags | syscall.MS_RDONLY,
+ },
+ }
+
+ if len(command.Mounts) > 0 {
+ cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...)
+ }
+
+ return nil
+}
+
+func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error {
+ // If resources are not limited then manually create cgroups needed
+ if !command.ResourceLimits {
+ return cgutil.ConfigureBasicCgroups(cfg)
+ }
+
+ // set cgroups path
+ if cgutil.UseV2 {
+ // in v2, the cgroup must have been created by the client already,
+ // which breaks a lot of existing tests that run drivers without a client
+ if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" {
+ return errors.New("cgroup path must be set")
+ }
+ parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath)
+ cfg.Cgroups.Path = filepath.Join("/", parent, cgroup)
+ } else {
+ // in v1, the cgroup is created using /nomad, which is a bug because it
+ // does not respect the cgroup_parent client configuration
+ // (but makes testing easy)
+ id := uuid.Generate()
+ cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id)
+ }
+
+ if command.Resources == nil || command.Resources.NomadResources == nil {
+ return nil
+ }
+
+ // Total amount of memory allowed to consume
+ res := command.Resources.NomadResources
+ memHard, memSoft := res.Memory.MemoryMaxMB, res.Memory.MemoryMB
+ if memHard <= 0 {
+ memHard = res.Memory.MemoryMB
+ memSoft = 0
+ }
+
+ if memHard > 0 {
+ cfg.Cgroups.Resources.Memory = memHard * 1024 * 1024
+ cfg.Cgroups.Resources.MemoryReservation = memSoft * 1024 * 1024
+
+ // Disable swap to avoid issues on the machine
+ var memSwappiness uint64
+ cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
+ }
+
+ cpuShares := res.Cpu.CpuShares
+ if cpuShares < 2 {
+ return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares)
+ }
+
+ // Set the relative CPU shares for this cgroup, and convert for cgroupv2
+ cfg.Cgroups.Resources.CpuShares = uint64(cpuShares)
+ cfg.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares))
+
+ if command.Resources.LinuxResources != nil && command.Resources.LinuxResources.CpusetCgroupPath != "" {
+ cfg.Hooks = lconfigs.Hooks{
+ lconfigs.CreateRuntime: lconfigs.HookList{
+ newSetCPUSetCgroupHook(command.Resources.LinuxResources.CpusetCgroupPath),
+ },
+ }
+ }
+
+ return nil
+}
+
+func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) {
+ cfg := &lconfigs.Config{
+ Cgroups: &lconfigs.Cgroup{
+ Resources: &lconfigs.Resources{
+ MemorySwappiness: nil,
+ },
+ },
+ Version: "1.0.0",
+ }
+
+ for _, device := range specconv.AllowedDevices {
+ cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule)
+ }
+
+ configureCapabilities(cfg, command)
+
+ // children should not inherit Nomad agent oom_score_adj value
+ oomScoreAdj := 0
+ cfg.OomScoreAdj = &oomScoreAdj
+
+ if err := configureIsolation(cfg, command); err != nil {
+ return nil, err
+ }
+
+ if err := configureCgroups(cfg, command); err != nil {
+ return nil, err
+ }
+
+ return cfg, nil
+}
+
+// cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
+func cmdDevices(driverDevices []*drivers.DeviceConfig) ([]*devices.Device, error) {
+ if len(driverDevices) == 0 {
+ return nil, nil
+ }
+
+ r := make([]*devices.Device, len(driverDevices))
+
+ for i, d := range driverDevices {
+ ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions)
+ if err != nil {
+ return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err)
+ }
+ ed.Path = d.TaskPath
+ r[i] = ed
+ }
+
+ return r, nil
+}
+
+var userMountToUnixMount = map[string]int{
+ // Empty string maps to `rprivate` for backwards compatibility in restored
+ // older tasks, where mount propagation will not be present.
+ "": unix.MS_PRIVATE | unix.MS_REC, // rprivate
+ structs.VolumeMountPropagationPrivate: unix.MS_PRIVATE | unix.MS_REC, // rprivate
+ structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC, // rslave
+ structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared
+}
+
+// cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
+func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount {
+ if len(mounts) == 0 {
+ return nil
+ }
+
+ r := make([]*lconfigs.Mount, len(mounts))
+
+ for i, m := range mounts {
+ flags := unix.MS_BIND
+ if m.Readonly {
+ flags |= unix.MS_RDONLY
+ }
+
+ r[i] = &lconfigs.Mount{
+ Source: m.HostPath,
+ Destination: m.TaskPath,
+ Device: "bind",
+ Flags: flags,
+ PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]},
+ }
+ }
+
+ return r
+}
+
+// lookupTaskBin finds the file `bin`, searching in order:
+// - taskDir/local
+// - taskDir
+// - each mount, in order listed in the jobspec
+// - a PATH-like search of usr/local/bin/, usr/bin/, and bin/ inside the taskDir
+//
+// Returns an absolute path inside the container that will get passed as arg[0]
+// to the launched process, and the absolute path to that binary as seen by the
+// host (these will be identical for binaries that don't come from mounts).
+//
+// See also executor.lookupBin for a version used by non-isolated drivers.
+func lookupTaskBin(command *ExecCommand) (string, string, error) {
+ taskDir := command.TaskDir
+ bin := command.Cmd
+
+ // Check in the local directory
+ localDir := filepath.Join(taskDir, allocdir.TaskLocal)
+ taskPath, hostPath, err := getPathInTaskDir(command.TaskDir, localDir, bin)
+ if err == nil {
+ return taskPath, hostPath, nil
+ }
+
+ // Check at the root of the task's directory
+ taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, command.TaskDir, bin)
+ if err == nil {
+ return taskPath, hostPath, nil
+ }
+
+ // Check in our mounts
+ for _, mount := range command.Mounts {
+ taskPath, hostPath, err = getPathInMount(mount.HostPath, mount.TaskPath, bin)
+ if err == nil {
+ return taskPath, hostPath, nil
+ }
+ }
+
+ // If there's a / in the binary's path, we can't fallback to a PATH search
+ if strings.Contains(bin, "/") {
+ return "", "", fmt.Errorf("file %s not found under path %s", bin, taskDir)
+ }
+
+ // look for a file using a PATH-style lookup inside the directory
+ // root. Similar to the stdlib's exec.LookPath except:
+ // - uses a restricted lookup PATH rather than the agent process's PATH env var.
+ // - does not require that the file is already executable (this will be ensured
+ // by the caller)
+ // - does not prevent using relative path as added to exec.LookPath in go1.19
+ // (this gets fixed-up in the caller)
+
+ // This is a fake PATH so that we're not using the agent's PATH
+ restrictedPaths := []string{"/usr/local/bin", "/usr/bin", "/bin"}
+
+ for _, dir := range restrictedPaths {
+ pathDir := filepath.Join(command.TaskDir, dir)
+ taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, pathDir, bin)
+ if err == nil {
+ return taskPath, hostPath, nil
+ }
+ }
+
+ return "", "", fmt.Errorf("file %s not found under path", bin)
+}
+
+// getPathInTaskDir searches for the binary in the task directory and nested
+// search directory. It returns the absolute path rooted inside the container
+// and the absolute path on the host.
+func getPathInTaskDir(taskDir, searchDir, bin string) (string, string, error) {
+
+ hostPath := filepath.Join(searchDir, bin)
+ err := filepathIsRegular(hostPath)
+ if err != nil {
+ return "", "", err
+ }
+
+ // Find the path relative to the task directory
+ rel, err := filepath.Rel(taskDir, hostPath)
+ if rel == "" || err != nil {
+ return "", "", fmt.Errorf(
+ "failed to determine relative path base=%q target=%q: %v",
+ taskDir, hostPath, err)
+ }
+
+ // Turn relative-to-taskdir path into re-rooted absolute path to avoid
+ // libcontainer trying to resolve the binary using $PATH.
+ // Do *not* use filepath.Join as it will translate ".."s returned by
+ // filepath.Rel. Prepending "/" will cause the path to be rooted in the
+ // chroot which is the desired behavior.
+ return filepath.Clean("/" + rel), hostPath, nil
+}
+
+// getPathInMount for the binary in the mount's host path, constructing the path
+// considering that the bin path is rooted in the mount's task path and not its
+// host path. It returns the absolute path rooted inside the container and the
+// absolute path on the host.
+func getPathInMount(mountHostPath, mountTaskPath, bin string) (string, string, error) {
+
+ // Find the path relative to the mount point in the task so that we can
+ // trim off any shared prefix when we search on the host path
+ mountRel, err := filepath.Rel(mountTaskPath, bin)
+ if mountRel == "" || err != nil {
+ return "", "", fmt.Errorf("path was not relative to the mount task path")
+ }
+
+ hostPath := filepath.Join(mountHostPath, mountRel)
+
+ err = filepathIsRegular(hostPath)
+ if err != nil {
+ return "", "", err
+ }
+
+ // Turn relative-to-taskdir path into re-rooted absolute path to avoid
+ // libcontainer trying to resolve the binary using $PATH.
+ // Do *not* use filepath.Join as it will translate ".."s returned by
+ // filepath.Rel. Prepending "/" will cause the path to be rooted in the
+ // chroot which is the desired behavior.
+ return filepath.Clean("/" + bin), hostPath, nil
+}
+
+// filepathIsRegular verifies that a filepath is a regular file (i.e. not a
+// directory, socket, device, etc.)
+func filepathIsRegular(path string) error {
+ f, err := os.Stat(path)
+ if err != nil {
+ return err
+ }
+ if !f.Mode().Type().IsRegular() {
+ return fmt.Errorf("path was not a regular file")
+ }
+ return nil
+}
+
+func newSetCPUSetCgroupHook(cgroupPath string) lconfigs.Hook {
+ return lconfigs.NewFunctionHook(func(state *specs.State) error {
+ return cgroups.WriteCgroupProc(cgroupPath, state.Pid)
+ })
+}