1 files changed, 926 insertions, 0 deletions
diff --git a/executor/executor_linux.go b/executor/executor_linux.go
new file mode 100644
index 0000000..4ab8367
--- /dev/null
+++ b/executor/executor_linux.go
@@ -0,0 +1,926 @@
+//go:build linux
+
+package executor
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/armon/circbuf"
+	"github.com/hashicorp/consul-template/signals"
+	hclog "github.com/hashicorp/go-hclog"
+	"github.com/hashicorp/nomad/client/allocdir"
+	"github.com/hashicorp/nomad/client/lib/cgutil"
+	"github.com/hashicorp/nomad/client/lib/resources"
+	"github.com/hashicorp/nomad/client/stats"
+	cstructs "github.com/hashicorp/nomad/client/structs"
+	"github.com/hashicorp/nomad/drivers/shared/capabilities"
+	shelpers "github.com/hashicorp/nomad/helper/stats"
+	"github.com/hashicorp/nomad/helper/uuid"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/hashicorp/nomad/plugins/drivers"
+	"github.com/opencontainers/runc/libcontainer"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	lconfigs "github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
+	ldevices "github.com/opencontainers/runc/libcontainer/devices"
+	"github.com/opencontainers/runc/libcontainer/specconv"
+	lutils "github.com/opencontainers/runc/libcontainer/utils"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+)
+
+var (
+	// ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1
+	ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
+
+	// ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage.
+	ExecutorCgroupV2MeasuredMemStats = []string{"Cache", "Swap", "Usage"}
+
+	// ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
+	ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
+)
+
+// LibcontainerExecutor implements an Executor with the runc/libcontainer api
+type LibcontainerExecutor struct {
+	id      string
+	command *ExecCommand
+
+	logger hclog.Logger
+
+	totalCpuStats  *stats.CpuStats
+	userCpuStats   *stats.CpuStats
+	systemCpuStats *stats.CpuStats
+	pidCollector   *pidCollector
+
+	container      libcontainer.Container
+	userProc       *libcontainer.Process
+	userProcExited chan interface{}
+	exitState      *ProcessState
+}
+
+func NewExecutorWithIsolation(logger hclog.Logger) Executor {
+	logger = logger.Named("isolated_executor")
+	if err := shelpers.Init(); err != nil {
+		logger.Error("unable to initialize stats", "error", err)
+	}
+	return &LibcontainerExecutor{
+		id:             strings.ReplaceAll(uuid.Generate(), "-", "_"),
+		logger:         logger,
+		totalCpuStats:  stats.NewCpuStats(),
+		userCpuStats:   stats.NewCpuStats(),
+		systemCpuStats: stats.NewCpuStats(),
+		pidCollector:   newPidCollector(logger),
+	}
+}
+
+// Launch creates a new container in libcontainer and starts a new process with it
+func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) {
+	l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
+
+	if command.Resources == nil {
+		command.Resources = &drivers.Resources{
+			NomadResources: &structs.AllocatedTaskResources{},
+		}
+	}
+
+	l.command = command
+
+	// create a new factory which will store the container state in the allocDir
+	factory, err := libcontainer.New(
+		path.Join(command.TaskDir, "../alloc/container"),
+		// note that os.Args[0] refers to the executor shim typically
+		// and first args arguments is ignored now due
+		// until https://github.com/opencontainers/runc/pull/1888 is merged
+		libcontainer.InitArgs(os.Args[0], "libcontainer-shim"),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create factory: %v", err)
+	}
+
+	// A container groups processes under the same isolation enforcement
+	containerCfg, err := newLibcontainerConfig(command)
+	if err != nil {
+		return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err)
+	}
+
+	container, err := factory.Create(l.id, containerCfg)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
+	}
+	l.container = container
+
+	// Look up the binary path and make it executable
+	taskPath, hostPath, err := lookupTaskBin(command)
+	if err != nil {
+		return nil, err
+	}
+	if err := makeExecutable(hostPath); err != nil {
+		return nil, err
+	}
+
+	combined := append([]string{taskPath}, command.Args...)
+	stdout, err := command.Stdout()
+	if err != nil {
+		return nil, err
+	}
+	stderr, err := command.Stderr()
+	if err != nil {
+		return nil, err
+	}
+
+	l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " "))
+
+	// the task process will be started by the container
+	process := &libcontainer.Process{
+		Args:   combined,
+		Env:    command.Env,
+		Stdout: stdout,
+		Stderr: stderr,
+		Init:   true,
+	}
+
+	if command.User != "" {
+		process.User = command.User
+	}
+	l.userProc = process
+
+	l.totalCpuStats = stats.NewCpuStats()
+	l.userCpuStats = stats.NewCpuStats()
+	l.systemCpuStats = stats.NewCpuStats()
+
+	// Starts the task
+	if err := container.Run(process); err != nil {
+		container.Destroy()
+		return nil, err
+	}
+
+	pid, err := process.Pid()
+	if err != nil {
+		container.Destroy()
+		return nil, err
+	}
+
+	// start a goroutine to wait on the process to complete, so Wait calls can
+	// be multiplexed
+	l.userProcExited = make(chan interface{})
+	go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
+	go l.wait()
+
+	return &ProcessState{
+		Pid:      pid,
+		ExitCode: -1,
+		Time:     time.Now(),
+	}, nil
+}
+
+func (l *LibcontainerExecutor) getAllPids() (resources.PIDs, error) {
+	pids, err := l.container.Processes()
+	if err != nil {
+		return nil, err
+	}
+	m := make(resources.PIDs, 1)
+	for _, pid := range pids {
+		m[pid] = resources.NewPID(pid)
+	}
+	return m, nil
+}
+
+// Wait waits until a process has exited and returns it's exitcode and errors
+func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) {
+	select {
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	case <-l.userProcExited:
+		return l.exitState, nil
+	}
+}
+
+func (l *LibcontainerExecutor) wait() {
+	defer close(l.userProcExited)
+
+	ps, err := l.userProc.Wait()
+	if err != nil {
+		// If the process has exited before we called wait an error is returned
+		// the process state is embedded in the error
+		if exitErr, ok := err.(*exec.ExitError); ok {
+			ps = exitErr.ProcessState
+		} else {
+			l.logger.Error("failed to call wait on user process", "error", err)
+			l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()}
+			return
+		}
+	}
+
+	l.command.Close()
+
+	exitCode := 1
+	var signal int
+	if status, ok := ps.Sys().(syscall.WaitStatus); ok {
+		exitCode = status.ExitStatus()
+		if status.Signaled() {
+			const exitSignalBase = 128
+			signal = int(status.Signal())
+			exitCode = exitSignalBase + signal
+		}
+	}
+
+	l.exitState = &ProcessState{
+		Pid:      ps.Pid(),
+		ExitCode: exitCode,
+		Signal:   signal,
+		Time:     time.Now(),
+	}
+}
+
+// Shutdown stops all processes started and cleans up any resources
+// created (such as mountpoints, devices, etc).
+func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
+	if l.container == nil {
+		return nil
+	}
+
+	status, err := l.container.Status()
+	if err != nil {
+		return err
+	}
+
+	defer l.container.Destroy()
+
+	if status == libcontainer.Stopped {
+		return nil
+	}
+
+	if grace > 0 {
+		if signal == "" {
+			signal = "SIGINT"
+		}
+
+		sig, ok := signals.SignalLookup[signal]
+		if !ok {
+			return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
+		}
+
+		// Signal initial container processes only during graceful
+		// shutdown; hence `false` arg.
+		err = l.container.Signal(sig, false)
+		if err != nil {
+			return err
+		}
+
+		select {
+		case <-l.userProcExited:
+			return nil
+		case <-time.After(grace):
+			// Force kill all container processes after grace period,
+			// hence `true` argument.
+			if err := l.container.Signal(os.Kill, true); err != nil {
+				return err
+			}
+		}
+	} else {
+		err := l.container.Signal(os.Kill, true)
+		if err != nil {
+			return err
+		}
+	}
+
+	select {
+	case <-l.userProcExited:
+		return nil
+	case <-time.After(time.Second * 15):
+		return fmt.Errorf("process failed to exit after 15 seconds")
+	}
+}
+
+// UpdateResources updates the resource isolation with new values to be enforced
+func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error {
+	return nil
+}
+
+// Version returns the api version of the executor
+func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) {
+	return &ExecutorVersion{Version: ExecutorVersionLatest}, nil
+}
+
+// Stats returns the resource statistics for processes managed by the executor
+func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
+	ch := make(chan *cstructs.TaskResourceUsage)
+	go l.handleStats(ch, ctx, interval)
+	return ch, nil
+
+}
+
+func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) {
+	defer close(ch)
+	timer := time.NewTimer(0)
+
+	measuredMemStats := ExecutorCgroupV1MeasuredMemStats
+	if cgroups.IsCgroup2UnifiedMode() {
+		measuredMemStats = ExecutorCgroupV2MeasuredMemStats
+	}
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+
+		case <-timer.C:
+			timer.Reset(interval)
+		}
+
+		lstats, err := l.container.Stats()
+		if err != nil {
+			l.logger.Warn("error collecting stats", "error", err)
+			return
+		}
+
+		pidStats, err := l.pidCollector.pidStats()
+		if err != nil {
+			l.logger.Warn("error collecting stats", "error", err)
+			return
+		}
+
+		ts := time.Now()
+		stats := lstats.CgroupStats
+
+		// Memory Related Stats
+		swap := stats.MemoryStats.SwapUsage
+		maxUsage := stats.MemoryStats.Usage.MaxUsage
+		rss := stats.MemoryStats.Stats["rss"]
+		cache := stats.MemoryStats.Stats["cache"]
+		mapped_file := stats.MemoryStats.Stats["mapped_file"]
+		ms := &cstructs.MemoryStats{
+			RSS:            rss,
+			Cache:          cache,
+			Swap:           swap.Usage,
+			MappedFile:     mapped_file,
+			Usage:          stats.MemoryStats.Usage.Usage,
+			MaxUsage:       maxUsage,
+			KernelUsage:    stats.MemoryStats.KernelUsage.Usage,
+			KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
+			Measured:       measuredMemStats,
+		}
+
+		// CPU Related Stats
+		totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
+		userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
+		kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
+
+		totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
+		cs := &cstructs.CpuStats{
+			SystemMode:       l.systemCpuStats.Percent(kernelModeTime),
+			UserMode:         l.userCpuStats.Percent(userModeTime),
+			Percent:          totalPercent,
+			ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
+			ThrottledTime:    stats.CpuStats.ThrottlingData.ThrottledTime,
+			TotalTicks:       l.systemCpuStats.TicksConsumed(totalPercent),
+			Measured:         ExecutorCgroupMeasuredCpuStats,
+		}
+		taskResUsage := cstructs.TaskResourceUsage{
+			ResourceUsage: &cstructs.ResourceUsage{
+				MemoryStats: ms,
+				CpuStats:    cs,
+			},
+			Timestamp: ts.UTC().UnixNano(),
+			Pids:      pidStats,
+		}
+
+		select {
+		case <-ctx.Done():
+			return
+		case ch <- &taskResUsage:
+		}
+
+	}
+}
+
+// Signal sends a signal to the process managed by the executor
+func (l *LibcontainerExecutor) Signal(s os.Signal) error {
+	return l.userProc.Signal(s)
+}
+
+// Exec starts an additional process inside the container
+func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
+	combined := append([]string{cmd}, args...)
+	// Capture output
+	buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
+
+	process := &libcontainer.Process{
+		Args:   combined,
+		Env:    l.command.Env,
+		Stdout: buf,
+		Stderr: buf,
+	}
+
+	err := l.container.Run(process)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	waitCh := make(chan *waitResult)
+	defer close(waitCh)
+	go l.handleExecWait(waitCh, process)
+
+	select {
+	case result := <-waitCh:
+		ps := result.ps
+		if result.err != nil {
+			if exitErr, ok := result.err.(*exec.ExitError); ok {
+				ps = exitErr.ProcessState
+			} else {
+				return nil, 0, result.err
+			}
+		}
+		var exitCode int
+		if status, ok := ps.Sys().(syscall.WaitStatus); ok {
+			exitCode = status.ExitStatus()
+		}
+		return buf.Bytes(), exitCode, nil
+
+	case <-time.After(time.Until(deadline)):
+		process.Signal(os.Kill)
+		return nil, 0, context.DeadlineExceeded
+	}
+
+}
+
+func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) {
+	parent, child, err := lutils.NewSockPair("socket")
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create terminal: %v", err)
+	}
+
+	return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err
+
+}
+
+func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool,
+	stream drivers.ExecTaskStream) error {
+
+	// the task process will be started by the container
+	process := &libcontainer.Process{
+		Args: cmd,
+		Env:  l.userProc.Env,
+		User: l.userProc.User,
+		Init: false,
+		Cwd:  "/",
+	}
+
+	execHelper := &execHelper{
+		logger: l.logger,
+
+		newTerminal: l.newTerminalSocket,
+		setTTY: func(tty *os.File) error {
+			process.ConsoleSocket = tty
+			return nil
+		},
+		setIO: func(stdin io.Reader, stdout, stderr io.Writer) error {
+			process.Stdin = stdin
+			process.Stdout = stdout
+			process.Stderr = stderr
+			return nil
+		},
+
+		processStart: func() error { return l.container.Run(process) },
+		processWait: func() (*os.ProcessState, error) {
+			return process.Wait()
+		},
+	}
+
+	return execHelper.run(ctx, tty, stream)
+
+}
+
+type waitResult struct {
+	ps  *os.ProcessState
+	err error
+}
+
+func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
+	ps, err := process.Wait()
+	ch <- &waitResult{ps, err}
+}
+
+func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) {
+	switch command.User {
+	case "root":
+		// when running as root, use the legacy set of system capabilities, so
+		// that we do not break existing nomad clusters using this "feature"
+		legacyCaps := capabilities.LegacySupported().Slice(true)
+		cfg.Capabilities = &lconfigs.Capabilities{
+			Bounding:    legacyCaps,
+			Permitted:   legacyCaps,
+			Effective:   legacyCaps,
+			Ambient:     nil,
+			Inheritable: nil,
+		}
+	default:
+		// otherwise apply the plugin + task capability configuration
+		cfg.Capabilities = &lconfigs.Capabilities{
+			Bounding: command.Capabilities,
+		}
+	}
+}
+
+func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces {
+	namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}}
+	if pidMode == IsolationModePrivate {
+		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID})
+	}
+	if ipcMode == IsolationModePrivate {
+		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC})
+	}
+	return namespaces
+}
+
+// configureIsolation prepares the isolation primitives of the container.
+// The process runs in a container configured with the following:
+//
+// * the task directory as the chroot
+// * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
+// * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
+// * some special filesystems: `/proc`, `/sys`.  Some case is given to avoid exec escaping or setting malicious values through them.
+func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error {
+	defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
+
+	// set the new root directory for the container
+	cfg.Rootfs = command.TaskDir
+
+	// disable pivot_root if set in the driver's configuration
+	cfg.NoPivotRoot = command.NoPivotRoot
+
+	// set up default namespaces as configured
+	cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC)
+
+	if command.NetworkIsolation != nil {
+		cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{
+			Type: lconfigs.NEWNET,
+			Path: command.NetworkIsolation.Path,
+		})
+	}
+
+	// paths to mask using a bind mount to /dev/null to prevent reading
+	cfg.MaskPaths = []string{
+		"/proc/kcore",
+		"/sys/firmware",
+	}
+
+	// paths that should be remounted as readonly inside the container
+	cfg.ReadonlyPaths = []string{
+		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+	}
+
+	cfg.Devices = specconv.AllowedDevices
+	if len(command.Devices) > 0 {
+		devs, err := cmdDevices(command.Devices)
+		if err != nil {
+			return err
+		}
+		cfg.Devices = append(cfg.Devices, devs...)
+	}
+
+	cfg.Mounts = []*lconfigs.Mount{
+		{
+			Source:      "tmpfs",
+			Destination: "/dev",
+			Device:      "tmpfs",
+			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
+			Data:        "mode=755",
+		},
+		{
+			Source:      "proc",
+			Destination: "/proc",
+			Device:      "proc",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "devpts",
+			Destination: "/dev/pts",
+			Device:      "devpts",
+			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
+			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
+		},
+		{
+			Device:      "tmpfs",
+			Source:      "shm",
+			Destination: "/dev/shm",
+			Data:        "mode=1777,size=65536k",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "mqueue",
+			Destination: "/dev/mqueue",
+			Device:      "mqueue",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "sysfs",
+			Destination: "/sys",
+			Device:      "sysfs",
+			Flags:       defaultMountFlags | syscall.MS_RDONLY,
+		},
+	}
+
+	if len(command.Mounts) > 0 {
+		cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...)
+	}
+
+	return nil
+}
+
+func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error {
+	// If resources are not limited then manually create cgroups needed
+	if !command.ResourceLimits {
+		return cgutil.ConfigureBasicCgroups(cfg)
+	}
+
+	// set cgroups path
+	if cgutil.UseV2 {
+		// in v2, the cgroup must have been created by the client already,
+		// which breaks a lot of existing tests that run drivers without a client
+		if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" {
+			return errors.New("cgroup path must be set")
+		}
+		parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath)
+		cfg.Cgroups.Path = filepath.Join("/", parent, cgroup)
+	} else {
+		// in v1, the cgroup is created using /nomad, which is a bug because it
+		// does not respect the cgroup_parent client configuration
+		// (but makes testing easy)
+		id := uuid.Generate()
+		cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id)
+	}
+
+	if command.Resources == nil || command.Resources.NomadResources == nil {
+		return nil
+	}
+
+	// Total amount of memory allowed to consume
+	res := command.Resources.NomadResources
+	memHard, memSoft := res.Memory.MemoryMaxMB, res.Memory.MemoryMB
+	if memHard <= 0 {
+		memHard = res.Memory.MemoryMB
+		memSoft = 0
+	}
+
+	if memHard > 0 {
+		cfg.Cgroups.Resources.Memory = memHard * 1024 * 1024
+		cfg.Cgroups.Resources.MemoryReservation = memSoft * 1024 * 1024
+
+		// Disable swap to avoid issues on the machine
+		var memSwappiness uint64
+		cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
+	}
+
+	cpuShares := res.Cpu.CpuShares
+	if cpuShares < 2 {
+		return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares)
+	}
+
+	// Set the relative CPU shares for this cgroup, and convert for cgroupv2
+	cfg.Cgroups.Resources.CpuShares = uint64(cpuShares)
+	cfg.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares))
+
+	if command.Resources.LinuxResources != nil && command.Resources.LinuxResources.CpusetCgroupPath != "" {
+		cfg.Hooks = lconfigs.Hooks{
+			lconfigs.CreateRuntime: lconfigs.HookList{
+				newSetCPUSetCgroupHook(command.Resources.LinuxResources.CpusetCgroupPath),
+			},
+		}
+	}
+
+	return nil
+}
+
+func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) {
+	cfg := &lconfigs.Config{
+		Cgroups: &lconfigs.Cgroup{
+			Resources: &lconfigs.Resources{
+				MemorySwappiness: nil,
+			},
+		},
+		Version: "1.0.0",
+	}
+
+	for _, device := range specconv.AllowedDevices {
+		cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule)
+	}
+
+	configureCapabilities(cfg, command)
+
+	// children should not inherit Nomad agent oom_score_adj value
+	oomScoreAdj := 0
+	cfg.OomScoreAdj = &oomScoreAdj
+
+	if err := configureIsolation(cfg, command); err != nil {
+		return nil, err
+	}
+
+	if err := configureCgroups(cfg, command); err != nil {
+		return nil, err
+	}
+
+	return cfg, nil
+}
+
+// cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
+func cmdDevices(driverDevices []*drivers.DeviceConfig) ([]*devices.Device, error) {
+	if len(driverDevices) == 0 {
+		return nil, nil
+	}
+
+	r := make([]*devices.Device, len(driverDevices))
+
+	for i, d := range driverDevices {
+		ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions)
+		if err != nil {
+			return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err)
+		}
+		ed.Path = d.TaskPath
+		r[i] = ed
+	}
+
+	return r, nil
+}
+
+var userMountToUnixMount = map[string]int{
+	// Empty string maps to `rprivate` for backwards compatibility in restored
+	// older tasks, where mount propagation will not be present.
+	"":                                       unix.MS_PRIVATE | unix.MS_REC, // rprivate
+	structs.VolumeMountPropagationPrivate:    unix.MS_PRIVATE | unix.MS_REC, // rprivate
+	structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC,   // rslave
+	structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared
+}
+
+// cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
+func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount {
+	if len(mounts) == 0 {
+		return nil
+	}
+
+	r := make([]*lconfigs.Mount, len(mounts))
+
+	for i, m := range mounts {
+		flags := unix.MS_BIND
+		if m.Readonly {
+			flags |= unix.MS_RDONLY
+		}
+
+		r[i] = &lconfigs.Mount{
+			Source:           m.HostPath,
+			Destination:      m.TaskPath,
+			Device:           "bind",
+			Flags:            flags,
+			PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]},
+		}
+	}
+
+	return r
+}
+
+// lookupTaskBin finds the file `bin`, searching in order:
+//   - taskDir/local
+//   - taskDir
+//   - each mount, in order listed in the jobspec
+//   - a PATH-like search of usr/local/bin/, usr/bin/, and bin/ inside the taskDir
+//
+// Returns an absolute path inside the container that will get passed as arg[0]
+// to the launched process, and the absolute path to that binary as seen by the
+// host (these will be identical for binaries that don't come from mounts).
+//
+// See also executor.lookupBin for a version used by non-isolated drivers.
+func lookupTaskBin(command *ExecCommand) (string, string, error) {
+	taskDir := command.TaskDir
+	bin := command.Cmd
+
+	// Check in the local directory
+	localDir := filepath.Join(taskDir, allocdir.TaskLocal)
+	taskPath, hostPath, err := getPathInTaskDir(command.TaskDir, localDir, bin)
+	if err == nil {
+		return taskPath, hostPath, nil
+	}
+
+	// Check at the root of the task's directory
+	taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, command.TaskDir, bin)
+	if err == nil {
+		return taskPath, hostPath, nil
+	}
+
+	// Check in our mounts
+	for _, mount := range command.Mounts {
+		taskPath, hostPath, err = getPathInMount(mount.HostPath, mount.TaskPath, bin)
+		if err == nil {
+			return taskPath, hostPath, nil
+		}
+	}
+
+	// If there's a / in the binary's path, we can't fallback to a PATH search
+	if strings.Contains(bin, "/") {
+		return "", "", fmt.Errorf("file %s not found under path %s", bin, taskDir)
+	}
+
+	// look for a file using a PATH-style lookup inside the directory
+	// root. Similar to the stdlib's exec.LookPath except:
+	//   - uses a restricted lookup PATH rather than the agent process's PATH env var.
+	//   - does not require that the file is already executable (this will be ensured
+	//     by the caller)
+	//   - does not prevent using relative path as added to exec.LookPath in go1.19
+	//     (this gets fixed-up in the caller)
+
+	// This is a fake PATH so that we're not using the agent's PATH
+	restrictedPaths := []string{"/usr/local/bin", "/usr/bin", "/bin"}
+
+	for _, dir := range restrictedPaths {
+		pathDir := filepath.Join(command.TaskDir, dir)
+		taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, pathDir, bin)
+		if err == nil {
+			return taskPath, hostPath, nil
+		}
+	}
+
+	return "", "", fmt.Errorf("file %s not found under path", bin)
+}
+
+// getPathInTaskDir searches for the binary in the task directory and nested
+// search directory. It returns the absolute path rooted inside the container
+// and the absolute path on the host.
+func getPathInTaskDir(taskDir, searchDir, bin string) (string, string, error) {
+
+	hostPath := filepath.Join(searchDir, bin)
+	err := filepathIsRegular(hostPath)
+	if err != nil {
+		return "", "", err
+	}
+
+	// Find the path relative to the task directory
+	rel, err := filepath.Rel(taskDir, hostPath)
+	if rel == "" || err != nil {
+		return "", "", fmt.Errorf(
+			"failed to determine relative path base=%q target=%q: %v",
+			taskDir, hostPath, err)
+	}
+
+	// Turn relative-to-taskdir path into re-rooted absolute path to avoid
+	// libcontainer trying to resolve the binary using $PATH.
+	// Do *not* use filepath.Join as it will translate ".."s returned by
+	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
+	// chroot which is the desired behavior.
+	return filepath.Clean("/" + rel), hostPath, nil
+}
+
+// getPathInMount for the binary in the mount's host path, constructing the path
+// considering that the bin path is rooted in the mount's task path and not its
+// host path. It returns the absolute path rooted inside the container and the
+// absolute path on the host.
+func getPathInMount(mountHostPath, mountTaskPath, bin string) (string, string, error) {
+
+	// Find the path relative to the mount point in the task so that we can
+	// trim off any shared prefix when we search on the host path
+	mountRel, err := filepath.Rel(mountTaskPath, bin)
+	if mountRel == "" || err != nil {
+		return "", "", fmt.Errorf("path was not relative to the mount task path")
+	}
+
+	hostPath := filepath.Join(mountHostPath, mountRel)
+
+	err = filepathIsRegular(hostPath)
+	if err != nil {
+		return "", "", err
+	}
+
+	// Turn relative-to-taskdir path into re-rooted absolute path to avoid
+	// libcontainer trying to resolve the binary using $PATH.
+	// Do *not* use filepath.Join as it will translate ".."s returned by
+	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
+	// chroot which is the desired behavior.
+	return filepath.Clean("/" + bin), hostPath, nil
+}
+
+// filepathIsRegular verifies that a filepath is a regular file (i.e. not a
+// directory, socket, device, etc.)
+func filepathIsRegular(path string) error {
+	f, err := os.Stat(path)
+	if err != nil {
+		return err
+	}
+	if !f.Mode().Type().IsRegular() {
+		return fmt.Errorf("path was not a regular file")
+	}
+	return nil
+}
+
+func newSetCPUSetCgroupHook(cgroupPath string) lconfigs.Hook {
+	return lconfigs.NewFunctionHook(func(state *specs.State) error {
+		return cgroups.WriteCgroupProc(cgroupPath, state.Pid)
+	})
+}