exec.go#Action
Action: func(context *cli.Context) error {if err := checkArgs(context, 1, minArgs); err != nil {return err}if err := revisePidFile(context); err != nil {return err}status, err := execProcess(context)if err == nil {os.Exit(status)}return fmt.Errorf("exec failed: %v", err)},SkipArgReorder: true,
1) exec.go#execProcess
func execProcess(context *cli.Context) (int, error) {container, err := getContainer(context)if err != nil {return -1, err}status, err := container.Status()if err != nil {return -1, err}if status == libcontainer.Stopped {return -1, fmt.Errorf("cannot exec a container that has stopped")}path := context.String("process")if path == "" && len(context.Args()) == 1 {return -1, fmt.Errorf("process args cannot be empty")}detach := context.Bool("detach")state, err := container.State()if err != nil {return -1, err}bundle := utils.SearchLabels(state.Config.Labels, "bundle")p, err := getProcess(context, bundle)if err != nil {return -1, err}r := &runner{enableSubreaper: false,shouldDestroy: false,container: container,consoleSocket: context.String("console-socket"),detach: detach,pidFile: context.String("pid-file"),action: CT_ACT_RUN,init: false,}return r.run(p)}
1.1) exec.go#getProcess
func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {if path := context.String("process"); path != "" {f, err := os.Open(path)if err != nil {return nil, err}defer f.Close()var p specs.Processif err := json.NewDecoder(f).Decode(&p); err != nil {return nil, err}return &p, validateProcessSpec(&p)}// process via cli flagsif err := os.Chdir(bundle); err != nil {return nil, err}spec, err := loadSpec(specConfig)if err != nil {return nil, err}p := spec.Processp.Args = context.Args()[1:]// override the cwd, if passedif context.String("cwd") != "" {p.Cwd = context.String("cwd")}if ap := context.String("apparmor"); ap != "" {p.ApparmorProfile = ap}if l := context.String("process-label"); l != "" {p.SelinuxLabel = l}if caps := context.StringSlice("cap"); len(caps) > 0 {for _, c := range caps {p.Capabilities.Bounding = append(p.Capabilities.Bounding, c)p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c)p.Capabilities.Effective = append(p.Capabilities.Effective, c)p.Capabilities.Permitted = append(p.Capabilities.Permitted, c)p.Capabilities.Ambient = append(p.Capabilities.Ambient, c)}}// append the passed env variablesp.Env = append(p.Env, context.StringSlice("env")...)// set the ttyif context.IsSet("tty") {p.Terminal = context.Bool("tty")}if context.IsSet("no-new-privs") {p.NoNewPrivileges = context.Bool("no-new-privs")}// override the user, if passedif context.String("user") != "" {u := strings.SplitN(context.String("user"), ":", 2)if len(u) > 1 {gid, err := strconv.Atoi(u[1])if err != nil {return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)}p.User.GID = uint32(gid)}uid, err := strconv.Atoi(u[0])if err != nil {return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)}p.User.UID = uint32(uid)}for _, gid := range context.Int64Slice("additional-gids") {if gid < 0 {return nil, fmt.Errorf("additional-gids must be a positive number %d", gid)}p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid))}return p, nil}
1.2) libcontainer/container_linux.go#linuxContainer.newParentProcess
如果非init进程,那么会返回SetnsProcess。
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {parentPipe, childPipe, err := utils.NewSockPair("init")if err != nil {return nil, newSystemErrorWithCause(err, "creating new init pipe")}cmd, err := c.commandTemplate(p, childPipe)if err != nil {return nil, newSystemErrorWithCause(err, "creating new command template")}if !p.Init {return c.newSetnsProcess(p, cmd, parentPipe, childPipe)}// We only set up fifoFd if we're not doing a `runc exec`. The historic// reason for this is that previously we would pass a dirfd that allowed// for container rootfs escape (and not doing it in `runc exec` avoided// that problem), but we no longer do that. However, there's no need to do// this for `runc exec` so we just keep it this way to be safe.if err := c.includeExecFifo(cmd); err != nil {return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")}return c.newInitProcess(p, cmd, parentPipe, childPipe)}
1.2.1) libcontainer/container_linux.go#linuxContainer.newSetnsProcess
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))state, err := c.currentState()if err != nil {return nil, newSystemErrorWithCause(err, "getting container's current state")}// for setns process, we don't have to set cloneflags as the process namespaces// will only be set via setns syscalldata, err := c.bootstrapData(0, state.NamespacePaths)if err != nil {return nil, err}return &setnsProcess{cmd: cmd,cgroupPaths: c.cgroupManager.GetPaths(),rootlessCgroups: c.config.RootlessCgroups,intelRdtPath: state.IntelRdtPath,childPipe: childPipe,parentPipe: parentPipe,config: c.newInitConfig(p),process: p,bootstrapData: data,}, nil}
1.3) libcontainer/process_linux.go#setnsProcess.start
func (p *setnsProcess) start() (err error) {defer p.parentPipe.Close()err = p.cmd.Start()p.childPipe.Close()if err != nil {return newSystemErrorWithCause(err, "starting setns process")}if p.bootstrapData != nil {if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {return newSystemErrorWithCause(err, "copying bootstrap data to pipe")}}if err = p.execSetns(); err != nil {return newSystemErrorWithCause(err, "executing setns process")}if len(p.cgroupPaths) > 0 {if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())}}if p.intelRdtPath != "" {// if Intel RDT "resource control" filesystem path exists_, err := os.Stat(p.intelRdtPath)if err == nil {if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())}}}// set rlimits, this has to be done here because we lose permissions// to raise the limits once we enter a user-namespaceif err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {return newSystemErrorWithCause(err, "setting rlimits for process")}if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {return newSystemErrorWithCause(err, "writing config to pipe")}ierr := parseSync(p.parentPipe, func(sync *syncT) error {switch sync.Type {case procReady:// This shouldn't happen.panic("unexpected procReady in setns")case procHooks:// This shouldn't happen.panic("unexpected procHooks in setns")default:return newSystemError(fmt.Errorf("invalid JSON payload from child"))}})if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {return newSystemErrorWithCause(err, "calling shutdown on init pipe")}// Must be done after Shutdown so the child will exit and we can wait for it.if ierr != nil {p.wait()return ierr}return nil}
1.3.1) libcontainer/process_linux.go#setnsProcess.execSetns
// execSetns runs the process that executes C code to perform the setns calls// because setns support requires the C process to fork off a child and perform the setns// before the go runtime boots, we wait on the process to die and receive the child's pid// over the provided pipe.func (p *setnsProcess) execSetns() error {status, err := p.cmd.Process.Wait()if err != nil {p.cmd.Wait()return newSystemErrorWithCause(err, "waiting on setns process to finish")}if !status.Success() {p.cmd.Wait()return newSystemError(&exec.ExitError{ProcessState: status})}var pid *pidif err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {p.cmd.Wait()return newSystemErrorWithCause(err, "reading pid from init pipe")}// Clean up the zombie parent processfirstChildProcess, err := os.FindProcess(pid.PidFirstChild)if err != nil {return err}// Ignore the error in case the child has already been reaped for any reason_, _ = firstChildProcess.Wait()process, err := os.FindProcess(pid.Pid)if err != nil {return err}p.cmd.Process = processp.process.ops = preturn nil}
2) 【child】init.go
import ("os""runtime""github.com/opencontainers/runc/libcontainer"// ********************************** NOTICE ********************************** //_ "github.com/opencontainers/runc/libcontainer/nsenter"// ********************************** NOTICE ********************************** //"github.com/urfave/cli")func init() {if len(os.Args) > 1 && os.Args[1] == "init" {runtime.GOMAXPROCS(1)runtime.LockOSThread()}}var initCommand = cli.Command{Name: "init",Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,Action: func(context *cli.Context) error {factory, _ := libcontainer.New("")if err := factory.StartInitialization(); err != nil {// as the error is sent back to the parent there is no need to log// or write it to stderr because the parent process will handle thisos.Exit(1)}panic("libcontainer: container init failed to exec")},}
2.1) libcontainer/nsenter/nsenter_gccgo.go
// +build linux,gccgopackage nsenter/*#cgo CFLAGS: -Wallextern void nsexec();void __attribute__((constructor)) init(void) {nsexec();}*/import "C"// AlwaysFalse is here to stay false// (and be exported so the compiler doesn't optimize out its reference)var AlwaysFalse boolfunc init() {if AlwaysFalse {// by referencing this C init() in a noop test, it will ensure the compiler// links in the C function.// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134C.init()}}
2.1.1) libcontainer/nsenter/nsenter.c#nsexec(Cgo无法使用交叉编译)
void nsexec(void){int pipenum;jmp_buf env;int sync_child_pipe[2], sync_grandchild_pipe[2];struct nlconfig_t config = { 0 };/** If we don't have an init pipe, just return to the go routine.* We'll only get an init pipe for start or exec.*/pipenum = initpipe();if (pipenum == -1)return;/* Parse all of the netlink configuration. */nl_parse(pipenum, &config);/* Set oom_score_adj. This has to be done before !dumpable because* /proc/self/oom_score_adj is not writeable unless you're an privileged* user (if !dumpable is set). All children inherit their parent's* oom_score_adj value on fork(2) so this will always be propagated* properly.*/update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);/** Make the process non-dumpable, to avoid various race conditions that* could cause processes in namespaces we're joining to access host* resources (or potentially execute code).** However, if the number of namespaces we are joining is 0, we are not* going to be switching to a different security context. Thus setting* ourselves to be non-dumpable only breaks things (like rootless* containers), which is the recommendation from the kernel folks.*/if (config.namespaces) {if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)bail("failed to set process as non-dumpable");}/* Pipe so we can tell the child when we've finished setting up. */if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)bail("failed to setup sync pipe between parent and child");/** We need a new socketpair to sync with grandchild so we don't have* race condition with child.*/if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)bail("failed to setup sync pipe between parent and grandchild");/* TODO: Currently we aren't dealing with child deaths properly. *//** Okay, so this is quite annoying.** In order for this unsharing code to be more extensible we need to split* up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case* would be if we did clone(CLONE_NEWUSER) and the other namespaces* separately, but because of SELinux issues we cannot really do that. But* we cannot just dump the namespace flags into clone(...) because several* usecases (such as rootless containers) require more granularity around* the namespace setup. In addition, some older kernels had issues where* CLONE_NEWUSER wasn't handled before other namespaces (but we cannot* handle this while also dealing with SELinux so we choose SELinux support* over broken kernel support).** However, if we unshare(2) the user namespace *before* we clone(2), then* all hell breaks loose.** The parent no longer has permissions to do many things (unshare(2) drops* all capabilities in your old namespace), and the container cannot be set* up to have more than one {uid,gid} mapping. This is obviously less than* ideal. In order to fix this, we have to first clone(2) and then unshare.** Unfortunately, it's not as simple as that. We have to fork to enter the* PID namespace (the PID namespace only applies to children). Since we'll* have to double-fork, this clone_parent() call won't be able to get the* PID of the _actual_ init process (without doing more synchronisation than* I can deal with at the moment). So we'll just get the parent to send it* for us, the only job of this process is to update* /proc/pid/{setgroups,uid_map,gid_map}.** And as a result of the above, we also need to setns(2) in the first child* because if we join a PID namespace in the topmost parent then our child* will be in that namespace (and it will not be able to give us a PID value* that makes sense without resorting to sending things with cmsg).** This also deals with an older issue caused by dumping cloneflags into* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so* we have to unshare(2) before clone(2) in order to do this. This was fixed* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're* aware, the last mainline kernel which had this bug was Linux 3.12.* However, we cannot comment on which kernels the broken patch was* backported to.** -- Aleksa "what has my life come to?" Sarai*/switch (setjmp(env)) {/** Stage 0: We're in the parent. Our job is just to create a new child* (stage 1: JUMP_CHILD) process and write its uid_map and* gid_map. That process will go on to create a new process, then* it will send us its PID which we will send to the bootstrap* process.*/case JUMP_PARENT:{int len;pid_t child, first_child = -1;bool ready = false;/* For debugging. */prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);/* Start the process of getting a container. */child = clone_parent(&env, JUMP_CHILD);if (child < 0)bail("unable to fork: child_func");/** State machine for synchronisation with the children.** Father only return when both child and grandchild are* ready, so we can receive all possible error codes* generated by children.*/while (!ready) {enum sync_t s;int ret;syncfd = sync_child_pipe[1];close(sync_child_pipe[0]);if (read(syncfd, &s, sizeof(s)) != sizeof(s))bail("failed to sync with child: next state");switch (s) {case SYNC_ERR:/* We have to mirror the error code of the child. */if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))bail("failed to sync with child: read(error code)");exit(ret);case SYNC_USERMAP_PLS:/** Enable setgroups(2) if we've been asked to. But we also* have to explicitly disable setgroups(2) if we're* creating a rootless container for single-entry mapping.* i.e. config.is_setgroup == false.* (this is required since Linux 3.19).** For rootless multi-entry mapping, config.is_setgroup shall be true and* newuidmap/newgidmap shall be used.*/if (config.is_rootless_euid && !config.is_setgroup)update_setgroups(child, SETGROUPS_DENY);/* Set up mappings. */update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);s = SYNC_USERMAP_ACK;if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {kill(child, SIGKILL);bail("failed to sync with child: write(SYNC_USERMAP_ACK)");}break;case SYNC_RECVPID_PLS:{first_child = child;/* Get the init_func pid. */if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {kill(first_child, SIGKILL);bail("failed to sync with child: read(childpid)");}/* Send ACK. */s = SYNC_RECVPID_ACK;if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {kill(first_child, SIGKILL);kill(child, SIGKILL);bail("failed to sync with child: write(SYNC_RECVPID_ACK)");}/* Send the init_func pid back to our parent.** Send the init_func pid and the pid of the first child back to our parent.* We need to send both back because we can't reap the first child we created (CLONE_PARENT).* It becomes the responsibility of our parent to reap the first child.*/len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);if (len < 0) {kill(child, SIGKILL);bail("unable to generate JSON for child pid");}}break;case SYNC_CHILD_READY:ready = true;break;default:bail("unexpected sync value: %u", s);}}/* Now sync with grandchild. */ready = false;while (!ready) {enum sync_t s;int ret;syncfd = sync_grandchild_pipe[1];close(sync_grandchild_pipe[0]);s = SYNC_GRANDCHILD;if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {kill(child, SIGKILL);bail("failed to sync with child: write(SYNC_GRANDCHILD)");}if (read(syncfd, &s, sizeof(s)) != sizeof(s))bail("failed to sync with child: next state");switch (s) {case SYNC_ERR:/* We have to mirror the error code of the child. */if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))bail("failed to sync with child: read(error code)");exit(ret);case SYNC_CHILD_READY:ready = true;break;default:bail("unexpected sync value: %u", s);}}exit(0);}/** Stage 1: We're in the first child process. Our job is to join any* provided namespaces in the netlink payload and unshare all* of the requested namespaces. If we've been asked to* CLONE_NEWUSER, we will ask our parent (stage 0) to set up* our user mappings for us. Then, we create a new child* (stage 2: JUMP_INIT) for PID namespace. We then send the* child's PID to our parent (stage 0).*/case JUMP_CHILD:{pid_t child;enum sync_t s;/* We're in a child and thus need to tell the parent if we die. */syncfd = sync_child_pipe[0];close(sync_child_pipe[1]);/* For debugging. */prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);/** We need to setns first. We cannot do this earlier (in stage 0)* because of the fact that we forked to get here (the PID of* [stage 2: JUMP_INIT]) would be meaningless). We could send it* using cmsg(3) but that's just annoying.*/if (config.namespaces)join_namespaces(config.namespaces);/** Deal with user namespaces first. They are quite special, as they* affect our ability to unshare other namespaces and are used as* context for privilege checks.** We don't unshare all namespaces in one go. The reason for this* is that, while the kernel documentation may claim otherwise,* there are certain cases where unsharing all namespaces at once* will result in namespace objects being owned incorrectly.* Ideally we should just fix these kernel bugs, but it's better to* be safe than sorry, and fix them separately.** A specific case of this is that the SELinux label of the* internal kern-mount that mqueue uses will be incorrect if the* UTS namespace is cloned before the USER namespace is mapped.* I've also heard of similar problems with the network namespace* in some scenarios. This also mirrors how LXC deals with this* problem.*/if (config.cloneflags & CLONE_NEWUSER) {if (unshare(CLONE_NEWUSER) < 0)bail("failed to unshare user namespace");config.cloneflags &= ~CLONE_NEWUSER;/** We don't have the privileges to do any mapping here (see the* clone_parent rant). So signal our parent to hook us up.*//* Switching is only necessary if we joined namespaces. */if (config.namespaces) {if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)bail("failed to set process as dumpable");}s = SYNC_USERMAP_PLS;if (write(syncfd, &s, sizeof(s)) != sizeof(s))bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");/* ... wait for mapping ... */if (read(syncfd, &s, sizeof(s)) != sizeof(s))bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");if (s != SYNC_USERMAP_ACK)bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);/* Switching is only necessary if we joined namespaces. */if (config.namespaces) {if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)bail("failed to set process as dumpable");}/* Become root in the namespace proper. */if (setresuid(0, 0, 0) < 0)bail("failed to become root in user namespace");}/** Unshare all of the namespaces. Now, it should be noted that this* ordering might break in the future (especially with rootless* containers). But for now, it's not possible to split this into* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.** Note that we don't merge this with clone() because there were* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)* was broken, so we'll just do it the long way anyway.*/if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)bail("failed to unshare namespaces");/** TODO: What about non-namespace clone flags that we're dropping here?** We fork again because of PID namespace, setns(2) or unshare(2) don't* change the PID namespace of the calling process, because doing so* would change the caller's idea of its own PID (as reported by getpid()),* which would break many applications and libraries, so we must fork* to actually enter the new PID namespace.*/child = clone_parent(&env, JUMP_INIT);if (child < 0)bail("unable to fork: init_func");/* Send the child to our parent, which knows what it's doing. */s = SYNC_RECVPID_PLS;if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {kill(child, SIGKILL);bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");}if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {kill(child, SIGKILL);bail("failed to sync with parent: write(childpid)");}/* ... wait for parent to get the pid ... */if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {kill(child, SIGKILL);bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");}if (s != SYNC_RECVPID_ACK) {kill(child, SIGKILL);bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);}s = SYNC_CHILD_READY;if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {kill(child, SIGKILL);bail("failed to sync with parent: write(SYNC_CHILD_READY)");}/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */exit(0);}/** Stage 2: We're the final child process, and the only process that will* actually return to the Go runtime. Our job is to just do the* final cleanup steps and then return to the Go runtime to allow* init_linux.go to run.*/case JUMP_INIT:{/** We're inside the child now, having jumped from the* start_child() code after forking in the parent.*/enum sync_t s;/* We're in a child and thus need to tell the parent if we die. */syncfd = sync_grandchild_pipe[0];close(sync_grandchild_pipe[1]);close(sync_child_pipe[0]);close(sync_child_pipe[1]);/* For debugging. */prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);if (read(syncfd, &s, sizeof(s)) != sizeof(s))bail("failed to sync with parent: read(SYNC_GRANDCHILD)");if (s != SYNC_GRANDCHILD)bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);if (setsid() < 0)bail("setsid failed");if (setuid(0) < 0)bail("setuid failed");if (setgid(0) < 0)bail("setgid failed");if (!config.is_rootless_euid && config.is_setgroup) {if (setgroups(0, NULL) < 0)bail("setgroups failed");}/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */if (config.cloneflags & CLONE_NEWCGROUP) {uint8_t value;if (read(pipenum, &value, sizeof(value)) != sizeof(value))bail("read synchronisation value failed");if (value == CREATECGROUPNS) {if (unshare(CLONE_NEWCGROUP) < 0)bail("failed to unshare cgroup namespace");} elsebail("received unknown synchronisation value");}s = SYNC_CHILD_READY;if (write(syncfd, &s, sizeof(s)) != sizeof(s))bail("failed to sync with patent: write(SYNC_CHILD_READY)");/* Close sync pipes. */close(sync_grandchild_pipe[0]);/* Free netlink data. */nl_free(&config);/* Finish executing, let the Go runtime take over. */return;}default:bail("unexpected jump value");}/* Should never be reached. */bail("should never be reached");}
2.2) libcontainer/setns_init_linux.go#linuxSetnsInit.Init
func (l *linuxSetnsInit) Init() error {runtime.LockOSThread()defer runtime.UnlockOSThread()if !l.config.Config.NoNewKeyring {// Do not inherit the parent's session keyring.if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {// Same justification as in standart_init_linux.go as to why we// don't bail on ENOSYS.//// TODO(cyphar): And we should have logging here too.if errors.Cause(err) != unix.ENOSYS {return errors.Wrap(err, "join session keyring")}}}if l.config.CreateConsole {if err := setupConsole(l.consoleSocket, l.config, false); err != nil {return err}if err := system.Setctty(); err != nil {return err}}if l.config.NoNewPrivileges {if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {return err}}if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {return err}defer label.SetProcessLabel("")// Without NoNewPrivileges seccomp is a privileged operation, so we need to// do this before dropping capabilities; otherwise do it as late as possible// just before execve so as few syscalls take place after it as possible.if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {return err}}if err := finalizeNamespace(l.config); err != nil {return err}if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {return err}// Set seccomp as close to execve as possible, so as few syscalls take// place afterward (reducing the amount of syscalls that users need to// enable in their seccomp profiles).if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {return newSystemErrorWithCause(err, "init seccomp")}}return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())}
