exec.go#Action
1) exec.go#execProcess
2) 【child】init.go
- 2.1) libcontainer/nsenter/nsenter_gccgo.go
  - 2.1.1) libcontainer/nsenter/nsenter.c#nsexec（Cgo无法使用交叉编译）
- 2.2) libcontainer/setns_init_linux.go#linuxSetnsInit.Init

exec.go#Action

    Action: func(context *cli.Context) error {
        if err := checkArgs(context, 1, minArgs); err != nil {
            return err
        }
        if err := revisePidFile(context); err != nil {
            return err
        }
        status, err := execProcess(context)
        if err == nil {
            os.Exit(status)
        }
        return fmt.Errorf("exec failed: %v", err)
    },
    SkipArgReorder: true,

1) exec.go#execProcess

func execProcess(context *cli.Context) (int, error) {
    container, err := getContainer(context)
    if err != nil {
        return -1, err
    }
    status, err := container.Status()
    if err != nil {
        return -1, err
    }
    if status == libcontainer.Stopped {
        return -1, fmt.Errorf("cannot exec a container that has stopped")
    }
    path := context.String("process")
    if path == "" && len(context.Args()) == 1 {
        return -1, fmt.Errorf("process args cannot be empty")
    }
    detach := context.Bool("detach")
    state, err := container.State()
    if err != nil {
        return -1, err
    }
    bundle := utils.SearchLabels(state.Config.Labels, "bundle")
    p, err := getProcess(context, bundle)
    if err != nil {
        return -1, err
    }
    r := &runner{
        enableSubreaper: false,
        shouldDestroy:   false,
        container:       container,
        consoleSocket:   context.String("console-socket"),
        detach:          detach,
        pidFile:         context.String("pid-file"),
        action:          CT_ACT_RUN,
        init:            false,
    }
    return r.run(p)
}

1.1) exec.go#getProcess

func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
    if path := context.String("process"); path != "" {
        f, err := os.Open(path)
        if err != nil {
            return nil, err
        }
        defer f.Close()
        var p specs.Process
        if err := json.NewDecoder(f).Decode(&p); err != nil {
            return nil, err
        }
        return &p, validateProcessSpec(&p)
    }
    // process via cli flags
    if err := os.Chdir(bundle); err != nil {
        return nil, err
    }
    spec, err := loadSpec(specConfig)
    if err != nil {
        return nil, err
    }
    p := spec.Process
    p.Args = context.Args()[1:]
    // override the cwd, if passed
    if context.String("cwd") != "" {
        p.Cwd = context.String("cwd")
    }
    if ap := context.String("apparmor"); ap != "" {
        p.ApparmorProfile = ap
    }
    if l := context.String("process-label"); l != "" {
        p.SelinuxLabel = l
    }
    if caps := context.StringSlice("cap"); len(caps) > 0 {
        for _, c := range caps {
            p.Capabilities.Bounding = append(p.Capabilities.Bounding, c)
            p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c)
            p.Capabilities.Effective = append(p.Capabilities.Effective, c)
            p.Capabilities.Permitted = append(p.Capabilities.Permitted, c)
            p.Capabilities.Ambient = append(p.Capabilities.Ambient, c)
        }
    }
    // append the passed env variables
    p.Env = append(p.Env, context.StringSlice("env")...)
    // set the tty
    if context.IsSet("tty") {
        p.Terminal = context.Bool("tty")
    }
    if context.IsSet("no-new-privs") {
        p.NoNewPrivileges = context.Bool("no-new-privs")
    }
    // override the user, if passed
    if context.String("user") != "" {
        u := strings.SplitN(context.String("user"), ":", 2)
        if len(u) > 1 {
            gid, err := strconv.Atoi(u[1])
            if err != nil {
                return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)
            }
            p.User.GID = uint32(gid)
        }
        uid, err := strconv.Atoi(u[0])
        if err != nil {
            return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)
        }
        p.User.UID = uint32(uid)
    }
    for _, gid := range context.Int64Slice("additional-gids") {
        if gid < 0 {
            return nil, fmt.Errorf("additional-gids must be a positive number %d", gid)
        }
        p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid))
    }
    return p, nil
}

1.2) libcontainer/container_linux.go#linuxContainer.newParentProcess

如果非init进程，那么会返回SetnsProcess。

func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
    parentPipe, childPipe, err := utils.NewSockPair("init")
    if err != nil {
        return nil, newSystemErrorWithCause(err, "creating new init pipe")
    }
    cmd, err := c.commandTemplate(p, childPipe)
    if err != nil {
        return nil, newSystemErrorWithCause(err, "creating new command template")
    }
    if !p.Init {
        return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
    }
    // We only set up fifoFd if we're not doing a `runc exec`. The historic
    // reason for this is that previously we would pass a dirfd that allowed
    // for container rootfs escape (and not doing it in `runc exec` avoided
    // that problem), but we no longer do that. However, there's no need to do
    // this for `runc exec` so we just keep it this way to be safe.
    if err := c.includeExecFifo(cmd); err != nil {
        return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
    }
    return c.newInitProcess(p, cmd, parentPipe, childPipe)
}

1.2.1) libcontainer/container_linux.go#linuxContainer.newSetnsProcess

func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
    cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
    state, err := c.currentState()
    if err != nil {
        return nil, newSystemErrorWithCause(err, "getting container's current state")
    }
    // for setns process, we don't have to set cloneflags as the process namespaces
    // will only be set via setns syscall
    data, err := c.bootstrapData(0, state.NamespacePaths)
    if err != nil {
        return nil, err
    }
    return &setnsProcess{
        cmd:             cmd,
        cgroupPaths:     c.cgroupManager.GetPaths(),
        rootlessCgroups: c.config.RootlessCgroups,
        intelRdtPath:    state.IntelRdtPath,
        childPipe:       childPipe,
        parentPipe:      parentPipe,
        config:          c.newInitConfig(p),
        process:         p,
        bootstrapData:   data,
    }, nil
}

1.3) libcontainer/process_linux.go#setnsProcess.start

func (p *setnsProcess) start() (err error) {
    defer p.parentPipe.Close()
    err = p.cmd.Start()
    p.childPipe.Close()
    if err != nil {
        return newSystemErrorWithCause(err, "starting setns process")
    }
    if p.bootstrapData != nil {
        if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
            return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
        }
    }
    if err = p.execSetns(); err != nil {
        return newSystemErrorWithCause(err, "executing setns process")
    }
    if len(p.cgroupPaths) > 0 {
        if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
            return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
        }
    }
    if p.intelRdtPath != "" {
        // if Intel RDT "resource control" filesystem path exists
        _, err := os.Stat(p.intelRdtPath)
        if err == nil {
            if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
                return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
            }
        }
    }
    // set rlimits, this has to be done here because we lose permissions
    // to raise the limits once we enter a user-namespace
    if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
        return newSystemErrorWithCause(err, "setting rlimits for process")
    }
    if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
        return newSystemErrorWithCause(err, "writing config to pipe")
    }
    ierr := parseSync(p.parentPipe, func(sync *syncT) error {
        switch sync.Type {
        case procReady:
            // This shouldn't happen.
            panic("unexpected procReady in setns")
        case procHooks:
            // This shouldn't happen.
            panic("unexpected procHooks in setns")
        default:
            return newSystemError(fmt.Errorf("invalid JSON payload from child"))
        }
    })
    if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
        return newSystemErrorWithCause(err, "calling shutdown on init pipe")
    }
    // Must be done after Shutdown so the child will exit and we can wait for it.
    if ierr != nil {
        p.wait()
        return ierr
    }
    return nil
}

1.3.1) libcontainer/process_linux.go#setnsProcess.execSetns

// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
    status, err := p.cmd.Process.Wait()
    if err != nil {
        p.cmd.Wait()
        return newSystemErrorWithCause(err, "waiting on setns process to finish")
    }
    if !status.Success() {
        p.cmd.Wait()
        return newSystemError(&exec.ExitError{ProcessState: status})
    }
    var pid *pid
    if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
        p.cmd.Wait()
        return newSystemErrorWithCause(err, "reading pid from init pipe")
    }
    // Clean up the zombie parent process
    firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
    if err != nil {
        return err
    }
    // Ignore the error in case the child has already been reaped for any reason
    _, _ = firstChildProcess.Wait()
    process, err := os.FindProcess(pid.Pid)
    if err != nil {
        return err
    }
    p.cmd.Process = process
    p.process.ops = p
    return nil
}

2) 【child】init.go


import (
    "os"
    "runtime"
    "github.com/opencontainers/runc/libcontainer"
  // ********************************** NOTICE ********************************** //
    _ "github.com/opencontainers/runc/libcontainer/nsenter"
  // ********************************** NOTICE ********************************** //
    "github.com/urfave/cli"
)
func init() {
    if len(os.Args) > 1 && os.Args[1] == "init" {
        runtime.GOMAXPROCS(1)
        runtime.LockOSThread()
    }
}
var initCommand = cli.Command{
    Name:  "init",
    Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
    Action: func(context *cli.Context) error {
        factory, _ := libcontainer.New("")
        if err := factory.StartInitialization(); err != nil {
            // as the error is sent back to the parent there is no need to log
            // or write it to stderr because the parent process will handle this
            os.Exit(1)
        }
        panic("libcontainer: container init failed to exec")
    },
}

2.1) libcontainer/nsenter/nsenter_gccgo.go

// +build linux,gccgo
package nsenter
/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
    nsexec();
}
*/
import "C"
// AlwaysFalse is here to stay false
// (and be exported so the compiler doesn't optimize out its reference)
var AlwaysFalse bool
func init() {
    if AlwaysFalse {
        // by referencing this C init() in a noop test, it will ensure the compiler
        // links in the C function.
        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
        C.init()
    }
}

2.1.1) libcontainer/nsenter/nsenter.c#nsexec（Cgo无法使用交叉编译）

void nsexec(void)
{
    int pipenum;
    jmp_buf env;
    int sync_child_pipe[2], sync_grandchild_pipe[2];
    struct nlconfig_t config = { 0 };
    /*
     * If we don't have an init pipe, just return to the go routine.
     * We'll only get an init pipe for start or exec.
     */
    pipenum = initpipe();
    if (pipenum == -1)
        return;
    /* Parse all of the netlink configuration. */
    nl_parse(pipenum, &config);
    /* Set oom_score_adj. This has to be done before !dumpable because
     * /proc/self/oom_score_adj is not writeable unless you're an privileged
     * user (if !dumpable is set). All children inherit their parent's
     * oom_score_adj value on fork(2) so this will always be propagated
     * properly.
     */
    update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
    /*
     * Make the process non-dumpable, to avoid various race conditions that
     * could cause processes in namespaces we're joining to access host
     * resources (or potentially execute code).
     *
     * However, if the number of namespaces we are joining is 0, we are not
     * going to be switching to a different security context. Thus setting
     * ourselves to be non-dumpable only breaks things (like rootless
     * containers), which is the recommendation from the kernel folks.
     */
    if (config.namespaces) {
        if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
            bail("failed to set process as non-dumpable");
    }
    /* Pipe so we can tell the child when we've finished setting up. */
    if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
        bail("failed to setup sync pipe between parent and child");
    /*
     * We need a new socketpair to sync with grandchild so we don't have
     * race condition with child.
     */
    if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
        bail("failed to setup sync pipe between parent and grandchild");
    /* TODO: Currently we aren't dealing with child deaths properly. */
    /*
     * Okay, so this is quite annoying.
     *
     * In order for this unsharing code to be more extensible we need to split
     * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
     * would be if we did clone(CLONE_NEWUSER) and the other namespaces
     * separately, but because of SELinux issues we cannot really do that. But
     * we cannot just dump the namespace flags into clone(...) because several
     * usecases (such as rootless containers) require more granularity around
     * the namespace setup. In addition, some older kernels had issues where
     * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
     * handle this while also dealing with SELinux so we choose SELinux support
     * over broken kernel support).
     *
     * However, if we unshare(2) the user namespace *before* we clone(2), then
     * all hell breaks loose.
     *
     * The parent no longer has permissions to do many things (unshare(2) drops
     * all capabilities in your old namespace), and the container cannot be set
     * up to have more than one {uid,gid} mapping. This is obviously less than
     * ideal. In order to fix this, we have to first clone(2) and then unshare.
     *
     * Unfortunately, it's not as simple as that. We have to fork to enter the
     * PID namespace (the PID namespace only applies to children). Since we'll
     * have to double-fork, this clone_parent() call won't be able to get the
     * PID of the _actual_ init process (without doing more synchronisation than
     * I can deal with at the moment). So we'll just get the parent to send it
     * for us, the only job of this process is to update
     * /proc/pid/{setgroups,uid_map,gid_map}.
     *
     * And as a result of the above, we also need to setns(2) in the first child
     * because if we join a PID namespace in the topmost parent then our child
     * will be in that namespace (and it will not be able to give us a PID value
     * that makes sense without resorting to sending things with cmsg).
     *
     * This also deals with an older issue caused by dumping cloneflags into
     * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
     * we have to unshare(2) before clone(2) in order to do this. This was fixed
     * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
     * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
     * aware, the last mainline kernel which had this bug was Linux 3.12.
     * However, we cannot comment on which kernels the broken patch was
     * backported to.
     *
     * -- Aleksa "what has my life come to?" Sarai
     */
    switch (setjmp(env)) {
        /*
         * Stage 0: We're in the parent. Our job is just to create a new child
         *          (stage 1: JUMP_CHILD) process and write its uid_map and
         *          gid_map. That process will go on to create a new process, then
         *          it will send us its PID which we will send to the bootstrap
         *          process.
         */
    case JUMP_PARENT:{
            int len;
            pid_t child, first_child = -1;
            bool ready = false;
            /* For debugging. */
            prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
            /* Start the process of getting a container. */
            child = clone_parent(&env, JUMP_CHILD);
            if (child < 0)
                bail("unable to fork: child_func");
            /*
             * State machine for synchronisation with the children.
             *
             * Father only return when both child and grandchild are
             * ready, so we can receive all possible error codes
             * generated by children.
             */
            while (!ready) {
                enum sync_t s;
                int ret;
                syncfd = sync_child_pipe[1];
                close(sync_child_pipe[0]);
                if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with child: next state");
                switch (s) {
                case SYNC_ERR:
                    /* We have to mirror the error code of the child. */
                    if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
                        bail("failed to sync with child: read(error code)");
                    exit(ret);
                case SYNC_USERMAP_PLS:
                    /*
                     * Enable setgroups(2) if we've been asked to. But we also
                     * have to explicitly disable setgroups(2) if we're
                     * creating a rootless container for single-entry mapping.
                     * i.e. config.is_setgroup == false.
                     * (this is required since Linux 3.19).
                     *
                     * For rootless multi-entry mapping, config.is_setgroup shall be true and
                     * newuidmap/newgidmap shall be used.
                     */
                    if (config.is_rootless_euid && !config.is_setgroup)
                        update_setgroups(child, SETGROUPS_DENY);
                    /* Set up mappings. */
                    update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
                    update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
                    s = SYNC_USERMAP_ACK;
                    if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                        kill(child, SIGKILL);
                        bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
                    }
                    break;
                case SYNC_RECVPID_PLS:{
                        first_child = child;
                        /* Get the init_func pid. */
                        if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
                            kill(first_child, SIGKILL);
                            bail("failed to sync with child: read(childpid)");
                        }
                        /* Send ACK. */
                        s = SYNC_RECVPID_ACK;
                        if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                            kill(first_child, SIGKILL);
                            kill(child, SIGKILL);
                            bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
                        }
                        /* Send the init_func pid back to our parent.
                         *
                         * Send the init_func pid and the pid of the first child back to our parent.
                         * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
                         * It becomes the responsibility of our parent to reap the first child.
                         */
                        len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
                        if (len < 0) {
                            kill(child, SIGKILL);
                            bail("unable to generate JSON for child pid");
                        }
                    }
                    break;
                case SYNC_CHILD_READY:
                    ready = true;
                    break;
                default:
                    bail("unexpected sync value: %u", s);
                }
            }
            /* Now sync with grandchild. */
            ready = false;
            while (!ready) {
                enum sync_t s;
                int ret;
                syncfd = sync_grandchild_pipe[1];
                close(sync_grandchild_pipe[0]);
                s = SYNC_GRANDCHILD;
                if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                    kill(child, SIGKILL);
                    bail("failed to sync with child: write(SYNC_GRANDCHILD)");
                }
                if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with child: next state");
                switch (s) {
                case SYNC_ERR:
                    /* We have to mirror the error code of the child. */
                    if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
                        bail("failed to sync with child: read(error code)");
                    exit(ret);
                case SYNC_CHILD_READY:
                    ready = true;
                    break;
                default:
                    bail("unexpected sync value: %u", s);
                }
            }
            exit(0);
        }
        /*
         * Stage 1: We're in the first child process. Our job is to join any
         *          provided namespaces in the netlink payload and unshare all
         *          of the requested namespaces. If we've been asked to
         *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
         *          our user mappings for us. Then, we create a new child
         *          (stage 2: JUMP_INIT) for PID namespace. We then send the
         *          child's PID to our parent (stage 0).
         */
    case JUMP_CHILD:{
            pid_t child;
            enum sync_t s;
            /* We're in a child and thus need to tell the parent if we die. */
            syncfd = sync_child_pipe[0];
            close(sync_child_pipe[1]);
            /* For debugging. */
            prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
            /*
             * We need to setns first. We cannot do this earlier (in stage 0)
             * because of the fact that we forked to get here (the PID of
             * [stage 2: JUMP_INIT]) would be meaningless). We could send it
             * using cmsg(3) but that's just annoying.
             */
            if (config.namespaces)
                join_namespaces(config.namespaces);
            /*
             * Deal with user namespaces first. They are quite special, as they
             * affect our ability to unshare other namespaces and are used as
             * context for privilege checks.
             *
             * We don't unshare all namespaces in one go. The reason for this
             * is that, while the kernel documentation may claim otherwise,
             * there are certain cases where unsharing all namespaces at once
             * will result in namespace objects being owned incorrectly.
             * Ideally we should just fix these kernel bugs, but it's better to
             * be safe than sorry, and fix them separately.
             *
             * A specific case of this is that the SELinux label of the
             * internal kern-mount that mqueue uses will be incorrect if the
             * UTS namespace is cloned before the USER namespace is mapped.
             * I've also heard of similar problems with the network namespace
             * in some scenarios. This also mirrors how LXC deals with this
             * problem.
             */
            if (config.cloneflags & CLONE_NEWUSER) {
                if (unshare(CLONE_NEWUSER) < 0)
                    bail("failed to unshare user namespace");
                config.cloneflags &= ~CLONE_NEWUSER;
                /*
                 * We don't have the privileges to do any mapping here (see the
                 * clone_parent rant). So signal our parent to hook us up.
                 */
                /* Switching is only necessary if we joined namespaces. */
                if (config.namespaces) {
                    if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
                        bail("failed to set process as dumpable");
                }
                s = SYNC_USERMAP_PLS;
                if (write(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
                /* ... wait for mapping ... */
                if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
                if (s != SYNC_USERMAP_ACK)
                    bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
                /* Switching is only necessary if we joined namespaces. */
                if (config.namespaces) {
                    if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
                        bail("failed to set process as dumpable");
                }
                /* Become root in the namespace proper. */
                if (setresuid(0, 0, 0) < 0)
                    bail("failed to become root in user namespace");
            }
            /*
             * Unshare all of the namespaces. Now, it should be noted that this
             * ordering might break in the future (especially with rootless
             * containers). But for now, it's not possible to split this into
             * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
             *
             * Note that we don't merge this with clone() because there were
             * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
             * was broken, so we'll just do it the long way anyway.
             */
            if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
                bail("failed to unshare namespaces");
            /*
             * TODO: What about non-namespace clone flags that we're dropping here?
             *
             * We fork again because of PID namespace, setns(2) or unshare(2) don't
             * change the PID namespace of the calling process, because doing so
             * would change the caller's idea of its own PID (as reported by getpid()),
             * which would break many applications and libraries, so we must fork
             * to actually enter the new PID namespace.
             */
            child = clone_parent(&env, JUMP_INIT);
            if (child < 0)
                bail("unable to fork: init_func");
            /* Send the child to our parent, which knows what it's doing. */
            s = SYNC_RECVPID_PLS;
            if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                kill(child, SIGKILL);
                bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
            }
            if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
                kill(child, SIGKILL);
                bail("failed to sync with parent: write(childpid)");
            }
            /* ... wait for parent to get the pid ... */
            if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
                kill(child, SIGKILL);
                bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
            }
            if (s != SYNC_RECVPID_ACK) {
                kill(child, SIGKILL);
                bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
            }
            s = SYNC_CHILD_READY;
            if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                kill(child, SIGKILL);
                bail("failed to sync with parent: write(SYNC_CHILD_READY)");
            }
            /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
            exit(0);
        }
        /*
         * Stage 2: We're the final child process, and the only process that will
         *          actually return to the Go runtime. Our job is to just do the
         *          final cleanup steps and then return to the Go runtime to allow
         *          init_linux.go to run.
         */
    case JUMP_INIT:{
            /*
             * We're inside the child now, having jumped from the
             * start_child() code after forking in the parent.
             */
            enum sync_t s;
            /* We're in a child and thus need to tell the parent if we die. */
            syncfd = sync_grandchild_pipe[0];
            close(sync_grandchild_pipe[1]);
            close(sync_child_pipe[0]);
            close(sync_child_pipe[1]);
            /* For debugging. */
            prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
            if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
            if (s != SYNC_GRANDCHILD)
                bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
            if (setsid() < 0)
                bail("setsid failed");
            if (setuid(0) < 0)
                bail("setuid failed");
            if (setgid(0) < 0)
                bail("setgid failed");
            if (!config.is_rootless_euid && config.is_setgroup) {
                if (setgroups(0, NULL) < 0)
                    bail("setgroups failed");
            }
            /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
            if (config.cloneflags & CLONE_NEWCGROUP) {
                uint8_t value;
                if (read(pipenum, &value, sizeof(value)) != sizeof(value))
                    bail("read synchronisation value failed");
                if (value == CREATECGROUPNS) {
                    if (unshare(CLONE_NEWCGROUP) < 0)
                        bail("failed to unshare cgroup namespace");
                } else
                    bail("received unknown synchronisation value");
            }
            s = SYNC_CHILD_READY;
            if (write(syncfd, &s, sizeof(s)) != sizeof(s))
                bail("failed to sync with patent: write(SYNC_CHILD_READY)");
            /* Close sync pipes. */
            close(sync_grandchild_pipe[0]);
            /* Free netlink data. */
            nl_free(&config);
            /* Finish executing, let the Go runtime take over. */
            return;
        }
    default:
        bail("unexpected jump value");
    }
    /* Should never be reached. */
    bail("should never be reached");
}

2.2) libcontainer/setns_init_linux.go#linuxSetnsInit.Init

func (l *linuxSetnsInit) Init() error {
    runtime.LockOSThread()
    defer runtime.UnlockOSThread()
    if !l.config.Config.NoNewKeyring {
        // Do not inherit the parent's session keyring.
        if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
            // Same justification as in standart_init_linux.go as to why we
            // don't bail on ENOSYS.
            //
            // TODO(cyphar): And we should have logging here too.
            if errors.Cause(err) != unix.ENOSYS {
                return errors.Wrap(err, "join session keyring")
            }
        }
    }
    if l.config.CreateConsole {
        if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
            return err
        }
        if err := system.Setctty(); err != nil {
            return err
        }
    }
    if l.config.NoNewPrivileges {
        if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
            return err
        }
    }
    if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
        return err
    }
    defer label.SetProcessLabel("")
    // Without NoNewPrivileges seccomp is a privileged operation, so we need to
    // do this before dropping capabilities; otherwise do it as late as possible
    // just before execve so as few syscalls take place after it as possible.
    if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
        if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
            return err
        }
    }
    if err := finalizeNamespace(l.config); err != nil {
        return err
    }
    if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
        return err
    }
    // Set seccomp as close to execve as possible, so as few syscalls take
    // place afterward (reducing the amount of syscalls that users need to
    // enable in their seccomp profiles).
    if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
        if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
            return newSystemErrorWithCause(err, "init seccomp")
        }
    }
    return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

手写OCI CRI【毕设】

runC源码解析-Exec（非init）

exec.go#Action

1) exec.go#execProcess

1.1) exec.go#getProcess

1.2) libcontainer/container_linux.go#linuxContainer.newParentProcess

1.2.1) libcontainer/container_linux.go#linuxContainer.newSetnsProcess

1.3) libcontainer/process_linux.go#setnsProcess.start

1.3.1) libcontainer/process_linux.go#setnsProcess.execSetns

2) 【child】init.go

2.1) libcontainer/nsenter/nsenter_gccgo.go

2.1.1) libcontainer/nsenter/nsenter.c#nsexec（Cgo无法使用交叉编译）

2.2) libcontainer/setns_init_linux.go#linuxSetnsInit.Init