exec.go#Action

  1. Action: func(context *cli.Context) error {
  2. if err := checkArgs(context, 1, minArgs); err != nil {
  3. return err
  4. }
  5. if err := revisePidFile(context); err != nil {
  6. return err
  7. }
  8. status, err := execProcess(context)
  9. if err == nil {
  10. os.Exit(status)
  11. }
  12. return fmt.Errorf("exec failed: %v", err)
  13. },
  14. SkipArgReorder: true,

1) exec.go#execProcess

  1. func execProcess(context *cli.Context) (int, error) {
  2. container, err := getContainer(context)
  3. if err != nil {
  4. return -1, err
  5. }
  6. status, err := container.Status()
  7. if err != nil {
  8. return -1, err
  9. }
  10. if status == libcontainer.Stopped {
  11. return -1, fmt.Errorf("cannot exec a container that has stopped")
  12. }
  13. path := context.String("process")
  14. if path == "" && len(context.Args()) == 1 {
  15. return -1, fmt.Errorf("process args cannot be empty")
  16. }
  17. detach := context.Bool("detach")
  18. state, err := container.State()
  19. if err != nil {
  20. return -1, err
  21. }
  22. bundle := utils.SearchLabels(state.Config.Labels, "bundle")
  23. p, err := getProcess(context, bundle)
  24. if err != nil {
  25. return -1, err
  26. }
  27. r := &runner{
  28. enableSubreaper: false,
  29. shouldDestroy: false,
  30. container: container,
  31. consoleSocket: context.String("console-socket"),
  32. detach: detach,
  33. pidFile: context.String("pid-file"),
  34. action: CT_ACT_RUN,
  35. init: false,
  36. }
  37. return r.run(p)
  38. }

1.1) exec.go#getProcess

  1. func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
  2. if path := context.String("process"); path != "" {
  3. f, err := os.Open(path)
  4. if err != nil {
  5. return nil, err
  6. }
  7. defer f.Close()
  8. var p specs.Process
  9. if err := json.NewDecoder(f).Decode(&p); err != nil {
  10. return nil, err
  11. }
  12. return &p, validateProcessSpec(&p)
  13. }
  14. // process via cli flags
  15. if err := os.Chdir(bundle); err != nil {
  16. return nil, err
  17. }
  18. spec, err := loadSpec(specConfig)
  19. if err != nil {
  20. return nil, err
  21. }
  22. p := spec.Process
  23. p.Args = context.Args()[1:]
  24. // override the cwd, if passed
  25. if context.String("cwd") != "" {
  26. p.Cwd = context.String("cwd")
  27. }
  28. if ap := context.String("apparmor"); ap != "" {
  29. p.ApparmorProfile = ap
  30. }
  31. if l := context.String("process-label"); l != "" {
  32. p.SelinuxLabel = l
  33. }
  34. if caps := context.StringSlice("cap"); len(caps) > 0 {
  35. for _, c := range caps {
  36. p.Capabilities.Bounding = append(p.Capabilities.Bounding, c)
  37. p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c)
  38. p.Capabilities.Effective = append(p.Capabilities.Effective, c)
  39. p.Capabilities.Permitted = append(p.Capabilities.Permitted, c)
  40. p.Capabilities.Ambient = append(p.Capabilities.Ambient, c)
  41. }
  42. }
  43. // append the passed env variables
  44. p.Env = append(p.Env, context.StringSlice("env")...)
  45. // set the tty
  46. if context.IsSet("tty") {
  47. p.Terminal = context.Bool("tty")
  48. }
  49. if context.IsSet("no-new-privs") {
  50. p.NoNewPrivileges = context.Bool("no-new-privs")
  51. }
  52. // override the user, if passed
  53. if context.String("user") != "" {
  54. u := strings.SplitN(context.String("user"), ":", 2)
  55. if len(u) > 1 {
  56. gid, err := strconv.Atoi(u[1])
  57. if err != nil {
  58. return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)
  59. }
  60. p.User.GID = uint32(gid)
  61. }
  62. uid, err := strconv.Atoi(u[0])
  63. if err != nil {
  64. return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)
  65. }
  66. p.User.UID = uint32(uid)
  67. }
  68. for _, gid := range context.Int64Slice("additional-gids") {
  69. if gid < 0 {
  70. return nil, fmt.Errorf("additional-gids must be a positive number %d", gid)
  71. }
  72. p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid))
  73. }
  74. return p, nil
  75. }

1.2) libcontainer/container_linux.go#linuxContainer.newParentProcess

如果非init进程,那么会返回SetnsProcess。

  1. func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
  2. parentPipe, childPipe, err := utils.NewSockPair("init")
  3. if err != nil {
  4. return nil, newSystemErrorWithCause(err, "creating new init pipe")
  5. }
  6. cmd, err := c.commandTemplate(p, childPipe)
  7. if err != nil {
  8. return nil, newSystemErrorWithCause(err, "creating new command template")
  9. }
  10. if !p.Init {
  11. return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
  12. }
  13. // We only set up fifoFd if we're not doing a `runc exec`. The historic
  14. // reason for this is that previously we would pass a dirfd that allowed
  15. // for container rootfs escape (and not doing it in `runc exec` avoided
  16. // that problem), but we no longer do that. However, there's no need to do
  17. // this for `runc exec` so we just keep it this way to be safe.
  18. if err := c.includeExecFifo(cmd); err != nil {
  19. return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
  20. }
  21. return c.newInitProcess(p, cmd, parentPipe, childPipe)
  22. }

1.2.1) libcontainer/container_linux.go#linuxContainer.newSetnsProcess

  1. func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
  2. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
  3. state, err := c.currentState()
  4. if err != nil {
  5. return nil, newSystemErrorWithCause(err, "getting container's current state")
  6. }
  7. // for setns process, we don't have to set cloneflags as the process namespaces
  8. // will only be set via setns syscall
  9. data, err := c.bootstrapData(0, state.NamespacePaths)
  10. if err != nil {
  11. return nil, err
  12. }
  13. return &setnsProcess{
  14. cmd: cmd,
  15. cgroupPaths: c.cgroupManager.GetPaths(),
  16. rootlessCgroups: c.config.RootlessCgroups,
  17. intelRdtPath: state.IntelRdtPath,
  18. childPipe: childPipe,
  19. parentPipe: parentPipe,
  20. config: c.newInitConfig(p),
  21. process: p,
  22. bootstrapData: data,
  23. }, nil
  24. }

1.3) libcontainer/process_linux.go#setnsProcess.start

  1. func (p *setnsProcess) start() (err error) {
  2. defer p.parentPipe.Close()
  3. err = p.cmd.Start()
  4. p.childPipe.Close()
  5. if err != nil {
  6. return newSystemErrorWithCause(err, "starting setns process")
  7. }
  8. if p.bootstrapData != nil {
  9. if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
  10. return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
  11. }
  12. }
  13. if err = p.execSetns(); err != nil {
  14. return newSystemErrorWithCause(err, "executing setns process")
  15. }
  16. if len(p.cgroupPaths) > 0 {
  17. if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
  18. return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
  19. }
  20. }
  21. if p.intelRdtPath != "" {
  22. // if Intel RDT "resource control" filesystem path exists
  23. _, err := os.Stat(p.intelRdtPath)
  24. if err == nil {
  25. if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
  26. return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
  27. }
  28. }
  29. }
  30. // set rlimits, this has to be done here because we lose permissions
  31. // to raise the limits once we enter a user-namespace
  32. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  33. return newSystemErrorWithCause(err, "setting rlimits for process")
  34. }
  35. if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
  36. return newSystemErrorWithCause(err, "writing config to pipe")
  37. }
  38. ierr := parseSync(p.parentPipe, func(sync *syncT) error {
  39. switch sync.Type {
  40. case procReady:
  41. // This shouldn't happen.
  42. panic("unexpected procReady in setns")
  43. case procHooks:
  44. // This shouldn't happen.
  45. panic("unexpected procHooks in setns")
  46. default:
  47. return newSystemError(fmt.Errorf("invalid JSON payload from child"))
  48. }
  49. })
  50. if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
  51. return newSystemErrorWithCause(err, "calling shutdown on init pipe")
  52. }
  53. // Must be done after Shutdown so the child will exit and we can wait for it.
  54. if ierr != nil {
  55. p.wait()
  56. return ierr
  57. }
  58. return nil
  59. }

1.3.1) libcontainer/process_linux.go#setnsProcess.execSetns

  1. // execSetns runs the process that executes C code to perform the setns calls
  2. // because setns support requires the C process to fork off a child and perform the setns
  3. // before the go runtime boots, we wait on the process to die and receive the child's pid
  4. // over the provided pipe.
  5. func (p *setnsProcess) execSetns() error {
  6. status, err := p.cmd.Process.Wait()
  7. if err != nil {
  8. p.cmd.Wait()
  9. return newSystemErrorWithCause(err, "waiting on setns process to finish")
  10. }
  11. if !status.Success() {
  12. p.cmd.Wait()
  13. return newSystemError(&exec.ExitError{ProcessState: status})
  14. }
  15. var pid *pid
  16. if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
  17. p.cmd.Wait()
  18. return newSystemErrorWithCause(err, "reading pid from init pipe")
  19. }
  20. // Clean up the zombie parent process
  21. firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
  22. if err != nil {
  23. return err
  24. }
  25. // Ignore the error in case the child has already been reaped for any reason
  26. _, _ = firstChildProcess.Wait()
  27. process, err := os.FindProcess(pid.Pid)
  28. if err != nil {
  29. return err
  30. }
  31. p.cmd.Process = process
  32. p.process.ops = p
  33. return nil
  34. }

2) 【child】init.go

  1. import (
  2. "os"
  3. "runtime"
  4. "github.com/opencontainers/runc/libcontainer"
  5. // ********************************** NOTICE ********************************** //
  6. _ "github.com/opencontainers/runc/libcontainer/nsenter"
  7. // ********************************** NOTICE ********************************** //
  8. "github.com/urfave/cli"
  9. )
  10. func init() {
  11. if len(os.Args) > 1 && os.Args[1] == "init" {
  12. runtime.GOMAXPROCS(1)
  13. runtime.LockOSThread()
  14. }
  15. }
  16. var initCommand = cli.Command{
  17. Name: "init",
  18. Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
  19. Action: func(context *cli.Context) error {
  20. factory, _ := libcontainer.New("")
  21. if err := factory.StartInitialization(); err != nil {
  22. // as the error is sent back to the parent there is no need to log
  23. // or write it to stderr because the parent process will handle this
  24. os.Exit(1)
  25. }
  26. panic("libcontainer: container init failed to exec")
  27. },
  28. }

2.1) libcontainer/nsenter/nsenter_gccgo.go

  1. // +build linux,gccgo
  2. package nsenter
  3. /*
  4. #cgo CFLAGS: -Wall
  5. extern void nsexec();
  6. void __attribute__((constructor)) init(void) {
  7. nsexec();
  8. }
  9. */
  10. import "C"
  11. // AlwaysFalse is here to stay false
  12. // (and be exported so the compiler doesn't optimize out its reference)
  13. var AlwaysFalse bool
  14. func init() {
  15. if AlwaysFalse {
  16. // by referencing this C init() in a noop test, it will ensure the compiler
  17. // links in the C function.
  18. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
  19. C.init()
  20. }
  21. }

2.1.1) libcontainer/nsenter/nsenter.c#nsexec(Cgo无法使用交叉编译)

  1. void nsexec(void)
  2. {
  3. int pipenum;
  4. jmp_buf env;
  5. int sync_child_pipe[2], sync_grandchild_pipe[2];
  6. struct nlconfig_t config = { 0 };
  7. /*
  8. * If we don't have an init pipe, just return to the go routine.
  9. * We'll only get an init pipe for start or exec.
  10. */
  11. pipenum = initpipe();
  12. if (pipenum == -1)
  13. return;
  14. /* Parse all of the netlink configuration. */
  15. nl_parse(pipenum, &config);
  16. /* Set oom_score_adj. This has to be done before !dumpable because
  17. * /proc/self/oom_score_adj is not writeable unless you're an privileged
  18. * user (if !dumpable is set). All children inherit their parent's
  19. * oom_score_adj value on fork(2) so this will always be propagated
  20. * properly.
  21. */
  22. update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
  23. /*
  24. * Make the process non-dumpable, to avoid various race conditions that
  25. * could cause processes in namespaces we're joining to access host
  26. * resources (or potentially execute code).
  27. *
  28. * However, if the number of namespaces we are joining is 0, we are not
  29. * going to be switching to a different security context. Thus setting
  30. * ourselves to be non-dumpable only breaks things (like rootless
  31. * containers), which is the recommendation from the kernel folks.
  32. */
  33. if (config.namespaces) {
  34. if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
  35. bail("failed to set process as non-dumpable");
  36. }
  37. /* Pipe so we can tell the child when we've finished setting up. */
  38. if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
  39. bail("failed to setup sync pipe between parent and child");
  40. /*
  41. * We need a new socketpair to sync with grandchild so we don't have
  42. * race condition with child.
  43. */
  44. if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
  45. bail("failed to setup sync pipe between parent and grandchild");
  46. /* TODO: Currently we aren't dealing with child deaths properly. */
  47. /*
  48. * Okay, so this is quite annoying.
  49. *
  50. * In order for this unsharing code to be more extensible we need to split
  51. * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
  52. * would be if we did clone(CLONE_NEWUSER) and the other namespaces
  53. * separately, but because of SELinux issues we cannot really do that. But
  54. * we cannot just dump the namespace flags into clone(...) because several
  55. * usecases (such as rootless containers) require more granularity around
  56. * the namespace setup. In addition, some older kernels had issues where
  57. * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
  58. * handle this while also dealing with SELinux so we choose SELinux support
  59. * over broken kernel support).
  60. *
  61. * However, if we unshare(2) the user namespace *before* we clone(2), then
  62. * all hell breaks loose.
  63. *
  64. * The parent no longer has permissions to do many things (unshare(2) drops
  65. * all capabilities in your old namespace), and the container cannot be set
  66. * up to have more than one {uid,gid} mapping. This is obviously less than
  67. * ideal. In order to fix this, we have to first clone(2) and then unshare.
  68. *
  69. * Unfortunately, it's not as simple as that. We have to fork to enter the
  70. * PID namespace (the PID namespace only applies to children). Since we'll
  71. * have to double-fork, this clone_parent() call won't be able to get the
  72. * PID of the _actual_ init process (without doing more synchronisation than
  73. * I can deal with at the moment). So we'll just get the parent to send it
  74. * for us, the only job of this process is to update
  75. * /proc/pid/{setgroups,uid_map,gid_map}.
  76. *
  77. * And as a result of the above, we also need to setns(2) in the first child
  78. * because if we join a PID namespace in the topmost parent then our child
  79. * will be in that namespace (and it will not be able to give us a PID value
  80. * that makes sense without resorting to sending things with cmsg).
  81. *
  82. * This also deals with an older issue caused by dumping cloneflags into
  83. * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
  84. * we have to unshare(2) before clone(2) in order to do this. This was fixed
  85. * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
  86. * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
  87. * aware, the last mainline kernel which had this bug was Linux 3.12.
  88. * However, we cannot comment on which kernels the broken patch was
  89. * backported to.
  90. *
  91. * -- Aleksa "what has my life come to?" Sarai
  92. */
  93. switch (setjmp(env)) {
  94. /*
  95. * Stage 0: We're in the parent. Our job is just to create a new child
  96. * (stage 1: JUMP_CHILD) process and write its uid_map and
  97. * gid_map. That process will go on to create a new process, then
  98. * it will send us its PID which we will send to the bootstrap
  99. * process.
  100. */
  101. case JUMP_PARENT:{
  102. int len;
  103. pid_t child, first_child = -1;
  104. bool ready = false;
  105. /* For debugging. */
  106. prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
  107. /* Start the process of getting a container. */
  108. child = clone_parent(&env, JUMP_CHILD);
  109. if (child < 0)
  110. bail("unable to fork: child_func");
  111. /*
  112. * State machine for synchronisation with the children.
  113. *
  114. * Father only return when both child and grandchild are
  115. * ready, so we can receive all possible error codes
  116. * generated by children.
  117. */
  118. while (!ready) {
  119. enum sync_t s;
  120. int ret;
  121. syncfd = sync_child_pipe[1];
  122. close(sync_child_pipe[0]);
  123. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  124. bail("failed to sync with child: next state");
  125. switch (s) {
  126. case SYNC_ERR:
  127. /* We have to mirror the error code of the child. */
  128. if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
  129. bail("failed to sync with child: read(error code)");
  130. exit(ret);
  131. case SYNC_USERMAP_PLS:
  132. /*
  133. * Enable setgroups(2) if we've been asked to. But we also
  134. * have to explicitly disable setgroups(2) if we're
  135. * creating a rootless container for single-entry mapping.
  136. * i.e. config.is_setgroup == false.
  137. * (this is required since Linux 3.19).
  138. *
  139. * For rootless multi-entry mapping, config.is_setgroup shall be true and
  140. * newuidmap/newgidmap shall be used.
  141. */
  142. if (config.is_rootless_euid && !config.is_setgroup)
  143. update_setgroups(child, SETGROUPS_DENY);
  144. /* Set up mappings. */
  145. update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
  146. update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
  147. s = SYNC_USERMAP_ACK;
  148. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  149. kill(child, SIGKILL);
  150. bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
  151. }
  152. break;
  153. case SYNC_RECVPID_PLS:{
  154. first_child = child;
  155. /* Get the init_func pid. */
  156. if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
  157. kill(first_child, SIGKILL);
  158. bail("failed to sync with child: read(childpid)");
  159. }
  160. /* Send ACK. */
  161. s = SYNC_RECVPID_ACK;
  162. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  163. kill(first_child, SIGKILL);
  164. kill(child, SIGKILL);
  165. bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
  166. }
  167. /* Send the init_func pid back to our parent.
  168. *
  169. * Send the init_func pid and the pid of the first child back to our parent.
  170. * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
  171. * It becomes the responsibility of our parent to reap the first child.
  172. */
  173. len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
  174. if (len < 0) {
  175. kill(child, SIGKILL);
  176. bail("unable to generate JSON for child pid");
  177. }
  178. }
  179. break;
  180. case SYNC_CHILD_READY:
  181. ready = true;
  182. break;
  183. default:
  184. bail("unexpected sync value: %u", s);
  185. }
  186. }
  187. /* Now sync with grandchild. */
  188. ready = false;
  189. while (!ready) {
  190. enum sync_t s;
  191. int ret;
  192. syncfd = sync_grandchild_pipe[1];
  193. close(sync_grandchild_pipe[0]);
  194. s = SYNC_GRANDCHILD;
  195. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  196. kill(child, SIGKILL);
  197. bail("failed to sync with child: write(SYNC_GRANDCHILD)");
  198. }
  199. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  200. bail("failed to sync with child: next state");
  201. switch (s) {
  202. case SYNC_ERR:
  203. /* We have to mirror the error code of the child. */
  204. if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
  205. bail("failed to sync with child: read(error code)");
  206. exit(ret);
  207. case SYNC_CHILD_READY:
  208. ready = true;
  209. break;
  210. default:
  211. bail("unexpected sync value: %u", s);
  212. }
  213. }
  214. exit(0);
  215. }
  216. /*
  217. * Stage 1: We're in the first child process. Our job is to join any
  218. * provided namespaces in the netlink payload and unshare all
  219. * of the requested namespaces. If we've been asked to
  220. * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
  221. * our user mappings for us. Then, we create a new child
  222. * (stage 2: JUMP_INIT) for PID namespace. We then send the
  223. * child's PID to our parent (stage 0).
  224. */
  225. case JUMP_CHILD:{
  226. pid_t child;
  227. enum sync_t s;
  228. /* We're in a child and thus need to tell the parent if we die. */
  229. syncfd = sync_child_pipe[0];
  230. close(sync_child_pipe[1]);
  231. /* For debugging. */
  232. prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
  233. /*
  234. * We need to setns first. We cannot do this earlier (in stage 0)
  235. * because of the fact that we forked to get here (the PID of
  236. * [stage 2: JUMP_INIT]) would be meaningless). We could send it
  237. * using cmsg(3) but that's just annoying.
  238. */
  239. if (config.namespaces)
  240. join_namespaces(config.namespaces);
  241. /*
  242. * Deal with user namespaces first. They are quite special, as they
  243. * affect our ability to unshare other namespaces and are used as
  244. * context for privilege checks.
  245. *
  246. * We don't unshare all namespaces in one go. The reason for this
  247. * is that, while the kernel documentation may claim otherwise,
  248. * there are certain cases where unsharing all namespaces at once
  249. * will result in namespace objects being owned incorrectly.
  250. * Ideally we should just fix these kernel bugs, but it's better to
  251. * be safe than sorry, and fix them separately.
  252. *
  253. * A specific case of this is that the SELinux label of the
  254. * internal kern-mount that mqueue uses will be incorrect if the
  255. * UTS namespace is cloned before the USER namespace is mapped.
  256. * I've also heard of similar problems with the network namespace
  257. * in some scenarios. This also mirrors how LXC deals with this
  258. * problem.
  259. */
  260. if (config.cloneflags & CLONE_NEWUSER) {
  261. if (unshare(CLONE_NEWUSER) < 0)
  262. bail("failed to unshare user namespace");
  263. config.cloneflags &= ~CLONE_NEWUSER;
  264. /*
  265. * We don't have the privileges to do any mapping here (see the
  266. * clone_parent rant). So signal our parent to hook us up.
  267. */
  268. /* Switching is only necessary if we joined namespaces. */
  269. if (config.namespaces) {
  270. if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
  271. bail("failed to set process as dumpable");
  272. }
  273. s = SYNC_USERMAP_PLS;
  274. if (write(syncfd, &s, sizeof(s)) != sizeof(s))
  275. bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
  276. /* ... wait for mapping ... */
  277. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  278. bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
  279. if (s != SYNC_USERMAP_ACK)
  280. bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
  281. /* Switching is only necessary if we joined namespaces. */
  282. if (config.namespaces) {
  283. if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
  284. bail("failed to set process as dumpable");
  285. }
  286. /* Become root in the namespace proper. */
  287. if (setresuid(0, 0, 0) < 0)
  288. bail("failed to become root in user namespace");
  289. }
  290. /*
  291. * Unshare all of the namespaces. Now, it should be noted that this
  292. * ordering might break in the future (especially with rootless
  293. * containers). But for now, it's not possible to split this into
  294. * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
  295. *
  296. * Note that we don't merge this with clone() because there were
  297. * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
  298. * was broken, so we'll just do it the long way anyway.
  299. */
  300. if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
  301. bail("failed to unshare namespaces");
  302. /*
  303. * TODO: What about non-namespace clone flags that we're dropping here?
  304. *
  305. * We fork again because of PID namespace, setns(2) or unshare(2) don't
  306. * change the PID namespace of the calling process, because doing so
  307. * would change the caller's idea of its own PID (as reported by getpid()),
  308. * which would break many applications and libraries, so we must fork
  309. * to actually enter the new PID namespace.
  310. */
  311. child = clone_parent(&env, JUMP_INIT);
  312. if (child < 0)
  313. bail("unable to fork: init_func");
  314. /* Send the child to our parent, which knows what it's doing. */
  315. s = SYNC_RECVPID_PLS;
  316. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  317. kill(child, SIGKILL);
  318. bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
  319. }
  320. if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
  321. kill(child, SIGKILL);
  322. bail("failed to sync with parent: write(childpid)");
  323. }
  324. /* ... wait for parent to get the pid ... */
  325. if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
  326. kill(child, SIGKILL);
  327. bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
  328. }
  329. if (s != SYNC_RECVPID_ACK) {
  330. kill(child, SIGKILL);
  331. bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
  332. }
  333. s = SYNC_CHILD_READY;
  334. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  335. kill(child, SIGKILL);
  336. bail("failed to sync with parent: write(SYNC_CHILD_READY)");
  337. }
  338. /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
  339. exit(0);
  340. }
  341. /*
  342. * Stage 2: We're the final child process, and the only process that will
  343. * actually return to the Go runtime. Our job is to just do the
  344. * final cleanup steps and then return to the Go runtime to allow
  345. * init_linux.go to run.
  346. */
  347. case JUMP_INIT:{
  348. /*
  349. * We're inside the child now, having jumped from the
  350. * start_child() code after forking in the parent.
  351. */
  352. enum sync_t s;
  353. /* We're in a child and thus need to tell the parent if we die. */
  354. syncfd = sync_grandchild_pipe[0];
  355. close(sync_grandchild_pipe[1]);
  356. close(sync_child_pipe[0]);
  357. close(sync_child_pipe[1]);
  358. /* For debugging. */
  359. prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
  360. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  361. bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
  362. if (s != SYNC_GRANDCHILD)
  363. bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
  364. if (setsid() < 0)
  365. bail("setsid failed");
  366. if (setuid(0) < 0)
  367. bail("setuid failed");
  368. if (setgid(0) < 0)
  369. bail("setgid failed");
  370. if (!config.is_rootless_euid && config.is_setgroup) {
  371. if (setgroups(0, NULL) < 0)
  372. bail("setgroups failed");
  373. }
  374. /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
  375. if (config.cloneflags & CLONE_NEWCGROUP) {
  376. uint8_t value;
  377. if (read(pipenum, &value, sizeof(value)) != sizeof(value))
  378. bail("read synchronisation value failed");
  379. if (value == CREATECGROUPNS) {
  380. if (unshare(CLONE_NEWCGROUP) < 0)
  381. bail("failed to unshare cgroup namespace");
  382. } else
  383. bail("received unknown synchronisation value");
  384. }
  385. s = SYNC_CHILD_READY;
  386. if (write(syncfd, &s, sizeof(s)) != sizeof(s))
  387. bail("failed to sync with patent: write(SYNC_CHILD_READY)");
  388. /* Close sync pipes. */
  389. close(sync_grandchild_pipe[0]);
  390. /* Free netlink data. */
  391. nl_free(&config);
  392. /* Finish executing, let the Go runtime take over. */
  393. return;
  394. }
  395. default:
  396. bail("unexpected jump value");
  397. }
  398. /* Should never be reached. */
  399. bail("should never be reached");
  400. }

2.2) libcontainer/setns_init_linux.go#linuxSetnsInit.Init

  1. func (l *linuxSetnsInit) Init() error {
  2. runtime.LockOSThread()
  3. defer runtime.UnlockOSThread()
  4. if !l.config.Config.NoNewKeyring {
  5. // Do not inherit the parent's session keyring.
  6. if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
  7. // Same justification as in standart_init_linux.go as to why we
  8. // don't bail on ENOSYS.
  9. //
  10. // TODO(cyphar): And we should have logging here too.
  11. if errors.Cause(err) != unix.ENOSYS {
  12. return errors.Wrap(err, "join session keyring")
  13. }
  14. }
  15. }
  16. if l.config.CreateConsole {
  17. if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
  18. return err
  19. }
  20. if err := system.Setctty(); err != nil {
  21. return err
  22. }
  23. }
  24. if l.config.NoNewPrivileges {
  25. if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
  26. return err
  27. }
  28. }
  29. if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
  30. return err
  31. }
  32. defer label.SetProcessLabel("")
  33. // Without NoNewPrivileges seccomp is a privileged operation, so we need to
  34. // do this before dropping capabilities; otherwise do it as late as possible
  35. // just before execve so as few syscalls take place after it as possible.
  36. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
  37. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  38. return err
  39. }
  40. }
  41. if err := finalizeNamespace(l.config); err != nil {
  42. return err
  43. }
  44. if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
  45. return err
  46. }
  47. // Set seccomp as close to execve as possible, so as few syscalls take
  48. // place afterward (reducing the amount of syscalls that users need to
  49. // enable in their seccomp profiles).
  50. if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
  51. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  52. return newSystemErrorWithCause(err, "init seccomp")
  53. }
  54. }
  55. return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
  56. }