1、runc使用

2、runc执行过程

根据用户的config.json构建逻辑container,然后调用container.Start()方法,此处重点关注container.Start()方法

2.1、创建exec fifo

exec fifi用户监听用户执行start命令,真正启动用户进程用的 execFifoFilename = “exec.fifo” root默认为/run/runc

  1. // 若为1号进程,则需要创建exec fifo
  2. // 当在容器内在起一个进程则不需要
  3. func (c *linuxContainer) createExecFifo() error {
  4. rootuid, err := c.Config().HostRootUID()
  5. if err != nil {
  6. return err
  7. }
  8. rootgid, err := c.Config().HostRootGID()
  9. if err != nil {
  10. return err
  11. }
  12. fifoName := filepath.Join(c.root, execFifoFilename)
  13. if _, err := os.Stat(fifoName); err == nil {
  14. return fmt.Errorf("exec fifo %s already exists", fifoName)
  15. }
  16. oldMask := unix.Umask(0000)
  17. if err := unix.Mkfifo(fifoName, 0622); err != nil {
  18. unix.Umask(oldMask)
  19. return err
  20. }
  21. unix.Umask(oldMask)
  22. return os.Chown(fifoName, rootuid, rootgid)
  23. }

2.2、初始化父进程(create操作对应的为initProcess)

initProcess负责对runc init进行控制和信号处理

2.2.1、构建ParentProcess

  1. func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
  2. // 构建通信pipe
  3. parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
  4. if err != nil {
  5. return nil, newSystemErrorWithCause(err, "creating new init pipe")
  6. }
  7. messageSockPair := filePair{parentInitPipe, childInitPipe}
  8. // 构建日志pipe
  9. // parent负责读,child负责写
  10. parentLogPipe, childLogPipe, err := os.Pipe()
  11. if err != nil {
  12. return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
  13. }
  14. logFilePair := filePair{parentLogPipe, childLogPipe}
  15. // 此处就是封装runc init cmd,并且将用户二进制路径作为参数传递到runc init命令中
  16. cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
  17. // 若不是第一次init,则封装为SetnsProcess
  18. if !p.Init {
  19. return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
  20. }
  21. // We only set up fifoFd if we're not doing a `runc exec`. The historic
  22. // reason for this is that previously we would pass a dirfd that allowed
  23. // for container rootfs escape (and not doing it in `runc exec` avoided
  24. // that problem), but we no longer do that. However, there's no need to do
  25. // this for `runc exec` so we just keep it this way to be safe.
  26. if err := c.includeExecFifo(cmd); err != nil {
  27. return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
  28. }
  29. return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
  30. }

2.2.1、构建initProcess

  1. func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
  2. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
  3. nsMaps := make(map[configs.NamespaceType]string)
  4. for _, ns := range c.config.Namespaces {
  5. if ns.Path != "" {
  6. nsMaps[ns.Type] = ns.Path
  7. }
  8. }
  9. _, sharePidns := nsMaps[configs.NEWPID]
  10. data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
  11. if err != nil {
  12. return nil, err
  13. }
  14. init := &initProcess{
  15. cmd: cmd,
  16. messageSockPair: messageSockPair,
  17. logFilePair: logFilePair,
  18. manager: c.cgroupManager,
  19. intelRdtManager: c.intelRdtManager,
  20. config: c.newInitConfig(p),
  21. container: c,
  22. process: p,
  23. bootstrapData: data,
  24. sharePidns: sharePidns,
  25. }
  26. c.initProcess = init
  27. return init, nil
  28. }

2.3、启动父进程(create操作对应的为initProcess)

启动父进程之前需要重新向子进程日志,就是从parentPipe读取日志,然后打到标准输出

  1. func (p *initProcess) start() (retErr error) {
  2. // 结束之后关闭parent的unix socket
  3. defer p.messageSockPair.parent.Close()
  4. // 启动子进程,此处为runc init
  5. // 在factory创建逻辑容器的时候指定
  6. err := p.cmd.Start()
  7. p.process.ops = p
  8. // 父进程不需要操作子进程端的文件句柄,因此需要close
  9. p.messageSockPair.child.Close()
  10. p.logFilePair.child.Close()
  11. if err != nil {
  12. p.process.ops = nil
  13. return newSystemErrorWithCause(err, "starting init process command")
  14. }
  15. defer func() {
  16. if retErr != nil {
  17. // init might be killed by the kernel's OOM killer.
  18. oom, err := p.manager.OOMKillCount()
  19. if err != nil {
  20. logrus.WithError(err).Warn("unable to get oom kill count")
  21. } else if oom > 0 {
  22. // Does not matter what the particular error was,
  23. // its cause is most probably OOM, so report that.
  24. const oomError = "container init was OOM-killed (memory limit too low?)"
  25. if logrus.GetLevel() >= logrus.DebugLevel {
  26. // Only show the original error if debug is set,
  27. // as it is not generally very useful.
  28. retErr = newSystemErrorWithCause(retErr, oomError)
  29. } else {
  30. retErr = newSystemError(errors.New(oomError))
  31. }
  32. }
  33. // terminate the process to ensure we can remove cgroups
  34. if err := ignoreTerminateErrors(p.terminate()); err != nil {
  35. logrus.WithError(err).Warn("unable to terminate initProcess")
  36. }
  37. p.manager.Destroy()
  38. if p.intelRdtManager != nil {
  39. p.intelRdtManager.Destroy()
  40. }
  41. }
  42. }()
  43. // Do this before syncing with child so that no children can escape the
  44. // cgroup. We don't need to worry about not doing this and not being root
  45. // because we'd be using the rootless cgroup manager in that case.
  46. if err := p.manager.Apply(p.pid()); err != nil {
  47. return newSystemErrorWithCause(err, "applying cgroup configuration for process")
  48. }
  49. if p.intelRdtManager != nil {
  50. if err := p.intelRdtManager.Apply(p.pid()); err != nil {
  51. return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
  52. }
  53. }
  54. if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
  55. return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
  56. }
  57. childPid, err := p.getChildPid()
  58. if err != nil {
  59. return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
  60. }
  61. // Save the standard descriptor names before the container process
  62. // can potentially move them (e.g., via dup2()). If we don't do this now,
  63. // we won't know at checkpoint time which file descriptor to look up.
  64. fds, err := getPipeFds(childPid)
  65. if err != nil {
  66. return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
  67. }
  68. p.setExternalDescriptors(fds)
  69. // Now it's time to setup cgroup namesapce
  70. if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
  71. if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
  72. return newSystemErrorWithCause(err, "sending synchronization value to init process")
  73. }
  74. }
  75. // Wait for our first child to exit
  76. if err := p.waitForChildExit(childPid); err != nil {
  77. return newSystemErrorWithCause(err, "waiting for our first child to exit")
  78. }
  79. if err := p.createNetworkInterfaces(); err != nil {
  80. return newSystemErrorWithCause(err, "creating network interfaces")
  81. }
  82. if err := p.updateSpecState(); err != nil {
  83. return newSystemErrorWithCause(err, "updating the spec state")
  84. }
  85. if err := p.sendConfig(); err != nil {
  86. return newSystemErrorWithCause(err, "sending config to init process")
  87. }
  88. var (
  89. sentRun bool
  90. sentResume bool
  91. )
  92. ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
  93. switch sync.Type {
  94. case procReady:
  95. // set rlimits, this has to be done here because we lose permissions
  96. // to raise the limits once we enter a user-namespace
  97. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  98. return newSystemErrorWithCause(err, "setting rlimits for ready process")
  99. }
  100. // call prestart and CreateRuntime hooks
  101. if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
  102. // Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
  103. if err := p.manager.Set(p.config.Config); err != nil {
  104. return newSystemErrorWithCause(err, "setting cgroup config for ready process")
  105. }
  106. if p.intelRdtManager != nil {
  107. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  108. return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
  109. }
  110. }
  111. if p.config.Config.Hooks != nil {
  112. s, err := p.container.currentOCIState()
  113. if err != nil {
  114. return err
  115. }
  116. // initProcessStartTime hasn't been set yet.
  117. s.Pid = p.cmd.Process.Pid
  118. s.Status = specs.StateCreating
  119. hooks := p.config.Config.Hooks
  120. if err := hooks[configs.Prestart].RunHooks(s); err != nil {
  121. return err
  122. }
  123. if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
  124. return err
  125. }
  126. }
  127. }
  128. // generate a timestamp indicating when the container was started
  129. p.container.created = time.Now().UTC()
  130. p.container.state = &createdState{
  131. c: p.container,
  132. }
  133. // NOTE: If the procRun state has been synced and the
  134. // runc-create process has been killed for some reason,
  135. // the runc-init[2:stage] process will be leaky. And
  136. // the runc command also fails to parse root directory
  137. // because the container doesn't have state.json.
  138. //
  139. // In order to cleanup the runc-init[2:stage] by
  140. // runc-delete/stop, we should store the status before
  141. // procRun sync.
  142. state, uerr := p.container.updateState(p)
  143. if uerr != nil {
  144. return newSystemErrorWithCause(err, "store init state")
  145. }
  146. p.container.initProcessStartTime = state.InitProcessStartTime
  147. // Sync with child.
  148. if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
  149. return newSystemErrorWithCause(err, "writing syncT 'run'")
  150. }
  151. sentRun = true
  152. case procHooks:
  153. // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
  154. if err := p.manager.Set(p.config.Config); err != nil {
  155. return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
  156. }
  157. if p.intelRdtManager != nil {
  158. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  159. return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
  160. }
  161. }
  162. if p.config.Config.Hooks != nil {
  163. s, err := p.container.currentOCIState()
  164. if err != nil {
  165. return err
  166. }
  167. // initProcessStartTime hasn't been set yet.
  168. s.Pid = p.cmd.Process.Pid
  169. s.Status = specs.StateCreating
  170. hooks := p.config.Config.Hooks
  171. if err := hooks[configs.Prestart].RunHooks(s); err != nil {
  172. return err
  173. }
  174. if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
  175. return err
  176. }
  177. }
  178. // Sync with child.
  179. if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
  180. return newSystemErrorWithCause(err, "writing syncT 'resume'")
  181. }
  182. sentResume = true
  183. default:
  184. return newSystemError(errors.New("invalid JSON payload from child"))
  185. }
  186. return nil
  187. })
  188. if !sentRun {
  189. return newSystemErrorWithCause(ierr, "container init")
  190. }
  191. if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
  192. return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process"))
  193. }
  194. if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
  195. return newSystemErrorWithCause(err, "shutting down init pipe")
  196. }
  197. // Must be done after Shutdown so the child will exit and we can wait for it.
  198. if ierr != nil {
  199. p.wait()
  200. return ierr
  201. }
  202. return nil
  203. }

2.4、runc init进程

  1. func (l *linuxStandardInit) Init() error {
  2. runtime.LockOSThread()
  3. defer runtime.UnlockOSThread()
  4. if !l.config.Config.NoNewKeyring {
  5. if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
  6. return err
  7. }
  8. defer selinux.SetKeyLabel("")
  9. ringname, keepperms, newperms := l.getSessionRingParams()
  10. // Do not inherit the parent's session keyring.
  11. if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
  12. // If keyrings aren't supported then it is likely we are on an
  13. // older kernel (or inside an LXC container). While we could bail,
  14. // the security feature we are using here is best-effort (it only
  15. // really provides marginal protection since VFS credentials are
  16. // the only significant protection of keyrings).
  17. //
  18. // TODO(cyphar): Log this so people know what's going on, once we
  19. // have proper logging in 'runc init'.
  20. if errors.Cause(err) != unix.ENOSYS {
  21. return errors.Wrap(err, "join session keyring")
  22. }
  23. } else {
  24. // Make session keyring searcheable. If we've gotten this far we
  25. // bail on any error -- we don't want to have a keyring with bad
  26. // permissions.
  27. if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
  28. return errors.Wrap(err, "mod keyring permissions")
  29. }
  30. }
  31. }
  32. if err := setupNetwork(l.config); err != nil {
  33. return err
  34. }
  35. if err := setupRoute(l.config.Config); err != nil {
  36. return err
  37. }
  38. if err := prepareRootfs(l.pipe, l.config); err != nil {
  39. return err
  40. }
  41. // Set up the console. This has to be done *before* we finalize the rootfs,
  42. // but *after* we've given the user the chance to set up all of the mounts
  43. // they wanted.
  44. if l.config.CreateConsole {
  45. if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
  46. return err
  47. }
  48. if err := system.Setctty(); err != nil {
  49. return errors.Wrap(err, "setctty")
  50. }
  51. }
  52. // 若开启了mount ns,dev设备需要重新mount
  53. if l.config.Config.Namespaces.Contains(configs.NEWNS) {
  54. if err := finalizeRootfs(l.config.Config); err != nil {
  55. return err
  56. }
  57. }
  58. // 设置主机名
  59. if hostname := l.config.Config.Hostname; hostname != "" {
  60. if err := unix.Sethostname([]byte(hostname)); err != nil {
  61. return errors.Wrap(err, "sethostname")
  62. }
  63. }
  64. // 配置AppArmorProfile
  65. if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
  66. return errors.Wrap(err, "apply apparmor profile")
  67. }
  68. for key, value := range l.config.Config.Sysctl {
  69. if err := writeSystemProperty(key, value); err != nil {
  70. return errors.Wrapf(err, "write sysctl key %s", key)
  71. }
  72. }
  73. // 设置只读路径和MaskPaths
  74. for _, path := range l.config.Config.ReadonlyPaths {
  75. if err := readonlyPath(path); err != nil {
  76. return errors.Wrapf(err, "readonly path %s", path)
  77. }
  78. }
  79. for _, path := range l.config.Config.MaskPaths {
  80. if err := maskPath(path, l.config.Config.MountLabel); err != nil {
  81. return errors.Wrapf(err, "mask path %s", path)
  82. }
  83. }
  84. pdeath, err := system.GetParentDeathSignal()
  85. if err != nil {
  86. return errors.Wrap(err, "get pdeath signal")
  87. }
  88. if l.config.NoNewPrivileges {
  89. if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
  90. return errors.Wrap(err, "set nonewprivileges")
  91. }
  92. }
  93. // Tell our parent that we're ready to Execv. This must be done before the
  94. // Seccomp rules have been applied, because we need to be able to read and
  95. // write to a socket.
  96. if err := syncParentReady(l.pipe); err != nil {
  97. return errors.Wrap(err, "sync ready")
  98. }
  99. if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
  100. return errors.Wrap(err, "set process label")
  101. }
  102. defer selinux.SetExecLabel("")
  103. // Without NoNewPrivileges seccomp is a privileged operation, so we need to
  104. // do this before dropping capabilities; otherwise do it as late as possible
  105. // just before execve so as few syscalls take place after it as possible.
  106. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
  107. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  108. return err
  109. }
  110. }
  111. if err := finalizeNamespace(l.config); err != nil {
  112. return err
  113. }
  114. // finalizeNamespace can change user/group which clears the parent death
  115. // signal, so we restore it here.
  116. if err := pdeath.Restore(); err != nil {
  117. return errors.Wrap(err, "restore pdeath signal")
  118. }
  119. // Compare the parent from the initial start of the init process and make
  120. // sure that it did not change. if the parent changes that means it died
  121. // and we were reparented to something else so we should just kill ourself
  122. // and not cause problems for someone else.
  123. if unix.Getppid() != l.parentPid {
  124. return unix.Kill(unix.Getpid(), unix.SIGKILL)
  125. }
  126. // Check for the arg before waiting to make sure it exists and it is
  127. // returned as a create time error.
  128. name, err := exec.LookPath(l.config.Args[0])
  129. if err != nil {
  130. return err
  131. }
  132. // Close the pipe to signal that we have completed our init.
  133. l.pipe.Close()
  134. //-----------------------------------------------------------
  135. //----------以下需要runc start {container_id}才能执行----------
  136. //-----------------------------------------------------------
  137. // Wait for the FIFO to be opened on the other side before exec-ing the
  138. // user process. We open it through /proc/self/fd/$fd, because the fd that
  139. // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
  140. // re-open an O_PATH fd through /proc.
  141. fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
  142. if err != nil {
  143. return newSystemErrorWithCause(err, "open exec fifo")
  144. }
  145. if _, err := unix.Write(fd, []byte("0")); err != nil {
  146. return newSystemErrorWithCause(err, "write 0 exec fifo")
  147. }
  148. // Close the O_PATH fifofd fd before exec because the kernel resets
  149. // dumpable in the wrong order. This has been fixed in newer kernels, but
  150. // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
  151. // N.B. the core issue itself (passing dirfds to the host filesystem) has
  152. // since been resolved.
  153. // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
  154. unix.Close(l.fifoFd)
  155. // Set seccomp as close to execve as possible, so as few syscalls take
  156. // place afterward (reducing the amount of syscalls that users need to
  157. // enable in their seccomp profiles).
  158. if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
  159. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  160. return newSystemErrorWithCause(err, "init seccomp")
  161. }
  162. }
  163. s := l.config.SpecState
  164. s.Pid = unix.Getpid()
  165. s.Status = specs.StateCreated
  166. if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
  167. return err
  168. }
  169. // 切换镜像
  170. if err := unix.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
  171. return newSystemErrorWithCause(err, "exec user process")
  172. }
  173. return nil
  174. }