1、runc使用
2、runc执行过程
根据用户的config.json构建逻辑container,然后调用container.Start()方法,此处重点关注container.Start()方法
2.1、创建exec fifo
exec fifi用户监听用户执行start命令,真正启动用户进程用的 execFifoFilename = “exec.fifo” root默认为/run/runc
// 若为1号进程,则需要创建exec fifo// 当在容器内在起一个进程则不需要func (c *linuxContainer) createExecFifo() error {rootuid, err := c.Config().HostRootUID()if err != nil {return err}rootgid, err := c.Config().HostRootGID()if err != nil {return err}fifoName := filepath.Join(c.root, execFifoFilename)if _, err := os.Stat(fifoName); err == nil {return fmt.Errorf("exec fifo %s already exists", fifoName)}oldMask := unix.Umask(0000)if err := unix.Mkfifo(fifoName, 0622); err != nil {unix.Umask(oldMask)return err}unix.Umask(oldMask)return os.Chown(fifoName, rootuid, rootgid)}
2.2、初始化父进程(create操作对应的为initProcess)
initProcess负责对runc init进行控制和信号处理
2.2.1、构建ParentProcess
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {// 构建通信pipeparentInitPipe, childInitPipe, err := utils.NewSockPair("init")if err != nil {return nil, newSystemErrorWithCause(err, "creating new init pipe")}messageSockPair := filePair{parentInitPipe, childInitPipe}// 构建日志pipe// parent负责读,child负责写parentLogPipe, childLogPipe, err := os.Pipe()if err != nil {return nil, fmt.Errorf("Unable to create the log pipe: %s", err)}logFilePair := filePair{parentLogPipe, childLogPipe}// 此处就是封装runc init cmd,并且将用户二进制路径作为参数传递到runc init命令中cmd := c.commandTemplate(p, childInitPipe, childLogPipe)// 若不是第一次init,则封装为SetnsProcessif !p.Init {return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)}// We only set up fifoFd if we're not doing a `runc exec`. The historic// reason for this is that previously we would pass a dirfd that allowed// for container rootfs escape (and not doing it in `runc exec` avoided// that problem), but we no longer do that. However, there's no need to do// this for `runc exec` so we just keep it this way to be safe.if err := c.includeExecFifo(cmd); err != nil {return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")}return c.newInitProcess(p, cmd, messageSockPair, logFilePair)}
2.2.1、构建initProcess
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))nsMaps := make(map[configs.NamespaceType]string)for _, ns := range c.config.Namespaces {if ns.Path != "" {nsMaps[ns.Type] = ns.Path}}_, sharePidns := nsMaps[configs.NEWPID]data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)if err != nil {return nil, err}init := &initProcess{cmd: cmd,messageSockPair: messageSockPair,logFilePair: logFilePair,manager: c.cgroupManager,intelRdtManager: c.intelRdtManager,config: c.newInitConfig(p),container: c,process: p,bootstrapData: data,sharePidns: sharePidns,}c.initProcess = initreturn init, nil}
2.3、启动父进程(create操作对应的为initProcess)
启动父进程之前需要重新向子进程日志,就是从parentPipe读取日志,然后打到标准输出
func (p *initProcess) start() (retErr error) {// 结束之后关闭parent的unix socketdefer p.messageSockPair.parent.Close()// 启动子进程,此处为runc init// 在factory创建逻辑容器的时候指定err := p.cmd.Start()p.process.ops = p// 父进程不需要操作子进程端的文件句柄,因此需要closep.messageSockPair.child.Close()p.logFilePair.child.Close()if err != nil {p.process.ops = nilreturn newSystemErrorWithCause(err, "starting init process command")}defer func() {if retErr != nil {// init might be killed by the kernel's OOM killer.oom, err := p.manager.OOMKillCount()if err != nil {logrus.WithError(err).Warn("unable to get oom kill count")} else if oom > 0 {// Does not matter what the particular error was,// its cause is most probably OOM, so report that.const oomError = "container init was OOM-killed (memory limit too low?)"if logrus.GetLevel() >= logrus.DebugLevel {// Only show the original error if debug is set,// as it is not generally very useful.retErr = newSystemErrorWithCause(retErr, oomError)} else {retErr = newSystemError(errors.New(oomError))}}// terminate the process to ensure we can remove cgroupsif err := ignoreTerminateErrors(p.terminate()); err != nil {logrus.WithError(err).Warn("unable to terminate initProcess")}p.manager.Destroy()if p.intelRdtManager != nil {p.intelRdtManager.Destroy()}}}()// Do this before syncing with child so that no children can escape the// cgroup. We don't need to worry about not doing this and not being root// because we'd be using the rootless cgroup manager in that case.if err := p.manager.Apply(p.pid()); err != nil {return newSystemErrorWithCause(err, "applying cgroup configuration for process")}if p.intelRdtManager != nil {if err := p.intelRdtManager.Apply(p.pid()); err != nil {return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")}}if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {return newSystemErrorWithCause(err, "copying bootstrap data to pipe")}childPid, err := p.getChildPid()if err != nil {return newSystemErrorWithCause(err, "getting the final child's pid from pipe")}// Save the standard descriptor names before the container process// can potentially move them (e.g., via dup2()). If we don't do this now,// we won't know at checkpoint time which file descriptor to look up.fds, err := getPipeFds(childPid)if err != nil {return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)}p.setExternalDescriptors(fds)// Now it's time to setup cgroup namesapceif p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {return newSystemErrorWithCause(err, "sending synchronization value to init process")}}// Wait for our first child to exitif err := p.waitForChildExit(childPid); err != nil {return newSystemErrorWithCause(err, "waiting for our first child to exit")}if err := p.createNetworkInterfaces(); err != nil {return newSystemErrorWithCause(err, "creating network interfaces")}if err := p.updateSpecState(); err != nil {return newSystemErrorWithCause(err, "updating the spec state")}if err := p.sendConfig(); err != nil {return newSystemErrorWithCause(err, "sending config to init process")}var (sentRun boolsentResume bool)ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {switch sync.Type {case procReady:// set rlimits, this has to be done here because we lose permissions// to raise the limits once we enter a user-namespaceif err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {return newSystemErrorWithCause(err, "setting rlimits for ready process")}// call prestart and CreateRuntime hooksif !p.config.Config.Namespaces.Contains(configs.NEWNS) {// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.if err := p.manager.Set(p.config.Config); err != nil {return newSystemErrorWithCause(err, "setting cgroup config for ready process")}if p.intelRdtManager != nil {if err := p.intelRdtManager.Set(p.config.Config); err != nil {return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")}}if p.config.Config.Hooks != nil {s, err := p.container.currentOCIState()if err != nil {return err}// initProcessStartTime hasn't been set yet.s.Pid = p.cmd.Process.Pids.Status = specs.StateCreatinghooks := p.config.Config.Hooksif err := hooks[configs.Prestart].RunHooks(s); err != nil {return err}if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {return err}}}// generate a timestamp indicating when the container was startedp.container.created = time.Now().UTC()p.container.state = &createdState{c: p.container,}// NOTE: If the procRun state has been synced and the// runc-create process has been killed for some reason,// the runc-init[2:stage] process will be leaky. And// the runc command also fails to parse root directory// because the container doesn't have state.json.//// In order to cleanup the runc-init[2:stage] by// runc-delete/stop, we should store the status before// procRun sync.state, uerr := p.container.updateState(p)if uerr != nil {return newSystemErrorWithCause(err, "store init state")}p.container.initProcessStartTime = state.InitProcessStartTime// Sync with child.if err := writeSync(p.messageSockPair.parent, procRun); err != nil {return newSystemErrorWithCause(err, "writing syncT 'run'")}sentRun = truecase procHooks:// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.if err := p.manager.Set(p.config.Config); err != nil {return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")}if p.intelRdtManager != nil {if err := p.intelRdtManager.Set(p.config.Config); err != nil {return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")}}if p.config.Config.Hooks != nil {s, err := p.container.currentOCIState()if err != nil {return err}// initProcessStartTime hasn't been set yet.s.Pid = p.cmd.Process.Pids.Status = specs.StateCreatinghooks := p.config.Config.Hooksif err := hooks[configs.Prestart].RunHooks(s); err != nil {return err}if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {return err}}// Sync with child.if err := writeSync(p.messageSockPair.parent, procResume); err != nil {return newSystemErrorWithCause(err, "writing syncT 'resume'")}sentResume = truedefault:return newSystemError(errors.New("invalid JSON payload from child"))}return nil})if !sentRun {return newSystemErrorWithCause(ierr, "container init")}if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process"))}if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {return newSystemErrorWithCause(err, "shutting down init pipe")}// Must be done after Shutdown so the child will exit and we can wait for it.if ierr != nil {p.wait()return ierr}return nil}
2.4、runc init进程
func (l *linuxStandardInit) Init() error {runtime.LockOSThread()defer runtime.UnlockOSThread()if !l.config.Config.NoNewKeyring {if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {return err}defer selinux.SetKeyLabel("")ringname, keepperms, newperms := l.getSessionRingParams()// Do not inherit the parent's session keyring.if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {// If keyrings aren't supported then it is likely we are on an// older kernel (or inside an LXC container). While we could bail,// the security feature we are using here is best-effort (it only// really provides marginal protection since VFS credentials are// the only significant protection of keyrings).//// TODO(cyphar): Log this so people know what's going on, once we// have proper logging in 'runc init'.if errors.Cause(err) != unix.ENOSYS {return errors.Wrap(err, "join session keyring")}} else {// Make session keyring searcheable. If we've gotten this far we// bail on any error -- we don't want to have a keyring with bad// permissions.if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {return errors.Wrap(err, "mod keyring permissions")}}}if err := setupNetwork(l.config); err != nil {return err}if err := setupRoute(l.config.Config); err != nil {return err}if err := prepareRootfs(l.pipe, l.config); err != nil {return err}// Set up the console. This has to be done *before* we finalize the rootfs,// but *after* we've given the user the chance to set up all of the mounts// they wanted.if l.config.CreateConsole {if err := setupConsole(l.consoleSocket, l.config, true); err != nil {return err}if err := system.Setctty(); err != nil {return errors.Wrap(err, "setctty")}}// 若开启了mount ns,dev设备需要重新mountif l.config.Config.Namespaces.Contains(configs.NEWNS) {if err := finalizeRootfs(l.config.Config); err != nil {return err}}// 设置主机名if hostname := l.config.Config.Hostname; hostname != "" {if err := unix.Sethostname([]byte(hostname)); err != nil {return errors.Wrap(err, "sethostname")}}// 配置AppArmorProfileif err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {return errors.Wrap(err, "apply apparmor profile")}for key, value := range l.config.Config.Sysctl {if err := writeSystemProperty(key, value); err != nil {return errors.Wrapf(err, "write sysctl key %s", key)}}// 设置只读路径和MaskPathsfor _, path := range l.config.Config.ReadonlyPaths {if err := readonlyPath(path); err != nil {return errors.Wrapf(err, "readonly path %s", path)}}for _, path := range l.config.Config.MaskPaths {if err := maskPath(path, l.config.Config.MountLabel); err != nil {return errors.Wrapf(err, "mask path %s", path)}}pdeath, err := system.GetParentDeathSignal()if err != nil {return errors.Wrap(err, "get pdeath signal")}if l.config.NoNewPrivileges {if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {return errors.Wrap(err, "set nonewprivileges")}}// Tell our parent that we're ready to Execv. This must be done before the// Seccomp rules have been applied, because we need to be able to read and// write to a socket.if err := syncParentReady(l.pipe); err != nil {return errors.Wrap(err, "sync ready")}if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {return errors.Wrap(err, "set process label")}defer selinux.SetExecLabel("")// Without NoNewPrivileges seccomp is a privileged operation, so we need to// do this before dropping capabilities; otherwise do it as late as possible// just before execve so as few syscalls take place after it as possible.if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {return err}}if err := finalizeNamespace(l.config); err != nil {return err}// finalizeNamespace can change user/group which clears the parent death// signal, so we restore it here.if err := pdeath.Restore(); err != nil {return errors.Wrap(err, "restore pdeath signal")}// Compare the parent from the initial start of the init process and make// sure that it did not change. if the parent changes that means it died// and we were reparented to something else so we should just kill ourself// and not cause problems for someone else.if unix.Getppid() != l.parentPid {return unix.Kill(unix.Getpid(), unix.SIGKILL)}// Check for the arg before waiting to make sure it exists and it is// returned as a create time error.name, err := exec.LookPath(l.config.Args[0])if err != nil {return err}// Close the pipe to signal that we have completed our init.l.pipe.Close()//-----------------------------------------------------------//----------以下需要runc start {container_id}才能执行----------//-----------------------------------------------------------// Wait for the FIFO to be opened on the other side before exec-ing the// user process. We open it through /proc/self/fd/$fd, because the fd that// was given to us was an O_PATH fd to the fifo itself. Linux allows us to// re-open an O_PATH fd through /proc.fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)if err != nil {return newSystemErrorWithCause(err, "open exec fifo")}if _, err := unix.Write(fd, []byte("0")); err != nil {return newSystemErrorWithCause(err, "write 0 exec fifo")}// Close the O_PATH fifofd fd before exec because the kernel resets// dumpable in the wrong order. This has been fixed in newer kernels, but// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.// N.B. the core issue itself (passing dirfds to the host filesystem) has// since been resolved.// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318unix.Close(l.fifoFd)// Set seccomp as close to execve as possible, so as few syscalls take// place afterward (reducing the amount of syscalls that users need to// enable in their seccomp profiles).if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {return newSystemErrorWithCause(err, "init seccomp")}}s := l.config.SpecStates.Pid = unix.Getpid()s.Status = specs.StateCreatedif err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {return err}// 切换镜像if err := unix.Exec(name, l.config.Args[0:], os.Environ()); err != nil {return newSystemErrorWithCause(err, "exec user process")}return nil}
