1、runc使用
2、runc执行过程
根据用户的config.json构建逻辑container,然后调用container.Start()方法,此处重点关注container.Start()方法
2.1、创建exec fifo
exec fifi用户监听用户执行start命令,真正启动用户进程用的 execFifoFilename = “exec.fifo” root默认为/run/runc
// 若为1号进程,则需要创建exec fifo
// 当在容器内在起一个进程则不需要
func (c *linuxContainer) createExecFifo() error {
rootuid, err := c.Config().HostRootUID()
if err != nil {
return err
}
rootgid, err := c.Config().HostRootGID()
if err != nil {
return err
}
fifoName := filepath.Join(c.root, execFifoFilename)
if _, err := os.Stat(fifoName); err == nil {
return fmt.Errorf("exec fifo %s already exists", fifoName)
}
oldMask := unix.Umask(0000)
if err := unix.Mkfifo(fifoName, 0622); err != nil {
unix.Umask(oldMask)
return err
}
unix.Umask(oldMask)
return os.Chown(fifoName, rootuid, rootgid)
}
2.2、初始化父进程(create操作对应的为initProcess)
initProcess负责对runc init进行控制和信号处理
2.2.1、构建ParentProcess
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
// 构建通信pipe
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
messageSockPair := filePair{parentInitPipe, childInitPipe}
// 构建日志pipe
// parent负责读,child负责写
parentLogPipe, childLogPipe, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
}
logFilePair := filePair{parentLogPipe, childLogPipe}
// 此处就是封装runc init cmd,并且将用户二进制路径作为参数传递到runc init命令中
cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
// 若不是第一次init,则封装为SetnsProcess
if !p.Init {
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
}
// We only set up fifoFd if we're not doing a `runc exec`. The historic
// reason for this is that previously we would pass a dirfd that allowed
// for container rootfs escape (and not doing it in `runc exec` avoided
// that problem), but we no longer do that. However, there's no need to do
// this for `runc exec` so we just keep it this way to be safe.
if err := c.includeExecFifo(cmd); err != nil {
return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
}
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
}
2.2.1、构建initProcess
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
init := &initProcess{
cmd: cmd,
messageSockPair: messageSockPair,
logFilePair: logFilePair,
manager: c.cgroupManager,
intelRdtManager: c.intelRdtManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
}
c.initProcess = init
return init, nil
}
2.3、启动父进程(create操作对应的为initProcess)
启动父进程之前需要重新向子进程日志,就是从parentPipe读取日志,然后打到标准输出
func (p *initProcess) start() (retErr error) {
// 结束之后关闭parent的unix socket
defer p.messageSockPair.parent.Close()
// 启动子进程,此处为runc init
// 在factory创建逻辑容器的时候指定
err := p.cmd.Start()
p.process.ops = p
// 父进程不需要操作子进程端的文件句柄,因此需要close
p.messageSockPair.child.Close()
p.logFilePair.child.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
defer func() {
if retErr != nil {
// init might be killed by the kernel's OOM killer.
oom, err := p.manager.OOMKillCount()
if err != nil {
logrus.WithError(err).Warn("unable to get oom kill count")
} else if oom > 0 {
// Does not matter what the particular error was,
// its cause is most probably OOM, so report that.
const oomError = "container init was OOM-killed (memory limit too low?)"
if logrus.GetLevel() >= logrus.DebugLevel {
// Only show the original error if debug is set,
// as it is not generally very useful.
retErr = newSystemErrorWithCause(retErr, oomError)
} else {
retErr = newSystemError(errors.New(oomError))
}
}
// terminate the process to ensure we can remove cgroups
if err := ignoreTerminateErrors(p.terminate()); err != nil {
logrus.WithError(err).Warn("unable to terminate initProcess")
}
p.manager.Destroy()
if p.intelRdtManager != nil {
p.intelRdtManager.Destroy()
}
}
}()
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
}
}
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
childPid, err := p.getChildPid()
if err != nil {
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(childPid)
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
}
p.setExternalDescriptors(fds)
// Now it's time to setup cgroup namesapce
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
return newSystemErrorWithCause(err, "sending synchronization value to init process")
}
}
// Wait for our first child to exit
if err := p.waitForChildExit(childPid); err != nil {
return newSystemErrorWithCause(err, "waiting for our first child to exit")
}
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
if err := p.updateSpecState(); err != nil {
return newSystemErrorWithCause(err, "updating the spec state")
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
sentRun bool
sentResume bool
)
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
switch sync.Type {
case procReady:
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
}
// call prestart and CreateRuntime hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
}
}
if p.config.Config.Hooks != nil {
s, err := p.container.currentOCIState()
if err != nil {
return err
}
// initProcessStartTime hasn't been set yet.
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
hooks := p.config.Config.Hooks
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
return err
}
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
return err
}
}
}
// generate a timestamp indicating when the container was started
p.container.created = time.Now().UTC()
p.container.state = &createdState{
c: p.container,
}
// NOTE: If the procRun state has been synced and the
// runc-create process has been killed for some reason,
// the runc-init[2:stage] process will be leaky. And
// the runc command also fails to parse root directory
// because the container doesn't have state.json.
//
// In order to cleanup the runc-init[2:stage] by
// runc-delete/stop, we should store the status before
// procRun sync.
state, uerr := p.container.updateState(p)
if uerr != nil {
return newSystemErrorWithCause(err, "store init state")
}
p.container.initProcessStartTime = state.InitProcessStartTime
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
}
sentRun = true
case procHooks:
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
}
}
if p.config.Config.Hooks != nil {
s, err := p.container.currentOCIState()
if err != nil {
return err
}
// initProcessStartTime hasn't been set yet.
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
hooks := p.config.Config.Hooks
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
return err
}
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
return err
}
}
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
}
sentResume = true
default:
return newSystemError(errors.New("invalid JSON payload from child"))
}
return nil
})
if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process"))
}
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return ierr
}
return nil
}
2.4、runc init进程
func (l *linuxStandardInit) Init() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if !l.config.Config.NoNewKeyring {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("")
ringname, keepperms, newperms := l.getSessionRingParams()
// Do not inherit the parent's session keyring.
if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
// If keyrings aren't supported then it is likely we are on an
// older kernel (or inside an LXC container). While we could bail,
// the security feature we are using here is best-effort (it only
// really provides marginal protection since VFS credentials are
// the only significant protection of keyrings).
//
// TODO(cyphar): Log this so people know what's going on, once we
// have proper logging in 'runc init'.
if errors.Cause(err) != unix.ENOSYS {
return errors.Wrap(err, "join session keyring")
}
} else {
// Make session keyring searcheable. If we've gotten this far we
// bail on any error -- we don't want to have a keyring with bad
// permissions.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return errors.Wrap(err, "mod keyring permissions")
}
}
}
if err := setupNetwork(l.config); err != nil {
return err
}
if err := setupRoute(l.config.Config); err != nil {
return err
}
if err := prepareRootfs(l.pipe, l.config); err != nil {
return err
}
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return errors.Wrap(err, "setctty")
}
}
// 若开启了mount ns,dev设备需要重新mount
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
// 设置主机名
if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
return errors.Wrap(err, "sethostname")
}
}
// 配置AppArmorProfile
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return errors.Wrap(err, "apply apparmor profile")
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return errors.Wrapf(err, "write sysctl key %s", key)
}
}
// 设置只读路径和MaskPaths
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return errors.Wrapf(err, "readonly path %s", path)
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path, l.config.Config.MountLabel); err != nil {
return errors.Wrapf(err, "mask path %s", path)
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return errors.Wrap(err, "get pdeath signal")
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return errors.Wrap(err, "set nonewprivileges")
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return errors.Wrap(err, "sync ready")
}
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return errors.Wrap(err, "set process label")
}
defer selinux.SetExecLabel("")
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return errors.Wrap(err, "restore pdeath signal")
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
// and we were reparented to something else so we should just kill ourself
// and not cause problems for someone else.
if unix.Getppid() != l.parentPid {
return unix.Kill(unix.Getpid(), unix.SIGKILL)
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// Close the pipe to signal that we have completed our init.
l.pipe.Close()
//-----------------------------------------------------------
//----------以下需要runc start {container_id}才能执行----------
//-----------------------------------------------------------
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo")
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
unix.Close(l.fifoFd)
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
s := l.config.SpecState
s.Pid = unix.Getpid()
s.Status = specs.StateCreated
if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
return err
}
// 切换镜像
if err := unix.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
}