create.go#Action(入口)

1、检查对应有一个参数,即容器名
2、读取当前目录下的config.json,JSON反序列化为spec对象
3、启动容器startContainer,传入操作类型为CREATE
4、根据startContainer的返回值,退出

  1. Action: func(context *cli.Context) error {
  2. if err := checkArgs(context, 1, exactArgs); err != nil {
  3. return err
  4. }
  5. if err := revisePidFile(context); err != nil {
  6. return err
  7. }
  8. spec, err := setupSpec(context)
  9. if err != nil {
  10. return err
  11. }
  12. status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
  13. if err != nil {
  14. return err
  15. }
  16. // exit with the container's exit status so any external supervisor is
  17. // notified of the exit with the correct exit status.
  18. os.Exit(status)
  19. return nil
  20. },

1) utils.go#setUpSpec(加载spec对象)

  1. // setupSpec performs initial setup based on the cli.Context for the container
  2. func setupSpec(context *cli.Context) (*specs.Spec, error) {
  3. bundle := context.String("bundle")
  4. if bundle != "" {
  5. if err := os.Chdir(bundle); err != nil {
  6. return nil, err
  7. }
  8. }
  9. // ********************************** NOTICE ********************************** //
  10. spec, err := loadSpec(specConfig)
  11. // ********************************** NOTICE ********************************** //
  12. if err != nil {
  13. return nil, err
  14. }
  15. return spec, nil
  16. }

1.1) spec.go#loadSpec(加载config.json)

  1. // loadSpec loads the specification from the provided path.
  2. func loadSpec(cPath string) (spec *specs.Spec, err error) {
  3. cf, err := os.Open(cPath)
  4. if err != nil {
  5. if os.IsNotExist(err) {
  6. return nil, fmt.Errorf("JSON specification file %s not found", cPath)
  7. }
  8. return nil, err
  9. }
  10. defer cf.Close()
  11. if err = json.NewDecoder(cf).Decode(&spec); err != nil {
  12. return nil, err
  13. }
  14. return spec, validateProcessSpec(spec.Process)
  15. }
  16. func validateProcessSpec(spec *specs.Process) error {
  17. if spec.Cwd == "" {
  18. return fmt.Errorf("Cwd property must not be empty")
  19. }
  20. if !filepath.IsAbs(spec.Cwd) {
  21. return fmt.Errorf("Cwd must be an absolute path")
  22. }
  23. if len(spec.Args) == 0 {
  24. return fmt.Errorf("args must not be empty")
  25. }
  26. return nil
  27. }

2) utils_linux.go#startContainer(create、run、restore入口)

1、检查容器id是否传入
2、调用createContainer方法创建一个容器
3、构造一个runner对象,并调用其run方法。

  1. type CtAct uint8
  2. const (
  3. CT_ACT_CREATE CtAct = iota + 1
  4. CT_ACT_RUN
  5. CT_ACT_RESTORE
  6. )
  7. func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
  8. id := context.Args().First()
  9. if id == "" {
  10. return -1, errEmptyID
  11. }
  12. notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
  13. if notifySocket != nil {
  14. notifySocket.setupSpec(context, spec)
  15. }
  16. // ********************************** NOTICE ********************************** //
  17. container, err := createContainer(context, id, spec)
  18. // ********************************** NOTICE ********************************** //
  19. if err != nil {
  20. return -1, err
  21. }
  22. if notifySocket != nil {
  23. err := notifySocket.setupSocket()
  24. if err != nil {
  25. return -1, err
  26. }
  27. }
  28. // Support on-demand socket activation by passing file descriptors into the container init process.
  29. listenFDs := []*os.File{}
  30. if os.Getenv("LISTEN_FDS") != "" {
  31. listenFDs = activation.Files(false)
  32. }
  33. r := &runner{
  34. enableSubreaper: !context.Bool("no-subreaper"),
  35. shouldDestroy: true,
  36. container: container,
  37. listenFDs: listenFDs,
  38. notifySocket: notifySocket,
  39. consoleSocket: context.String("console-socket"),
  40. detach: context.Bool("detach"),
  41. pidFile: context.String("pid-file"),
  42. preserveFDs: context.Int("preserve-fds"),
  43. action: action,
  44. criuOpts: criuOpts,
  45. init: true,
  46. }
  47. // ********************************** NOTICE ********************************** //
  48. return r.run(spec.Process)
  49. // ********************************** NOTICE ********************************** //
  50. }

2.1) utils_linux.go#createContainer

1、将spec对象转为config对象
2、获取container的工厂对象
3、调用工厂的create方法

  1. func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
  2. rootlessCg, err := shouldUseRootlessCgroupManager(context)
  3. if err != nil {
  4. return nil, err
  5. }
  6. // ********************************** NOTICE ********************************** //
  7. config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
  8. CgroupName: id,
  9. UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
  10. NoPivotRoot: context.Bool("no-pivot"),
  11. NoNewKeyring: context.Bool("no-new-keyring"),
  12. Spec: spec,
  13. RootlessEUID: os.Geteuid() != 0,
  14. RootlessCgroups: rootlessCg,
  15. })
  16. // ********************************** NOTICE ********************************** //
  17. if err != nil {
  18. return nil, err
  19. }
  20. // ********************************** NOTICE ********************************** //
  21. factory, err := loadFactory(context)
  22. // ********************************** NOTICE ********************************** //
  23. if err != nil {
  24. return nil, err
  25. }
  26. // ********************************** NOTICE ********************************** //
  27. return factory.Create(id, config)
  28. // ********************************** NOTICE ********************************** //
  29. }

2.1.1) libcontainer/specconv/spec_linux.go#CreateLibcontainerConfig(spec转config)

spec

  1. // Spec is the base configuration for the container.
  2. type Spec struct {
  3. // Version of the Open Container Initiative Runtime Specification with which the bundle complies.
  4. Version string `json:"ociVersion"`
  5. // Process configures the container process.
  6. Process *Process `json:"process,omitempty"`
  7. // Root configures the container's root filesystem.
  8. Root *Root `json:"root,omitempty"`
  9. // Hostname configures the container's hostname.
  10. Hostname string `json:"hostname,omitempty"`
  11. // Mounts configures additional mounts (on top of Root).
  12. Mounts []Mount `json:"mounts,omitempty"`
  13. // Hooks configures callbacks for container lifecycle events.
  14. Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris"`
  15. // Annotations contains arbitrary metadata for the container.
  16. Annotations map[string]string `json:"annotations,omitempty"`
  17. // Linux is platform-specific configuration for Linux based containers.
  18. Linux *Linux `json:"linux,omitempty" platform:"linux"`
  19. // Solaris is platform-specific configuration for Solaris based containers.
  20. Solaris *Solaris `json:"solaris,omitempty" platform:"solaris"`
  21. // Windows is platform-specific configuration for Windows based containers.
  22. Windows *Windows `json:"windows,omitempty" platform:"windows"`
  23. // VM specifies configuration for virtual-machine-based containers.
  24. VM *VM `json:"vm,omitempty" platform:"vm"`
  25. }
  26. // Process contains information to start a specific application inside the container.
  27. type Process struct {
  28. // Terminal creates an interactive terminal for the container.
  29. Terminal bool `json:"terminal,omitempty"`
  30. // ConsoleSize specifies the size of the console.
  31. ConsoleSize *Box `json:"consoleSize,omitempty"`
  32. // User specifies user information for the process.
  33. User User `json:"user"`
  34. // Args specifies the binary and arguments for the application to execute.
  35. Args []string `json:"args"`
  36. // Env populates the process environment for the process.
  37. Env []string `json:"env,omitempty"`
  38. // Cwd is the current working directory for the process and must be
  39. // relative to the container's root.
  40. Cwd string `json:"cwd"`
  41. // Capabilities are Linux capabilities that are kept for the process.
  42. Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
  43. // Rlimits specifies rlimit options to apply to the process.
  44. Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"`
  45. // NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
  46. NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
  47. // ApparmorProfile specifies the apparmor profile for the container.
  48. ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
  49. // Specify an oom_score_adj for the container.
  50. OOMScoreAdj *int `json:"oomScoreAdj,omitempty" platform:"linux"`
  51. // SelinuxLabel specifies the selinux context that the container process is run as.
  52. SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
  53. }
  54. // Linux contains platform-specific configuration for Linux based containers.
  55. type Linux struct {
  56. // UIDMapping specifies user mappings for supporting user namespaces.
  57. UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty"`
  58. // GIDMapping specifies group mappings for supporting user namespaces.
  59. GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty"`
  60. // Sysctl are a set of key value pairs that are set for the container on start
  61. Sysctl map[string]string `json:"sysctl,omitempty"`
  62. // Resources contain cgroup information for handling resource constraints
  63. // for the container
  64. Resources *LinuxResources `json:"resources,omitempty"`
  65. // CgroupsPath specifies the path to cgroups that are created and/or joined by the container.
  66. // The path is expected to be relative to the cgroups mountpoint.
  67. // If resources are specified, the cgroups at CgroupsPath will be updated based on resources.
  68. CgroupsPath string `json:"cgroupsPath,omitempty"`
  69. // Namespaces contains the namespaces that are created and/or joined by the container
  70. Namespaces []LinuxNamespace `json:"namespaces,omitempty"`
  71. // Devices are a list of device nodes that are created for the container
  72. Devices []LinuxDevice `json:"devices,omitempty"`
  73. // Seccomp specifies the seccomp security settings for the container.
  74. Seccomp *LinuxSeccomp `json:"seccomp,omitempty"`
  75. // RootfsPropagation is the rootfs mount propagation mode for the container.
  76. RootfsPropagation string `json:"rootfsPropagation,omitempty"`
  77. // MaskedPaths masks over the provided paths inside the container.
  78. MaskedPaths []string `json:"maskedPaths,omitempty"`
  79. // ReadonlyPaths sets the provided paths as RO inside the container.
  80. ReadonlyPaths []string `json:"readonlyPaths,omitempty"`
  81. // MountLabel specifies the selinux context for the mounts in the container.
  82. MountLabel string `json:"mountLabel,omitempty"`
  83. // IntelRdt contains Intel Resource Director Technology (RDT) information for
  84. // handling resource constraints (e.g., L3 cache, memory bandwidth) for the container
  85. IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"`
  86. }
  1. type CreateOpts struct {
  2. CgroupName string
  3. UseSystemdCgroup bool
  4. NoPivotRoot bool
  5. NoNewKeyring bool
  6. Spec *specs.Spec
  7. RootlessEUID bool
  8. RootlessCgroups bool
  9. }
  10. // CreateLibcontainerConfig creates a new libcontainer configuration from a
  11. // given specification and a cgroup name
  12. func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
  13. // runc's cwd will always be the bundle path
  14. rcwd, err := os.Getwd()
  15. if err != nil {
  16. return nil, err
  17. }
  18. cwd, err := filepath.Abs(rcwd)
  19. if err != nil {
  20. return nil, err
  21. }
  22. spec := opts.Spec
  23. if spec.Root == nil {
  24. return nil, fmt.Errorf("Root must be specified")
  25. }
  26. rootfsPath := spec.Root.Path
  27. if !filepath.IsAbs(rootfsPath) {
  28. rootfsPath = filepath.Join(cwd, rootfsPath)
  29. }
  30. labels := []string{}
  31. for k, v := range spec.Annotations {
  32. labels = append(labels, fmt.Sprintf("%s=%s", k, v))
  33. }
  34. config := &configs.Config{
  35. Rootfs: rootfsPath,
  36. NoPivotRoot: opts.NoPivotRoot,
  37. Readonlyfs: spec.Root.Readonly,
  38. Hostname: spec.Hostname,
  39. Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
  40. NoNewKeyring: opts.NoNewKeyring,
  41. RootlessEUID: opts.RootlessEUID,
  42. RootlessCgroups: opts.RootlessCgroups,
  43. }
  44. exists := false
  45. for _, m := range spec.Mounts {
  46. // ********************************** NOTICE ********************************** //
  47. config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
  48. // ********************************** NOTICE ********************************** //
  49. }
  50. // ********************************** NOTICE ********************************** //
  51. if err := createDevices(spec, config); err != nil {
  52. // ********************************** NOTICE ********************************** //
  53. return nil, err
  54. }
  55. // ********************************** NOTICE ********************************** //
  56. c, err := createCgroupConfig(opts)
  57. // ********************************** NOTICE ********************************** //
  58. if err != nil {
  59. return nil, err
  60. }
  61. config.Cgroups = c
  62. // set linux-specific config
  63. if spec.Linux != nil {
  64. if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
  65. return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
  66. }
  67. if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
  68. return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root")
  69. }
  70. for _, ns := range spec.Linux.Namespaces {
  71. t, exists := namespaceMapping[ns.Type]
  72. if !exists {
  73. return nil, fmt.Errorf("namespace %q does not exist", ns)
  74. }
  75. if config.Namespaces.Contains(t) {
  76. return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
  77. }
  78. config.Namespaces.Add(t, ns.Path)
  79. }
  80. if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
  81. config.Networks = []*configs.Network{
  82. {
  83. Type: "loopback",
  84. },
  85. }
  86. }
  87. if config.Namespaces.Contains(configs.NEWUSER) {
  88. if err := setupUserNamespace(spec, config); err != nil {
  89. return nil, err
  90. }
  91. }
  92. config.MaskPaths = spec.Linux.MaskedPaths
  93. config.ReadonlyPaths = spec.Linux.ReadonlyPaths
  94. config.MountLabel = spec.Linux.MountLabel
  95. config.Sysctl = spec.Linux.Sysctl
  96. if spec.Linux.Seccomp != nil {
  97. seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
  98. if err != nil {
  99. return nil, err
  100. }
  101. config.Seccomp = seccomp
  102. }
  103. if spec.Linux.IntelRdt != nil {
  104. config.IntelRdt = &configs.IntelRdt{}
  105. if spec.Linux.IntelRdt.L3CacheSchema != "" {
  106. config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
  107. }
  108. if spec.Linux.IntelRdt.MemBwSchema != "" {
  109. config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema
  110. }
  111. }
  112. }
  113. if spec.Process != nil {
  114. config.OomScoreAdj = spec.Process.OOMScoreAdj
  115. if spec.Process.SelinuxLabel != "" {
  116. config.ProcessLabel = spec.Process.SelinuxLabel
  117. }
  118. if spec.Process.Capabilities != nil {
  119. config.Capabilities = &configs.Capabilities{
  120. Bounding: spec.Process.Capabilities.Bounding,
  121. Effective: spec.Process.Capabilities.Effective,
  122. Permitted: spec.Process.Capabilities.Permitted,
  123. Inheritable: spec.Process.Capabilities.Inheritable,
  124. Ambient: spec.Process.Capabilities.Ambient,
  125. }
  126. }
  127. }
  128. createHooks(spec, config)
  129. config.Version = specs.Version
  130. return config, nil
  131. }

config

  1. // Config defines configuration options for executing a process inside a contained environment.
  2. type Config struct {
  3. // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
  4. // This is a common option when the container is running in ramdisk
  5. NoPivotRoot bool `json:"no_pivot_root"`
  6. // ParentDeathSignal specifies the signal that is sent to the container's process in the case
  7. // that the parent process dies.
  8. ParentDeathSignal int `json:"parent_death_signal"`
  9. // Path to a directory containing the container's root filesystem.
  10. Rootfs string `json:"rootfs"`
  11. // Readonlyfs will remount the container's rootfs as readonly where only externally mounted
  12. // bind mounts are writtable.
  13. Readonlyfs bool `json:"readonlyfs"`
  14. // Specifies the mount propagation flags to be applied to /.
  15. RootPropagation int `json:"rootPropagation"`
  16. // Mounts specify additional source and destination paths that will be mounted inside the container's
  17. // rootfs and mount namespace if specified
  18. Mounts []*Mount `json:"mounts"`
  19. // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
  20. Devices []*Device `json:"devices"`
  21. MountLabel string `json:"mount_label"`
  22. // Hostname optionally sets the container's hostname if provided
  23. Hostname string `json:"hostname"`
  24. // Namespaces specifies the container's namespaces that it should setup when cloning the init process
  25. // If a namespace is not provided that namespace is shared from the container's parent process
  26. Namespaces Namespaces `json:"namespaces"`
  27. // Capabilities specify the capabilities to keep when executing the process inside the container
  28. // All capabilities not specified will be dropped from the processes capability mask
  29. Capabilities *Capabilities `json:"capabilities"`
  30. // Networks specifies the container's network setup to be created
  31. Networks []*Network `json:"networks"`
  32. // Routes can be specified to create entries in the route table as the container is started
  33. Routes []*Route `json:"routes"`
  34. // Cgroups specifies specific cgroup settings for the various subsystems that the container is
  35. // placed into to limit the resources the container has available
  36. Cgroups *Cgroup `json:"cgroups"`
  37. // AppArmorProfile specifies the profile to apply to the process running in the container and is
  38. // change at the time the process is execed
  39. AppArmorProfile string `json:"apparmor_profile,omitempty"`
  40. // ProcessLabel specifies the label to apply to the process running in the container. It is
  41. // commonly used by selinux
  42. ProcessLabel string `json:"process_label,omitempty"`
  43. // Rlimits specifies the resource limits, such as max open files, to set in the container
  44. // If Rlimits are not set, the container will inherit rlimits from the parent process
  45. Rlimits []Rlimit `json:"rlimits,omitempty"`
  46. // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
  47. // for a process. Valid values are between the range [-1000, '1000'], where processes with
  48. // higher scores are preferred for being killed. If it is unset then we don't touch the current
  49. // value.
  50. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
  51. OomScoreAdj *int `json:"oom_score_adj,omitempty"`
  52. // UidMappings is an array of User ID mappings for User Namespaces
  53. UidMappings []IDMap `json:"uid_mappings"`
  54. // GidMappings is an array of Group ID mappings for User Namespaces
  55. GidMappings []IDMap `json:"gid_mappings"`
  56. // MaskPaths specifies paths within the container's rootfs to mask over with a bind
  57. // mount pointing to /dev/null as to prevent reads of the file.
  58. MaskPaths []string `json:"mask_paths"`
  59. // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
  60. // so that these files prevent any writes.
  61. ReadonlyPaths []string `json:"readonly_paths"`
  62. // Sysctl is a map of properties and their values. It is the equivalent of using
  63. // sysctl -w my.property.name value in Linux.
  64. Sysctl map[string]string `json:"sysctl"`
  65. // Seccomp allows actions to be taken whenever a syscall is made within the container.
  66. // A number of rules are given, each having an action to be taken if a syscall matches it.
  67. // A default action to be taken if no rules match is also given.
  68. Seccomp *Seccomp `json:"seccomp"`
  69. // NoNewPrivileges controls whether processes in the container can gain additional privileges.
  70. NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
  71. // Hooks are a collection of actions to perform at various container lifecycle events.
  72. // CommandHooks are serialized to JSON, but other hooks are not.
  73. Hooks *Hooks
  74. // Version is the version of opencontainer specification that is supported.
  75. Version string `json:"version"`
  76. // Labels are user defined metadata that is stored in the config and populated on the state
  77. Labels []string `json:"labels"`
  78. // NoNewKeyring will not allocated a new session keyring for the container. It will use the
  79. // callers keyring in this case.
  80. NoNewKeyring bool `json:"no_new_keyring"`
  81. // IntelRdt specifies settings for Intel RDT group that the container is placed into
  82. // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
  83. IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
  84. // RootlessEUID is set when the runc was launched with non-zero EUID.
  85. // Note that RootlessEUID is set to false when launched with EUID=0 in userns.
  86. // When RootlessEUID is set, runc creates a new userns for the container.
  87. // (config.json needs to contain userns settings)
  88. RootlessEUID bool `json:"rootless_euid,omitempty"`
  89. // RootlessCgroups is set when unlikely to have the full access to cgroups.
  90. // When RootlessCgroups is set, cgroups errors are ignored.
  91. RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
  92. }

2.1.1.1) libcontainer/specconv/spec_linux.go#createLibcontainerMount

  1. func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
  2. flags, pgflags, data, ext := parseMountOptions(m.Options)
  3. source := m.Source
  4. device := m.Type
  5. if flags&unix.MS_BIND != 0 {
  6. if device == "" {
  7. device = "bind"
  8. }
  9. if !filepath.IsAbs(source) {
  10. source = filepath.Join(cwd, m.Source)
  11. }
  12. }
  13. return &configs.Mount{
  14. Device: device,
  15. Source: source,
  16. Destination: m.Destination,
  17. Data: data,
  18. Flags: flags,
  19. PropagationFlags: pgflags,
  20. Extensions: ext,
  21. }
  22. }

2.1.1.2) libcontainer/specconv/spec_linux.go#createDevices

  1. func createDevices(spec *specs.Spec, config *configs.Config) error {
  2. // add whitelisted devices
  3. config.Devices = []*configs.Device{
  4. {
  5. Type: 'c',
  6. Path: "/dev/null",
  7. Major: 1,
  8. Minor: 3,
  9. FileMode: 0666,
  10. Uid: 0,
  11. Gid: 0,
  12. },
  13. {
  14. Type: 'c',
  15. Path: "/dev/random",
  16. Major: 1,
  17. Minor: 8,
  18. FileMode: 0666,
  19. Uid: 0,
  20. Gid: 0,
  21. },
  22. {
  23. Type: 'c',
  24. Path: "/dev/full",
  25. Major: 1,
  26. Minor: 7,
  27. FileMode: 0666,
  28. Uid: 0,
  29. Gid: 0,
  30. },
  31. {
  32. Type: 'c',
  33. Path: "/dev/tty",
  34. Major: 5,
  35. Minor: 0,
  36. FileMode: 0666,
  37. Uid: 0,
  38. Gid: 0,
  39. },
  40. {
  41. Type: 'c',
  42. Path: "/dev/zero",
  43. Major: 1,
  44. Minor: 5,
  45. FileMode: 0666,
  46. Uid: 0,
  47. Gid: 0,
  48. },
  49. {
  50. Type: 'c',
  51. Path: "/dev/urandom",
  52. Major: 1,
  53. Minor: 9,
  54. FileMode: 0666,
  55. Uid: 0,
  56. Gid: 0,
  57. },
  58. }
  59. // merge in additional devices from the spec
  60. if spec.Linux != nil {
  61. for _, d := range spec.Linux.Devices {
  62. var uid, gid uint32
  63. var filemode os.FileMode = 0666
  64. if d.UID != nil {
  65. uid = *d.UID
  66. }
  67. if d.GID != nil {
  68. gid = *d.GID
  69. }
  70. dt, err := stringToDeviceRune(d.Type)
  71. if err != nil {
  72. return err
  73. }
  74. if d.FileMode != nil {
  75. filemode = *d.FileMode
  76. }
  77. device := &configs.Device{
  78. Type: dt,
  79. Path: d.Path,
  80. Major: d.Major,
  81. Minor: d.Minor,
  82. FileMode: filemode,
  83. Uid: uid,
  84. Gid: gid,
  85. }
  86. config.Devices = append(config.Devices, device)
  87. }
  88. }
  89. return nil
  90. }

2.1.1.3) libcontainer/specconv/spec_linux.go#createDevices

  1. func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
  2. var (
  3. myCgroupPath string
  4. spec = opts.Spec
  5. useSystemdCgroup = opts.UseSystemdCgroup
  6. name = opts.CgroupName
  7. )
  8. c := &configs.Cgroup{
  9. Resources: &configs.Resources{},
  10. }
  11. if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
  12. myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
  13. if useSystemdCgroup {
  14. myCgroupPath = spec.Linux.CgroupsPath
  15. }
  16. }
  17. if useSystemdCgroup {
  18. if myCgroupPath == "" {
  19. c.Parent = "system.slice"
  20. c.ScopePrefix = "runc"
  21. c.Name = name
  22. } else {
  23. // Parse the path from expected "slice:prefix:name"
  24. // for e.g. "system.slice:docker:1234"
  25. parts := strings.Split(myCgroupPath, ":")
  26. if len(parts) != 3 {
  27. return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
  28. }
  29. c.Parent = parts[0]
  30. c.ScopePrefix = parts[1]
  31. c.Name = parts[2]
  32. }
  33. } else {
  34. if myCgroupPath == "" {
  35. c.Name = name
  36. }
  37. c.Path = myCgroupPath
  38. }
  39. // In rootless containers, any attempt to make cgroup changes is likely to fail.
  40. // libcontainer will validate this but ignores the error.
  41. c.Resources.AllowedDevices = allowedDevices
  42. if spec.Linux != nil {
  43. r := spec.Linux.Resources
  44. if r == nil {
  45. return c, nil
  46. }
  47. for i, d := range spec.Linux.Resources.Devices {
  48. var (
  49. t = "a"
  50. major = int64(-1)
  51. minor = int64(-1)
  52. )
  53. if d.Type != "" {
  54. t = d.Type
  55. }
  56. if d.Major != nil {
  57. major = *d.Major
  58. }
  59. if d.Minor != nil {
  60. minor = *d.Minor
  61. }
  62. if d.Access == "" {
  63. return nil, fmt.Errorf("device access at %d field cannot be empty", i)
  64. }
  65. dt, err := stringToCgroupDeviceRune(t)
  66. if err != nil {
  67. return nil, err
  68. }
  69. dd := &configs.Device{
  70. Type: dt,
  71. Major: major,
  72. Minor: minor,
  73. Permissions: d.Access,
  74. Allow: d.Allow,
  75. }
  76. c.Resources.Devices = append(c.Resources.Devices, dd)
  77. }
  78. if r.Memory != nil {
  79. if r.Memory.Limit != nil {
  80. c.Resources.Memory = *r.Memory.Limit
  81. }
  82. if r.Memory.Reservation != nil {
  83. c.Resources.MemoryReservation = *r.Memory.Reservation
  84. }
  85. if r.Memory.Swap != nil {
  86. c.Resources.MemorySwap = *r.Memory.Swap
  87. }
  88. if r.Memory.Kernel != nil {
  89. c.Resources.KernelMemory = *r.Memory.Kernel
  90. }
  91. if r.Memory.KernelTCP != nil {
  92. c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
  93. }
  94. if r.Memory.Swappiness != nil {
  95. c.Resources.MemorySwappiness = r.Memory.Swappiness
  96. }
  97. if r.Memory.DisableOOMKiller != nil {
  98. c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
  99. }
  100. }
  101. if r.CPU != nil {
  102. if r.CPU.Shares != nil {
  103. c.Resources.CpuShares = *r.CPU.Shares
  104. }
  105. if r.CPU.Quota != nil {
  106. c.Resources.CpuQuota = *r.CPU.Quota
  107. }
  108. if r.CPU.Period != nil {
  109. c.Resources.CpuPeriod = *r.CPU.Period
  110. }
  111. if r.CPU.RealtimeRuntime != nil {
  112. c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
  113. }
  114. if r.CPU.RealtimePeriod != nil {
  115. c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
  116. }
  117. if r.CPU.Cpus != "" {
  118. c.Resources.CpusetCpus = r.CPU.Cpus
  119. }
  120. if r.CPU.Mems != "" {
  121. c.Resources.CpusetMems = r.CPU.Mems
  122. }
  123. }
  124. if r.Pids != nil {
  125. c.Resources.PidsLimit = r.Pids.Limit
  126. }
  127. if r.BlockIO != nil {
  128. if r.BlockIO.Weight != nil {
  129. c.Resources.BlkioWeight = *r.BlockIO.Weight
  130. }
  131. if r.BlockIO.LeafWeight != nil {
  132. c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
  133. }
  134. if r.BlockIO.WeightDevice != nil {
  135. for _, wd := range r.BlockIO.WeightDevice {
  136. var weight, leafWeight uint16
  137. if wd.Weight != nil {
  138. weight = *wd.Weight
  139. }
  140. if wd.LeafWeight != nil {
  141. leafWeight = *wd.LeafWeight
  142. }
  143. weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
  144. c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
  145. }
  146. }
  147. if r.BlockIO.ThrottleReadBpsDevice != nil {
  148. for _, td := range r.BlockIO.ThrottleReadBpsDevice {
  149. rate := td.Rate
  150. throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
  151. c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
  152. }
  153. }
  154. if r.BlockIO.ThrottleWriteBpsDevice != nil {
  155. for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
  156. rate := td.Rate
  157. throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
  158. c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
  159. }
  160. }
  161. if r.BlockIO.ThrottleReadIOPSDevice != nil {
  162. for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
  163. rate := td.Rate
  164. throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
  165. c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
  166. }
  167. }
  168. if r.BlockIO.ThrottleWriteIOPSDevice != nil {
  169. for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
  170. rate := td.Rate
  171. throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
  172. c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
  173. }
  174. }
  175. }
  176. for _, l := range r.HugepageLimits {
  177. c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
  178. Pagesize: l.Pagesize,
  179. Limit: l.Limit,
  180. })
  181. }
  182. if r.Network != nil {
  183. if r.Network.ClassID != nil {
  184. c.Resources.NetClsClassid = *r.Network.ClassID
  185. }
  186. for _, m := range r.Network.Priorities {
  187. c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
  188. Interface: m.Name,
  189. Priority: int64(m.Priority),
  190. })
  191. }
  192. }
  193. }
  194. // append the default allowed devices to the end of the list
  195. c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
  196. return c, nil
  197. }
  198. func stringToCgroupDeviceRune(s string) (rune, error) {
  199. switch s {
  200. case "a":
  201. return 'a', nil
  202. case "b":
  203. return 'b', nil
  204. case "c":
  205. return 'c', nil
  206. default:
  207. return 0, fmt.Errorf("invalid cgroup device type %q", s)
  208. }
  209. }

2.1.1.4) libcontainer/specconv/spec_linux.go#setupUserNamespace

  1. func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
  2. create := func(m specs.LinuxIDMapping) configs.IDMap {
  3. return configs.IDMap{
  4. HostID: int(m.HostID),
  5. ContainerID: int(m.ContainerID),
  6. Size: int(m.Size),
  7. }
  8. }
  9. if spec.Linux != nil {
  10. for _, m := range spec.Linux.UIDMappings {
  11. config.UidMappings = append(config.UidMappings, create(m))
  12. }
  13. for _, m := range spec.Linux.GIDMappings {
  14. config.GidMappings = append(config.GidMappings, create(m))
  15. }
  16. }
  17. rootUID, err := config.HostRootUID()
  18. if err != nil {
  19. return err
  20. }
  21. rootGID, err := config.HostRootGID()
  22. if err != nil {
  23. return err
  24. }
  25. for _, node := range config.Devices {
  26. node.Uid = uint32(rootUID)
  27. node.Gid = uint32(rootGID)
  28. }
  29. return nil
  30. }

2.1.1.5) libcontainer/specconv/spec_linux.go#createHooks

  1. func createHooks(rspec *specs.Spec, config *configs.Config) {
  2. config.Hooks = &configs.Hooks{}
  3. if rspec.Hooks != nil {
  4. for _, h := range rspec.Hooks.Prestart {
  5. cmd := createCommandHook(h)
  6. config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
  7. }
  8. for _, h := range rspec.Hooks.Poststart {
  9. cmd := createCommandHook(h)
  10. config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
  11. }
  12. for _, h := range rspec.Hooks.Poststop {
  13. cmd := createCommandHook(h)
  14. config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
  15. }
  16. }
  17. }

2.1.2) utils_linux.go#loadFactory

会根据context中的信息来确定使用哪些:

  • cgroupManager
  • intelRdtManager
  • CriuPath
  • NewuidmapPath
  • NewgidmapPath ```go var errEmptyID = errors.New(“container id cannot be empty”)

// loadFactory returns the configured factory instance for execing containers. func loadFactory(context *cli.Context) (libcontainer.Factory, error) { root := context.GlobalString(“root”) abs, err := filepath.Abs(root) if err != nil { return nil, err }

  1. // We default to cgroupfs, and can only use systemd if the system is a
  2. // systemd box.
  3. cgroupManager := libcontainer.Cgroupfs
  4. rootlessCg, err := shouldUseRootlessCgroupManager(context)
  5. if err != nil {
  6. return nil, err
  7. }
  8. if rootlessCg {
  9. cgroupManager = libcontainer.RootlessCgroupfs
  10. }
  11. if context.GlobalBool("systemd-cgroup") {
  12. if systemd.UseSystemd() {
  13. cgroupManager = libcontainer.SystemdCgroups
  14. } else {
  15. return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
  16. }
  17. }
  18. intelRdtManager := libcontainer.IntelRdtFs
  19. if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
  20. intelRdtManager = nil
  21. }
  22. // We resolve the paths for {newuidmap,newgidmap} from the context of runc,
  23. // to avoid doing a path lookup in the nsexec context. TODO: The binary
  24. // names are not currently configurable.
  25. newuidmap, err := exec.LookPath("newuidmap")
  26. if err != nil {
  27. newuidmap = ""
  28. }
  29. newgidmap, err := exec.LookPath("newgidmap")
  30. if err != nil {
  31. newgidmap = ""
  32. }

// ** NOTICE ** // return libcontainer.New(abs, cgroupManager, intelRdtManager, libcontainer.CriuPath(context.GlobalString(“criu”)), libcontainer.NewuidmapPath(newuidmap), libcontainer.NewgidmapPath(newgidmap)) // ** NOTICE ** // }

  1. <a name="4b7269a2"></a>
  2. #### 2.1.2.1) libcontainer/factory_linux.go#New
  3. ```go
  4. // New returns a linux based container factory based in the root directory and
  5. // configures the factory with the provided option funcs.
  6. func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
  7. if root != "" {
  8. if err := os.MkdirAll(root, 0700); err != nil {
  9. return nil, newGenericError(err, SystemError)
  10. }
  11. }
  12. l := &LinuxFactory{
  13. Root: root,
  14. InitPath: "/proc/self/exe",
  15. InitArgs: []string{os.Args[0], "init"},
  16. Validator: validate.New(),
  17. CriuPath: "criu",
  18. }
  19. Cgroupfs(l)
  20. for _, opt := range options {
  21. if opt == nil {
  22. continue
  23. }
  24. if err := opt(l); err != nil {
  25. return nil, err
  26. }
  27. }
  28. return l, nil
  29. }
  30. // Cgroupfs is an options func to configure a LinuxFactory to return containers
  31. // that use the native cgroups filesystem implementation to create and manage
  32. // cgroups.
  33. func Cgroupfs(l *LinuxFactory) error {
  34. l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
  35. return &fs.Manager{
  36. Cgroups: config,
  37. Paths: paths,
  38. }
  39. }
  40. return nil
  41. }

2.1.3) libcontainer/factory_linux.go#Create

  1. func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
  2. if l.Root == "" {
  3. return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
  4. }
  5. if err := l.validateID(id); err != nil {
  6. return nil, err
  7. }
  8. if err := l.Validator.Validate(config); err != nil {
  9. return nil, newGenericError(err, ConfigInvalid)
  10. }
  11. containerRoot, err := securejoin.SecureJoin(l.Root, id)
  12. if err != nil {
  13. return nil, err
  14. }
  15. if _, err := os.Stat(containerRoot); err == nil {
  16. return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
  17. } else if !os.IsNotExist(err) {
  18. return nil, newGenericError(err, SystemError)
  19. }
  20. if err := os.MkdirAll(containerRoot, 0711); err != nil {
  21. return nil, newGenericError(err, SystemError)
  22. }
  23. if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
  24. return nil, newGenericError(err, SystemError)
  25. }
  26. c := &linuxContainer{
  27. id: id,
  28. root: containerRoot,
  29. config: config,
  30. initPath: l.InitPath,
  31. initArgs: l.InitArgs,
  32. criuPath: l.CriuPath,
  33. newuidmapPath: l.NewuidmapPath,
  34. newgidmapPath: l.NewgidmapPath,
  35. cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
  36. }
  37. if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
  38. c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
  39. }
  40. c.state = &stoppedState{c: c}
  41. return c, nil
  42. }
  43. // stoppedState represents a container is a stopped/destroyed state.
  44. type stoppedState struct {
  45. c *linuxContainer
  46. }

2.2) utils_linux.go#runner.run(在当前container中运行一个Process)

runner#action为create。

  1. type runner struct {
  2. init bool
  3. enableSubreaper bool
  4. shouldDestroy bool
  5. detach bool
  6. listenFDs []*os.File
  7. preserveFDs int
  8. pidFile string
  9. consoleSocket string
  10. container libcontainer.Container
  11. action CtAct
  12. notifySocket *notifySocket
  13. criuOpts *libcontainer.CriuOpts
  14. }

传入的Process是spec的Process。

1、首先根据spec里面process的配置信息调用newProcess创建process对象。
2、其次将listen fd加入process的环境变量和需要在新进程保持打开的文件列表中。
3、调用setupIO来处理io和tty相关配置,对于create来说,这里就是修改当前进程的io,chown用户/组权限。
4、创建一个signalHandler来处理tty和signal。
5、调用container.Start(process)来启动process进程–即进入container的阶段。

  1. func (r *runner) run(config *specs.Process) (int, error) {
  2. if err := r.checkTerminal(config); err != nil {
  3. r.destroy()
  4. return -1, err
  5. }
  6. // ********************************** NOTICE ********************************** //
  7. // 将specs.Process转为libcontainer.Process
  8. process, err := newProcess(*config, r.init)
  9. // ********************************** NOTICE ********************************** //
  10. if err != nil {
  11. r.destroy()
  12. return -1, err
  13. }
  14. if len(r.listenFDs) > 0 {
  15. process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
  16. process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
  17. }
  18. baseFd := 3 + len(process.ExtraFiles)
  19. for i := baseFd; i < baseFd+r.preserveFDs; i++ {
  20. process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
  21. }
  22. rootuid, err := r.container.Config().HostRootUID()
  23. if err != nil {
  24. r.destroy()
  25. return -1, err
  26. }
  27. rootgid, err := r.container.Config().HostRootGID()
  28. if err != nil {
  29. r.destroy()
  30. return -1, err
  31. }
  32. var (
  33. detach = r.detach || (r.action == CT_ACT_CREATE)
  34. )
  35. // Setting up IO is a two stage process. We need to modify process to deal
  36. // with detaching containers, and then we get a tty after the container has
  37. // started.
  38. handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
  39. // ********************************** NOTICE ********************************** //
  40. tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
  41. // ********************************** NOTICE ********************************** //
  42. if err != nil {
  43. r.destroy()
  44. return -1, err
  45. }
  46. defer tty.Close()
  47. switch r.action {
  48. case CT_ACT_CREATE:
  49. // ********************************** NOTICE ********************************** //
  50. err = r.container.Start(process)
  51. // ********************************** NOTICE ********************************** //
  52. case CT_ACT_RESTORE:
  53. err = r.container.Restore(process, r.criuOpts)
  54. case CT_ACT_RUN:
  55. err = r.container.Run(process)
  56. default:
  57. panic("Unknown action")
  58. }
  59. if err != nil {
  60. r.destroy()
  61. return -1, err
  62. }
  63. if err := tty.waitConsole(); err != nil {
  64. r.terminate(process)
  65. r.destroy()
  66. return -1, err
  67. }
  68. if err = tty.ClosePostStart(); err != nil {
  69. r.terminate(process)
  70. r.destroy()
  71. return -1, err
  72. }
  73. if r.pidFile != "" {
  74. if err = createPidFile(r.pidFile, process); err != nil {
  75. r.terminate(process)
  76. r.destroy()
  77. return -1, err
  78. }
  79. }
  80. status, err := handler.forward(process, tty, detach)
  81. if err != nil {
  82. r.terminate(process)
  83. }
  84. if detach {
  85. return 0, nil
  86. }
  87. r.destroy()
  88. return status, err
  89. }
  90. // newProcess returns a new libcontainer Process with the arguments from the
  91. // spec and stdio from the current process.
  92. func newProcess(p specs.Process, init bool) (*libcontainer.Process, error) {
  93. lp := &libcontainer.Process{
  94. Args: p.Args,
  95. Env: p.Env,
  96. // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
  97. User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
  98. Cwd: p.Cwd,
  99. Label: p.SelinuxLabel,
  100. NoNewPrivileges: &p.NoNewPrivileges,
  101. AppArmorProfile: p.ApparmorProfile,
  102. Init: init,
  103. }
  104. if p.ConsoleSize != nil {
  105. lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
  106. lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
  107. }
  108. if p.Capabilities != nil {
  109. lp.Capabilities = &configs.Capabilities{}
  110. lp.Capabilities.Bounding = p.Capabilities.Bounding
  111. lp.Capabilities.Effective = p.Capabilities.Effective
  112. lp.Capabilities.Inheritable = p.Capabilities.Inheritable
  113. lp.Capabilities.Permitted = p.Capabilities.Permitted
  114. lp.Capabilities.Ambient = p.Capabilities.Ambient
  115. }
  116. for _, gid := range p.User.AdditionalGids {
  117. lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
  118. }
  119. for _, rlimit := range p.Rlimits {
  120. rl, err := createLibContainerRlimit(rlimit)
  121. if err != nil {
  122. return nil, err
  123. }
  124. lp.Rlimits = append(lp.Rlimits, rl)
  125. }
  126. return lp, nil
  127. }
  128. // setupIO modifies the given process config according to the options.
  129. func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) {
  130. if createTTY {
  131. process.Stdin = nil
  132. process.Stdout = nil
  133. process.Stderr = nil
  134. t := &tty{}
  135. if !detach {
  136. parent, child, err := utils.NewSockPair("console")
  137. if err != nil {
  138. return nil, err
  139. }
  140. process.ConsoleSocket = child
  141. t.postStart = append(t.postStart, parent, child)
  142. t.consoleC = make(chan error, 1)
  143. go func() {
  144. if err := t.recvtty(process, parent); err != nil {
  145. t.consoleC <- err
  146. }
  147. t.consoleC <- nil
  148. }()
  149. } else {
  150. // the caller of runc will handle receiving the console master
  151. conn, err := net.Dial("unix", sockpath)
  152. if err != nil {
  153. return nil, err
  154. }
  155. uc, ok := conn.(*net.UnixConn)
  156. if !ok {
  157. return nil, fmt.Errorf("casting to UnixConn failed")
  158. }
  159. t.postStart = append(t.postStart, uc)
  160. socket, err := uc.File()
  161. if err != nil {
  162. return nil, err
  163. }
  164. t.postStart = append(t.postStart, socket)
  165. process.ConsoleSocket = socket
  166. }
  167. return t, nil
  168. }
  169. // when runc will detach the caller provides the stdio to runc via runc's 0,1,2
  170. // and the container's process inherits runc's stdio.
  171. if detach {
  172. if err := inheritStdio(process); err != nil {
  173. return nil, err
  174. }
  175. return &tty{}, nil
  176. }
  177. return setupProcessPipes(process, rootuid, rootgid)
  178. }

2.2.1) libcontainer/container_linux.go#linuxContainer.Start

  1. func (c *linuxContainer) Start(process *Process) error {
  2. c.m.Lock()
  3. defer c.m.Unlock()
  4. if process.Init {
  5. if err := c.createExecFifo(); err != nil {
  6. return err
  7. }
  8. }
  9. // ********************************** NOTICE ********************************** //
  10. if err := c.start(process); err != nil {
  11. // ********************************** NOTICE ********************************** //
  12. if process.Init {
  13. c.deleteExecFifo()
  14. }
  15. return err
  16. }
  17. return nil
  18. }

2.2.1.1) libcontainer/container_linux.go#linuxContainer.start

image.png

  1. func (c *linuxContainer) start(process *Process) error {
  2. // ********************************** NOTICE ********************************** //
  3. // 启动parent进程
  4. parent, err := c.newParentProcess(process)
  5. // ********************************** NOTICE ********************************** //
  6. if err != nil {
  7. return newSystemErrorWithCause(err, "creating new parent process")
  8. }
  9. // ********************************** NOTICE ********************************** //
  10. if err := parent.start(); err != nil {
  11. // ********************************** NOTICE ********************************** //
  12. // terminate the process to ensure that it properly is reaped.
  13. if err := ignoreTerminateErrors(parent.terminate()); err != nil {
  14. logrus.Warn(err)
  15. }
  16. return newSystemErrorWithCause(err, "starting container process")
  17. }
  18. // generate a timestamp indicating when the container was started
  19. c.created = time.Now().UTC()
  20. if process.Init {
  21. c.state = &createdState{
  22. c: c,
  23. }
  24. // ********************************** NOTICE ********************************** //
  25. // 更新state
  26. state, err := c.updateState(parent)
  27. // ********************************** NOTICE ********************************** //
  28. if err != nil {
  29. return err
  30. }
  31. c.initProcessStartTime = state.InitProcessStartTime
  32. if c.config.Hooks != nil {
  33. s, err := c.currentOCIState()
  34. if err != nil {
  35. return err
  36. }
  37. for i, hook := range c.config.Hooks.Poststart {
  38. if err := hook.Run(s); err != nil {
  39. if err := ignoreTerminateErrors(parent.terminate()); err != nil {
  40. logrus.Warn(err)
  41. }
  42. return newSystemErrorWithCausef(err, "running poststart hook %d", i)
  43. }
  44. }
  45. }
  46. }
  47. return nil
  48. }

2.2.1.1.1) libcontainer/container_linux.go#linuxContainer.newParentProcess

创建一个initProcess,里面既有init进程的信息,也有spec里面指定的process的信息。

1、创建一对pipe——parentPipe和childPipe,打开rootDir。
2、创建一个command,命令为runc init自身(通过/proc/self/exe软链接实现);标准io为当前进程的;工作目录为Rootfs;用ExtraFiles在新进程中保持打开childPipe和rootDir,并添加对应的环境变量。
3、调用newInitProcess进一步将parent process和command封装为initProcess。主要工作为添加初始化类型环境变量,将namespace、uid/gid映射等配置信息用bootstrapData封装为一个io.Reader等。

parentProcess是一个接口类型。
image.png

  1. func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
  2. parentPipe, childPipe, err := utils.NewSockPair("init")
  3. if err != nil {
  4. return nil, newSystemErrorWithCause(err, "creating new init pipe")
  5. }
  6. // ********************************** NOTICE ********************************** //
  7. cmd, err := c.commandTemplate(p, childPipe)
  8. // ********************************** NOTICE ********************************** //
  9. if err != nil {
  10. return nil, newSystemErrorWithCause(err, "creating new command template")
  11. }
  12. if !p.Init {
  13. return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
  14. }
  15. // We only set up fifoFd if we're not doing a `runc exec`. The historic
  16. // reason for this is that previously we would pass a dirfd that allowed
  17. // for container rootfs escape (and not doing it in `runc exec` avoided
  18. // that problem), but we no longer do that. However, there's no need to do
  19. // this for `runc exec` so we just keep it this way to be safe.
  20. if err := c.includeExecFifo(cmd); err != nil {
  21. return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
  22. }
  23. // ********************************** NOTICE ********************************** //
  24. return c.newInitProcess(p, cmd, parentPipe, childPipe)
  25. // ********************************** NOTICE ********************************** //
  26. }

2.2.1.1.1.1) libcontainer/container_linux.go#linuxContainer.commandTemplate

这里的initPath、initArgs是:
InitPath: “/proc/self/exe”,
InitArgs: []string{os.Args[0], “init”},

  1. func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
  2. cmd := exec.Command(c.initPath, c.initArgs[1:]...)
  3. cmd.Args[0] = c.initArgs[0]
  4. cmd.Stdin = p.Stdin
  5. cmd.Stdout = p.Stdout
  6. cmd.Stderr = p.Stderr
  7. cmd.Dir = c.config.Rootfs
  8. if cmd.SysProcAttr == nil {
  9. cmd.SysProcAttr = &syscall.SysProcAttr{}
  10. }
  11. cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
  12. cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
  13. if p.ConsoleSocket != nil {
  14. cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
  15. cmd.Env = append(cmd.Env,
  16. fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
  17. )
  18. }
  19. cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
  20. cmd.Env = append(cmd.Env,
  21. fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
  22. )
  23. // NOTE: when running a container with no PID namespace and the parent process spawning the container is
  24. // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
  25. // even with the parent still running.
  26. if c.config.ParentDeathSignal > 0 {
  27. cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
  28. }
  29. return cmd, nil
  30. }

2.2.1.1.1.2) libcontainer/container_linux.go#linuxContainer.newInitProcess

调用newInitProcess进一步将parent process和command封装为initProcess。主要工作为添加初始化类型环境变量,将namespace、uid/gid映射等配置信息用bootstrapData封装为一个io.Reader等。

  1. func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
  2. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
  3. nsMaps := make(map[configs.NamespaceType]string)
  4. for _, ns := range c.config.Namespaces {
  5. if ns.Path != "" {
  6. nsMaps[ns.Type] = ns.Path
  7. }
  8. }
  9. _, sharePidns := nsMaps[configs.NEWPID]
  10. // ********************************** NOTICE ********************************** //
  11. data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
  12. // ********************************** NOTICE ********************************** //
  13. if err != nil {
  14. return nil, err
  15. }
  16. init := &initProcess{
  17. cmd: cmd,
  18. childPipe: childPipe,
  19. parentPipe: parentPipe,
  20. manager: c.cgroupManager,
  21. intelRdtManager: c.intelRdtManager,
  22. // ********************************** NOTICE ********************************** //
  23. config: c.newInitConfig(p),
  24. // ********************************** NOTICE ********************************** //
  25. container: c,
  26. process: p,
  27. bootstrapData: data,
  28. sharePidns: sharePidns,
  29. }
  30. c.initProcess = init
  31. return init, nil
  32. }

2.2.1.1.1.2.1) libcontainer/container_linux.go#linuxContainer.bootstrapData(发给init进程的数据)
  1. // bootstrapData encodes the necessary data in netlink binary format
  2. // as a io.Reader.
  3. // Consumer can write the data to a bootstrap program
  4. // such as one that uses nsenter package to bootstrap the container's
  5. // init process correctly, i.e. with correct namespaces, uid/gid
  6. // mapping etc.
  7. func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
  8. // create the netlink message
  9. r := nl.NewNetlinkRequest(int(InitMsg), 0)
  10. // write cloneFlags
  11. r.AddData(&Int32msg{
  12. Type: CloneFlagsAttr,
  13. Value: uint32(cloneFlags),
  14. })
  15. // write custom namespace paths
  16. if len(nsMaps) > 0 {
  17. nsPaths, err := c.orderNamespacePaths(nsMaps)
  18. if err != nil {
  19. return nil, err
  20. }
  21. r.AddData(&Bytemsg{
  22. Type: NsPathsAttr,
  23. Value: []byte(strings.Join(nsPaths, ",")),
  24. })
  25. }
  26. // write namespace paths only when we are not joining an existing user ns
  27. _, joinExistingUser := nsMaps[configs.NEWUSER]
  28. if !joinExistingUser {
  29. // write uid mappings
  30. if len(c.config.UidMappings) > 0 {
  31. if c.config.RootlessEUID && c.newuidmapPath != "" {
  32. r.AddData(&Bytemsg{
  33. Type: UidmapPathAttr,
  34. Value: []byte(c.newuidmapPath),
  35. })
  36. }
  37. b, err := encodeIDMapping(c.config.UidMappings)
  38. if err != nil {
  39. return nil, err
  40. }
  41. r.AddData(&Bytemsg{
  42. Type: UidmapAttr,
  43. Value: b,
  44. })
  45. }
  46. // write gid mappings
  47. if len(c.config.GidMappings) > 0 {
  48. b, err := encodeIDMapping(c.config.GidMappings)
  49. if err != nil {
  50. return nil, err
  51. }
  52. r.AddData(&Bytemsg{
  53. Type: GidmapAttr,
  54. Value: b,
  55. })
  56. if c.config.RootlessEUID && c.newgidmapPath != "" {
  57. r.AddData(&Bytemsg{
  58. Type: GidmapPathAttr,
  59. Value: []byte(c.newgidmapPath),
  60. })
  61. }
  62. if requiresRootOrMappingTool(c.config) {
  63. r.AddData(&Boolmsg{
  64. Type: SetgroupAttr,
  65. Value: true,
  66. })
  67. }
  68. }
  69. }
  70. if c.config.OomScoreAdj != nil {
  71. // write oom_score_adj
  72. r.AddData(&Bytemsg{
  73. Type: OomScoreAdjAttr,
  74. Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
  75. })
  76. }
  77. // write rootless
  78. r.AddData(&Boolmsg{
  79. Type: RootlessEUIDAttr,
  80. Value: c.config.RootlessEUID,
  81. })
  82. return bytes.NewReader(r.Serialize()), nil
  83. }

2.2.1.1.1.2.2) libcontainer/container_linux.go#linuxContainer.newInitConfig(Process转initConfig)
  1. func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
  2. cfg := &initConfig{
  3. Config: c.config,
  4. Args: process.Args,
  5. Env: process.Env,
  6. User: process.User,
  7. AdditionalGroups: process.AdditionalGroups,
  8. Cwd: process.Cwd,
  9. Capabilities: process.Capabilities,
  10. PassedFilesCount: len(process.ExtraFiles),
  11. ContainerId: c.ID(),
  12. NoNewPrivileges: c.config.NoNewPrivileges,
  13. RootlessEUID: c.config.RootlessEUID,
  14. RootlessCgroups: c.config.RootlessCgroups,
  15. AppArmorProfile: c.config.AppArmorProfile,
  16. ProcessLabel: c.config.ProcessLabel,
  17. Rlimits: c.config.Rlimits,
  18. }
  19. if process.NoNewPrivileges != nil {
  20. cfg.NoNewPrivileges = *process.NoNewPrivileges
  21. }
  22. if process.AppArmorProfile != "" {
  23. cfg.AppArmorProfile = process.AppArmorProfile
  24. }
  25. if process.Label != "" {
  26. cfg.ProcessLabel = process.Label
  27. }
  28. if len(process.Rlimits) > 0 {
  29. cfg.Rlimits = process.Rlimits
  30. }
  31. cfg.CreateConsole = process.ConsoleSocket != nil
  32. cfg.ConsoleWidth = process.ConsoleWidth
  33. cfg.ConsoleHeight = process.ConsoleHeight
  34. return cfg
  35. }

2.2.1.1.2) 【parent】libcontainer/process_linux.go#initProcess.start

1、异步启动cmd.Start()(等同于调用runc init)来启动init进程。
2、将spec中process指定的ops指定为initProcess。
3、将前面创建bootstrapData从parentPipe传出去(init进程会从childPipe接收到这些数据,reverse出写入的内容,进行namespace相关的配置)
4、调用execSetns(),这个方法名看似是进行namespace的配置,实际上则是等待上面init进程的执行,并在parentPipe等待并解析出从childPipe传回的pid(谁的pid),找到该pid对应的进程,并将cmd.Process对应的进程替换为该进程。
5、为checkpoint做准备,保存cmd.Process进程的标准IO文件描述符。
6、应用cgroup配置
7、创建容器中的network interface。
8、将容器的配置文件内容spec从parentPipe发送给init进程。

9、下面与init进程进行同步,一个for循环状态机,通过解析parentPipe传回的sync Type来执行相应的操作。按正常的时间顺序,如下:
procReady,继续配置cgroup(Set与Apply的区别?)、oom、rlimits;如果配置中没有mount namespace(Why?),则执行prestart钩子;往parentPipe写入procRun状态。
procHooks,执行prestart钩子,往parentPipe写入procResume状态。(这个应该不是标准create的流程,resume?)
procError,just error and exit
10、进行一些是否成功run和resume的判断,进行错误处理。
11、关闭parentPipe,返回nil or err。

  1. func (p *initProcess) start() error {
  2. defer p.parentPipe.Close()
  3. err := p.cmd.Start()
  4. p.process.ops = p
  5. p.childPipe.Close()
  6. if err != nil {
  7. p.process.ops = nil
  8. return newSystemErrorWithCause(err, "starting init process command")
  9. }
  10. // Do this before syncing with child so that no children can escape the
  11. // cgroup. We don't need to worry about not doing this and not being root
  12. // because we'd be using the rootless cgroup manager in that case.
  13. if err := p.manager.Apply(p.pid()); err != nil {
  14. return newSystemErrorWithCause(err, "applying cgroup configuration for process")
  15. }
  16. if p.intelRdtManager != nil {
  17. if err := p.intelRdtManager.Apply(p.pid()); err != nil {
  18. return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
  19. }
  20. }
  21. defer func() {
  22. if err != nil {
  23. // TODO: should not be the responsibility to call here
  24. p.manager.Destroy()
  25. if p.intelRdtManager != nil {
  26. p.intelRdtManager.Destroy()
  27. }
  28. }
  29. }()
  30. if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
  31. return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
  32. }
  33. childPid, err := p.getChildPid()
  34. if err != nil {
  35. return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
  36. }
  37. // Save the standard descriptor names before the container process
  38. // can potentially move them (e.g., via dup2()). If we don't do this now,
  39. // we won't know at checkpoint time which file descriptor to look up.
  40. fds, err := getPipeFds(childPid)
  41. if err != nil {
  42. return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
  43. }
  44. p.setExternalDescriptors(fds)
  45. // Do this before syncing with child so that no children
  46. // can escape the cgroup
  47. if err := p.manager.Apply(childPid); err != nil {
  48. return newSystemErrorWithCause(err, "applying cgroup configuration for process")
  49. }
  50. if p.intelRdtManager != nil {
  51. if err := p.intelRdtManager.Apply(childPid); err != nil {
  52. return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
  53. }
  54. }
  55. // Now it's time to setup cgroup namesapce
  56. if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
  57. if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
  58. return newSystemErrorWithCause(err, "sending synchronization value to init process")
  59. }
  60. }
  61. // Wait for our first child to exit
  62. if err := p.waitForChildExit(childPid); err != nil {
  63. return newSystemErrorWithCause(err, "waiting for our first child to exit")
  64. }
  65. defer func() {
  66. if err != nil {
  67. // TODO: should not be the responsibility to call here
  68. p.manager.Destroy()
  69. }
  70. }()
  71. if err := p.createNetworkInterfaces(); err != nil {
  72. return newSystemErrorWithCause(err, "creating network interfaces")
  73. }
  74. if err := p.sendConfig(); err != nil {
  75. return newSystemErrorWithCause(err, "sending config to init process")
  76. }
  77. var (
  78. sentRun bool
  79. sentResume bool
  80. )
  81. ierr := parseSync(p.parentPipe, func(sync *syncT) error {
  82. switch sync.Type {
  83. case procReady:
  84. // set rlimits, this has to be done here because we lose permissions
  85. // to raise the limits once we enter a user-namespace
  86. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  87. return newSystemErrorWithCause(err, "setting rlimits for ready process")
  88. }
  89. // call prestart hooks
  90. if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
  91. // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
  92. if err := p.manager.Set(p.config.Config); err != nil {
  93. return newSystemErrorWithCause(err, "setting cgroup config for ready process")
  94. }
  95. if p.intelRdtManager != nil {
  96. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  97. return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
  98. }
  99. }
  100. if p.config.Config.Hooks != nil {
  101. s, err := p.container.currentOCIState()
  102. if err != nil {
  103. return err
  104. }
  105. // initProcessStartTime hasn't been set yet.
  106. s.Pid = p.cmd.Process.Pid
  107. s.Status = "creating"
  108. for i, hook := range p.config.Config.Hooks.Prestart {
  109. if err := hook.Run(s); err != nil {
  110. return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  111. }
  112. }
  113. }
  114. }
  115. // Sync with child.
  116. if err := writeSync(p.parentPipe, procRun); err != nil {
  117. return newSystemErrorWithCause(err, "writing syncT 'run'")
  118. }
  119. sentRun = true
  120. case procHooks:
  121. // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
  122. if err := p.manager.Set(p.config.Config); err != nil {
  123. return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
  124. }
  125. if p.intelRdtManager != nil {
  126. if err := p.intelRdtManager.Set(p.config.Config); err != nil {
  127. return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
  128. }
  129. }
  130. if p.config.Config.Hooks != nil {
  131. s, err := p.container.currentOCIState()
  132. if err != nil {
  133. return err
  134. }
  135. // initProcessStartTime hasn't been set yet.
  136. s.Pid = p.cmd.Process.Pid
  137. s.Status = "creating"
  138. for i, hook := range p.config.Config.Hooks.Prestart {
  139. if err := hook.Run(s); err != nil {
  140. return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  141. }
  142. }
  143. }
  144. // Sync with child.
  145. if err := writeSync(p.parentPipe, procResume); err != nil {
  146. return newSystemErrorWithCause(err, "writing syncT 'resume'")
  147. }
  148. sentResume = true
  149. default:
  150. return newSystemError(fmt.Errorf("invalid JSON payload from child"))
  151. }
  152. return nil
  153. })
  154. if !sentRun {
  155. return newSystemErrorWithCause(ierr, "container init")
  156. }
  157. if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
  158. return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
  159. }
  160. if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
  161. return newSystemErrorWithCause(err, "shutting down init pipe")
  162. }
  163. // Must be done after Shutdown so the child will exit and we can wait for it.
  164. if ierr != nil {
  165. p.wait()
  166. return ierr
  167. }
  168. return nil
  169. }

2.2.1.1.2.1) libcontainer/process_linux.go#initProcess.createNetworkInterfaces

  1. func (p *initProcess) createNetworkInterfaces() error {
  2. for _, config := range p.config.Config.Networks {
  3. strategy, err := getStrategy(config.Type)
  4. if err != nil {
  5. return err
  6. }
  7. n := &network{
  8. Network: *config,
  9. }
  10. if err := strategy.create(n, p.pid()); err != nil {
  11. return err
  12. }
  13. p.config.Networks = append(p.config.Networks, n)
  14. }
  15. return nil
  16. }

2.2.1.1.2.2) libcontainer/process_linux.go#initProcess.sendConfig

  1. func (p *initProcess) sendConfig() error {
  2. // send the config to the container's init process, we don't use JSON Encode
  3. // here because there might be a problem in JSON decoder in some cases, see:
  4. // https://github.com/docker/docker/issues/14203#issuecomment-174177790
  5. return utils.WriteJSON(p.parentPipe, p.config)
  6. }

2.2.1.1.3) 【child】init.go#Action(容器init进程,与上一步不是顺序关系,而是异步通信)

2.2.1.1.3.1) libcontainer/nsenter/nsenter.go#nsexec

在init.go文件中有一行:
_ "github.com/opencontainers/runc/libcontainer/nsenter"

  1. // +build linux,!gccgo
  2. package nsenter
  3. /*
  4. #cgo CFLAGS: -Wall
  5. extern void nsexec();
  6. void __attribute__((constructor)) init(void) {
  7. nsexec();
  8. }
  9. */
  10. import "C"

nsenter
The nsenter package registers a special init constructor that is called before
the Go runtime has a chance to boot. This provides us the ability to setns on
existing namespaces and avoid the issues that the Go runtime has with multiple
threads. This constructor will be called if this package is registered,
imported, in your go application.
The nsenter package will import "C" and it uses cgo
package. In cgo, if the import of “C” is immediately preceded by a comment, that comment,
called the preamble, is used as a header when compiling the C parts of the package.
So every time we import package nsenter, the C code function nsexec() would be
called. And package nsenter is only imported in init.go, so every time the runc
init command is invoked, that C code is run.
Because nsexec() must be run before the Go runtime in order to use the
Linux kernel namespace, you must import this library into a package if
you plan to use libcontainer directly. Otherwise Go will not execute
the nsexec() constructor, which means that the re-exec will not cause
the namespaces to be joined. You can import it like this:

  1. import _ "github.com/opencontainers/runc/libcontainer/nsenter"

nsexec() will first get the file descriptor number for the init pipe
from the environment variable _LIBCONTAINER_INITPIPE (which was opened
by the parent and kept open across the fork-exec of the nsexec() init
process). The init pipe is used to read bootstrap data (namespace paths,
clone flags, uid and gid mappings, and the console path) from the parent
process. nsexec() will then call setns(2) to join the namespaces
provided in the bootstrap data (if available), clone(2) a child process
with the provided clone flags, update the user and group ID mappings, do
some further miscellaneous setup steps, and then send the PID of the
child process to the parent of the nsexec() “caller”. Finally,
the parent nsexec() will exit and the child nsexec() process will
return to allow the Go runtime take over.
NOTE: We do both setns(2) and clone(2) even if we don’t have any
CLONE_NEW* clone flags because we must fork a new process in order to
enter the PID namespace.

2.2.1.1.3.1.1) libcontainer/nsenter/nsexec.c

1、拿到childPipe(根据环境变量)
2、读取bootstrapData,C中的数据结构如下:

  1. struct nlconfig_t {
  2. char *data;
  3. /* Process settings. */
  4. uint32_t cloneflags;
  5. char *oom_score_adj;
  6. size_t oom_score_adj_len;
  7. /* User namespace settings. */
  8. char *uidmap;
  9. size_t uidmap_len;
  10. char *gidmap;
  11. size_t gidmap_len;
  12. char *namespaces;
  13. size_t namespaces_len;
  14. uint8_t is_setgroup;
  15. /* Rootless container settings. */
  16. uint8_t is_rootless_euid; /* boolean */
  17. char *uidmappath;
  18. size_t uidmappath_len;
  19. char *gidmappath;
  20. size_t gidmappath_len;
  21. };

3、runc中命名空间隔离的实现文件nsexec.c 发现并没有简单实用clone实现。而是因为selinux问题,内核版本等问题,并没有简单使用clone实现,而是配合其他namespace API实现
Namespace API提供了三种系统调用接口:
● clone():创建新的进程
● setns():允许指定进程加入特定的namespace
● unshare():将指定进程移除指定的namespace
nsexec.c分别使用这三种接口,对于一般新建命名空间,使用unshare()实现;对于已有的命名空间,使用setns()实现。

1257598468-5c2a0fc3a1d5a_articlex.png

  1. void nsexec(void)
  2. {
  3. int pipenum;
  4. jmp_buf env;
  5. int sync_child_pipe[2], sync_grandchild_pipe[2];
  6. struct nlconfig_t config = { 0 };
  7. /*
  8. * If we don't have an init pipe, just return to the go routine.
  9. * We'll only get an init pipe for start or exec.
  10. */
  11. pipenum = initpipe();
  12. if (pipenum == -1)
  13. return;
  14. /* Parse all of the netlink configuration. */
  15. nl_parse(pipenum, &config);
  16. /* Set oom_score_adj. This has to be done before !dumpable because
  17. * /proc/self/oom_score_adj is not writeable unless you're an privileged
  18. * user (if !dumpable is set). All children inherit their parent's
  19. * oom_score_adj value on fork(2) so this will always be propagated
  20. * properly.
  21. */
  22. update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
  23. /*
  24. * Make the process non-dumpable, to avoid various race conditions that
  25. * could cause processes in namespaces we're joining to access host
  26. * resources (or potentially execute code).
  27. *
  28. * However, if the number of namespaces we are joining is 0, we are not
  29. * going to be switching to a different security context. Thus setting
  30. * ourselves to be non-dumpable only breaks things (like rootless
  31. * containers), which is the recommendation from the kernel folks.
  32. */
  33. if (config.namespaces) {
  34. if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
  35. bail("failed to set process as non-dumpable");
  36. }
  37. /* Pipe so we can tell the child when we've finished setting up. */
  38. if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
  39. bail("failed to setup sync pipe between parent and child");
  40. /*
  41. * We need a new socketpair to sync with grandchild so we don't have
  42. * race condition with child.
  43. */
  44. if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
  45. bail("failed to setup sync pipe between parent and grandchild");
  46. /* TODO: Currently we aren't dealing with child deaths properly. */
  47. /*
  48. * Okay, so this is quite annoying.
  49. *
  50. * In order for this unsharing code to be more extensible we need to split
  51. * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
  52. * would be if we did clone(CLONE_NEWUSER) and the other namespaces
  53. * separately, but because of SELinux issues we cannot really do that. But
  54. * we cannot just dump the namespace flags into clone(...) because several
  55. * usecases (such as rootless containers) require more granularity around
  56. * the namespace setup. In addition, some older kernels had issues where
  57. * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
  58. * handle this while also dealing with SELinux so we choose SELinux support
  59. * over broken kernel support).
  60. *
  61. * However, if we unshare(2) the user namespace *before* we clone(2), then
  62. * all hell breaks loose.
  63. *
  64. * The parent no longer has permissions to do many things (unshare(2) drops
  65. * all capabilities in your old namespace), and the container cannot be set
  66. * up to have more than one {uid,gid} mapping. This is obviously less than
  67. * ideal. In order to fix this, we have to first clone(2) and then unshare.
  68. *
  69. * Unfortunately, it's not as simple as that. We have to fork to enter the
  70. * PID namespace (the PID namespace only applies to children). Since we'll
  71. * have to double-fork, this clone_parent() call won't be able to get the
  72. * PID of the _actual_ init process (without doing more synchronisation than
  73. * I can deal with at the moment). So we'll just get the parent to send it
  74. * for us, the only job of this process is to update
  75. * /proc/pid/{setgroups,uid_map,gid_map}.
  76. *
  77. * And as a result of the above, we also need to setns(2) in the first child
  78. * because if we join a PID namespace in the topmost parent then our child
  79. * will be in that namespace (and it will not be able to give us a PID value
  80. * that makes sense without resorting to sending things with cmsg).
  81. *
  82. * This also deals with an older issue caused by dumping cloneflags into
  83. * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
  84. * we have to unshare(2) before clone(2) in order to do this. This was fixed
  85. * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
  86. * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
  87. * aware, the last mainline kernel which had this bug was Linux 3.12.
  88. * However, we cannot comment on which kernels the broken patch was
  89. * backported to.
  90. *
  91. * -- Aleksa "what has my life come to?" Sarai
  92. */
  93. switch (setjmp(env)) {
  94. /*
  95. * Stage 0: We're in the parent. Our job is just to create a new child
  96. * (stage 1: JUMP_CHILD) process and write its uid_map and
  97. * gid_map. That process will go on to create a new process, then
  98. * it will send us its PID which we will send to the bootstrap
  99. * process.
  100. */
  101. case JUMP_PARENT:{
  102. int len;
  103. pid_t child, first_child = -1;
  104. bool ready = false;
  105. /* For debugging. */
  106. prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
  107. /* Start the process of getting a container. */
  108. child = clone_parent(&env, JUMP_CHILD);
  109. if (child < 0)
  110. bail("unable to fork: child_func");
  111. /*
  112. * State machine for synchronisation with the children.
  113. *
  114. * Father only return when both child and grandchild are
  115. * ready, so we can receive all possible error codes
  116. * generated by children.
  117. */
  118. while (!ready) {
  119. enum sync_t s;
  120. int ret;
  121. syncfd = sync_child_pipe[1];
  122. close(sync_child_pipe[0]);
  123. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  124. bail("failed to sync with child: next state");
  125. switch (s) {
  126. case SYNC_ERR:
  127. /* We have to mirror the error code of the child. */
  128. if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
  129. bail("failed to sync with child: read(error code)");
  130. exit(ret);
  131. case SYNC_USERMAP_PLS:
  132. /*
  133. * Enable setgroups(2) if we've been asked to. But we also
  134. * have to explicitly disable setgroups(2) if we're
  135. * creating a rootless container for single-entry mapping.
  136. * i.e. config.is_setgroup == false.
  137. * (this is required since Linux 3.19).
  138. *
  139. * For rootless multi-entry mapping, config.is_setgroup shall be true and
  140. * newuidmap/newgidmap shall be used.
  141. */
  142. if (config.is_rootless_euid && !config.is_setgroup)
  143. update_setgroups(child, SETGROUPS_DENY);
  144. /* Set up mappings. */
  145. update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
  146. update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
  147. s = SYNC_USERMAP_ACK;
  148. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  149. kill(child, SIGKILL);
  150. bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
  151. }
  152. break;
  153. case SYNC_RECVPID_PLS:{
  154. first_child = child;
  155. /* Get the init_func pid. */
  156. if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
  157. kill(first_child, SIGKILL);
  158. bail("failed to sync with child: read(childpid)");
  159. }
  160. /* Send ACK. */
  161. s = SYNC_RECVPID_ACK;
  162. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  163. kill(first_child, SIGKILL);
  164. kill(child, SIGKILL);
  165. bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
  166. }
  167. /* Send the init_func pid back to our parent.
  168. *
  169. * Send the init_func pid and the pid of the first child back to our parent.
  170. * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
  171. * It becomes the responsibility of our parent to reap the first child.
  172. */
  173. len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
  174. if (len < 0) {
  175. kill(child, SIGKILL);
  176. bail("unable to generate JSON for child pid");
  177. }
  178. }
  179. break;
  180. case SYNC_CHILD_READY:
  181. ready = true;
  182. break;
  183. default:
  184. bail("unexpected sync value: %u", s);
  185. }
  186. }
  187. /* Now sync with grandchild. */
  188. ready = false;
  189. while (!ready) {
  190. enum sync_t s;
  191. int ret;
  192. syncfd = sync_grandchild_pipe[1];
  193. close(sync_grandchild_pipe[0]);
  194. s = SYNC_GRANDCHILD;
  195. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  196. kill(child, SIGKILL);
  197. bail("failed to sync with child: write(SYNC_GRANDCHILD)");
  198. }
  199. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  200. bail("failed to sync with child: next state");
  201. switch (s) {
  202. case SYNC_ERR:
  203. /* We have to mirror the error code of the child. */
  204. if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
  205. bail("failed to sync with child: read(error code)");
  206. exit(ret);
  207. case SYNC_CHILD_READY:
  208. ready = true;
  209. break;
  210. default:
  211. bail("unexpected sync value: %u", s);
  212. }
  213. }
  214. exit(0);
  215. }
  216. /*
  217. * Stage 1: We're in the first child process. Our job is to join any
  218. * provided namespaces in the netlink payload and unshare all
  219. * of the requested namespaces. If we've been asked to
  220. * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
  221. * our user mappings for us. Then, we create a new child
  222. * (stage 2: JUMP_INIT) for PID namespace. We then send the
  223. * child's PID to our parent (stage 0).
  224. */
  225. case JUMP_CHILD:{
  226. pid_t child;
  227. enum sync_t s;
  228. /* We're in a child and thus need to tell the parent if we die. */
  229. syncfd = sync_child_pipe[0];
  230. close(sync_child_pipe[1]);
  231. /* For debugging. */
  232. prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
  233. /*
  234. * We need to setns first. We cannot do this earlier (in stage 0)
  235. * because of the fact that we forked to get here (the PID of
  236. * [stage 2: JUMP_INIT]) would be meaningless). We could send it
  237. * using cmsg(3) but that's just annoying.
  238. */
  239. if (config.namespaces)
  240. join_namespaces(config.namespaces);
  241. /*
  242. * Deal with user namespaces first. They are quite special, as they
  243. * affect our ability to unshare other namespaces and are used as
  244. * context for privilege checks.
  245. *
  246. * We don't unshare all namespaces in one go. The reason for this
  247. * is that, while the kernel documentation may claim otherwise,
  248. * there are certain cases where unsharing all namespaces at once
  249. * will result in namespace objects being owned incorrectly.
  250. * Ideally we should just fix these kernel bugs, but it's better to
  251. * be safe than sorry, and fix them separately.
  252. *
  253. * A specific case of this is that the SELinux label of the
  254. * internal kern-mount that mqueue uses will be incorrect if the
  255. * UTS namespace is cloned before the USER namespace is mapped.
  256. * I've also heard of similar problems with the network namespace
  257. * in some scenarios. This also mirrors how LXC deals with this
  258. * problem.
  259. */
  260. if (config.cloneflags & CLONE_NEWUSER) {
  261. if (unshare(CLONE_NEWUSER) < 0)
  262. bail("failed to unshare user namespace");
  263. config.cloneflags &= ~CLONE_NEWUSER;
  264. /*
  265. * We don't have the privileges to do any mapping here (see the
  266. * clone_parent rant). So signal our parent to hook us up.
  267. */
  268. /* Switching is only necessary if we joined namespaces. */
  269. if (config.namespaces) {
  270. if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
  271. bail("failed to set process as dumpable");
  272. }
  273. s = SYNC_USERMAP_PLS;
  274. if (write(syncfd, &s, sizeof(s)) != sizeof(s))
  275. bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
  276. /* ... wait for mapping ... */
  277. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  278. bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
  279. if (s != SYNC_USERMAP_ACK)
  280. bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
  281. /* Switching is only necessary if we joined namespaces. */
  282. if (config.namespaces) {
  283. if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
  284. bail("failed to set process as dumpable");
  285. }
  286. /* Become root in the namespace proper. */
  287. if (setresuid(0, 0, 0) < 0)
  288. bail("failed to become root in user namespace");
  289. }
  290. /*
  291. * Unshare all of the namespaces. Now, it should be noted that this
  292. * ordering might break in the future (especially with rootless
  293. * containers). But for now, it's not possible to split this into
  294. * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
  295. *
  296. * Note that we don't merge this with clone() because there were
  297. * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
  298. * was broken, so we'll just do it the long way anyway.
  299. */
  300. if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
  301. bail("failed to unshare namespaces");
  302. /*
  303. * TODO: What about non-namespace clone flags that we're dropping here?
  304. *
  305. * We fork again because of PID namespace, setns(2) or unshare(2) don't
  306. * change the PID namespace of the calling process, because doing so
  307. * would change the caller's idea of its own PID (as reported by getpid()),
  308. * which would break many applications and libraries, so we must fork
  309. * to actually enter the new PID namespace.
  310. */
  311. child = clone_parent(&env, JUMP_INIT);
  312. if (child < 0)
  313. bail("unable to fork: init_func");
  314. /* Send the child to our parent, which knows what it's doing. */
  315. s = SYNC_RECVPID_PLS;
  316. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  317. kill(child, SIGKILL);
  318. bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
  319. }
  320. if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
  321. kill(child, SIGKILL);
  322. bail("failed to sync with parent: write(childpid)");
  323. }
  324. /* ... wait for parent to get the pid ... */
  325. if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
  326. kill(child, SIGKILL);
  327. bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
  328. }
  329. if (s != SYNC_RECVPID_ACK) {
  330. kill(child, SIGKILL);
  331. bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
  332. }
  333. s = SYNC_CHILD_READY;
  334. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  335. kill(child, SIGKILL);
  336. bail("failed to sync with parent: write(SYNC_CHILD_READY)");
  337. }
  338. /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
  339. exit(0);
  340. }
  341. /*
  342. * Stage 2: We're the final child process, and the only process that will
  343. * actually return to the Go runtime. Our job is to just do the
  344. * final cleanup steps and then return to the Go runtime to allow
  345. * init_linux.go to run.
  346. */
  347. case JUMP_INIT:{
  348. /*
  349. * We're inside the child now, having jumped from the
  350. * start_child() code after forking in the parent.
  351. */
  352. enum sync_t s;
  353. /* We're in a child and thus need to tell the parent if we die. */
  354. syncfd = sync_grandchild_pipe[0];
  355. close(sync_grandchild_pipe[1]);
  356. close(sync_child_pipe[0]);
  357. close(sync_child_pipe[1]);
  358. /* For debugging. */
  359. prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
  360. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  361. bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
  362. if (s != SYNC_GRANDCHILD)
  363. bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
  364. if (setsid() < 0)
  365. bail("setsid failed");
  366. if (setuid(0) < 0)
  367. bail("setuid failed");
  368. if (setgid(0) < 0)
  369. bail("setgid failed");
  370. if (!config.is_rootless_euid && config.is_setgroup) {
  371. if (setgroups(0, NULL) < 0)
  372. bail("setgroups failed");
  373. }
  374. /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
  375. if (config.cloneflags & CLONE_NEWCGROUP) {
  376. uint8_t value;
  377. if (read(pipenum, &value, sizeof(value)) != sizeof(value))
  378. bail("read synchronisation value failed");
  379. if (value == CREATECGROUPNS) {
  380. if (unshare(CLONE_NEWCGROUP) < 0)
  381. bail("failed to unshare cgroup namespace");
  382. } else
  383. bail("received unknown synchronisation value");
  384. }
  385. s = SYNC_CHILD_READY;
  386. if (write(syncfd, &s, sizeof(s)) != sizeof(s))
  387. bail("failed to sync with patent: write(SYNC_CHILD_READY)");
  388. /* Close sync pipes. */
  389. close(sync_grandchild_pipe[0]);
  390. /* Free netlink data. */
  391. nl_free(&config);
  392. /* Finish executing, let the Go runtime take over. */
  393. return;
  394. }
  395. default:
  396. bail("unexpected jump value");
  397. }
  398. /* Should never be reached. */
  399. bail("should never be reached");
  400. }

2.2.1.1.3.2) init.go#Action

  1. Action: func(context *cli.Context) error {
  2. factory, _ := libcontainer.New("")
  3. if err := factory.StartInitialization(); err != nil {
  4. // as the error is sent back to the parent there is no need to log
  5. // or write it to stderr because the parent process will handle this
  6. os.Exit(1)
  7. }
  8. panic("libcontainer: container init failed to exec")
  9. },
  1. // StartInitialization is an internal API to libcontainer used during the reexec of the
  2. // container.
  3. //
  4. // Errors:
  5. // Pipe connection error
  6. // System error
  7. StartInitialization() error

2.2.1.1.3.3) libcontainer/factory_linux.go#LinuxFactory.StartInitialization

1、从环境变量中解析出childPipe、rootDir的fd以及initType(默认为standard,有时间看一下还有其他什么特别的初始化方式),并清除当前进程的所有环境变量。
2、设置一个trap以及panic recover,如果初始化容器失败,会往childPipe中写入procError。
3、调用newContainerInit创建一个init对象(两种类型,standard or setns,下面以standard为例),首先从childPipe中获取config配置文件,从配置文件中读取环境变量并设置到当前进程。构造一个linuxStandardInit对象,主要包括pipe、parentPid、config和rootDir等字段。
4、调用linuxStandardInit对象的Init方法进行初始化。

  1. // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
  2. // This is a low level implementation detail of the reexec and should not be consumed externally
  3. func (l *LinuxFactory) StartInitialization() (err error) {
  4. var (
  5. pipefd, fifofd int
  6. consoleSocket *os.File
  7. envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
  8. envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD")
  9. envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
  10. )
  11. // Get the INITPIPE.
  12. pipefd, err = strconv.Atoi(envInitPipe)
  13. if err != nil {
  14. return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
  15. }
  16. var (
  17. pipe = os.NewFile(uintptr(pipefd), "pipe")
  18. it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
  19. )
  20. defer pipe.Close()
  21. // Only init processes have FIFOFD.
  22. fifofd = -1
  23. if it == initStandard {
  24. if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
  25. return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
  26. }
  27. }
  28. if envConsole != "" {
  29. console, err := strconv.Atoi(envConsole)
  30. if err != nil {
  31. return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
  32. }
  33. consoleSocket = os.NewFile(uintptr(console), "console-socket")
  34. defer consoleSocket.Close()
  35. }
  36. // clear the current process's environment to clean any libcontainer
  37. // specific env vars.
  38. os.Clearenv()
  39. defer func() {
  40. // We have an error during the initialization of the container's init,
  41. // send it back to the parent process in the form of an initError.
  42. if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
  43. fmt.Fprintln(os.Stderr, err)
  44. return
  45. }
  46. if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
  47. fmt.Fprintln(os.Stderr, err)
  48. return
  49. }
  50. }()
  51. defer func() {
  52. if e := recover(); e != nil {
  53. err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
  54. }
  55. }()
  56. // ********************************** NOTICE ********************************** //
  57. i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
  58. // ********************************** NOTICE ********************************** //
  59. if err != nil {
  60. return err
  61. }
  62. // ********************************** NOTICE ********************************** //
  63. // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
  64. return i.Init()
  65. // ********************************** NOTICE ********************************** //
  66. }

2.2.1.1.3.3.1) libcontainer/init_linux.go#newContainerInit

1、从pipe中读出initConfig
2、构造一个linuxStandardInit对象。

  1. func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
  2. var config *initConfig
  3. if err := json.NewDecoder(pipe).Decode(&config); err != nil {
  4. return nil, err
  5. }
  6. if err := populateProcessEnvironment(config.Env); err != nil {
  7. return nil, err
  8. }
  9. switch t {
  10. case initSetns:
  11. return &linuxSetnsInit{
  12. pipe: pipe,
  13. consoleSocket: consoleSocket,
  14. config: config,
  15. }, nil
  16. case initStandard:
  17. return &linuxStandardInit{
  18. pipe: pipe,
  19. consoleSocket: consoleSocket,
  20. parentPid: unix.Getppid(),
  21. config: config,
  22. fifoFd: fifoFd,
  23. }, nil
  24. }
  25. return nil, fmt.Errorf("unknown init type %q", t)
  26. }
  27. type linuxStandardInit struct {
  28. pipe *os.File
  29. consoleSocket *os.File
  30. parentPid int
  31. fifoFd int
  32. config *initConfig
  33. }

2.2.1.1.3.1.1.1) libcontainer/standard_init_linux.go#linuxStandardInit.Init

1、首先是针对Session keyring的一些配置,不是很清楚这里的Session是什么?
2、配置console和tty。如果配置文件中指定有Console字段,则从该字段中获取tty的slave路径创建一个linuxConsole对象,调用其dupStdio打开slave设备,将其fd复制(dup3)到当前进程的标准IO。如果console对象创建好以后,便调用ioctl的TIOCSCTTY分配控制终端,这里应该是和4.3+BSD系统保持兼容。(关于tty和console的进一步内容,有时间转发一篇更详细的或者自己总结一篇也行,对这一部分也挺感兴趣)
3、调用setupNetwork配置容器的网络。奇怪网络不是在前面配置过了吗,还是调用同样的函数。。。存疑?
4、调用setupRoute配置容器的静态路由信息。
5、selinux,调用label.Init()检查selinux是否被启动以及是否检查过,并将结果存入全局变量。此处的label并非是用户label,而是selinux相关的processLabel。
6、如果设置了mount namespace,则调用setupRootfs在新的mount namespace中配置设备、挂载点以及文件系统。
7、根据需要配置hostname、apparmor、processLabel、sysctl、readonlyPath、maskPath。这些都是一些feature,对容器启动本身没有太多影响。
8、获取父进程的退出信号量。
9、通过管道与父进程进行同步,先发出procReady再等待procRun。
10、初始化seccomp。
11、调用finalizeNamespace根据config配置将需要的特权capabilities加入白名单,设置user namespace,关闭不需要的文件描述符。
12、恢复parent进程的death信号量并检查当前父进程pid是否为我们原来记录的。不是的话,kill ourself
13、检查config里面需要执行的命令是否存在。注意:create虽然不会执行命令,但是会检查命令路径是否正确,该错误类型也会在create期间返回。

  • 到此,与父进程之间的同步已经完成,关闭pipe。
  • 尝试以只写方式打开fifo管道,并往管道中写入“0” 。该操作会一直保持阻塞,直到管道的另一端以读方式打开,并读取内容。至此,create操作流程已经结束。ref : FIFO管道

14、下面实际上是start的时候才会触发的操作了,阻塞清除后,根据config配置初始化seccomp,并调用syscall.Exec执行config里面指定的命令(执行的从parent传过来的initConfig中的Args数组)。

如果是start,那么runc会以只读方式打开fifo管道,读取内容,如果长度大于0,则读取到Create流程中最后写入的“0”,也同时恢复阻塞了Create的init进程,执行最后调用用户进程部分。
image.png

  1. func (l *linuxStandardInit) Init() error {
  2. runtime.LockOSThread()
  3. defer runtime.UnlockOSThread()
  4. if !l.config.Config.NoNewKeyring {
  5. ringname, keepperms, newperms := l.getSessionRingParams()
  6. // Do not inherit the parent's session keyring.
  7. if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
  8. // If keyrings aren't supported then it is likely we are on an
  9. // older kernel (or inside an LXC container). While we could bail,
  10. // the security feature we are using here is best-effort (it only
  11. // really provides marginal protection since VFS credentials are
  12. // the only significant protection of keyrings).
  13. //
  14. // TODO(cyphar): Log this so people know what's going on, once we
  15. // have proper logging in 'runc init'.
  16. if errors.Cause(err) != unix.ENOSYS {
  17. return errors.Wrap(err, "join session keyring")
  18. }
  19. } else {
  20. // Make session keyring searcheable. If we've gotten this far we
  21. // bail on any error -- we don't want to have a keyring with bad
  22. // permissions.
  23. if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
  24. return errors.Wrap(err, "mod keyring permissions")
  25. }
  26. }
  27. }
  28. if err := setupNetwork(l.config); err != nil {
  29. return err
  30. }
  31. if err := setupRoute(l.config.Config); err != nil {
  32. return err
  33. }
  34. label.Init()
  35. if err := prepareRootfs(l.pipe, l.config); err != nil {
  36. return err
  37. }
  38. // Set up the console. This has to be done *before* we finalize the rootfs,
  39. // but *after* we've given the user the chance to set up all of the mounts
  40. // they wanted.
  41. if l.config.CreateConsole {
  42. if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
  43. return err
  44. }
  45. if err := system.Setctty(); err != nil {
  46. return errors.Wrap(err, "setctty")
  47. }
  48. }
  49. // Finish the rootfs setup.
  50. if l.config.Config.Namespaces.Contains(configs.NEWNS) {
  51. if err := finalizeRootfs(l.config.Config); err != nil {
  52. return err
  53. }
  54. }
  55. if hostname := l.config.Config.Hostname; hostname != "" {
  56. if err := unix.Sethostname([]byte(hostname)); err != nil {
  57. return errors.Wrap(err, "sethostname")
  58. }
  59. }
  60. if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
  61. return errors.Wrap(err, "apply apparmor profile")
  62. }
  63. for key, value := range l.config.Config.Sysctl {
  64. if err := writeSystemProperty(key, value); err != nil {
  65. return errors.Wrapf(err, "write sysctl key %s", key)
  66. }
  67. }
  68. for _, path := range l.config.Config.ReadonlyPaths {
  69. if err := readonlyPath(path); err != nil {
  70. return errors.Wrapf(err, "readonly path %s", path)
  71. }
  72. }
  73. for _, path := range l.config.Config.MaskPaths {
  74. if err := maskPath(path, l.config.Config.MountLabel); err != nil {
  75. return errors.Wrapf(err, "mask path %s", path)
  76. }
  77. }
  78. pdeath, err := system.GetParentDeathSignal()
  79. if err != nil {
  80. return errors.Wrap(err, "get pdeath signal")
  81. }
  82. if l.config.NoNewPrivileges {
  83. if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
  84. return errors.Wrap(err, "set nonewprivileges")
  85. }
  86. }
  87. // Tell our parent that we're ready to Execv. This must be done before the
  88. // Seccomp rules have been applied, because we need to be able to read and
  89. // write to a socket.
  90. if err := syncParentReady(l.pipe); err != nil {
  91. return errors.Wrap(err, "sync ready")
  92. }
  93. if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
  94. return errors.Wrap(err, "set process label")
  95. }
  96. defer label.SetProcessLabel("")
  97. // Without NoNewPrivileges seccomp is a privileged operation, so we need to
  98. // do this before dropping capabilities; otherwise do it as late as possible
  99. // just before execve so as few syscalls take place after it as possible.
  100. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
  101. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  102. return err
  103. }
  104. }
  105. if err := finalizeNamespace(l.config); err != nil {
  106. return err
  107. }
  108. // finalizeNamespace can change user/group which clears the parent death
  109. // signal, so we restore it here.
  110. if err := pdeath.Restore(); err != nil {
  111. return errors.Wrap(err, "restore pdeath signal")
  112. }
  113. // Compare the parent from the initial start of the init process and make
  114. // sure that it did not change. if the parent changes that means it died
  115. // and we were reparented to something else so we should just kill ourself
  116. // and not cause problems for someone else.
  117. if unix.Getppid() != l.parentPid {
  118. return unix.Kill(unix.Getpid(), unix.SIGKILL)
  119. }
  120. // Check for the arg before waiting to make sure it exists and it is
  121. // returned as a create time error.
  122. name, err := exec.LookPath(l.config.Args[0])
  123. if err != nil {
  124. return err
  125. }
  126. // Close the pipe to signal that we have completed our init.
  127. l.pipe.Close()
  128. // Wait for the FIFO to be opened on the other side before exec-ing the
  129. // user process. We open it through /proc/self/fd/$fd, because the fd that
  130. // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
  131. // re-open an O_PATH fd through /proc.
  132. fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
  133. if err != nil {
  134. return newSystemErrorWithCause(err, "open exec fifo")
  135. }
  136. if _, err := unix.Write(fd, []byte("0")); err != nil {
  137. return newSystemErrorWithCause(err, "write 0 exec fifo")
  138. }
  139. // Close the O_PATH fifofd fd before exec because the kernel resets
  140. // dumpable in the wrong order. This has been fixed in newer kernels, but
  141. // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
  142. // N.B. the core issue itself (passing dirfds to the host filesystem) has
  143. // since been resolved.
  144. // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
  145. unix.Close(l.fifoFd)
  146. // Set seccomp as close to execve as possible, so as few syscalls take
  147. // place afterward (reducing the amount of syscalls that users need to
  148. // enable in their seccomp profiles).
  149. if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
  150. if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
  151. return newSystemErrorWithCause(err, "init seccomp")
  152. }
  153. }
  154. if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
  155. return newSystemErrorWithCause(err, "exec user process")
  156. }
  157. return nil
  158. }

2.2.1.1.3.1.1.1.1) libcontainer/rootfs_linux.go#prepareRootfs
  1. // prepareRootfs sets up the devices, mount points, and filesystems for use
  2. // inside a new mount namespace. It doesn't set anything as ro. You must call
  3. // finalizeRootfs after this function to finish setting up the rootfs.
  4. func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
  5. config := iConfig.Config
  6. if err := prepareRoot(config); err != nil {
  7. return newSystemErrorWithCause(err, "preparing rootfs")
  8. }
  9. hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP)
  10. setupDev := needsSetupDev(config)
  11. for _, m := range config.Mounts {
  12. for _, precmd := range m.PremountCmds {
  13. if err := mountCmd(precmd); err != nil {
  14. return newSystemErrorWithCause(err, "running premount command")
  15. }
  16. }
  17. if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil {
  18. return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
  19. }
  20. for _, postcmd := range m.PostmountCmds {
  21. if err := mountCmd(postcmd); err != nil {
  22. return newSystemErrorWithCause(err, "running postmount command")
  23. }
  24. }
  25. }
  26. if setupDev {
  27. if err := createDevices(config); err != nil {
  28. return newSystemErrorWithCause(err, "creating device nodes")
  29. }
  30. if err := setupPtmx(config); err != nil {
  31. return newSystemErrorWithCause(err, "setting up ptmx")
  32. }
  33. if err := setupDevSymlinks(config.Rootfs); err != nil {
  34. return newSystemErrorWithCause(err, "setting up /dev symlinks")
  35. }
  36. }
  37. // Signal the parent to run the pre-start hooks.
  38. // The hooks are run after the mounts are setup, but before we switch to the new
  39. // root, so that the old root is still available in the hooks for any mount
  40. // manipulations.
  41. // Note that iConfig.Cwd is not guaranteed to exist here.
  42. if err := syncParentHooks(pipe); err != nil {
  43. return err
  44. }
  45. // The reason these operations are done here rather than in finalizeRootfs
  46. // is because the console-handling code gets quite sticky if we have to set
  47. // up the console before doing the pivot_root(2). This is because the
  48. // Console API has to also work with the ExecIn case, which means that the
  49. // API must be able to deal with being inside as well as outside the
  50. // container. It's just cleaner to do this here (at the expense of the
  51. // operation not being perfectly split).
  52. if err := unix.Chdir(config.Rootfs); err != nil {
  53. return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
  54. }
  55. if config.NoPivotRoot {
  56. err = msMoveRoot(config.Rootfs)
  57. } else if config.Namespaces.Contains(configs.NEWNS) {
  58. err = pivotRoot(config.Rootfs)
  59. } else {
  60. err = chroot(config.Rootfs)
  61. }
  62. if err != nil {
  63. return newSystemErrorWithCause(err, "jailing process inside rootfs")
  64. }
  65. if setupDev {
  66. if err := reOpenDevNull(); err != nil {
  67. return newSystemErrorWithCause(err, "reopening /dev/null inside container")
  68. }
  69. }
  70. if cwd := iConfig.Cwd; cwd != "" {
  71. // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...".
  72. // However, we are safe to call MkDirAll directly because we are in the jail here.
  73. if err := os.MkdirAll(cwd, 0755); err != nil {
  74. return err
  75. }
  76. }
  77. return nil
  78. }

2.2.1.1.4) libcontainer/container_linux.go#linuxContainer.updateState

  1. func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
  2. if process != nil {
  3. c.initProcess = process
  4. }
  5. state, err := c.currentState()
  6. if err != nil {
  7. return nil, err
  8. }
  9. err = c.saveState(state)
  10. if err != nil {
  11. return nil, err
  12. }
  13. return state, nil
  14. }
  15. func (c *linuxContainer) saveState(s *State) error {
  16. f, err := os.Create(filepath.Join(c.root, stateFilename))
  17. if err != nil {
  18. return err
  19. }
  20. defer f.Close()
  21. return utils.WriteJSON(f, s)
  22. }
  23. const stateFilename = "state.json"