the OCI runtime will proxy the socket into the container to receive ready notification.

podman原理:

通过python测试NOTIFY_SOCKET. (service文件可以放到 /usr/lib/systemd/system/ 下测试)
https://stackoverflow.com/questions/63540832/how-to-run-a-service-running-in-a-container-in-systemd-including-systemd-notify
image.png

podman是自己建立一套NOTIFY_SOCKET的机制,没有复用runc的能力。

conmon通过socket读取 /var/lib/containers/storage/overlay-containers/c2184b9d7b043500a54374fe2a19ede77a1d9e06e51fc45df74e2675f2a745ba/userdata/notify/notify.sock,同时该socket文件会mount到容器里的/run/notify/notify.sock(容器内进程会写这个socket,因为这个socket会设置为NOTIFY_SOCKET)

当conmon读取READY=1,写入/run/systemd/notify到READY=1

podman会把MAINPID设置为conmon进程的PID

  1. if c.config.SdNotifyMode != define.SdNotifyModeIgnore {
  2. payload := fmt.Sprintf("MAINPID=%d", c.state.ConmonPID)
  3. if c.config.SdNotifyMode == define.SdNotifyModeConmon {
  4. payload += "\n"
  5. payload += daemon.SdNotifyReady
  6. }
  7. if sent, err := daemon.SdNotify(false, payload); err != nil {
  8. logrus.Errorf("Notifying systemd of Conmon PID: %s", err.Error())
  9. } else if sent {
  10. logrus.Debugf("Notify sent successfully")
  11. }
  12. }

runc原理:

runc和podman是两套机制。

runc作为 OCI runtime,会启动一个gorouting代理NOTIFY_SOCKET的处理。

notify_socket.go

  1. type notifySocket struct {
  2. socket *net.UnixConn // 在容器外监听socketPath,容器内程序会向这个socket发送"READY=1"
  3. host string // 通过环境变量NOTIFY_SOCKET设置的sock,容器外路径。从socket读取到"READY="之后,向这个发送"READY="和pid
  4. socketPath string // 是的notifySocket.socket使用的socket文件,容器外路径是/$rootfs/notify/notify.sock ,该文件会bind mount到容器内 /run/notify/notify.sock
  5. }
  1. func (n *notifySocket) run(pid1 int) error {
  2. if n.socket == nil {
  3. return nil
  4. }
  5. notifySocketHostAddr := net.UnixAddr{Name: n.host, Net: "unixgram"}
  6. client, err := net.DialUnix("unixgram", nil, &notifySocketHostAddr)
  7. if err != nil {
  8. return err
  9. }
  10. ticker := time.NewTicker(time.Millisecond * 100)
  11. defer ticker.Stop()
  12. // 获取容器内程序发送的 "READY=1"
  13. fileChan := make(chan []byte)
  14. go func() {
  15. for {
  16. buf := make([]byte, 4096)
  17. r, err := n.socket.Read(buf)
  18. if err != nil {
  19. return
  20. }
  21. got := buf[0:r]
  22. // systemd-ready sends a single datagram with the state string as payload,
  23. // so we don't need to worry about partial messages.
  24. for _, line := range bytes.Split(got, []byte{'\n'}) {
  25. if bytes.HasPrefix(got, []byte("READY=")) {
  26. fileChan <- line
  27. return
  28. }
  29. }
  30. }
  31. }()
  32. // 向NOTIFY_SOCKET对应的socket发送 "READY=1\nMAINPID=$pid"
  33. for {
  34. select {
  35. case <-ticker.C:
  36. _, err := os.Stat(filepath.Join("/proc", strconv.Itoa(pid1)))
  37. if err != nil {
  38. return nil
  39. }
  40. case b := <-fileChan:
  41. var out bytes.Buffer
  42. _, err = out.Write(b)
  43. if err != nil {
  44. return err
  45. }
  46. _, err = out.Write([]byte{'\n'})
  47. if err != nil {
  48. return err
  49. }
  50. _, err = client.Write(out.Bytes())
  51. if err != nil {
  52. return err
  53. }
  54. // now we can inform systemd to use pid1 as the pid to monitor
  55. // 使用容器的1号进程作为systemd的MAINPID
  56. newPid := "MAINPID=" + strconv.Itoa(pid1)
  57. _, err := client.Write([]byte(newPid + "\n"))
  58. if err != nil {
  59. return err
  60. }
  61. return nil
  62. }
  63. }
  64. }