the OCI runtime will proxy the socket into the container to receive ready notification.
podman原理:
通过python测试NOTIFY_SOCKET. (service文件可以放到 /usr/lib/systemd/system/ 下测试)
https://stackoverflow.com/questions/63540832/how-to-run-a-service-running-in-a-container-in-systemd-including-systemd-notify
�
podman是自己建立一套NOTIFY_SOCKET的机制,没有复用runc的能力。
conmon通过socket读取 /var/lib/containers/storage/overlay-containers/c2184b9d7b043500a54374fe2a19ede77a1d9e06e51fc45df74e2675f2a745ba/userdata/notify/notify.sock,同时该socket文件会mount到容器里的/run/notify/notify.sock(容器内进程会写这个socket,因为这个socket会设置为NOTIFY_SOCKET)
�
当conmon读取READY=1,写入/run/systemd/notify到READY=1
podman会把MAINPID设置为conmon进程的PID
if c.config.SdNotifyMode != define.SdNotifyModeIgnore {
payload := fmt.Sprintf("MAINPID=%d", c.state.ConmonPID)
if c.config.SdNotifyMode == define.SdNotifyModeConmon {
payload += "\n"
payload += daemon.SdNotifyReady
}
if sent, err := daemon.SdNotify(false, payload); err != nil {
logrus.Errorf("Notifying systemd of Conmon PID: %s", err.Error())
} else if sent {
logrus.Debugf("Notify sent successfully")
}
}
runc原理:
runc和podman是两套机制。
runc作为 OCI runtime,会启动一个gorouting代理NOTIFY_SOCKET的处理。
notify_socket.go
type notifySocket struct {
socket *net.UnixConn // 在容器外监听socketPath,容器内程序会向这个socket发送"READY=1"
host string // 通过环境变量NOTIFY_SOCKET设置的sock,容器外路径。从socket读取到"READY="之后,向这个发送"READY="和pid
socketPath string // 是的notifySocket.socket使用的socket文件,容器外路径是/$rootfs/notify/notify.sock ,该文件会bind mount到容器内 /run/notify/notify.sock
}
func (n *notifySocket) run(pid1 int) error {
if n.socket == nil {
return nil
}
notifySocketHostAddr := net.UnixAddr{Name: n.host, Net: "unixgram"}
client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
if err != nil {
return err
}
ticker := time.NewTicker(time.Millisecond * 100)
defer ticker.Stop()
// 获取容器内程序发送的 "READY=1"
fileChan := make(chan []byte)
go func() {
for {
buf := make([]byte, 4096)
r, err := n.socket.Read(buf)
if err != nil {
return
}
got := buf[0:r]
// systemd-ready sends a single datagram with the state string as payload,
// so we don't need to worry about partial messages.
for _, line := range bytes.Split(got, []byte{'\n'}) {
if bytes.HasPrefix(got, []byte("READY=")) {
fileChan <- line
return
}
}
}
}()
// 向NOTIFY_SOCKET对应的socket发送 "READY=1\nMAINPID=$pid"
for {
select {
case <-ticker.C:
_, err := os.Stat(filepath.Join("/proc", strconv.Itoa(pid1)))
if err != nil {
return nil
}
case b := <-fileChan:
var out bytes.Buffer
_, err = out.Write(b)
if err != nil {
return err
}
_, err = out.Write([]byte{'\n'})
if err != nil {
return err
}
_, err = client.Write(out.Bytes())
if err != nil {
return err
}
// now we can inform systemd to use pid1 as the pid to monitor
// 使用容器的1号进程作为systemd的MAINPID
newPid := "MAINPID=" + strconv.Itoa(pid1)
_, err := client.Write([]byte(newPid + "\n"))
if err != nil {
return err
}
return nil
}
}
}