缘起
最近阅读<
本系列笔记拟采用golang练习之
案例需求(聊天服务器)
- 用户可以连接到服务器。
- 用户可以设定自己的用户名。
- 用户可以向服务器发送消息,同时服务器也会向其他用户广播该消息。
目标(Day 3)
- 在Day 2中, 已基本完成功能性编码, 测试也通过了, 今天尝试压测, 并诊断是否存在内存和routine泄漏
过程
- 修改单元测试代码, 并发客户端增加到50个, 每个客户端随机1-3秒发送一条消息
- 添加ILoggingService, 以更方便的采集全局日志
- 使用pprof包的api, 采集heap和goroutine的诊断日志
- 使用go tool pprof <诊断日志>, 观察是否存在内存和routine泄漏
- 发现单元测试总是failed, 总有若干tChatClient, 服务端未打印关闭日志
- 查看routine诊断日志, 发现存在routine泄漏
- 在tChatClient中添加更详细的函数调用日志, 发现是closeChan通道写入阻塞导致
- 修复routine泄漏的bug, 加大并发客户端到500个, 并进行多轮测试, 观察是否存在内存和routine泄漏
单元测试
- ChatServer_test.go
- 增大客户端并发数, 并添加pprof诊断
- 诊断ILoggingService.AllLogs(), 判断所有客户端是否完成连接/断开的闭环 ```go package chat_server
import ( “fmt” cs “learning/gooop/chat_server” “math/rand” “os” “runtime” “runtime/pprof” “strings” “sync” “testing” “time” )
func Test_ChatServer(t *testing.T) { fnAssertTrue := func(b bool, msg string) { if !b { t.Fatal(msg) } }
// create dump filememProfile, err := os.Create("/home/ioly/chat_server_mem.profile")if err != nil {t.Fatal(err)}cpuProfile, err := os.Create("/home/ioly/chat_server_cpu.profile")if err != nil {t.Fatal(err)}rtnProfile, err := os.Create("/home/ioly/chat_server_routine.profile")if err != nil {t.Fatal(err)}err = pprof.StartCPUProfile(cpuProfile)if err != nil {t.Fatal(err)}defer pprof.StopCPUProfile()port := 3333server := cs.NewChatServer()err = server.Open(port)if err != nil {t.Fatal(err)}clientCount := 500wg := &sync.WaitGroup{}rnd := rand.New(rand.NewSource(time.Now().UnixNano()))address := fmt.Sprintf("localhost:%v", port)for i := 0;i < clientCount;i++ {err, client := cs.DialChatClient(address)if err != nil {t.Fatal(err)}id := fmt.Sprintf("c%02d", i)//client.RecvHandler(func(client cs.IChatClient, msg cs.IMsg) {// t.Logf("%v recv: %v\n", id, msg)//})wg.Add(1)go func() {client.SetName(id)client.Send(&cs.NameMsg{id })n := 0duration := rnd.Intn(3) + 1for range time.Tick(time.Duration(duration) * time.Second) {client.Send(&cs.ChatMsg{id, fmt.Sprintf("msg %02d from %v", n, id) })n++if n > 10 {break}}client.Close()wg.Done()}()}// wait and set statusdone := []bool{false}go func() {wg.Wait()done[0] = true}()// print passed secondspassedSeconds := 0for range time.Tick(time.Second) {passedSeconds++t.Logf("%v seconds passed", passedSeconds)if done[0] {break}}time.Sleep(5*time.Second)server.Close()// dump heap fileruntime.GC()err = pprof.Lookup("heap").WriteTo(memProfile, 0)if err != nil {t.Fatal(err)}err = pprof.Lookup("goroutine").WriteTo(rtnProfile, 0)if err != nil {t.Fatal(err)}// check server logslogs := cs.Logging.AllLogs()fnHasLog := func(log string) bool {for _,it := range logs {if strings.Contains(it, log) {return true}}return false}for i := 0;i < clientCount;i++ {msg := fmt.Sprintf("tChatClient.postConnClosed, c%02d, serverFlag=false", i)fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)msg = fmt.Sprintf("tChatServer.handleIncomingConn, clientCount=%v", i + 1)fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)msg = fmt.Sprintf("tChatClient.postConnClosed, c%02d, serverFlag=true", i)fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)msg = fmt.Sprintf("tChatServer.handleClientClosed, c%02d", i)fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)}
}
<a name="6RyoQ"></a># 测试输出并发500, 仅末尾部分
tChatClient.beginWrite, write error, c496, serverFlag=true tChatClient.beginWrite, <- closeChan, c496, serverFlag=true tChatClient.postConnClosed, c496, serverFlag=true tChatServer.handleClientClosed, c496 tChatServer.handleClientClosed, c496, clientCount=1 tChatClient.closeConn, c498, serverFlag=false tChatClient.beginWrite, <- closeChan, c498, serverFlag=false tChatClient.beginRead, read error, c498, serverFlag=false tChatClient.postConnClosed, c498, serverFlag=false tChatClient.beginRead, read error, c498, serverFlag=true tChatClient.closeConn, c498, serverFlag=true tChatClient.beginWrite, <- closeChan, c498, serverFlag=true tChatClient.postConnClosed, c498, serverFlag=true tChatServer.handleClientClosed, c498 tChatServer.handleClientClosed, c498, clientCount=0 ChatServer_test.go:97: 34 seconds passed —- PASS: Test_ChatServer (39.50s) PASS ok command-line-arguments 39.535s
<a name="5WFK6"></a># 内存泄漏诊断无泄漏
$ go tool pprof ~/chat_server_mem.profile File: chat_server.test Type: inuse_space Time: Mar 9, 2021 at 8:56am (CST) Entering interactive mode (type “help” for commands, “o” for options) (pprof) top Showing nodes accounting for 8461.35kB, 100% of 8461.35kB total Showing top 10 nodes out of 15 flat flat% sum% cum cum% 5720.44kB 67.61% 67.61% 5720.44kB 67.61% time.startTimer 1184.27kB 14.00% 81.60% 1184.27kB 14.00% runtime/pprof.StartCPUProfile 1024.38kB 12.11% 93.71% 1024.38kB 12.11% runtime.malg 532.26kB 6.29% 100% 532.26kB 6.29% runtime/pprof.(profMap).lookup 0 0% 100% 1184.27kB 14.00% command-line-arguments.Test_ChatServer 0 0% 100% 5720.44kB 67.61% learning/gooop/chat_server.(tChatClient).beginWrite 0 0% 100% 1024.38kB 12.11% runtime.mstart 0 0% 100% 1024.38kB 12.11% runtime.newproc.func1 0 0% 100% 1024.38kB 12.11% runtime.newproc1 0 0% 100% 1024.38kB 12.11% runtime.systemstack (pprof) quit
<a name="cFdMf"></a># routine泄漏诊断(修复前)有泄漏时的诊断日志(并发50)
$ go tool pprof ~/chat_server_routine.profile File: chat_server.test Type: goroutine Time: Mar 9, 2021 at 9:12am (CST) Entering interactive mode (type “help” for commands, “o” for options) (pprof) top Showing nodes accounting for 4, 100% of 4 total Showing top 10 nodes out of 21 flat flat% sum% cum cum% 3 75.00% 75.00% 3 75.00% runtime.gopark 1 25.00% 100% 1 25.00% runtime/pprof.runtime_goroutineProfileWithLabels 0 0% 100% 1 25.00% command-line-arguments.Test_ChatServer 0 0% 100% 1 25.00% learning/gooop/chat_server.(tChatClient).beginWrite 0 0% 100% 1 25.00% learning/gooop/chat_server.(tChatClient).closeConn 0 0% 100% 1 25.00% main.main 0 0% 100% 1 25.00% runtime.chanrecv 0 0% 100% 1 25.00% runtime.chanrecv1 0 0% 100% 1 25.00% runtime.chansend 0 0% 100% 1 25.00% runtime.chansend1 (pprof)
可以看到closeConn函数阻塞```gofunc (me *tChatClient) closeConn() {if !atomic.CompareAndSwapInt32(&me.closeFlag, 0, 1) {return}Logging.Logf("tChatClient.closeConn, %v, serverFlag=%v", me.name, me.serverFlag)me.closeChan <- true}
从代码看, closeConn函数阻塞只有一种可能, 就是closeChan阻塞.
这个原因有点诡异, 因为已经加了atomic.CompareAndSwapInt32保护,
仍然导致阻塞, 唯一原因是并发环境下, closeChan尚未开始读取, 是无法写入的.
修复routine泄漏
就是给导致写阻塞的closeChan加上缓冲, 大小等于closeConn函数的可能调用入口数量
func openChatClient(conn net.Conn, serverFlag bool) IChatClient {it := &tChatClient{conn: conn,openFlag: 0,closeFlag: 0,serverFlag: serverFlag,closeChan: make(chan bool, 3), // 给closeChan添加缓冲区!!sendChan: make(chan IMsg, gMaxPendingSend),name: "anonymous",sendLogs: []IMsg{},dropLogs: []IMsg{},recvLogs: []IMsg{},}it.open()return it}
routine泄漏诊断(修复后)
修复后, 加大并发到500, 未发现明显routine泄漏(不存在业务代码导致挂起的routine)
$ go tool pprof ~/chat_server_routine.profileFile: chat_server.testType: goroutineTime: Mar 9, 2021 at 9:28am (CST)Entering interactive mode (type "help" for commands, "o" for options)(pprof) topShowing nodes accounting for 3, 100% of 3 totalShowing top 10 nodes out of 17flat flat% sum% cum cum%2 66.67% 66.67% 2 66.67% runtime.gopark1 33.33% 100% 1 33.33% runtime/pprof.runtime_goroutineProfileWithLabels0 0% 100% 1 33.33% command-line-arguments.Test_ChatServer0 0% 100% 1 33.33% main.main0 0% 100% 1 33.33% runtime.chanrecv0 0% 100% 1 33.33% runtime.chanrecv10 0% 100% 1 33.33% runtime.main0 0% 100% 1 33.33% runtime/pprof.(*Profile).WriteTo0 0% 100% 1 33.33% runtime/pprof.profileWriter0 0% 100% 1 33.33% runtime/pprof.writeGoroutine(pprof)
(end)
