缘起

最近阅读<> (刘金亮, 2021.1)
本系列笔记拟采用golang练习之

案例需求(聊天服务器)

  • 用户可以连接到服务器。
  • 用户可以设定自己的用户名。
  • 用户可以向服务器发送消息,同时服务器也会向其他用户广播该消息。

目标(Day 3)

  • 在Day 2中, 已基本完成功能性编码, 测试也通过了, 今天尝试压测, 并诊断是否存在内存和routine泄漏

过程

  • 修改单元测试代码, 并发客户端增加到50个, 每个客户端随机1-3秒发送一条消息
  • 添加ILoggingService, 以更方便的采集全局日志
  • 使用pprof包的api, 采集heap和goroutine的诊断日志
  • 使用go tool pprof <诊断日志>, 观察是否存在内存和routine泄漏
  • 发现单元测试总是failed, 总有若干tChatClient, 服务端未打印关闭日志
  • 查看routine诊断日志, 发现存在routine泄漏
  • 在tChatClient中添加更详细的函数调用日志, 发现是closeChan通道写入阻塞导致
  • 修复routine泄漏的bug, 加大并发客户端到500个, 并进行多轮测试, 观察是否存在内存和routine泄漏

单元测试

  • ChatServer_test.go
  • 增大客户端并发数, 并添加pprof诊断
  • 诊断ILoggingService.AllLogs(), 判断所有客户端是否完成连接/断开的闭环 ```go package chat_server

import ( “fmt” cs “learning/gooop/chat_server” “math/rand” “os” “runtime” “runtime/pprof” “strings” “sync” “testing” “time” )

func Test_ChatServer(t *testing.T) { fnAssertTrue := func(b bool, msg string) { if !b { t.Fatal(msg) } }

  1. // create dump file
  2. memProfile, err := os.Create("/home/ioly/chat_server_mem.profile")
  3. if err != nil {
  4. t.Fatal(err)
  5. }
  6. cpuProfile, err := os.Create("/home/ioly/chat_server_cpu.profile")
  7. if err != nil {
  8. t.Fatal(err)
  9. }
  10. rtnProfile, err := os.Create("/home/ioly/chat_server_routine.profile")
  11. if err != nil {
  12. t.Fatal(err)
  13. }
  14. err = pprof.StartCPUProfile(cpuProfile)
  15. if err != nil {
  16. t.Fatal(err)
  17. }
  18. defer pprof.StopCPUProfile()
  19. port := 3333
  20. server := cs.NewChatServer()
  21. err = server.Open(port)
  22. if err != nil {
  23. t.Fatal(err)
  24. }
  25. clientCount := 500
  26. wg := &sync.WaitGroup{}
  27. rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
  28. address := fmt.Sprintf("localhost:%v", port)
  29. for i := 0;i < clientCount;i++ {
  30. err, client := cs.DialChatClient(address)
  31. if err != nil {
  32. t.Fatal(err)
  33. }
  34. id := fmt.Sprintf("c%02d", i)
  35. //client.RecvHandler(func(client cs.IChatClient, msg cs.IMsg) {
  36. // t.Logf("%v recv: %v\n", id, msg)
  37. //})
  38. wg.Add(1)
  39. go func() {
  40. client.SetName(id)
  41. client.Send(&cs.NameMsg{id })
  42. n := 0
  43. duration := rnd.Intn(3) + 1
  44. for range time.Tick(time.Duration(duration) * time.Second) {
  45. client.Send(&cs.ChatMsg{id, fmt.Sprintf("msg %02d from %v", n, id) })
  46. n++
  47. if n > 10 {
  48. break
  49. }
  50. }
  51. client.Close()
  52. wg.Done()
  53. }()
  54. }
  55. // wait and set status
  56. done := []bool{false}
  57. go func() {
  58. wg.Wait()
  59. done[0] = true
  60. }()
  61. // print passed seconds
  62. passedSeconds := 0
  63. for range time.Tick(time.Second) {
  64. passedSeconds++
  65. t.Logf("%v seconds passed", passedSeconds)
  66. if done[0] {
  67. break
  68. }
  69. }
  70. time.Sleep(5*time.Second)
  71. server.Close()
  72. // dump heap file
  73. runtime.GC()
  74. err = pprof.Lookup("heap").WriteTo(memProfile, 0)
  75. if err != nil {
  76. t.Fatal(err)
  77. }
  78. err = pprof.Lookup("goroutine").WriteTo(rtnProfile, 0)
  79. if err != nil {
  80. t.Fatal(err)
  81. }
  82. // check server logs
  83. logs := cs.Logging.AllLogs()
  84. fnHasLog := func(log string) bool {
  85. for _,it := range logs {
  86. if strings.Contains(it, log) {
  87. return true
  88. }
  89. }
  90. return false
  91. }
  92. for i := 0;i < clientCount;i++ {
  93. msg := fmt.Sprintf("tChatClient.postConnClosed, c%02d, serverFlag=false", i)
  94. fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)
  95. msg = fmt.Sprintf("tChatServer.handleIncomingConn, clientCount=%v", i + 1)
  96. fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)
  97. msg = fmt.Sprintf("tChatClient.postConnClosed, c%02d, serverFlag=true", i)
  98. fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)
  99. msg = fmt.Sprintf("tChatServer.handleClientClosed, c%02d", i)
  100. fnAssertTrue(fnHasLog(msg), "expecting log: " + msg)
  101. }

}

  1. <a name="6RyoQ"></a>
  2. # 测试输出
  3. 并发500, 仅末尾部分

tChatClient.beginWrite, write error, c496, serverFlag=true tChatClient.beginWrite, <- closeChan, c496, serverFlag=true tChatClient.postConnClosed, c496, serverFlag=true tChatServer.handleClientClosed, c496 tChatServer.handleClientClosed, c496, clientCount=1 tChatClient.closeConn, c498, serverFlag=false tChatClient.beginWrite, <- closeChan, c498, serverFlag=false tChatClient.beginRead, read error, c498, serverFlag=false tChatClient.postConnClosed, c498, serverFlag=false tChatClient.beginRead, read error, c498, serverFlag=true tChatClient.closeConn, c498, serverFlag=true tChatClient.beginWrite, <- closeChan, c498, serverFlag=true tChatClient.postConnClosed, c498, serverFlag=true tChatServer.handleClientClosed, c498 tChatServer.handleClientClosed, c498, clientCount=0 ChatServer_test.go:97: 34 seconds passed —- PASS: Test_ChatServer (39.50s) PASS ok command-line-arguments 39.535s

  1. <a name="5WFK6"></a>
  2. # 内存泄漏诊断
  3. 无泄漏

$ go tool pprof ~/chat_server_mem.profile File: chat_server.test Type: inuse_space Time: Mar 9, 2021 at 8:56am (CST) Entering interactive mode (type “help” for commands, “o” for options) (pprof) top Showing nodes accounting for 8461.35kB, 100% of 8461.35kB total Showing top 10 nodes out of 15 flat flat% sum% cum cum% 5720.44kB 67.61% 67.61% 5720.44kB 67.61% time.startTimer 1184.27kB 14.00% 81.60% 1184.27kB 14.00% runtime/pprof.StartCPUProfile 1024.38kB 12.11% 93.71% 1024.38kB 12.11% runtime.malg 532.26kB 6.29% 100% 532.26kB 6.29% runtime/pprof.(profMap).lookup 0 0% 100% 1184.27kB 14.00% command-line-arguments.Test_ChatServer 0 0% 100% 5720.44kB 67.61% learning/gooop/chat_server.(tChatClient).beginWrite 0 0% 100% 1024.38kB 12.11% runtime.mstart 0 0% 100% 1024.38kB 12.11% runtime.newproc.func1 0 0% 100% 1024.38kB 12.11% runtime.newproc1 0 0% 100% 1024.38kB 12.11% runtime.systemstack (pprof) quit

  1. <a name="cFdMf"></a>
  2. # routine泄漏诊断(修复前)
  3. 有泄漏时的诊断日志(并发50)

$ go tool pprof ~/chat_server_routine.profile File: chat_server.test Type: goroutine Time: Mar 9, 2021 at 9:12am (CST) Entering interactive mode (type “help” for commands, “o” for options) (pprof) top Showing nodes accounting for 4, 100% of 4 total Showing top 10 nodes out of 21 flat flat% sum% cum cum% 3 75.00% 75.00% 3 75.00% runtime.gopark 1 25.00% 100% 1 25.00% runtime/pprof.runtime_goroutineProfileWithLabels 0 0% 100% 1 25.00% command-line-arguments.Test_ChatServer 0 0% 100% 1 25.00% learning/gooop/chat_server.(tChatClient).beginWrite 0 0% 100% 1 25.00% learning/gooop/chat_server.(tChatClient).closeConn 0 0% 100% 1 25.00% main.main 0 0% 100% 1 25.00% runtime.chanrecv 0 0% 100% 1 25.00% runtime.chanrecv1 0 0% 100% 1 25.00% runtime.chansend 0 0% 100% 1 25.00% runtime.chansend1 (pprof)

  1. 可以看到closeConn函数阻塞
  2. ```go
  3. func (me *tChatClient) closeConn() {
  4. if !atomic.CompareAndSwapInt32(&me.closeFlag, 0, 1) {
  5. return
  6. }
  7. Logging.Logf("tChatClient.closeConn, %v, serverFlag=%v", me.name, me.serverFlag)
  8. me.closeChan <- true
  9. }

从代码看, closeConn函数阻塞只有一种可能, 就是closeChan阻塞.
这个原因有点诡异, 因为已经加了atomic.CompareAndSwapInt32保护,
仍然导致阻塞, 唯一原因是并发环境下, closeChan尚未开始读取, 是无法写入的.

修复routine泄漏

就是给导致写阻塞的closeChan加上缓冲, 大小等于closeConn函数的可能调用入口数量

  1. func openChatClient(conn net.Conn, serverFlag bool) IChatClient {
  2. it := &tChatClient{
  3. conn: conn,
  4. openFlag: 0,
  5. closeFlag: 0,
  6. serverFlag: serverFlag,
  7. closeChan: make(chan bool, 3), // 给closeChan添加缓冲区!!
  8. sendChan: make(chan IMsg, gMaxPendingSend),
  9. name: "anonymous",
  10. sendLogs: []IMsg{},
  11. dropLogs: []IMsg{},
  12. recvLogs: []IMsg{},
  13. }
  14. it.open()
  15. return it
  16. }

routine泄漏诊断(修复后)

修复后, 加大并发到500, 未发现明显routine泄漏(不存在业务代码导致挂起的routine)

  1. $ go tool pprof ~/chat_server_routine.profile
  2. File: chat_server.test
  3. Type: goroutine
  4. Time: Mar 9, 2021 at 9:28am (CST)
  5. Entering interactive mode (type "help" for commands, "o" for options)
  6. (pprof) top
  7. Showing nodes accounting for 3, 100% of 3 total
  8. Showing top 10 nodes out of 17
  9. flat flat% sum% cum cum%
  10. 2 66.67% 66.67% 2 66.67% runtime.gopark
  11. 1 33.33% 100% 1 33.33% runtime/pprof.runtime_goroutineProfileWithLabels
  12. 0 0% 100% 1 33.33% command-line-arguments.Test_ChatServer
  13. 0 0% 100% 1 33.33% main.main
  14. 0 0% 100% 1 33.33% runtime.chanrecv
  15. 0 0% 100% 1 33.33% runtime.chanrecv1
  16. 0 0% 100% 1 33.33% runtime.main
  17. 0 0% 100% 1 33.33% runtime/pprof.(*Profile).WriteTo
  18. 0 0% 100% 1 33.33% runtime/pprof.profileWriter
  19. 0 0% 100% 1 33.33% runtime/pprof.writeGoroutine
  20. (pprof)

(end)