微信公众号:运维开发故事,作者:华仔

一、前言

为什么想到要用golang来编写 metrics 呢?这主要是我们的一个客户那里,k8s 网络使用了 ovs,并且做了 bond,即 bond0 和 bond1,每个 bond 下面 2 张网卡。在上了生产后,让我每天都要检查一下网卡是否正常,因为之前就有网卡 DOWN 了。而我呢,比较懒,不想手动去检查。想着通过 prometheus 最终展示到 grafana,我就在 grafana 上看看有没有处于异常的网卡就好了。其次呢,我最近刚好在学习 go,也想练练手;同时也问了一下研发同学,说很简单,叫我试试,遇到困难时也愿意帮助我。所以,我就打算试试了。

二、环境

组件 版本 备注
k8s v1.14
ovs v2.9.5
go 1.14.1

三、目标

目标就是要通过 prometheus 去拉取我的 ovs bond 的网卡状态指标,那么这里我需要编写一个 go 程序去获取我主机的 ovs bond 信息,并最终以 metrics 方式暴露供 prometheus 来拉取,在 grafana 上展示。示例如下:

  1. [root@test~]$ ovs-appctl bond/show |grep '^slave' |grep -v grep |awk '{print $2""$3}'
  2. a1-b1:enabled
  3. a2-b2:enabled
  4. a3-b3:enabled
  5. a4-b4:disabled
  6. curl http://$IP:$PORT/metrics
  7. ovs_bond_status{component="ovs"} 5
  8. ovs_bond_status{component="ovs","a1b1"="enabled","a2b2"="disabled","a3b3"="enabled",a4b4="disabled“} 2

四、构想

  1. 由于要通过 prometheus 来抓取指标,所以 bond 信息肯定要以 metrics 格式进行暴露。metrics 格式可以参考 prometheus 官网。
  2. bond 有两个,每个下面有两张网卡,每张网卡的状态只有 enabled 和 disabled,因此用数字 0-4 来告诉用户有几张网卡 disabled 了,用数字 5 来表示命令执行有问题或没有 bond,需要人工介入。可以通过命令去获取 bond 信息,因此还是采取命令方式去获取。
  3. 要对执行命令获取的输出结果进行处理并放到 metrics 中去。注:metrics 的 label 不能有【-】。
  4. shell 命令返回的 bond 正确信息用 map 去接收,key 为网卡名,value 为网卡状态
  5. 可以参考client_golang/prometheus

五、实践

先执行 shell 命令去获取 bond 信息


[root@test~]$ ovs-appctl  bond/show |grep '^slave' |grep -v grep  |awk '{print $2""$3}'
a1-b1:enabled
a2-b2:enabled
a3-b3:enabled
a4-b4:disabled

要针对 shell 的输出结果进行处理

# 执行shell命令,并对输出进行处理,记录相关日志


func getBondStatus() (m map[string]string) {
    result, err := exec.Command("bash", "-c", "ovs-appctl bond/show | grep '^slave' | grep -v grep | awk '{print $2\"\"$3}'").Output()
    if err != nil {
        log.Error("result: ", string(result))
        log.Error("command failed: ", err.Error())
        m = make(map[string]string)
        m["msg"] = "failure"
        return m
    } else if len(result) == 0 {
        log.Error("command exec failed, result is null")
        m = make(map[string]string)
        m["msg"] = "return null"
        return m
    }

    ret := strings.TrimSpace(string(result))

    tt := strings.Split(ret, "\n")


    var nMap = make(map[string]string)
    for i := 0; i < len(tt); i++ {

        if strings.Contains(tt[i], "-") == true {
            nKey := strings.Split(strings.Split(tt[i], ":")[0], "-")
            nMap[strings.Join(nKey, "")] = (strings.Split(tt[i], ":"))[1]
        } else {
            nMap[(strings.Split(tt[i], ":"))[0]] = (strings.Split(tt[i], ":"))[1]
        }
    }
    return nMap
}

定义 metrics 指标


type ovsCollector struct {

    ovsMetric *prometheus.Desc
}

func (collector *ovsCollector) Describe(ch chan<- *prometheus.Desc) {
    ch <- collector.ovsMetric
}


var vLable = []string{}

var vValue = []string{}

var constLabel = prometheus.Labels{"component": "ovs"}


func newOvsCollector() *ovsCollector {
    var rm = make(map[string]string)
    rm = getBondStatus()
    if _, ok := rm["msg"]; ok {
        log.Error("command execute failed:", rm["msg"])
    } else {

        for k, _ := range rm {

            vLable = append(vLable, k)
        }
    }

    return &ovsCollector{
        ovsMetric: prometheus.NewDesc("ovs_bond_status",
            "Show ovs bond status", vLable,
            constLabel),
    }
}

指标对应值


func (collector *ovsCollector) Collect(ch chan<- prometheus.Metric) {
    var metricValue float64
    var rm = make(map[string]string)
    rm = getBondStatus()
    if _, ok := rm["msg"]; ok {
        log.Error("command exec failed")
        metricValue = 5
        ch <- prometheus.MustNewConstMetric(collector.ovsMetric, prometheus.CounterValue, metricValue)
    } else {
        vValue = vValue[0:0]

        for _, v := range rm {

            vValue = append(vValue, v)

            if v == "disabled" {
                metricValue++
            }
        }
        ch <- prometheus.MustNewConstMetric(collector.ovsMetric, prometheus.CounterValue, metricValue, vValue...)
    }
}

程序入口

func main() {
    ovs := newOvsCollector()
    prometheus.MustRegister(ovs)

    http.Handle("/metrics", promhttp.Handler())

    log.Info("begin to server on port 8080")

    log.Fatal(http.ListenAndServe(":8080", nil))
}

完整代码

package main

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promhttp"
    log "github.com/sirupsen/logrus"
    "net/http"
    "os/exec"
    "strings"
)


type ovsCollector struct {
    ovsMetric *prometheus.Desc
}

func (collector *ovsCollector) Describe(ch chan<- *prometheus.Desc) {
    ch <- collector.ovsMetric
}

var vLable = []string{}
var vValue = []string{}
var constLabel = prometheus.Labels{"component": "ovs"}


func (collector *ovsCollector) Collect(ch chan<- prometheus.Metric) {
    var metricValue float64
    var rm = make(map[string]string)
    rm = getBondStatus()
    if _, ok := rm["msg"]; ok {
        log.Error("command exec failed")
        metricValue = 5
        ch <- prometheus.MustNewConstMetric(collector.ovsMetric, prometheus.CounterValue, metricValue)
    } else {
        vValue = vValue[0:0]
        for _, v := range rm {

            vValue = append(vValue, v)
            if v == "disabled" {
                metricValue++
            }
        }


        ch <- prometheus.MustNewConstMetric(collector.ovsMetric, prometheus.CounterValue, metricValue, vValue...)
    }
}


func newOvsCollector() *ovsCollector {
    var rm = make(map[string]string)
    rm = getBondStatus()
    if _, ok := rm["msg"]; ok {
        log.Error("command execute failed:", rm["msg"])
    } else {
        for k, _ := range rm {

            vLable = append(vLable, k)
        }
    }
    return &ovsCollector{
        ovsMetric: prometheus.NewDesc("ovs_bond_status",
            "Show ovs bond status", vLable,
            constLabel),
    }
}

func getBondStatus() (m map[string]string) {
    result, err := exec.Command("bash", "-c", "ovs-appctl bond/show | grep '^slave' | grep -v grep | awk '{print $2\"\"$3}'").Output()
    if err != nil {
        log.Error("result: ", string(result))
        log.Error("command failed: ", err.Error())
        m = make(map[string]string)
        m["msg"] = "failure"
        return m
    } else if len(result) == 0 {
        log.Error("command exec failed, result is null")
        m = make(map[string]string)
        m["msg"] = "return null"
        return m
    }
    ret := strings.TrimSpace(string(result))
    tt := strings.Split(ret, "\n")
    var nMap = make(map[string]string)
    for i := 0; i < len(tt); i++ {

        if strings.Contains(tt[i], "-") == true {
            nKey := strings.Split(strings.Split(tt[i], ":")[0], "-")
            nMap[strings.Join(nKey, "")] = (strings.Split(tt[i], ":"))[1]
        } else {
            nMap[(strings.Split(tt[i], ":"))[0]] = (strings.Split(tt[i], ":"))[1]
        }
    }
    return nMap
}

func main() {
    ovs := newOvsCollector()
    prometheus.MustRegister(ovs)

    http.Handle("/metrics", promhttp.Handler())

    log.Info("begin to server on port 8080")

    log.Fatal(http.ListenAndServe(":8080", nil))
}

六、部署

因为最终要部署到 k8s 环境中,
先构建镜像,参考如下 Dockerfile

FROM golang:1.14.1 AS builder
WORKDIR /go/src
COPY ./ .
RUN go build -o ovs_check main.go

# runtime
FROM centos:7.7
COPY --from=builder /go/src/ovs_check /xiyangxixia/ovs_check
ENTRYPOINT ["/xiyangxixia/ovs_check"]

我这里部署使用的 yaml 如下所示:

---
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: ovs-agent
  namespace: kube-system
spec:
  minReadySeconds: 5
  selector:
    matchLabels:
      name: ovs-agent
  template:
    metadata:
      annotations:

        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
      labels:
        name: ovs-agent
    spec:
      containers:
      - name: ovs-agent
        image: ovs_bond:v1
        imagePullPolicy: IfNotPresent
        resources:
            limits:
              cpu: 100m
              memory: 200Mi
            requests:
              cpu: 100m
              memory: 200Mi
        securityContext:
          privileged: true
          procMount: Default
        volumeMounts:
        - mountPath: /lib/modules
          name: lib-modules
          readOnly: true
        - mountPath: /var/run/openvswitch
          name: ovs-run
        - mountPath: /usr/bin/ovs-appctl
          name: ovs-bin
          subPath: ovs-appctl
      serviceAccountName: xiyangxixia
      hostPID: true
      hostIPC: true
      volumes:
      - hostPath:
          path: /lib/modules
          type: ""
        name: lib-modules
      - hostPath:
          path: /var/run/openvswitch
          type: ""
        name: ovs-run
      - hostPath:
          path: /usr/bin/
          type: ""
        name: ovs-bin
  updateStrategy:
    type: RollingUpdate

七、测试

[root@test ~]$ kubectl get po -n kube-system -o wide  |grep ovs
ovs-agent-h8zc6    1/1     Running     0    2d14h   10.211.55.41   master-1   <none>           <none>
[root@test ~]$ curl 10.211.55.41:8080/metrics |grep ovs_bond


ovs_bond_status{component="ovs",a1b1="enabled",a2b2="enabled",a3b3="enabled",a4b4="enabled"} 0

八、总结

以上就是这篇文章的所有了,原谅我学艺不精只能粗糙的介绍一下。感谢一直以来关注公众号的朋友们!
原文链接:
https://blog.csdn.net/wanger5354/article/details/118725852?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1.pc_relevant_paycolumn_v3&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1.pc_relevant_paycolumn_v3