1 Grafana
Grafana 官网地址: https://grafana.com/
Grafana Dashboards 地址: https://grafana.com/grafana/dashboards
Grafana Plugins 地址: https://grafana.com/grafana/plugins
Docker Hub 地址: https://registry.hub.docker.com/r/grafana/grafana
# 拉取镜像
docker pull grafana/grafana:latest
# 启动镜像
docker run -d \
-p 3000:3000 \
--name=grafana \
--restart=always \
grafana/grafana:latest
# 验证效果
打开网址 http://localhost:3000/
默认账号密码 admin/admin
2 Prometheus
Prometheus 官网地址: https://prometheus.io/
Prometheus 文档地址: https://prometheus.io/docs/introduction/overview/
Docker Hub 地址: https://registry.hub.docker.com/r/prom/prometheus
# 拉取镜像
docker pull prom/prometheus:latest
# 启动镜像
docker run -d \
-p 9090:9090 \
-v /tmp/prometheus.yml:/etc/prometheus/prometheus.yml \
--name=prometheus \
--restart=always \
prom/prometheus:latest
docker run -d \
-p 9090:9090 \
-v $(pwd)/prom/prometheus.yml:/etc/prometheus/prometheus.yml \
--name=prometheus \
--restart=always \
prom/prometheus:latest
# 验证效果
打开网址 http://localhost:9090/
配置文件
配置文档: https://prometheus.io/docs/prometheus/latest/configuration/configuration/
配置样例: https://github.com/prometheus/prometheus/blob/release-2.21/config/testdata/conf.good.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['192.168.100.15:9090']
labels:
instance: prometheus
- job_name: 'node'
static_configs:
- targets: ['192.168.100.15:9100']
labels:
instance: node
- job_name: 'dcgm'
static_configs:
- targets: ['192.168.100.15:9400']
labels:
instance: dcgm
- job_name: 'cadvisor'
static_configs:
- targets: ['192.168.100.15:8085']
labels:
instance: cadvisor
- job_name: 'spring'
metrics_path: '/actuator/prometheus/metrics'
static_configs:
- targets: ['192.168.100.7:8080']
labels:
instance: spring
3 PushGateway
Docker Hub 地址: https://hub.docker.com/r/prom/pushgateway
GitHub地址: https://github.com/prometheus/pushgateway
# 拉取镜像
docker pull prom/pushgateway:latest
# 启动镜像
docker run -d \
-p 9091:9091 \
--name=pushgateway \
--restart=always \
prom/pushgateway
# 验证效果
http://localhost:9091/metrics/job/<JOB_NAME>{/<LABEL_NAME>/<LABEL_VALUE>}
4 Exporter
4.1 cAdvisor 容器监控
Docker Hub 地址: https://registry.hub.docker.com/r/google/cadvisor
GitHub地址: https://github.com/google/cadvisor
Grafana Dashboard:
https://grafana.com/grafana/dashboards/893
https://grafana.com/grafana/dashboards/315
# 拉取镜像
docker pull google/cadvisor:latest
# 启动镜像
docker run -d \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8085:8080 \
--detach=true \
--name=cadvisor \
--privileged \
--device=/dev/kmsg \
google/cadvisor:latest
# 验证效果
curl localhost:8080/metric
4.2 node 主机监控(CPU/MEM/DISK/NET…)
Docker Hub 地址: https://registry.hub.docker.com/r/prom/node-exporter
GitHub地址: https://github.com/prometheus/node_exporter
Grafana Dashboard:
https://grafana.com/grafana/dashboards/1860
https://grafana.com/grafana/dashboards/11074
# 拉取镜像
docker pull prom/node-exporter:latest
# 启动镜像
docker run -d \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
--name=node \
prom/node-exporter \
--path.rootfs=/host
# 验证效果
curl localhost:9100/metric
4.3 dcgm 显卡监控(GPU)
Docker Hub 地址: https://registry.hub.docker.com/r/nvidia/dcgm-exporter
GitHub地址: https://github.com/NVIDIA/gpu-monitoring-tools
Grafana Dashboard: https://grafana.com/grafana/dashboards/12239
# 拉取镜像
docker pull nvidia/dcgm-exporter:latest
# 启动镜像
docker run -d \
-p 9400:9400 \
--gpus all \
--name=dcgm \
nvidia/dcgm-exporter:latest
# 验证效果
curl localhost:9400/metrics
4.4 Spring Boot
Grafana Dashboard: https://grafana.com/grafana/dashboards/4701
5 Kubernetes部署
prometheus-cfg.yaml
kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus-config
namespace: monitor-sa
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 1m
scrape_configs:
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-node-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
prometheus-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-server //控制器名称
namespace: monitor-sa //命名空间
labels:
app: prometheus //标签
spec:
replicas: 1 //副本数
selector:
matchLabels:
app: prometheus //pod标签
component: server
#matchExpressions:
#- {key: app, operator: In, values: [prometheus]}
#- {key: component, operator: In, values: [server]}
template: //模板
metadata:
labels:
app: prometheus
component: server
annotations:
prometheus.io/scrape: 'false' //是否收集数据
spec:
nodeName: k8s-node //指定节点
serviceAccountName: monitor //指定sa
containers:
- name: prometheus
image: prom/prometheus:v2.2.1
imagePullPolicy: IfNotPresent //拉取镜像规则,本地有载在本地获取,如果本地没有从仓库拉取
command:
- prometheus
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention=720h
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: /etc/prometheus/prometheus.yml
name: prometheus-config
subPath: prometheus.yml
- mountPath: /prometheus/
name: prometheus-storage-volume
volumes: //将configmap存储卷挂在到上面的容器对应的目录中
- name: prometheus-config
configMap:
name: prometheus-config
items:
- key: prometheus.yml
path: prometheus.yml
mode: 0644
- name: prometheus-storage-volume
hostPath:
path: /data
type: Directory
prometheus-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitor-sa
labels:
app: prometheus
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 30000
protocol: TCP
selector:
app: prometheus
component: server
kubectl apply -f prometheus-cfg.yaml
kubectl get pod -n monitor-sa
https://blog.csdn.net/liuchao666888/article/details/107636647