SkyWalking Kubernetes
https://github.com/apache/skywalking-kubernetes
SkyWalking Cloud on Kubernetes
https://github.com/apache/skywalking-swck
Helm方式部署
export SKYWALKING_RELEASE_NAME=skywalking # change the release name according to your scenario
export SKYWALKING_RELEASE_NAMESPACE=default # change the namespace to where you want to install SkyWalking
export REPO=skywalking
helm repo add ${REPO} https://apache.jfrog.io/artifactory/skywalking-helm
helm install "${SKYWALKING_RELEASE_NAME}" ${REPO}/skywalking -n "${SKYWALKING_RELEASE_NAMESPACE}" \
--set oap.image.tag=9.1.0 \
--set oap.storageType=elasticsearch \
--set ui.image.tag=9.1.0 \
--set elasticsearch.imageTag=6.8.6
[root@UR-20210425NAMA ~]# kubectl get pod
NAME READY STATUS RESTARTS AGE
elasticsearch-master-0 1/1 Running 0 7d19h
elasticsearch-master-1 1/1 Running 0 7d19h
elasticsearch-master-2 1/1 Running 0 7d19h
skywalking-es-init-ftvxn 0/1 Completed 0 7d19h
skywalking-oap-84464fc4cc-8hrb6 1/1 Running 6 7d19h
skywalking-oap-84464fc4cc-rxm4h 1/1 Running 6 7d19h
skywalking-ui-549dc5989f-4d6bd 1/1 Running 0 7d19h
[root@UR-20210425NAMA ~]# kubectl get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
elasticsearch-master ClusterIP 10.43.6.199 <none> 9200/TCP,9300/TCP 7d19h
elasticsearch-master-headless ClusterIP None <none> 9200/TCP,9300/TCP 7d19h
kubernetes ClusterIP 10.43.0.1 <none> 443/TCP 63d
skywalking-oap NodePort 10.43.218.36 <none> 11800:30196/TCP,12800:31981/TCP 7d19h
skywalking-ui NodePort 10.43.236.32 <none> 80:31286/TCP 7d19h
[root@UR-20210425NAMA ~]#
yaml文件
kind: Service
apiVersion: v1
metadata:
name: skywalking-ui
namespace: default
labels:
app: skywalking
app.kubernetes.io/managed-by: Helm
chart: skywalking-4.2.0
component: ui
heritage: Helm
release: skywalking
annotations:
meta.helm.sh/release-name: skywalking
meta.helm.sh/release-namespace: default
spec:
ports:
- protocol: TCP
port: 80
targetPort: 8080
nodePort: 31286
selector:
app: skywalking
component: ui
release: skywalking
clusterIP: 10.43.236.32
type: NodePort
sessionAffinity: None
externalTrafficPolicy: Cluster
kind: Deployment
apiVersion: apps/v1
metadata:
name: skywalking-ui
namespace: infrastructure-prod
labels:
app: skywalking
app.kubernetes.io/managed-by: Helm
chart: skywalking-4.2.0
component: ui
heritage: Helm
release: skywalking
annotations:
deployment.kubernetes.io/revision: '4'
meta.helm.sh/release-name: skywalking
meta.helm.sh/release-namespace: infrastructure-prod
spec:
replicas: 1
selector:
matchLabels:
app: skywalking
component: ui
release: skywalking
template:
metadata:
creationTimestamp: null
labels:
app: skywalking
component: ui
release: skywalking
annotations:
kubesphere.io/restartedAt: '2022-07-01T12:55:58.718Z'
spec:
containers:
- name: ui
image: 'hw-harbor.ur.com.cn/paas/skywalking-ui:8.8.1'
ports:
- name: page
containerPort: 8080
protocol: TCP
env:
- name: SW_OAP_ADDRESS
value: 'http://skywalking-oap:12800'
resources:
limits:
cpu: '1'
memory: 512Mi
requests:
cpu: '1'
memory: 512Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: Always
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
securityContext: {}
schedulerName: default-scheduler
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
kind: Service
apiVersion: v1
metadata:
name: skywalking-oap
namespace: infrastructure-prod
labels:
app: skywalking
app.kubernetes.io/managed-by: Helm
chart: skywalking-4.2.0
component: oap
heritage: Helm
release: skywalking
annotations:
kubectl.kubernetes.io/last-applied-configuration: >
{"apiVersion":"v1","kind":"Service","metadata":{"annotations":{"meta.helm.sh/release-name":"skywalking","meta.helm.sh/release-namespace":"infrastructure-prod"},"labels":{"app":"skywalking","app.kubernetes.io/managed-by":"Helm","chart":"skywalking-4.2.0","component":"oap","heritage":"Helm","release":"skywalking"},"name":"skywalking-oap","namespace":"infrastructure-prod"},"spec":{"clusterIP":"172.30.192.136","ports":[{"name":"grpc","port":11800,"protocol":"TCP","targetPort":11800},{"name":"rest","port":12800,"protocol":"TCP","targetPort":12800}],"selector":{"app":"skywalking","component":"oap","release":"skywalking"},"sessionAffinity":"None","type":"ClusterIP"}}
meta.helm.sh/release-name: skywalking
meta.helm.sh/release-namespace: infrastructure-prod
spec:
ports:
- name: grpc
protocol: TCP
port: 11800
targetPort: 11800
- name: rest
protocol: TCP
port: 12800
targetPort: 12800
selector:
app: skywalking
component: oap
release: skywalking
clusterIP: 172.30.192.136
type: ClusterIP
sessionAffinity: None
kind: Deployment
apiVersion: apps/v1
metadata:
name: skywalking-oap
namespace: infrastructure-prod
labels:
app: skywalking
app.kubernetes.io/managed-by: Helm
chart: skywalking-4.2.0
component: oap
heritage: Helm
release: skywalking
annotations:
deployment.kubernetes.io/revision: '23'
meta.helm.sh/release-name: skywalking
meta.helm.sh/release-namespace: infrastructure-prod
spec:
replicas: 3
selector:
matchLabels:
app: skywalking
component: oap
release: skywalking
template:
metadata:
creationTimestamp: null
labels:
app: skywalking
component: oap
release: skywalking
annotations:
kubesphere.io/restartedAt: '2022-08-02T01:09:33.250Z'
spec:
volumes:
- name: alarm-settings
configMap:
name: alarm-settings
items:
- key: alarm-settings.yml
path: alarm-settings.yml
defaultMode: 420
initContainers:
- name: wait-for-elasticsearch
image: 'busybox:1.30'
command:
- sh
- '-c'
- >-
for i in $(seq 1 60); do nc -z -w3 172.25.2.8 9200 && exit 0 ||
sleep 5; done; exit 1
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
containers:
- name: oap
image: 'hw-harbor.ur.com.cn/paas/skywalking-oap-server:8.8.1'
ports:
- name: grpc
containerPort: 11800
protocol: TCP
- name: rest
containerPort: 12800
protocol: TCP
env:
- name: TZ
value: Asia/Shanghai
- name: JAVA_OPTS
value: '-Dmode=no-init -Xmx2g -Xms2g'
- name: SW_CLUSTER
value: kubernetes
- name: SW_CLUSTER_K8S_NAMESPACE
value: infrastructure-prod
- name: SW_NAMESPACE
value: sw_prod
- name: SW_CLUSTER_K8S_LABEL
value: 'app=skywalking,release=skywalking,component=oap'
- name: SKYWALKING_COLLECTOR_UID
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.uid
- name: SW_STORAGE
value: elasticsearch
- name: SW_STORAGE_ES_CLUSTER_NODES
value: '172.25.2.8:9200,172.25.2.6:9200,172.25.2.7:9200,'
- name: SW_ES_USER
value: xxx
- name: SW_ES_PASSWORD
value: xxx
- name: SW_STORAGE_ES_RECORD_DATA_TTL
value: '7'
- name: SW_STORAGE_ES_BULK_ACTIONS
value: '5000'
- name: SW_SUPERDATASET_STORAGE_DAY_STEP
value: '-1'
- name: SW_STORAGE_ES_FLUSH_INTERVAL
value: '60'
- name: SW_STORAGE_ES_INDEX_SHARDS_NUMBER
value: '3'
- name: SW_STORAGE_ES_QUERY_MAX_SIZE
value: '7000'
- name: SW_STORAGE_ES_ADVANCED
value: >-
"{\"index.refresh_interval\":\"30s\",\"index.translog.flush_threshold_size\":\"500mb\",\"index.translog.sync_interval\":\"60s\",\"index.translog.durability\":\"async\"}"
resources:
limits:
cpu: '2'
memory: 4Gi
requests:
cpu: '2'
memory: 4Gi
volumeMounts:
- name: alarm-settings
readOnly: true
mountPath: /skywalking/config/alarm-settings.yml
subPath: alarm-settings.yml
livenessProbe:
tcpSocket:
port: 12800
initialDelaySeconds: 15
timeoutSeconds: 1
periodSeconds: 20
successThreshold: 1
failureThreshold: 3
readinessProbe:
tcpSocket:
port: 12800
initialDelaySeconds: 15
timeoutSeconds: 1
periodSeconds: 20
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: Always
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
serviceAccountName: skywalking-oap
serviceAccount: skywalking-oap
securityContext: {}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
podAffinityTerm:
labelSelector:
matchLabels:
app: skywalking
component: oap
release: skywalking
topologyKey: kubernetes.io/hostname
schedulerName: default-scheduler
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
kind: Ingress
apiVersion: extensions/v1beta1
metadata:
name: skywalking-ingress
namespace: infrastructure-prod
labels:
isExternal: 'true'
zone: data
annotations:
kubernetes.io/elb.id: d4b32321-1e87-478b-a821-b9e4c4a542ce
kubernetes.io/elb.ip: 172.25.98.42
kubernetes.io/elb.port: '80'
kubernetes.io/ingress.class: cce
spec:
rules:
- host: skywalking.ur.com.cn
http:
paths:
- path: /
pathType: ImplementationSpecific
backend:
serviceName: skywalking-ui
servicePort: 80
property:
ingress.beta.kubernetes.io/url-match-mode: STARTS_WITH
webhooks告警配置
kind: ConfigMap
apiVersion: v1
metadata:
name: alarm-settings
namespace: infrastructure-prod
annotations:
kubesphere.io/creator: linmingwang
data:
alarm-settings.yml: |-
rules:
# Rule unique name, must be ended with `_rule`.
service_resp_time_rule:
metrics-name: service_resp_time
op: ">"
threshold: 1000
period: 15
count: 5
silence-period: 15
message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.
service_sla_rule:
# Metrics value need to be long, double or int
metrics-name: service_sla
op: "<"
threshold: 8000
# The length of time to evaluate the metrics
period: 10
# How many times after the metrics match the condition, will trigger alarm
count: 2
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
silence-period: 3
message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
service_resp_time_percentile_rule:
# Metrics value need to be long, double or int
metrics-name: service_percentile
op: ">"
threshold: 1000,1000,1000,1000,1000
period: 10
count: 3
silence-period: 5
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
service_instance_resp_time_rule:
metrics-name: service_instance_resp_time
op: ">"
threshold: 1000
period: 10
count: 2
silence-period: 5
message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
database_access_resp_time_rule:
metrics-name: database_access_resp_time
threshold: 1000
op: ">"
period: 10
count: 2
message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
endpoint_relation_resp_time_rule:
metrics-name: endpoint_relation_resp_time
threshold: 1000
op: ">"
period: 10
count: 2
message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minutes
wechatHooks:
textTemplate: |-
{
"msgtype": "text",
"text": {
"content": "Skywalking(生产环境): \n %s."
}
}
webhooks:
- https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=07ab9f9d-c9b0-44bf-9110-8aeaa2166f58
#webHooks:
# - http//172.0.0.1
配置 agent
通过 initContainer容器里的agent拷贝到目标容器里
使用JAVA_OPTS方式 远行skywalking-agent.jar服务,skywalking配置通过pipeline流水线变量传递
https://skywalking.apache.org/downloads/
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: kubesphere
component: $APP_NAME
tier: backend
# 服务名称
name: $APP_NAME
# 项目名称
namespace: $NAMESPACE
spec:
progressDeadlineSeconds: 600
replicas: 2
selector:
matchLabels:
app: kubesphere
component: $APP_NAME
tier: backend
template:
metadata:
labels:
app: kubesphere
component: $APP_NAME
tier: backend
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: component
operator: In
values:
- $APP_NAME
topologyKey: "kubernetes.io/hostname"
initContainers:
# 初始化容器
- name: ur-paas-init
image: $HARBOR_HOST/paas/init-container:1.0.2
imagePullPolicy: IfNotPresent
command: ["cp", "-r", "/agent/", "/share/"]
volumeMounts:
- name: ur-share
mountPath: /share
resources:
requests:
cpu: 200m
memory: 200Mi
limits:
cpu: 200m
memory: 200Mi
containers:
- name: $APP_NAME
image: $HARBOR_HOST/$HARBOR_NAMESPACE/$APP_NAME:$BRANCH_NAME-$BUILD_NUMBER
command: ["sh","/home/run.sh"]
imagePullPolicy: Always
lifecycle:
preStop:
exec:
command: ["sh","/home/preStop.sh"]
env:
- name: JAVA_OPTS
value: $JAVA_OPTS
- name: CACHE_IGNORE
value: js|html
- name: CACHE_PUBLIC_EXPIRATION
value: 3d
# 时区
- name: TZ
value: Asia/Shanghai
# 引入skywalking agent
- name: SW_AGENT_NAME
value: ${APP_NAME}
- name: SW_JDBC_TRACE_SQL_PARAMETERS
value: 'true'
- name: SW_PLUGIN_JDBC_SQL_PARAMETERS_MAX_LENGTH
value: '512'
# skywalking oap 后端
- name: SW_AGENT_COLLECTOR_BACKEND_SERVICES
value: ${SW_BACKEND}
# nacos 注册中心配置中心
- name: NACOS_HOST
value: ${NACOS_HOST}
- name: NACOS_NAMESPACE
value: ${NACOS_NAMESPACE}
- name: NACOS_GROUP
value: ${NACOS_GROUP}
- name: POD_ID
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
livenessProbe:
httpGet:
path: $HEALTH_PATH
port: $CONTAINER_PORT
initialDelaySeconds: 60
successThreshold: 1
timeoutSeconds: 10
failureThreshold: 10
periodSeconds: 10
readinessProbe:
httpGet:
path: $HEALTH_PATH
port: $CONTAINER_PORT
initialDelaySeconds: 30
timeoutSeconds: 10
failureThreshold: 30
periodSeconds: 5
ports:
- containerPort: $CONTAINER_PORT
protocol: TCP
volumeMounts:
- name: ur-share
mountPath: /share
- name: gc
mountPath: /home/gc/
- name: dump
mountPath: /home/dump/
- name: scripts
mountPath: /home/run.sh
subPath: run.sh
- name: scripts
mountPath: /home/preStop.sh
subPath: preStop.sh
- name: vol-log
mountPath: /var/log
policy:
logs:
rotate: Hourly
annotations:
format: '{"multi":{"mode":"regular","value":"([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))"}}'
pathPattern: /info.log
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 2
memory: 4Gi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumes:
# ur共享文件夹
- name: ur-share
emptyDir: {}
- name: vol-log
emptyDir: {}
- name: gc
persistentVolumeClaim:
claimName: $APP_NAME
- name: dump
persistentVolumeClaim:
claimName: scp-dump
- name: scripts
configMap:
defaultMode: 0777
name: $APP_NAME
dnsPolicy: ClusterFirst
imagePullSecrets:
- name: $HARBOR_CREDENTIAL_ID
restartPolicy: Always
terminationGracePeriodSeconds: 40
---
apiVersion: v1
kind: ConfigMap
metadata:
name: $APP_NAME
namespace: $NAMESPACE
data:
run.sh: |
#!/bin/bash
echo -javaagent:/share/agent/skywalking/skywalking-agent.jar \
-XX:+UseContainerSupport \
-XX:MaxRAMPercentage=75.0 \
-XX:MinRAMPercentage=75.0 \
-XX:InitialRAMPercentage=75.0 \
-XX:MetaspaceSize=256m \
-XX:MaxMetaspaceSize=256m \
-XX:+UseConcMarkSweepGC \
-XX:+UseCMSCompactAtFullCollection \
-XX:+CMSClassUnloadingEnabled \
-XX:CMSInitiatingOccupancyFraction=80 \
-XX:+UseCMSInitiatingOccupancyOnly \
-XX:+ExplicitGCInvokesConcurrentAndUnloadsClasses \
-Dsun.rmi.dgc.server.gcInterval=2592000000 \
-Dsun.rmi.dgc.client.gcInterval=2592000000 \
-XX:+UseParNewGC \
-XX:ParallelGCThreads=4 \
-XX:SurvivorRatio=8 \
-XX:+PrintGCDetails \
-XX:+PrintGCDateStamps \
-XX:+PrintTenuringDistribution \
-XX:+PrintHeapAtGC \
-XX:+PrintReferenceGC \
-XX:+PrintGCApplicationStoppedTime \
-XX:+UseGCLogFileRotation \
-XX:NumberOfGCLogFiles=10 \
-XX:GCLogFileSize=1000m \
-Xloggc:/home/gc/gc-$${p}POD_ID-$(date +"%Y-%m-%d-%H-%M-%S").log \
-XX:+HeapDumpOnOutOfMemoryError \
-XX:HeapDumpPath=/home/dump/dump-$${p}POD_ID-$(date +"%Y-%m-%d-%H-%M-%S").hprof > /home/gc/java_tool_opts.sh
export JAVA_TOOL_OPTIONS=$(cat /home/gc/java_tool_opts.sh)
java -jar ${${p}JAVA_OPTS} -Djava.awt.headless=true -Dsun.net.client.defaultConnectTimeout=10000 -Dsun.net.client.defaultReadTimeout=30000 -Dfile.encoding=utf-8 -Djava.security.egd=file:/dev/./urandom *.jar
preStop.sh: |
#!/bin/bash
curl -XPUT http://${NACOS_HOST}/nacos/v1/ns/instance?serviceName=${APP_NAME}\&groupName=${NACOS_GROUP}\&namespaceId=${NACOS_NAMESPACE}\&ip=$${p}POD_IP\&port=${CONTAINER_PORT}\&enable=false
sleep 40s
---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: $APP_NAME
namespace: $NAMESPACE
spec:
accessModes:
- ReadWriteMany
volumeMode: Filesystem
resources:
requests:
storage: 2Gi
storageClassName: sfsturbo-ur-scp
---
apiVersion: autoscaling/v1
kind: HorizontalPodAutoscaler
metadata:
annotations:
extendedhpa.metrics: '[{"type":"Resource","name":"cpu","targetType":"Utilization","targetRange":{"low":"55","high":"65"}}]'
extendedhpa.option: '{"downscaleWindow":"60m","upscaleWindow":"0m"}'
name: $APP_NAME
namespace: $NAMESPACE
spec:
minReplicas: 2
maxReplicas: 6
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: $APP_NAME
targetCPUUtilizationPercentage: 60
如何使用java探针注入器
https://skywalking.apache.org/zh/2022-04-19-how-to-use-the-java-agent-injector
https://github.com/apache/skywalking-swck
SWCK是部署在 Kubernetes 环境中,为 Skywalking 用户提供服务的平台,用户可以基于该平台使用、升级和维护 SkyWalking 相关组件。
实际上,SWCK 是基于 kubebuilder 开发的Operator,为用户提供自定义资源( CR )以及管理资源的控制器( Controller ),所有的自定义资源定义(CRD)如下所示:
- JavaAgent
- OAP
- UI
- Storage
- Satellite
- Fetcher
- 透明性。用户应用一般运行在普通容器中而 java 探针则运行在初始化容器中,且两者都属于同一个 pod 。该 pod 中的每个容器都会挂载一个共享内存卷,为 java 探针提供存储路径。在 pod 启动时,初始化容器中的 java 探针会先于应用容器运行,由注入器将其中的探针文件存放在共享内存卷中。在应用容器启动时,注入器通过设置 JVM 参数将探针文件注入到应用程序中。用户可以通过这种方式实现 java 探针的注入,而无需重新构建包含 java 探针的容器镜像。
- 可配置性。注入器提供两种方式配置 java 探针:全局配置和自定义配置。默认的全局配置存放在 configmap 中,用户可以根据需求修改全局配置,比如修改 backend_service 的地址。此外,用户也能通过 annotation 为特定应用设置自定义的一些配置,比如不同服务的 service_name 名称。详情可见 java探针说明书。
- 可观察性。每个 java 探针在被注入时,用户可以查看名为 JavaAgent 的 CRD 资源,用于观测注入后的 java 探针配置。详情可见 JavaAgent说明。