SkyWalking Kubernetes
https://github.com/apache/skywalking-kubernetes
SkyWalking Cloud on Kubernetes
https://github.com/apache/skywalking-swck
Helm方式部署
export SKYWALKING_RELEASE_NAME=skywalking # change the release name according to your scenarioexport SKYWALKING_RELEASE_NAMESPACE=default # change the namespace to where you want to install SkyWalking
export REPO=skywalkinghelm repo add ${REPO} https://apache.jfrog.io/artifactory/skywalking-helmhelm install "${SKYWALKING_RELEASE_NAME}" ${REPO}/skywalking -n "${SKYWALKING_RELEASE_NAMESPACE}" \--set oap.image.tag=9.1.0 \--set oap.storageType=elasticsearch \--set ui.image.tag=9.1.0 \--set elasticsearch.imageTag=6.8.6

[root@UR-20210425NAMA ~]# kubectl get podNAME READY STATUS RESTARTS AGEelasticsearch-master-0 1/1 Running 0 7d19helasticsearch-master-1 1/1 Running 0 7d19helasticsearch-master-2 1/1 Running 0 7d19hskywalking-es-init-ftvxn 0/1 Completed 0 7d19hskywalking-oap-84464fc4cc-8hrb6 1/1 Running 6 7d19hskywalking-oap-84464fc4cc-rxm4h 1/1 Running 6 7d19hskywalking-ui-549dc5989f-4d6bd 1/1 Running 0 7d19h[root@UR-20210425NAMA ~]# kubectl get svcNAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGEelasticsearch-master ClusterIP 10.43.6.199 <none> 9200/TCP,9300/TCP 7d19helasticsearch-master-headless ClusterIP None <none> 9200/TCP,9300/TCP 7d19hkubernetes ClusterIP 10.43.0.1 <none> 443/TCP 63dskywalking-oap NodePort 10.43.218.36 <none> 11800:30196/TCP,12800:31981/TCP 7d19hskywalking-ui NodePort 10.43.236.32 <none> 80:31286/TCP 7d19h[root@UR-20210425NAMA ~]#
yaml文件
kind: ServiceapiVersion: v1metadata:name: skywalking-uinamespace: defaultlabels:app: skywalkingapp.kubernetes.io/managed-by: Helmchart: skywalking-4.2.0component: uiheritage: Helmrelease: skywalkingannotations:meta.helm.sh/release-name: skywalkingmeta.helm.sh/release-namespace: defaultspec:ports:- protocol: TCPport: 80targetPort: 8080nodePort: 31286selector:app: skywalkingcomponent: uirelease: skywalkingclusterIP: 10.43.236.32type: NodePortsessionAffinity: NoneexternalTrafficPolicy: Cluster
kind: DeploymentapiVersion: apps/v1metadata:name: skywalking-uinamespace: infrastructure-prodlabels:app: skywalkingapp.kubernetes.io/managed-by: Helmchart: skywalking-4.2.0component: uiheritage: Helmrelease: skywalkingannotations:deployment.kubernetes.io/revision: '4'meta.helm.sh/release-name: skywalkingmeta.helm.sh/release-namespace: infrastructure-prodspec:replicas: 1selector:matchLabels:app: skywalkingcomponent: uirelease: skywalkingtemplate:metadata:creationTimestamp: nulllabels:app: skywalkingcomponent: uirelease: skywalkingannotations:kubesphere.io/restartedAt: '2022-07-01T12:55:58.718Z'spec:containers:- name: uiimage: 'hw-harbor.ur.com.cn/paas/skywalking-ui:8.8.1'ports:- name: pagecontainerPort: 8080protocol: TCPenv:- name: SW_OAP_ADDRESSvalue: 'http://skywalking-oap:12800'resources:limits:cpu: '1'memory: 512Mirequests:cpu: '1'memory: 512MiterminationMessagePath: /dev/termination-logterminationMessagePolicy: FileimagePullPolicy: AlwaysrestartPolicy: AlwaysterminationGracePeriodSeconds: 30dnsPolicy: ClusterFirstsecurityContext: {}schedulerName: default-schedulerstrategy:type: RollingUpdaterollingUpdate:maxUnavailable: 25%maxSurge: 25%revisionHistoryLimit: 10progressDeadlineSeconds: 600
kind: ServiceapiVersion: v1metadata:name: skywalking-oapnamespace: infrastructure-prodlabels:app: skywalkingapp.kubernetes.io/managed-by: Helmchart: skywalking-4.2.0component: oapheritage: Helmrelease: skywalkingannotations:kubectl.kubernetes.io/last-applied-configuration: >{"apiVersion":"v1","kind":"Service","metadata":{"annotations":{"meta.helm.sh/release-name":"skywalking","meta.helm.sh/release-namespace":"infrastructure-prod"},"labels":{"app":"skywalking","app.kubernetes.io/managed-by":"Helm","chart":"skywalking-4.2.0","component":"oap","heritage":"Helm","release":"skywalking"},"name":"skywalking-oap","namespace":"infrastructure-prod"},"spec":{"clusterIP":"172.30.192.136","ports":[{"name":"grpc","port":11800,"protocol":"TCP","targetPort":11800},{"name":"rest","port":12800,"protocol":"TCP","targetPort":12800}],"selector":{"app":"skywalking","component":"oap","release":"skywalking"},"sessionAffinity":"None","type":"ClusterIP"}}meta.helm.sh/release-name: skywalkingmeta.helm.sh/release-namespace: infrastructure-prodspec:ports:- name: grpcprotocol: TCPport: 11800targetPort: 11800- name: restprotocol: TCPport: 12800targetPort: 12800selector:app: skywalkingcomponent: oaprelease: skywalkingclusterIP: 172.30.192.136type: ClusterIPsessionAffinity: None
kind: DeploymentapiVersion: apps/v1metadata:name: skywalking-oapnamespace: infrastructure-prodlabels:app: skywalkingapp.kubernetes.io/managed-by: Helmchart: skywalking-4.2.0component: oapheritage: Helmrelease: skywalkingannotations:deployment.kubernetes.io/revision: '23'meta.helm.sh/release-name: skywalkingmeta.helm.sh/release-namespace: infrastructure-prodspec:replicas: 3selector:matchLabels:app: skywalkingcomponent: oaprelease: skywalkingtemplate:metadata:creationTimestamp: nulllabels:app: skywalkingcomponent: oaprelease: skywalkingannotations:kubesphere.io/restartedAt: '2022-08-02T01:09:33.250Z'spec:volumes:- name: alarm-settingsconfigMap:name: alarm-settingsitems:- key: alarm-settings.ymlpath: alarm-settings.ymldefaultMode: 420initContainers:- name: wait-for-elasticsearchimage: 'busybox:1.30'command:- sh- '-c'- >-for i in $(seq 1 60); do nc -z -w3 172.25.2.8 9200 && exit 0 ||sleep 5; done; exit 1resources: {}terminationMessagePath: /dev/termination-logterminationMessagePolicy: FileimagePullPolicy: IfNotPresentcontainers:- name: oapimage: 'hw-harbor.ur.com.cn/paas/skywalking-oap-server:8.8.1'ports:- name: grpccontainerPort: 11800protocol: TCP- name: restcontainerPort: 12800protocol: TCPenv:- name: TZvalue: Asia/Shanghai- name: JAVA_OPTSvalue: '-Dmode=no-init -Xmx2g -Xms2g'- name: SW_CLUSTERvalue: kubernetes- name: SW_CLUSTER_K8S_NAMESPACEvalue: infrastructure-prod- name: SW_NAMESPACEvalue: sw_prod- name: SW_CLUSTER_K8S_LABELvalue: 'app=skywalking,release=skywalking,component=oap'- name: SKYWALKING_COLLECTOR_UIDvalueFrom:fieldRef:apiVersion: v1fieldPath: metadata.uid- name: SW_STORAGEvalue: elasticsearch- name: SW_STORAGE_ES_CLUSTER_NODESvalue: '172.25.2.8:9200,172.25.2.6:9200,172.25.2.7:9200,'- name: SW_ES_USERvalue: xxx- name: SW_ES_PASSWORDvalue: xxx- name: SW_STORAGE_ES_RECORD_DATA_TTLvalue: '7'- name: SW_STORAGE_ES_BULK_ACTIONSvalue: '5000'- name: SW_SUPERDATASET_STORAGE_DAY_STEPvalue: '-1'- name: SW_STORAGE_ES_FLUSH_INTERVALvalue: '60'- name: SW_STORAGE_ES_INDEX_SHARDS_NUMBERvalue: '3'- name: SW_STORAGE_ES_QUERY_MAX_SIZEvalue: '7000'- name: SW_STORAGE_ES_ADVANCEDvalue: >-"{\"index.refresh_interval\":\"30s\",\"index.translog.flush_threshold_size\":\"500mb\",\"index.translog.sync_interval\":\"60s\",\"index.translog.durability\":\"async\"}"resources:limits:cpu: '2'memory: 4Girequests:cpu: '2'memory: 4GivolumeMounts:- name: alarm-settingsreadOnly: truemountPath: /skywalking/config/alarm-settings.ymlsubPath: alarm-settings.ymllivenessProbe:tcpSocket:port: 12800initialDelaySeconds: 15timeoutSeconds: 1periodSeconds: 20successThreshold: 1failureThreshold: 3readinessProbe:tcpSocket:port: 12800initialDelaySeconds: 15timeoutSeconds: 1periodSeconds: 20successThreshold: 1failureThreshold: 3terminationMessagePath: /dev/termination-logterminationMessagePolicy: FileimagePullPolicy: AlwaysrestartPolicy: AlwaysterminationGracePeriodSeconds: 30dnsPolicy: ClusterFirstserviceAccountName: skywalking-oapserviceAccount: skywalking-oapsecurityContext: {}affinity:podAntiAffinity:preferredDuringSchedulingIgnoredDuringExecution:- weight: 1podAffinityTerm:labelSelector:matchLabels:app: skywalkingcomponent: oaprelease: skywalkingtopologyKey: kubernetes.io/hostnameschedulerName: default-schedulerstrategy:type: RollingUpdaterollingUpdate:maxUnavailable: 25%maxSurge: 25%revisionHistoryLimit: 10progressDeadlineSeconds: 600
kind: IngressapiVersion: extensions/v1beta1metadata:name: skywalking-ingressnamespace: infrastructure-prodlabels:isExternal: 'true'zone: dataannotations:kubernetes.io/elb.id: d4b32321-1e87-478b-a821-b9e4c4a542cekubernetes.io/elb.ip: 172.25.98.42kubernetes.io/elb.port: '80'kubernetes.io/ingress.class: ccespec:rules:- host: skywalking.ur.com.cnhttp:paths:- path: /pathType: ImplementationSpecificbackend:serviceName: skywalking-uiservicePort: 80property:ingress.beta.kubernetes.io/url-match-mode: STARTS_WITH
webhooks告警配置
kind: ConfigMapapiVersion: v1metadata:name: alarm-settingsnamespace: infrastructure-prodannotations:kubesphere.io/creator: linmingwangdata:alarm-settings.yml: |-rules:# Rule unique name, must be ended with `_rule`.service_resp_time_rule:metrics-name: service_resp_timeop: ">"threshold: 1000period: 15count: 5silence-period: 15message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.service_sla_rule:# Metrics value need to be long, double or intmetrics-name: service_slaop: "<"threshold: 8000# The length of time to evaluate the metricsperiod: 10# How many times after the metrics match the condition, will trigger alarmcount: 2# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.silence-period: 3message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutesservice_resp_time_percentile_rule:# Metrics value need to be long, double or intmetrics-name: service_percentileop: ">"threshold: 1000,1000,1000,1000,1000period: 10count: 3silence-period: 5message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000service_instance_resp_time_rule:metrics-name: service_instance_resp_timeop: ">"threshold: 1000period: 10count: 2silence-period: 5message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutesdatabase_access_resp_time_rule:metrics-name: database_access_resp_timethreshold: 1000op: ">"period: 10count: 2message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutesendpoint_relation_resp_time_rule:metrics-name: endpoint_relation_resp_timethreshold: 1000op: ">"period: 10count: 2message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minuteswechatHooks:textTemplate: |-{"msgtype": "text","text": {"content": "Skywalking(生产环境): \n %s."}}webhooks:- https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=07ab9f9d-c9b0-44bf-9110-8aeaa2166f58#webHooks:# - http//172.0.0.1



配置 agent
通过 initContainer容器里的agent拷贝到目标容器里
使用JAVA_OPTS方式 远行skywalking-agent.jar服务,skywalking配置通过pipeline流水线变量传递
https://skywalking.apache.org/downloads/



apiVersion: apps/v1kind: Deploymentmetadata:labels:app: kubespherecomponent: $APP_NAMEtier: backend# 服务名称name: $APP_NAME# 项目名称namespace: $NAMESPACEspec:progressDeadlineSeconds: 600replicas: 2selector:matchLabels:app: kubespherecomponent: $APP_NAMEtier: backendtemplate:metadata:labels:app: kubespherecomponent: $APP_NAMEtier: backendspec:affinity:podAntiAffinity:requiredDuringSchedulingIgnoredDuringExecution:- labelSelector:matchExpressions:- key: componentoperator: Invalues:- $APP_NAMEtopologyKey: "kubernetes.io/hostname"initContainers:# 初始化容器- name: ur-paas-initimage: $HARBOR_HOST/paas/init-container:1.0.2imagePullPolicy: IfNotPresentcommand: ["cp", "-r", "/agent/", "/share/"]volumeMounts:- name: ur-sharemountPath: /shareresources:requests:cpu: 200mmemory: 200Milimits:cpu: 200mmemory: 200Micontainers:- name: $APP_NAMEimage: $HARBOR_HOST/$HARBOR_NAMESPACE/$APP_NAME:$BRANCH_NAME-$BUILD_NUMBERcommand: ["sh","/home/run.sh"]imagePullPolicy: Alwayslifecycle:preStop:exec:command: ["sh","/home/preStop.sh"]env:- name: JAVA_OPTSvalue: $JAVA_OPTS- name: CACHE_IGNOREvalue: js|html- name: CACHE_PUBLIC_EXPIRATIONvalue: 3d# 时区- name: TZvalue: Asia/Shanghai# 引入skywalking agent- name: SW_AGENT_NAMEvalue: ${APP_NAME}- name: SW_JDBC_TRACE_SQL_PARAMETERSvalue: 'true'- name: SW_PLUGIN_JDBC_SQL_PARAMETERS_MAX_LENGTHvalue: '512'# skywalking oap 后端- name: SW_AGENT_COLLECTOR_BACKEND_SERVICESvalue: ${SW_BACKEND}# nacos 注册中心配置中心- name: NACOS_HOSTvalue: ${NACOS_HOST}- name: NACOS_NAMESPACEvalue: ${NACOS_NAMESPACE}- name: NACOS_GROUPvalue: ${NACOS_GROUP}- name: POD_IDvalueFrom:fieldRef:fieldPath: metadata.name- name: POD_IPvalueFrom:fieldRef:fieldPath: status.podIPlivenessProbe:httpGet:path: $HEALTH_PATHport: $CONTAINER_PORTinitialDelaySeconds: 60successThreshold: 1timeoutSeconds: 10failureThreshold: 10periodSeconds: 10readinessProbe:httpGet:path: $HEALTH_PATHport: $CONTAINER_PORTinitialDelaySeconds: 30timeoutSeconds: 10failureThreshold: 30periodSeconds: 5ports:- containerPort: $CONTAINER_PORTprotocol: TCPvolumeMounts:- name: ur-sharemountPath: /share- name: gcmountPath: /home/gc/- name: dumpmountPath: /home/dump/- name: scriptsmountPath: /home/run.shsubPath: run.sh- name: scriptsmountPath: /home/preStop.shsubPath: preStop.sh- name: vol-logmountPath: /var/logpolicy:logs:rotate: Hourlyannotations:format: '{"multi":{"mode":"regular","value":"([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))"}}'pathPattern: /info.logresources:limits:cpu: 2memory: 4Girequests:cpu: 2memory: 4GiterminationMessagePath: /dev/termination-logterminationMessagePolicy: Filevolumes:# ur共享文件夹- name: ur-shareemptyDir: {}- name: vol-logemptyDir: {}- name: gcpersistentVolumeClaim:claimName: $APP_NAME- name: dumppersistentVolumeClaim:claimName: scp-dump- name: scriptsconfigMap:defaultMode: 0777name: $APP_NAMEdnsPolicy: ClusterFirstimagePullSecrets:- name: $HARBOR_CREDENTIAL_IDrestartPolicy: AlwaysterminationGracePeriodSeconds: 40---apiVersion: v1kind: ConfigMapmetadata:name: $APP_NAMEnamespace: $NAMESPACEdata:run.sh: |#!/bin/bashecho -javaagent:/share/agent/skywalking/skywalking-agent.jar \-XX:+UseContainerSupport \-XX:MaxRAMPercentage=75.0 \-XX:MinRAMPercentage=75.0 \-XX:InitialRAMPercentage=75.0 \-XX:MetaspaceSize=256m \-XX:MaxMetaspaceSize=256m \-XX:+UseConcMarkSweepGC \-XX:+UseCMSCompactAtFullCollection \-XX:+CMSClassUnloadingEnabled \-XX:CMSInitiatingOccupancyFraction=80 \-XX:+UseCMSInitiatingOccupancyOnly \-XX:+ExplicitGCInvokesConcurrentAndUnloadsClasses \-Dsun.rmi.dgc.server.gcInterval=2592000000 \-Dsun.rmi.dgc.client.gcInterval=2592000000 \-XX:+UseParNewGC \-XX:ParallelGCThreads=4 \-XX:SurvivorRatio=8 \-XX:+PrintGCDetails \-XX:+PrintGCDateStamps \-XX:+PrintTenuringDistribution \-XX:+PrintHeapAtGC \-XX:+PrintReferenceGC \-XX:+PrintGCApplicationStoppedTime \-XX:+UseGCLogFileRotation \-XX:NumberOfGCLogFiles=10 \-XX:GCLogFileSize=1000m \-Xloggc:/home/gc/gc-$${p}POD_ID-$(date +"%Y-%m-%d-%H-%M-%S").log \-XX:+HeapDumpOnOutOfMemoryError \-XX:HeapDumpPath=/home/dump/dump-$${p}POD_ID-$(date +"%Y-%m-%d-%H-%M-%S").hprof > /home/gc/java_tool_opts.shexport JAVA_TOOL_OPTIONS=$(cat /home/gc/java_tool_opts.sh)java -jar ${${p}JAVA_OPTS} -Djava.awt.headless=true -Dsun.net.client.defaultConnectTimeout=10000 -Dsun.net.client.defaultReadTimeout=30000 -Dfile.encoding=utf-8 -Djava.security.egd=file:/dev/./urandom *.jarpreStop.sh: |#!/bin/bashcurl -XPUT http://${NACOS_HOST}/nacos/v1/ns/instance?serviceName=${APP_NAME}\&groupName=${NACOS_GROUP}\&namespaceId=${NACOS_NAMESPACE}\&ip=$${p}POD_IP\&port=${CONTAINER_PORT}\&enable=falsesleep 40s---kind: PersistentVolumeClaimapiVersion: v1metadata:name: $APP_NAMEnamespace: $NAMESPACEspec:accessModes:- ReadWriteManyvolumeMode: Filesystemresources:requests:storage: 2GistorageClassName: sfsturbo-ur-scp---apiVersion: autoscaling/v1kind: HorizontalPodAutoscalermetadata:annotations:extendedhpa.metrics: '[{"type":"Resource","name":"cpu","targetType":"Utilization","targetRange":{"low":"55","high":"65"}}]'extendedhpa.option: '{"downscaleWindow":"60m","upscaleWindow":"0m"}'name: $APP_NAMEnamespace: $NAMESPACEspec:minReplicas: 2maxReplicas: 6scaleTargetRef:apiVersion: apps/v1kind: Deploymentname: $APP_NAMEtargetCPUUtilizationPercentage: 60
如何使用java探针注入器
https://skywalking.apache.org/zh/2022-04-19-how-to-use-the-java-agent-injector
https://github.com/apache/skywalking-swck
SWCK是部署在 Kubernetes 环境中,为 Skywalking 用户提供服务的平台,用户可以基于该平台使用、升级和维护 SkyWalking 相关组件。
实际上,SWCK 是基于 kubebuilder 开发的Operator,为用户提供自定义资源( CR )以及管理资源的控制器( Controller ),所有的自定义资源定义(CRD)如下所示:
- JavaAgent
- OAP
- UI
- Storage
- Satellite
- Fetcher
- 透明性。用户应用一般运行在普通容器中而 java 探针则运行在初始化容器中,且两者都属于同一个 pod 。该 pod 中的每个容器都会挂载一个共享内存卷,为 java 探针提供存储路径。在 pod 启动时,初始化容器中的 java 探针会先于应用容器运行,由注入器将其中的探针文件存放在共享内存卷中。在应用容器启动时,注入器通过设置 JVM 参数将探针文件注入到应用程序中。用户可以通过这种方式实现 java 探针的注入,而无需重新构建包含 java 探针的容器镜像。
- 可配置性。注入器提供两种方式配置 java 探针:全局配置和自定义配置。默认的全局配置存放在 configmap 中,用户可以根据需求修改全局配置,比如修改 backend_service 的地址。此外,用户也能通过 annotation 为特定应用设置自定义的一些配置,比如不同服务的 service_name 名称。详情可见 java探针说明书。
- 可观察性。每个 java 探针在被注入时,用户可以查看名为 JavaAgent 的 CRD 资源,用于观测注入后的 java 探针配置。详情可见 JavaAgent说明。
