一、为什么需要自定义告警?¶
- 1、满足不同的监控需求;
- 2、结合链路追踪及告警规则,更高效解决问题;
二、场景定义¶
2.1 需求¶
场景描述:公司主营业务为在线购物网站,那么 HTTP 服务的可用性就非常重要。如果 某个服务出现故障,则可能会导致用户无法访问网站,从而影响用户体验和业务收入。
因此,需要借助 SkyWalking 自定义告警功能来监控异常 HTTP 状态码,及时发现和解 决问题,提高服务质量和系统可用性。
2.2 需求分析¶
当接口返回状态码为 404,500, 502, 503, 504 其中一个,就发送告警。
如果要添加自定义告警,首先需要在 oal 文件中添加一个指标。
Helm包文件目录: skywalking/files/conf.d 下面有个 README.md 文件,主要是为了介绍如何去自定义
[root@master01 8]# cat skywalking/files/conf.d/README.md
If you don't want to use the default configuration files packed into the Docker image,
put your own configuration files under this directory in the corresponding component subdirectory,
`oap`, `ui`, etc.
Files under `oap/*` will override the counterparts under the Docker image's `/skywalking/config/*`, with the directory structure retained, here are some examples:
| File under `files/config.d/oap` directory | Overrides the file under Docker image's `/skywalking/config/` |
| ---- | -------- |
| `files/config.d/oap/application.yml` | `/skywalking/config/application.yml` |
| `files/config.d/oap/log4j2.xml` | `/skywalking/config/log4j2.xml` |
| `files/config.d/oap/alarm-settings.yml` | `/skywalking/config/alarm-settings.yml` |
| `files/config.d/oap/endpoint-name-grouping.yml` | `/skywalking/config/endpoint-name-grouping.yml` |
| `files/config.d/oap/oal/core.oal` | `/skywalking/config/oal/core.oal` |
| `files/config.d/oap/oal/browser.oal` | `/skywalking/config/oal/browser.oal` |
| `files/config.d/oap/oc-rules/oap.yaml` | `/skywalking/config/oc-rules/oap.yaml` |
| `...` | `...` |
Files under `satellite/*` will override the counterparts under the Docker image's `/skywalking/configs/*`, with the directory structure retained, here are some examples:
| File under `files/config.d/satellite` directory | Overrides the file under Docker image's `/skywalking/configs/` |
| ---- | -------- |
| `files/config.d/satellite/satellite_config.yaml` | `/skywalking/configs/satellite_config.yaml` |
| `...` | `...` |
2.3 修改 core. oal¶
复制早先的所有内容,按照如上规则 files/config.d/oap/oal/core.oal 进行新告警添加。
Ps:如有es-init报错可先忽略;
[root@master01 ~]# cd /root/8/skywalking/files/conf.d
[root@master01 conf.d]# mkdir -p oap/oal
[root@master01 conf.d]# cd oap/oal/
[root@master01 oal]# vim core.oal
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".
// All scope metrics
all_percentile = from(All.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
all_heatmap = from(All.latency).histogram(100, 20);
// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();
// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_avg = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);
// zhdya 20230625
endpoint_abnormal = from(Endpoint.*).filter(responseCode in [404, 500, 502, 503, 504]).count();
更新部署
# 更新
[root@master01 ~]# cd /root/8
[root@master01 8]# helm upgrade skywalking skywalking -n devops --values ./skywalking/values.yaml
2.4 验证¶
# 查看运行中的pod
[root@master01 8]# kgp -n devops | grep oap
skywalking-oap-5554b6699f-h2j68 1/1 Running 0 45s
# 验证查询
[root@master01 8]# kubectl exec -it skywalking-oap-5554b6699f-h2j68 -ndevops -- cat /skywalking/config/oal/core.oal |grep -C 3 zhdya
Defaulted container "oap" out of: oap, wait-for-elasticsearch (init)
// zhdya 20230625
endpoint_abnormal = from(Endpoint.*).filter(responseCode in [404, 500, 502, 503, 504]).count();
三、增加告警rules¶
修改helm文件中的rules配置,新增如下rules
# 添加如下内容
[root@master01 templates]# cd /root/8/skywalking/templates/
[root@master01 templates]# vim oap-configmap.yaml
endpoint_abnormal_rule:
metrics-name: endpoint_abnormal
threshold: 1
op: ">="
period: 2
count: 1
message: 接口:{name}\n 指标:接口异常\n 详情:最近2分钟内至少1次\n
# 完整配置文件
[root@master01 templates]# cd /root/8/skywalking/templates/
[root@master01 templates]# vim oap-configmap.yaml
{{- if .Values.oap.dynamicConfigEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: skywalking-dynamic-config
labels:
app: {{ template "skywalking.name" . }}
release: {{ .Release.Name }}
component: {{ .Values.oap.name }}
data:
alarm.default.alarm-settings: |-
rules:
# Rule unique name, must be ended with `_rule`.
service_resp_time_rule:
metrics-name: service_resp_time
op: ">"
threshold: 2000
period: 10
count: 3
silence-period: 5
message: 服务:{name}\n 指标:响应时间\n 详情:至少3次超过2秒(最近10分钟内)
service_sla_rule:
# Metrics value need to be long, double or int
metrics-name: service_sla
op: "<"
threshold: 2000
# The length of time to evaluate the metrics
period: 10
# How many times after the metrics match the condition, will trigger alarm
count: 3
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
silence-period: 3
message: 服务:{name}\n 指标:成功率\n 详情:至少3次低于80%(最近10分钟内)
service_resp_time_percentile_rule:
# Metrics value need to be long, double or int
metrics-name: service_percentile
op: ">"
threshold: 1000,1000,1000,1000,1000
period: 10
count: 2
silence-period: 5
message: 服务:{name}\n 指标:响应时间\n 详情:至少3次百分位超过1秒(最近10分钟内)
service_instance_resp_time_rule:
metrics-name: service_instance_resp_time
op: ">"
threshold: 2000
period: 10
count: 2
silence-period: 5
message: 实例:{name}\n 指标:响应时间\n 详情:至少2次超过2秒(最近10分钟内)
database_access_resp_time_rule:
metrics-name: database_access_resp_time
threshold: 2000
op: ">"
period: 10
count: 2
# message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
message: 数据库访问:{name}\n 指标:响应时间\n 详情:至少2次超过2秒(最近10分钟内)
endpoint_relation_resp_time_rule:
metrics-name: endpoint_relation_resp_time
threshold: 2000
op: ">"
period: 10
count: 2
message: 端点关系:{name}\n 指标:响应时间\n 详情:至少2次超过2秒(最近10分钟内)
instance_jvm_old_gc_count_rule:
metrics-name: instance_jvm_old_gc_count
threshold: 1
op: ">"
period: 3
count: 1
message: 实例:{name}\n 指标:OldGC次数\n 详情:最近1天内大于1次
instance_jvm_young_gc_count_rule:
metrics-name: instance_jvm_young_gc_count
threshold: 1
op: ">"
period: 5
count: 100
message: 实例:{name}\n 指标:YoungGC次数\n 详情:最近5分钟内大于100次
# ============== 新增的接口异常告警规则 ==============
endpoint_abnormal_rule:
metrics-name: endpoint_abnormal
threshold: 1
op: ">="
period: 2
count: 1
message: 接口:{name}\n 指标:接口异常\n 详情:最近2分钟内至少1次
# =============================================
wechatHooks:
textTemplate: |-
{
"msgtype": "text",
"text": {
"content": "SkyWalking 链路追踪告警: \n %s."
}
}
webhooks:
- https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=9d8866d6-ab55-48f3-8336-786325667640
{{- end }}