skywalking(五)实现skywalking 实现钉钉告警

发布时间 2023-11-16 15:13:05作者: areke

1. alert告警参数简介

1.1 告警指标

cat /apps/apache-skywalking-apm-bin/config/oal/core.oal

  • service_resp_time

    服务的响应时间

  • service_sla

    服务的http请求成功率SLA,比如99%等

  • service_cpm

    表示每分钟的吞吐量

  • service_apdex

    应用性能指数是0.8是0.x

  • service_percentile

    指定最近多少数据范围内的响应时间百分比,即p99, p95, p90, p75, p50在内的数据统计结果

  • endpoint_relation_cpm

    端点的每分钟的吞吐量

  • endpoint_relation_resp_time

    端点的响应时间

  • endpoint_relation_sla

    端点的http请求成功率SLA,比如99%等。

  • endpoint_relation_percentile

    端点的最近多少数据范围内的响应时间百分比,即p99、 p95、 p90、 p75、p50在内的数据统计结果

1.2 告警规则

cat /apps/apache-skywalking-apm-bin/config/alarm-settings.yml
rules: 						# 定义rule规则
service_cpm_rule: 			# 唯一的规则名称,必须以_rule结尾
metrics-name: service_cpm 	# 指标名称,对应core.oal文件中的采集指标名称
include-names:				# skywalking服务名称,不填写默认为所有服务
  - dubbox-provider
  - dubbox-consumer
op: ">" 					# 操作符,>, >=, <, <=, ==
threshold: 1 				# 指标阈值
period: 2 					# 评估指标的间隔周期
count: 1 					# 匹配成功多少次就会触发告警
silence-period: 2 			# 触发告警后的静默时间,min
message: dubbo-provider service_cpm 大于1了 # 告警信息

rules:
  # 告警规则 名称唯一 必须以_rule 结尾
  service_resp_time_rule:
    # 度量名称,只支持int long double
    metrics-name: service_resp_time
    # 操作符
    op: ">"
    # 阈值 ms
    threshold: 1000
    # 评估度量的时间长度,ms
    period: 10
    # 度量有多少次符合告警条件后,才会触发告警
    count: 2
    # 静默时间 默认情况下,它和周期一样,在同一个周期内只会触发一次。
    silence-period: 10
    message: 服务【{name}】的平均响应时间在最近10分钟内有2分钟超过1秒
  service_sla_rule:
    metrics-name: service_sla
    op: "<"
    threshold: 8000
    period: 10
    count: 2
    silence-period: 10
    message: 服务【{name}】的成功率在最近10分钟内有2分钟低于80%
composite-rules:
  # 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾
  comp_rule:
    # 指定如何组成规则,支持&&, ||, ()操作符
    expression: service_resp_time_rule && service_sla_rule
    message: 服务【{name}】在最近10分钟内有2分钟平均响应时间超过1秒并且成功率低于80%
dingtalkHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
      "content": "Apache SkyWalking Alarm: \n %s."
      }  
    }
  webhooks:
    - url: https://oapi.dingtalk.com/robot/send?access_token=a374b5e60f6d408e46d17ba5340245b522314298422d3279d71ee019fc2c6b64
      # secret: SEC8a7e5fe2bb03d383963c144a9cf754c4f0b43b6b3e04a2e892		# 钉钉机器人加签,secret认证

详细指标、规则等见官方文档:https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md#list-of-all-

2. 钉钉配置

创建机器人方法参考:[1]

2.1 自定义关键字

设置SkyWalking

安全设置文档:https://open.dingtalk.com/document/robots/customize-robot-security-settings

2.2 脚本验证

测试发送消息,包含关键字SkyWalking

[root@dubbo-server1 opt]#bash dingding-keywords.sh "SkyWalking-test"
{"errcode":0,"errmsg":"ok"}

钉钉收到消息

3. 告警指标配置

采用默认指标,也可以自定义

[root@skywalking-server config]#cat oal/core.oal
// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".

// Service scope metrics
service_resp_time = from(Service.latency).longAvg();		// 服务的响应时间
service_sla = from(Service.*).percent(status == true);		// 服务的http请求成功率SLA,比如99%等
service_cpm = from(Service.*).cpm();						// 表示每分钟的吞吐量
service_percentile = from(Service.latency).percentile(10);  // 指定最近多少数据范围内的响应时间百分比,即p50, p75, p90, p95, p99在内的数据统计结果
service_apdex = from(Service.latency).apdex(name, status);	// 应用性能指数
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();	// 
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();

// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_resp_time = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);

4. 自定义告警规则

4.1 定义告警配置

dingtalkHooks部分配置参考:https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md#dingtalk

cat /apps/apache-skywalking-apm-bin/config/alarm-settings.yml

rules:
  service_cpm_rule:
    metrics-name: service_cpm
    # include-names:			# skywalking服务名称,不填写默认所有服务
      # - dubbox-provider
      # - dubbox-consumer
      # - aa579e648a7c4677b048659fe6aaf385@10.0.0.92
    op: ">"
    threshold: 1
    period: 1
    count: 1
    silence-period: 1
    message: dubbo-provider的当前指标service_cpm,请求值大于1000了!!!!
dingtalkHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
      "content": "Apache SkyWalking Alarm: \n %s."
      }  
    }
  webhooks:
    - url: https://oapi.dingtalk.com/robot/send?access_token=a374b5e60f6d408e46d17ba5340245b522314298422d3279d71ee019fc2c6b64
      # secret: SEC8a7e5fe2bb03d383963c144a9cf754c4f0b43b6b3e04a2e892		# 钉钉机器人加签,secret认证

即测试当请求量大于1时,自动发送钉钉告警

4.2 重启服务

重启skywalking server服务

4.3 访问测试页面

不断刷新页面,使访问量大于1

4.4 验证钉钉

4.5 验证告警历史


  1. 创建群组机器人

    1. 添加机器人

    1. 创建加签或关键词

    安全设置文档:https://open.dingtalk.com/document/robots/customize-robot-security-settings

    1. 查看机器人

    复制Webhook地址

    ↩︎