skywalking 监控告警处理和外挂配置

发布时间 2023-07-28 14:19:26作者: kenneth-lin

1、添加告警配置

vim configs/alarm-settings.yml
dingtalkHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
        "content": "Apache SkyWalking Alarm: \n %s."
      }
    }
  webhooks:
    - url: https://oapi.dingtalk.com/robot/send?access_token=<access_token>
      secret: <加签值>

2、添加apollo配置

vim application.yml
找到configurationapollo配置

apollo 添加配置

修改后的监控规则配置

rules:
  # Rule unique name, must be ended with `_rule`.
  service_resp_time_rule:
    metrics-name: service_resp_time
    op: ">"
    threshold: 10000
    period: 5
    count: 3
    silence-period: 3
    message: 服务 {name} 的响应时间在最近5分钟内有3分钟超过了10秒
  service_sla_rule:
    # Metrics value need to be long, double or int
    metrics-name: service_sla
    op: "<"
    threshold: 8000
    # The length of time to evaluate the metrics
    period: 5
    # How many times after the metrics match the condition, will trigger alarm
    count: 3
    # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
    silence-period: 3
    message: 服务 {name} 的成功率在最近5分钟内有3分钟低于80%
  service_resp_time_percentile_rule:
    # Metrics value need to be long, double or int
    metrics-name: service_percentile
    op: ">"
    threshold: 10000,10000,10000,10000,10000
    period: 5
    count: 3
    silence-period: 5
    message: 服务 {name} 的百分位响应告警时间在最近5分钟内有3分钟出现告警, 由于超过了p50 > 10000, p75 > 10000, p90 > 10000, p95 > 10000, p99 > 10000之中的某个条件
  service_instance_resp_time_rule:
    metrics-name: service_instance_resp_time
    op: ">"
    threshold: 10000
    period: 5
    count: 3
    silence-period: 5
    message: 服务实例 {name} 的响应时间在最近5分钟内有3分钟超过了10秒。
  database_access_resp_time_rule:
    metrics-name: database_access_resp_time
    threshold: 10000
    op: ">"
    period: 5
    count: 3
    message: 数据库 {name} 访问的响应时间在最近5分钟内有3分钟超过了10秒
  endpoint_relation_resp_time_rule:
    metrics-name: endpoint_relation_resp_time
    threshold: 10000
    op: ">"
    period: 5
    count: 3
    message: 端点关系 {name} 的响应时间在最近5分钟内有3分钟超过了10秒
#  Active endpoint related metrics alarm will cost more memory than service and service instance metrics alarm.
#  Because the number of endpoint is much more than service and instance.
#
#  endpoint_avg_rule:
#    metrics-name: endpoint_avg
#    op: ">"
#    threshold: 1000
#    period: 10
#    count: 2
#    silence-period: 5
#    message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes

dingtalkHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
      "content": "Apache SkyWalking Alarm: \n %s."
      }  
    }
  webhooks:
    - url: https://oapi.dingtalk.com/robot/send?access_token=cea97f9dc89fe65cc6b8ce56d202e3fe5ccb4e57335647b37d74aa10694fa6f0
      secret: SEC88c82dea0189dcd90558c77963e4f1aac984ff2edd7a4ad7ee74f58dab47bd96

#webhooks:
#  - http://127.0.0.1/notify/
#  - http://127.0.0.1/go-wechat/