day19 告警消息转发平台部署管理-企业中 警Rules (7.9-7.10)

发布时间 2023-12-22 20:56:20作者: ikubernetesi

一、告警消息转发平台部署管理

告警中心消息转发系统PrometheusAlert

官方站点:https://github.com/feiyu563/PrometheusAlert

手册:README - PrometheusAlert (gitbook.io)

1、为什么使用PrometheusAlert ?

1.1 背景

  • 通过configmap配置文件维护告警媒介辨析度低;
  • 部分快消息告警媒介需要额外部署webhook做转发;
  • alertmanager web UI 较简单,不足以展示更多的数据信息;

1.2 架构原理

Exports-->Prometheus--> Prometheus Rules-->Alertmanager-->webhook-->PrometheusAlert-->Routes-->企业微信/钉钉/飞书

1.3 PrometheusAlert 优点

  1. 配置灵活:通过web UI界面配置操作,相较于configmap较灵活。
  2. 功能强大:支持在线修改告警模版,告警路由,告警记录,测试等。
    1. 针对Prometheus增加了告警级别,并且支持按照不同级别发送消息到不同目标
    对象。
    2. 简化Prometheus分组配置,支持按照具体消息发送到单个或多个接收方。
    3. 增加Dashboard,提供配置测试,告警消息模版自定义,告警模版测试等功能
  3.  自定义强度高:通过路由匹配规则触发更多的告警媒介(企微,钉钉,飞书,公有
    云电话/短消息接口)。
    1. 增加手机号码配置项和号码自动轮询配置,可固定发送给单一个人告警信息,也可以通过自动轮询的方式发送到多个人员且支持按照不同日期发送到不同人员。
  4. 开源免费:PrometheusAlert 是一款完全开源的软件,可以免费使用和定制,而且有广泛的社区支持

2、PrometheusAlert 部署配置

2.1 在kubernetes中运行

#Kubernetes中运行可以直接执行以下命令行即可(注意默认的部署模版中未挂载模版数据库文件 db/PrometheusAlertDB.db,为防止模版数据丢失,请自行增加挂载配置 )
$ kubectl apply -n monitor -f https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/example/kubernetes/PrometheusAlert-Deployment.yaml
#启动后可使用浏览器打开以下地址查看:http://[YOUR-PrometheusAlert-URL]:8080
#默认登录帐号和密码在app.conf中有配置

2.2 使用helm部署

#clone项目源代码
$ git clone https://github.com/feiyu563/PrometheusAlert.git
$ cd PrometheusAlert/example/helm
#如需修改配置文件,请更新config中的app.conf
#helm部署模版支持配置Ingress域名,可在values.yaml中进行配置
#配置修改完成后,通过以下命令启动即可(注意默认的部署模版中未挂载模版数据库文件
db/PrometheusAlertDB.db,为防止模版数据丢失,请自行增加挂载配置 )
$ helm upgrade --install monitor prometheusalert -n monitor
#启动后可使用浏览器打开以下地址查看: http://[Ingress_url]:[Ingress_port]
#默认登录帐号和密码在app.conf中有配置

2.3 使用控制器文件部署

为防止模板数据丢失,增加挂载配置

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: prome-alert-data-pvc
  namespace: monitor
spec:
  accessModes:
    - ReadWriteMany
  storageClassName: "nfs-storageclass"
  resources:
    requests:
      storage: 5Gi
[root@master-1-230 7.9]# kubectl  apply -f pvc.yaml 
persistentvolumeclaim/prome-alert-data-pvc created

下载官方配置文件

wget https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/example/kubernetes/PrometheusAlert-Deployment.yaml
cat PrometheusAlert-Deployment.yaml
# apiVersion: v1
# kind: Namespace
# metadata:
#   name: monitor
---  
apiVersion: v1
data:
  app.conf: |
    #---------------------↓全局配置-----------------------
    appname = PrometheusAlert
    #登录用户名
    login_user=prometheusalert
    #登录密码
    login_password=prometheusalert
    #监听地址
    httpaddr = "0.0.0.0"
    #监听端口
    httpport = 8080
    runmode = dev
    #设置代理 proxy = http://123.123.123.123:8080
    proxy =
    #开启JSON请求
    copyrequestbody = true
    #告警消息标题
    title=PrometheusAlert
    #链接到告警平台地址
    GraylogAlerturl=http://graylog.org
    #钉钉告警 告警logo图标地址
    logourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
    #钉钉告警 恢复logo图标地址
    rlogourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
    #短信告警级别(等于3就进行短信告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
    messagelevel=3
    #电话告警级别(等于4就进行语音告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
    phonecalllevel=4
    #默认拨打号码(页面测试短信和电话功能需要配置此项)
    defaultphone=xxxxxxxx
    #故障恢复是否启用电话通知0为关闭,1为开启
    phonecallresolved=0
    #是否前台输出file or console
    logtype=file
    #日志文件路径
    logpath=logs/prometheusalertcenter.log
    #转换Prometheus,graylog告警消息的时区为CST时区(如默认已经是CST时区,请勿开启)
    prometheus_cst_time=0
    #数据库驱动,支持sqlite3,mysql,postgres如使用mysql或postgres,请开启db_host,db_port,db_user,db_password,db_name的注释
    db_driver=sqlite3
    #db_host=127.0.0.1
    #db_port=3306
    #db_user=root
    #db_password=root
    #db_name=prometheusalert
    #是否开启告警记录 0为关闭,1为开启
    AlertRecord=0
    #是否开启告警记录定时删除 0为关闭,1为开启
    RecordLive=0
    #告警记录定时删除周期,单位天
    RecordLiveDay=7
    # 是否将告警记录写入es7,0为关闭,1为开启
    alert_to_es=0
    # es地址,是[]string
    # beego.Appconfig.Strings读取配置为[]string,使用";"而不是","
    to_es_url=http://localhost:9200
    # to_es_url=http://es1:9200;http://es2:9200;http://es3:9200
    # es用户和密码
    # to_es_user=username
    # to_es_pwd=password
    
    #---------------------↓webhook-----------------------
    #是否开启钉钉告警通道,可同时开始多个通道0为关闭,1为开启
    open-dingding=1
    #默认钉钉机器人地址
    ddurl=https://oapi.dingtalk.com/robot/send?access_token=xxxxx
    #是否开启 @所有人(0为关闭,1为开启)
    dd_isatall=1
    
    #是否开启微信告警通道,可同时开始多个通道0为关闭,1为开启
    open-weixin=1
    #默认企业微信机器人地址
    wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxx
    
    #是否开启飞书告警通道,可同时开始多个通道0为关闭,1为开启
    open-feishu=0
    #默认飞书机器人地址
    fsurl=https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx
    
    #---------------------↓腾讯云接口-----------------------
    #是否开启腾讯云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-txdx=0
    #腾讯云短信接口key
    TXY_DX_appkey=xxxxx
    #腾讯云短信模版ID 腾讯云短信模版配置可参考 prometheus告警:{1}
    TXY_DX_tpl_id=xxxxx
    #腾讯云短信sdk app id
    TXY_DX_sdkappid=xxxxx
    #腾讯云短信签名 根据自己审核通过的签名来填写
    TXY_DX_sign=腾讯云
    
    #是否开启腾讯云电话告警通道,可同时开始多个通道0为关闭,1为开启
    open-txdh=0
    #腾讯云电话接口key
    TXY_DH_phonecallappkey=xxxxx
    #腾讯云电话模版ID
    TXY_DH_phonecalltpl_id=xxxxx
    #腾讯云电话sdk app id
    TXY_DH_phonecallsdkappid=xxxxx
    
    #---------------------↓华为云接口-----------------------
    #是否开启华为云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-hwdx=0
    #华为云短信接口key
    HWY_DX_APP_Key=xxxxxxxxxxxxxxxxxxxxxx
    #华为云短信接口Secret
    HWY_DX_APP_Secret=xxxxxxxxxxxxxxxxxxxxxx
    #华为云APP接入地址(端口接口地址)
    HWY_DX_APP_Url=https://rtcsms.cn-north-1.myhuaweicloud.com:10743
    #华为云短信模板ID
    HWY_DX_Templateid=xxxxxxxxxxxxxxxxxxxxxx
    #华为云签名名称,必须是已审核通过的,与模板类型一致的签名名称,按照自己的实际签名填写
    HWY_DX_Signature=华为云
    #华为云签名通道号
    HWY_DX_Sender=xxxxxxxxxx
    
    #---------------------↓阿里云接口-----------------------
    #是否开启阿里云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-alydx=0
    #阿里云短信主账号AccessKey的ID
    ALY_DX_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云短信接口密钥
    ALY_DX_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云短信签名名称
    ALY_DX_SignName=阿里云
    #阿里云短信模板ID
    ALY_DX_Template=xxxxxxxxxxxxxxxxxxxxxx
    
    #是否开启阿里云电话告警通道,可同时开始多个通道0为关闭,1为开启
    open-alydh=0
    #阿里云电话主账号AccessKey的ID
    ALY_DH_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云电话接口密钥
    ALY_DH_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
    #阿里云电话被叫显号,必须是已购买的号码
    ALY_DX_CalledShowNumber=xxxxxxxxx
    #阿里云电话文本转语音(TTS)模板ID
    ALY_DH_TtsCode=xxxxxxxx
    
    #---------------------↓容联云接口-----------------------
    #是否开启容联云电话告警通道,可同时开始多个通道0为关闭,1为开启
    open-rlydh=0
    #容联云基础接口地址
    RLY_URL=https://app.cloopen.com:8883/2013-12-26/Accounts/
    #容联云后台SID
    RLY_ACCOUNT_SID=xxxxxxxxxxx
    #容联云api-token
    RLY_ACCOUNT_TOKEN=xxxxxxxxxx
    #容联云app_id
    RLY_APP_ID=xxxxxxxxxxxxx
    
    #---------------------↓邮件配置-----------------------
    #是否开启邮件
    open-email=0
    #邮件发件服务器地址
    Email_host=smtp.qq.com
    #邮件发件服务器端口
    Email_port=465
    #邮件帐号
    Email_user=xxxxxxx@qq.com
    #邮件密码
    Email_password=xxxxxx
    #邮件标题
    Email_title=运维告警
    #默认发送邮箱
    Default_emails=xxxxx@qq.com,xxxxx@qq.com
    
    #---------------------↓七陌云接口-----------------------
    #是否开启七陌短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-7moordx=0
    #七陌账户ID
    7MOOR_ACCOUNT_ID=Nxxx
    #七陌账户APISecret
    7MOOR_ACCOUNT_APISECRET=xxx
    #七陌账户短信模板编号
    7MOOR_DX_TEMPLATENUM=n
    #注意:七陌短信变量这里只用一个var1,在代码里写死了。
    #-----------
    #是否开启七陌webcall语音通知告警通道,可同时开始多个通道0为关闭,1为开启
    open-7moordh=0
    #请在七陌平台添加虚拟服务号、文本节点
    #七陌账户webcall的虚拟服务号
    7MOOR_WEBCALL_SERVICENO=xxx
    # 文本节点里被替换的变量,我配置的是text。如果被替换的变量不是text,请修改此配置
    7MOOR_WEBCALL_VOICE_VAR=text
    
    #---------------------↓telegram接口-----------------------
    #是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
    open-tg=0
    #tg机器人token
    TG_TOKEN=xxxxx
    #tg消息模式 个人消息或者频道消息 0为关闭(推送给个人),1为开启(推送给频道)
    TG_MODE_CHAN=0
    #tg用户ID
    TG_USERID=xxxxx
    #tg频道name或者id, 频道name需要以@开始
    TG_CHANNAME=xxxxx
    #tg api地址, 可以配置为代理地址
    #TG_API_PROXY="https://api.telegram.org/bot%s/%s"
    
    #---------------------↓workwechat接口-----------------------
    #是否开启workwechat告警通道,可同时开始多个通道0为关闭,1为开启
    open-workwechat=0
    # 企业ID
    WorkWechat_CropID=xxxxx
    # 应用ID
    WorkWechat_AgentID=xxxx
    # 应用secret
    WorkWechat_AgentSecret=xxxx
    # 接受用户
    WorkWechat_ToUser="zhangsan|lisi"
    # 接受部门
    WorkWechat_ToParty="ops|dev"
    # 接受标签
    WorkWechat_ToTag=""
    # 消息类型, 暂时只支持markdown
    # WorkWechat_Msgtype = "markdown"
    
    #---------------------↓百度云接口-----------------------
    #是否开启百度云短信告警通道,可同时开始多个通道0为关闭,1为开启
    open-baidudx=0
    #百度云短信接口AK(ACCESS_KEY_ID)
    BDY_DX_AK=xxxxx
    #百度云短信接口SK(SECRET_ACCESS_KEY)
    BDY_DX_SK=xxxxx
    #百度云短信ENDPOINT(ENDPOINT参数需要用指定区域的域名来进行定义,如服务所在区域为北京,则为)
    BDY_DX_ENDPOINT=http://smsv3.bj.baidubce.com
    #百度云短信模版ID,根据自己审核通过的模版来填写(模版支持一个参数code:如prometheus告警:{code})
    BDY_DX_TEMPLATE_ID=xxxxx
    #百度云短信签名ID,根据自己审核通过的签名来填写
    TXY_DX_SIGNATURE_ID=xxxxx
    
    #---------------------↓百度Hi(如流)-----------------------
    #是否开启百度Hi(如流)告警通道,可同时开始多个通道0为关闭,1为开启
    open-ruliu=0
    #默认百度Hi(如流)机器人地址
    BDRL_URL=https://api.im.baidu.com/api/msg/groupmsgsend?access_token=xxxxxxxxxxxxxx
    #百度Hi(如流)群ID
    BDRL_ID=123456
    #---------------------↓bark接口-----------------------
    #是否开启telegram告警通道,可同时开始多个通道0为关闭,1为开启
    open-bark=0
    #bark默认地址, 建议自行部署bark-server
    BARK_URL=https://api.day.app
    #bark key, 多个key使用分割
    BARK_KEYS=xxxxx
    # 复制, 推荐开启
    BARK_COPY=1
    # 历史记录保存,推荐开启
    BARK_ARCHIVE=1
    # 消息分组
    BARK_GROUP=PrometheusAlert
    
    #---------------------↓语音播报-----------------------
    #语音播报需要配合语音播报插件才能使用
    #是否开启语音播报通道,0为关闭,1为开启
    open-voice=1
    VOICE_IP=127.0.0.1
    VOICE_PORT=9999
    
    #---------------------↓飞书机器人应用-----------------------
    #是否开启feishuapp告警通道,可同时开始多个通道0为关闭,1为开启
    open-feishuapp=1
    # APPID
    FEISHU_APPID=cli_xxxxxxxxxxxxx
    # APPSECRET
    FEISHU_APPSECRET=xxxxxxxxxxxxxxxxxxxxxx
    # 可填飞书 用户open_id、user_id、union_ids、部门open_department_id
    AT_USER_ID="xxxxxxxx"
  user.csv: |
    2019年4月10日,15888888881,小张,15999999999,备用联系人小陈,15999999998,备用联系人小赵
    2019年4月11日,15888888882,小李,15999999999,备用联系人小陈,15999999998,备用联系人小赵
    2019年4月12日,15888888883,小王,15999999999,备用联系人小陈,15999999998,备用联系人小赵
    2019年4月13日,15888888884,小宋,15999999999,备用联系人小陈,15999999998,备用联系人小赵
kind: ConfigMap
metadata:
  name: prometheus-alert-center-conf
  namespace: monitor
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app: prometheus-alert-center
    alertname: prometheus-alert-center
  name: prometheus-alert-center
  namespace: monitor
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus-alert-center
      alertname: prometheus-alert-center
  template:
    metadata:
      labels:
        app: prometheus-alert-center
        alertname: prometheus-alert-center
    spec:
      containers:
      - image: feiyu563/prometheus-alert
        name: prometheus-alert-center
        env:
        - name: TZ
          value: "Asia/Shanghai"
        ports:
        - containerPort: 8080
          name: http
        resources:
          limits:
            cpu: 200m
            memory: 200Mi
          requests:
            cpu: 100m
            memory: 100Mi
        volumeMounts:
        - name: prometheus-alert-center-conf-map
          mountPath: /app/conf/app.conf
          subPath: app.conf
        - name: prometheus-alert-center-conf-map
          mountPath: /app/user.csv
          subPath: user.csv
      volumes:
      - name: prometheus-alert-center-conf-map
        configMap:
          name: prometheus-alert-center-conf
          items:
          - key: app.conf
            path: app.conf
          - key: user.csv
            path: user.csv
---
apiVersion: v1
kind: Service
metadata:
  labels:
    alertname: prometheus-alert-center
  name: prometheus-alert-center
  namespace: monitor 
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: '8080'  
spec:
  ports:
  - name: http
    port: 8080
    targetPort: http
  selector:
    app: prometheus-alert-center
---
# apiVersion: networking.k8s.io/v1beta1
# kind: Ingress
# metadata:
#   annotations:
#     kubernetes.io/ingress.class: nginx
#   name: prometheus-alert-center
#   namespace: monitoring
# spec:
#   rules:
#     - host: alert-center.local
#       http:
#         paths:
#           - backend:
#               serviceName: prometheus-alert-center
#               servicePort: 8080
#             path: /

修改配置

  app.conf: |
    #---------------------↓全局配置-----------------------
    appname = PrometheusAlert
    #登录用户名
    login_user=admin
    #登录密码
    login_password=admin123
    #监听地址
    httpaddr = "0.0.0.0"
    #监听端口
    httpport = 8080
...
    #数据库驱动,支持sqlite3,mysql,postgres如使用mysql或postgres,请开启db_host,db_port,db_user,db_password,db_name的注释
    db_driver=mysql
    db_host=mysql.default.svc.cluster.local
    db_port=3306
    db_user=root
    db_password=TQtH0tjCLt
    db_name=prometheusalert
...
    #是否开启告警记录定时删除 0为关闭,1为开启
    RecordLive=1
    #告警记录定时删除周期,单位天
    RecordLiveDay=7
...
    #是否开启微信告警通道,可同时开始多个通道0为关闭,1为开启
    open-weixin=1
    #默认企业微信机器人地址
    wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=b3738de5-9f00-45de-9e3d-45e6309bc54c
...
        - name: prometheus-alert-db
          mountPath: /app/db

      - name: prometheus-alert-db
        persistentVolumeClaim:
          claimName: prome-alert-data-pvc
...
[root@master-1-230 7.9]# kubectl  apply -f PrometheusAlert-Deployment.yaml 
configmap/prometheus-alert-center-conf created
deployment.apps/prometheus-alert-center created
service/prometheus-alert-center created

配置Ingress 路由

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: prometheus-alert-ing
  namespace: monitor
  annotations:
    prometheus.io/http_probe: "true"
spec:
  ingressClassName: nginx
  rules:
    - host: prometheusalert.ikubernetes.cloud
      http:
        paths:
          - pathType: Prefix
            backend:
              service:
                name: prometheus-alert-center
                port:
                  number: 8080
            path: /
[root@master-1-230 7.9]# kubectl  apply -f ingress.yaml 
ingress.networking.k8s.io/prometheus-alert-ing created
[root@master-1-230 7.9]# kubectl  get ingress -n monitor
NAME                   CLASS   HOSTS                               ADDRESS         PORTS   AGE
alertmanager-ingress   nginx   alertmanager.ikubernetes.cloud      192.168.1.204   80      17h
prometheus-alert-ing   nginx   prometheusalert.ikubernetes.cloud   192.168.1.204   80      37s
prometheus-ingress     nginx   prometheus.ikubernetes.cloud        192.168.1.204   80      31h

2.4 配置PrometheusAlert使用mysql作为后端数据存储

部署mysql到Kubernetes集群:

## 创建秘钥
kubectl create secret generic mysql-root-password --from-literal=password=mysqlpwd123

## 创建pvc
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: mysql-pvc
spec:
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: 10Gi
  storageClassName: nfs-storage
  
## 创建mysql控制器
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app: mysql
  name: mysql
  namespace: monitor
spec:
  selector:
    matchLabels:
      app: mysql
  template:
    metadata:
      labels:
        app: mysql
    spec:
      containers:
      - image: mysql:5.7
        name: mysql
        env:
        - name: MYSQL_ROOT_PASSWORD
          valueFrom:
            secretKeyRef:
              name: mysql-root-password
              key: password
          # 如果你不想使用secret对象保存mysql登录密码,可以直接使用下面的方式指定,简单粗暴未尝不可
          #value: "123456"
        ports:
        - containerPort: 3306
        volumeMounts:
        - name: mysqlvolume
          mountPath: /var/lib/mysql
      volumes:
      - name: mysqlvolume
        # 使用pvc
        persistentVolumeClaim:
          claimName: mysql-pvc
---
#定义mysql的Service
apiVersion: v1
kind: Service
metadata:
  labels:
    app: svc-mysql
  name: svc-mysql
  namespace: monitor
spec:
  selector:
    app: mysql
  type: ClusterIP
  ports:
  - port: 3306
    protocol: TCP
    targetPort: 3306

创建数据库

[root@master-1-230 7.9]# kubectl  get pod -A|grep mysql-0
default          mysql-0                                    1/1     Running   8 (8h ago)       8d
[root@master-1-230 7.9]# kubectl  get svc|grep mysql
mysql                     NodePort       10.110.136.173   <none>          3306:32000/TCP               8d
# 获取 mysql pod
[root@master-1-230 7.9]# kubectl get pod |grep mysql-0
mysql-0                                    1/1     Running   8 (8h ago)       8d

# 进入 mysql pod
kubectl exec -it mysql-0 /bin/bash
# 在 mysql pod 容器内执行
# 登录 mysql
mysql -uroot -p
# 创建数据库 prometheusalert
CREATE DATABASE prometheusalert CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;

[root@master-1-230 7.9]# kubectl exec -it mysql-0 /bin/bash
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
I have no name!@mysql-0:/$ mysql -uroot -p
Enter password: 
Welcome to the MySQL monitor.  Commands end with ; or \g.
Your MySQL connection id is 7735
Server version: 8.0.35 Source distribution

Copyright (c) 2000, 2023, Oracle and/or its affiliates.

Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

mysql> CREATE DATABASE prometheusalert CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
Query OK, 1 row affected (0.02 sec)

mysql> show databases;
+--------------------+
| Database           |
+--------------------+
| go_gin_api         |
| information_schema |
| my_database        |
| mysql              |
| performance_schema |
| prometheusalert    |
| sys                |
+--------------------+
7 rows in set (0.01 sec)

mysql> 

 

配置PrometheusAlert使用mysql作为后端数据存储


  • PrometheusAlert默认使用sqlite3作为后端自定义模板的存储,这种方式适合于单机部署,满足绝大部分生产场景使用。考虑到部分企业对于服务的高可用要求较高,同时也为了让PrometheusAlert更易于横向扩展,用户可以更改PrometheusAlert的默认存储为mysql。(推荐使用mysql 5.7及以上版本)

  • 1.创建数据库

    CREATE DATABASE prometheusalert CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
  • 2.开启PrometheusAlert配置文件中关于mysql的配置 conf/app.conf,数据库名称与上面创建的数据一致,并启动PrometheusAlert,PrometheusAlert启动时会自动初始化数据库表。

    #数据库驱动,支持sqlite3,mysql,如使用mysql,请开启db_host,db_user,db_password,db_name的注释
    db_driver=mysql
    db_host=127.0.0.1:3306
    db_user=root
    db_password=root
    db_name=prometheusalert
  • 3.打开PrometheusAlert web页面,进入菜单模版管理-->自定义模板-->选择文件,将db目录中的 prometheus-alert-template.json导入,刷新页面即可

    [root@master-1-230 ~]# kubectl cp prometheus-alert-center-58f446c665-t9klh:/app/db/prometheus-alert-template.json  ./prometheus-alert-template.json
    [root@master-1-230 ~]# ls prometheus-alert-template.json 
    prometheus-alert-template.json
  • 4.重启PrometheusAlert,这样即完成配置PrometheusAlert使用mysql数据库作为默认后端存储。

参考:https://github.com/feiyu563/PrometheusAlert/blob/master/doc/readme/base-install.md

 

 

 3、prometheusAlert 配置管理

3.1 编辑自定义模板

模板内容

{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}{{if eq $v.status "resolved"}}[PROMETHEUS-恢复信息]({{$v.generatorURL}})
> **[{{$v.labels.alertname}}]({{$var}})**✅
> <font color="info">告警级别:</font> {{$v.labels.severity}}
> <font color="info">开始时间:</font> {{GetCSTtime $v.startsAt}}
> <font color="info">结束时间:</font> {{GetCSTtime $v.endsAt}}
> <font color="info">命名空间:</font> {{$v.labels.namespace}}
> <font color="info">实例名称:</font> {{$v.labels.pod}}
> <font color="info">实例地址:</font> {{$v.labels.instance}}
> <font color="info">**{{$v.annotations.description}}**</font>{{else}}[PROMETHEUS-告警信息]({{$v.generatorURL}})
> **[{{$v.labels.alertname}}]({{$var}})**?
> <font color="#FF0000">告警级别:</font> {{$v.labels.severity}}
> <font color="#FF0000">开始时间:</font> {{GetCSTtime $v.startsAt}}
> <font color="#FF0000">命名空间:</font> {{$v.labels.namespace}}
> <font color="#FF0000">实例名称:</font> {{$v.labels.pod}}
> <font color="#FF0000">实例地址:</font> {{$v.labels.instance}}
> <font color="#FF0000">**{{$v.annotations.description}}**</font>{{end}}{{ end }}
{{ $urimsg:=""}}{{ range $key,$value:=.commonLabels }}{{$urimsg =  print $urimsg $key "%3D%22" $value "%22%2C" }}{{end}}[✍点我屏蔽该告警](http://alertmanager.ikubernetes.cloud/#/silences/new?filter=%7B{{SplitString $urimsg 0 -3}}%7D)

3.2 新增自定义模板

模板内容:

{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}{{if eq $v.status "resolved"}}[PROMETHEUS-恢复信息]({{$v.generatorURL}})
> **[{{$v.labels.alertname}}]({{$var}})**✅
> <font color="info">告警级别:</font> {{$v.labels.severity}}
> <font color="info">当前状态:</font> **<font color="#67C23A">已恢复</font>**
> <font color="info">开始时间:</font> {{GetCSTtime $v.startsAt}}
> <font color="info">结束时间:</font> {{GetCSTtime $v.endsAt}}
> <font color="info">实例地址:</font> {{$v.labels.instance}}
> <font color="info">**{{$v.annotations.description}}**</font>{{else}}[PROMETHEUS-告警信息]({{$v.generatorURL}})
> **[{{$v.labels.alertname}}]({{$var}})**?
> <font color="#FF0000">告警级别:</font> {{$v.labels.severity}}
> <font color="#FF0000">当前状态:</font> **<font color="#E6A23C">需要处理</font>**
> <font color="#FF0000">开始时间:</font> {{GetCSTtime $v.startsAt}}
> <font color="#FF0000">实例地址:</font> {{$v.labels.instance}}
> <font color="#FF0000">**{{$v.annotations.description}}**</font>{{end}}{{ end }}
{{ $urimsg:=""}}{{ range $key,$value:=.commonLabels }}{{$urimsg =  print $urimsg $key "%3D%22" $value "%22%2C" }}{{end}}[✍点我屏蔽该告警](http://alertmanager.ikubernetes.cloud/#/silences/new?filter=%7B{{SplitString $urimsg 0 -3}}%7D)

 3.3 告警测试

告警管理 --> 告警测试 --> 企业微信 --> 企业微信查看测试消息;

3.4 告警路由

 如果一个小伙伴下有很多机器资源,如何正则匹配?

4、配置Alertmanager接入PrometheusAlert

更新alertmanager配置,将所有的告警均指向告警分发平台

 

apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: monitor
data:
  alertmanager.yml: |-
    global:
      resolve_timeout: 1m
      smtp_smarthost: 'smtp.exmail.qq.com:465'     # 邮箱服务器的SMTP主机配置
      smtp_from: 'zhdya@zhdya.cn'    # 发送邮件主题
      smtp_auth_username: 'zhdya@zhdya.cn'      # 登录用户名
      smtp_auth_password: 'yfXBXXXXX3DwYTjn'    # 此处的auth password是邮箱的第三方登录授权密码,而非用户密码
      smtp_require_tls: false           # 有些邮箱需要开启此配置,这里使用的是企微邮箱,仅做测试,不需要开启此功能。
​
    templates:
      - '/etc/alertmanager/*.tmpl'
    route:
      group_by: ['env','instance','type','group','job','alertname','cluster']
      group_wait: 10s
      group_interval: 2m
      repeat_interval: 10m
      receiver: 'webhook'
​
    receivers:
    - name: 'webhook'
      webhook_configs:
      - url: 'http://prometheus-alert-center.monitor.svc:8080/prometheusalert?type=wx&tpl=prometheus-wx&wxurqq.com/cgi-bin/webhook/send?key=71c0a6f0-43a0-4ecf-b8c9-52aff88f3b68&at=ZhangDaDan,ZHDYA'
        send_resolved: true
​
    inhibit_rules:
      - source_match:
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname', 'dev', 'instance']
​
  wechat.tmpl: |-
    {{ define "wechat.default.message" }}
    {{- if gt (len .Alerts.Firing) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    {{- if eq $index 0 }}
    ========= 监控报警 =========
    告警状态:{{   .Status }}
    告警级别:{{ .Labels.severity }}
    告警类型:{{ $alert.Labels.alertname }}
    故障主机: {{ $alert.Labels.instance }}
    告警主题: {{ $alert.Annotations.summary }}
    告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
    触发阀值:{{ .Annotations.value }}
    故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    ========= = end =  =========
    {{- end }}
    {{- end }}
    {{- end }}
    {{- if gt (len .Alerts.Resolved) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    {{- if eq $index 0 }}
    ========= 告警恢复 =========
    告警类型:{{ .Labels.alertname }}
    告警状态:{{   .Status }}
    告警主题: {{ $alert.Annotations.summary }}
    告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
    故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    {{- if gt (len $alert.Labels.instance) 0 }}
    实例信息: {{ $alert.Labels.instance }}
    {{- end }}
    ========= = end =  =========
    {{- end }}
    {{- end }}
    {{- end }}
    {{- end }}
​
  email.tmpl: |-
    {{ define "email.from" }}xxx.com{{ end }}
    {{ define "email.to" }}xxx.com{{ end }}
    {{ define "email.to.html" }}
    {{- if gt (len .Alerts.Firing) 0 -}}
    {{ range .Alerts }}
    ========= 监控报警 =========<br>
    告警程序: prometheus_alert <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    告警主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }}  <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
    ========= = end =  =========<br>
    {{ end }}{{ end -}}
​
    {{- if gt (len .Alerts.Resolved) 0 -}}
    {{ range .Alerts }}
    ========= 告警恢复 =========<br>
    告警程序: prometheus_alert <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    告警主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }} <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
    恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
    ========= = end =  =========<br>
    {{ end }}{{ end -}}
​
    {{- end }}

接口说明

    receivers:
    - name: 'webhook'
      webhook_configs:
      - url: 'http://prometheus-alert-center.monitor.svc:8080/prometheusalert?type=wx&tpl=prometheus-wx&wxurqq.com/cgi-bin/webhook/send?key=71c0a6f0-43a0-4ecf-b8c9-52aff88f3b68&at=ZhangDaDan,ZHDYA'

## 接口说明:
/prometheusalert   #自定义模版接口,可通过Dashboard自定义模版后,支持任意WebHook接入
type=?:指定消息转发的目标类型,如钉钉、企业微信、飞书等;该参数为必选参数
目前支持的值:
dd 钉钉
wx 企业微信
workwechat 企业微信应用
fs 飞书
webhook WebHook
txdx 腾讯云短信
txdh 腾讯云电话
alydx 阿里云短信
alydh 阿里云电话
hwdx 华为云短信
bddx 百度云短信
rlydh 容联云电话
7moordx 七陌短信
7moordh 七陌语音电话
email Email
tg Telegram
rl 百度Hi(如流)

tpl=?: 指定消息所使用的模版,如prometheus-dd(Prometheus针对钉钉的模板);模版可以去PrometheusAlert 页面的模版管理-->自定义模板页面查看或新建;该参数为必选参数
ddurl=?:指定PrometheusAlert发送消息的钉钉机器人地址,如需要多个地址可以通过,分割,该参数需要配合type=dd的模版使用;该参数为可选参数,如未填写,则默认从app.conf中获取默认配置
wxurl=?:指定PrometheusAlert发送消息的企业微信机器人地址,如需要多个地址可以通过,分割,该参数需要配合type=wx的模版使用;该参数为可选参数,如未填写,则默认从app.conf中获取默认配置
fsurl=?:指定PrometheusAlert发送消息的飞书机器人地址,如需要多个地址可以通过,分割,该参数需要配合type=fs的模版使用;该参数为可选参数,如未填写,则默认从app.conf中获取默认配置
phone=?:指定PrometheusAlert发送消息的手机号,如需要多个号码可以通过,分割,该参数需要配合type=txdx | hwdx | bddx | alydx | txdh | alydh | rlydh | 7moordx | 7moordh的模版使用;该参数为可选参数,如未填写,则默认从app.conf中获取默认配置
email=?:指定PrometheusAlert发送消息的email地址,如需要多个email可以通过,分割,该参数需要配合type=email的模版使用;该参数为可选参数,如未填写,则默认从app.conf中获取默认配置
groupid=?:指定PrometheusAlert发送消息的groupid,该参数需要配合type=rl的模版使用;该参数为可选参数,如未填写,则默认从app.conf中获取默认配置
webhook=?:指定PrometheusAlert发送消息的webhook,该参数需要配合type=webhook的模版使用;该参数为可选参数
at=?:钉钉机器人、企业微信机器人开启@某人的功能,如需添加多个@目标,用,号分割即可。此处需注意:钉钉@使用的是手机号码,企业微信机器人@使用的是用户帐号。;该参数为可选参数
rr=?:该参数为开启随机轮询,目前仅针对ddurl,fsurl,wxurl有效,默认情况下如果上述Url配置的是多个地址,则多个地址全部发送,如开启该选项,则从多个地址中随机取一个地址发送,主要是为了避免消息发送频率过高导致触发部分机器人拦截消息。;该参数为可选参数
split=?:该参数仅针对Prometheus告警消息有效,作用是将Prometheus分组消息拆分成单条发送。默认开启,如果Prometheus一次告警附带的同分组的告警消息条数过多,可能会导致告警消息体过大。如需关闭请在url中加入split=false;该参数为可选参数
注意:此参数如设置为split=false,则PrometheusAlert web页面的路由和告警记录等功能将自动关闭,请谨慎。

热加载配置

curl -XPOST http://prometheus.ikubernetes.cloud/-/reload

5、测试验证:

验证所有的告警是否发送到指定的群组:

## 默认标签走默认路由
$ curl -XPOST -H 'Content-Type: application/json' http://alertmanager.ikubernetes.cloud/api/v1/alerts -d '[{"labels":{"hostname":"test"},"annotations":{"summary":"This is a test alert"}}]'

验证指定标签告警到指定的群组:

## 默认将告警分发到指定的告警组,并也需要在DEVOPS群组中展示
$ curl -XPOST -H 'Content-Type: application/json' http://alertmanager.ikubernetes.cloud/api/v1/alerts -d '[{"labels":{"hostname":"hello"},"annotations":{"summary":"This is a test alert"}}]'

二、企业中需要哪些告警Rules

1、前言

Prometheus中的告警规则允许你基于PromQL表达式定义告警触发条件,Prometheus后端对这些触发规则进行周期性计算,当满足触发条件后则会触发告警通知。
在企业中,为了确保业务的稳定性和可靠性,Prometheus告警规则非常重要。以下是需要考虑的几个维度:

  1. 业务维度:在企业中,不同的业务拥有不同的指标和告警规则。例如,对于ToC平台,需要监控订单量、库存、支付成功率等指标,以确保业务的正常运行。
  2. 环境维度:企业中通常会有多个环境,例如开发、测试、预生产和生产环境等。由于每个环境的特点不同,因此需要为每个环境制定不同的告警规则。
  3. 应用程序维度:不同的应用程序拥有不同的指标和告警规则。例如,在监控Web应用程序时,需要监控HTTP请求失败率、响应时间和内存使用情况等指标。
  4. 基础设施维度:企业中的基础设施包括服务器、网络设备和存储设备等。在监控基础设施时,需要监控CPU使用率、磁盘空间和网络带宽等指标。

2、定义告警规则

一条典型的告警规则如下所示:

    groups:
    - name: general.rules
      rules:
      - alert: InstanceDown
        expr: |
          up{job=~"other-ECS|k8s-nodes|prometheus"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} 停止工作"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 已经停止1分钟以上."

在告警规则文件中,我们可以将一组相关的规则设置定义在一个group下。
在每一个group中我们可以定义多个告警规则(rule)。一条告警规则主要由以下几部分组成:

  • alert:告警规则的名称。
  • expr:基于PromQL表达式告警触发条件,用于计算是否有时间序列满足该条件。
  • for:评估等待时间,可选参数。用于表示只有当触发条件持续一段时间后才发送告警。在等待期间新产生告警的状态为pending。
  • labels:自定义标签,允许用户指定要附加到告警上的一组附加标签。
  • annotations:用于指定一组附加信息,比如用于描述告警详细信息的文字等,annotations的内容在告警产生时会一同作为参数发送到lertmanager。

3、企业中的告警rules

结合公司的业务场景参考:Awesome Prometheus alerts | Collection of alerting rules (samber.github.io)

3.1 Node.rules

    groups:
    - name: node.rules
      rules:
      - alert: NodeFilesystemUsage
        expr: |
          100 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} : {{ $labels.mountpoint }} 分区使用大于85% (当前值: {{ $value }})"
      - alert: NodeMemoryUsage
        expr: |
          100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} 内存使用率过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 内存使用大于85% (当前值: {{ $value }})"
      - alert: NodeCPUUsage
        expr: |
          100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85
        for: 10m
        labels:
          hostname: '{{$labels.hostname}}'
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} CPU使用率过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} CPU使用大于85% (当前值: {{ $value }})"
      - alert: TCP_Estab
        expr: |
          node_netstat_Tcp_CurrEstab > 5500
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} TCP_Estab链接过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} TCP_Estab链接过高!(当前值: {{ $value }})"
      - alert: TCP_TIME_WAIT
        expr: |
          node_sockstat_TCP_tw > 3000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} TCP_TIME_WAIT过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} TCP_TIME_WAIT过高!(当前值: {{ $value }})"
      - alert: TCP_Sockets
        expr: |
          node_sockstat_sockets_used > 10000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} TCP_Sockets链接过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} TCP_Sockets链接过高!(当前值: {{ $value }})"
      - alert: KubeNodeNotReady
        expr: |
          kube_node_status_condition{condition="Ready",status="true"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.node }} NotReady已经1分钟.'
      - alert: KubernetesMemoryPressure
        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes memory pressure (instance {{ $labels.instance }})
          description: "{{ $labels.node }} has MemoryPressure condition VALUE = {{ $value }}"
      - alert: KubernetesDiskPressure
        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes disk pressure (instance {{ $labels.instance }})
          description: "{{ $labels.node }} has DiskPressure condition."
      - alert: KubernetesContainerOomKiller
        expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes container oom killer (instance {{ $labels.instance }})
          description: "{{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
      - alert: KubernetesJobFailed
        expr: kube_job_status_failed > 0
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes Job failed (instance {{ $labels.instance }})
          description: "Job {{$labels.namespace}}/{{$labels.job_name}} failed to complete."
      - alert: UnusualDiskReadRate
        expr: |
          sum by (job,instance) (irate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 140
        for: 5m
        labels:
          severity: critical
          hostname: '{{ $labels.hostname }}'
        annotations:
          description: '{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟磁盘读取数据(> 140 MB/s) (当前值: {{ $value }}) 阿里云ESSD PL0最大吞吐量180MB/s, PL1最大350MB/s'
      - alert: UnusualDiskWriteRate
        expr: |
          sum by (job,instance) (irate(node_disk_written_bytes_total[5m])) / 1024 / 1024 > 140
        for: 5m
        labels:
          severity: critical
          hostname: '{{ $labels.hostname }}'
        annotations:
          description: '{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟磁盘写入数据(> 140 MB/s) (当前值: {{ $value }}) 阿里云ESSD PL0最大吞吐量180MB/s, PL1最大350MB/s'
      - alert: UnusualNetworkThroughputIn
        expr: |
          sum by (job,instance) (irate(node_network_receive_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 80
        for: 5m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟网络带宽接收数据(> 80 MB/s) (当前值: {{ $value }})'
      - alert: UnusualNetworkThroughputOut
        expr: |
          sum by (job,instance) (irate(node_network_transmit_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 80
        for: 5m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟网络带宽发送数据(> 80 MB/s) (当前值: {{ $value }})'
      - alert: SystemdServiceCrashed
        expr: |
          node_systemd_unit_state{state="failed"} == 1
        for: 5m
        labels:
          severity: warning
        annotations:
         description: '{{ $labels.instance }} 主机名:{{ $labels.hostname }} 上的{{$labels.name}}服务有问题已经5分钟,请及时处理'
      - alert: HostDiskWillFillIn24Hours
        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 以当前写入速率,预计文件系统将在未来24小时内耗尽空间!"
      - alert: HostOutOfInodes
        expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host out of inodes (instance {{ $labels.instance }})
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 磁盘iNode空间剩余小于10%!\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: HostOomKillDetected
        expr: increase(node_vmstat_oom_kill[1m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host OOM kill detected (instance {{ $labels.instance }})
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 当前主机检查到有OOM现象!"

3.2 prometheus.rules

    groups:
    - name: prometheus.rules
      rules:
      - alert: PrometheusErrorSendingAlertsToAnyAlertmanagers
        expr: |
           (rate(prometheus_notifications_errors_total{instance="localhost:9090", job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{instance="localhost:9090", job="prometheus"}[5m])) * 100 > 3
        for: 5m
        labels:
          severity: warning
        annotations:
          description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
      - alert: PrometheusNotConnectedToAlertmanagers
        expr: |
           max_over_time(prometheus_notifications_alertmanagers_discovered{instance="localhost:9090", job="prometheus"}[5m]) != 1
        for: 5m
        labels:
          severity: critical
        annotations:
          description: "Prometheus {{$labels.namespace}}/{{$labels.pod}} 链接alertmanager异常!"
      - alert: PrometheusRuleFailures
        expr: |
           increase(prometheus_rule_evaluation_failures_total{instance="localhost:9090", job="prometheus"}[5m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          description: 'Prometheus {{$labels.namespace}}/{{$labels.pod}} 在5分钟执行失败的规则次数 {{ printf "%.0f" $value }}'
      - alert: PrometheusRuleEvaluationFailures
        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
          description: "Prometheus 遇到规则 {{ $value }} 载入失败, 请及时检查."
      - alert: PrometheusTsdbReloadFailures
        expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
          description: "Prometheus {{ $value }} TSDB 重载失败!"
      - alert: PrometheusTsdbWalCorruptions
        expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
          description: "Prometheus {{ $value }} TSDB WAL 模块出现问题!"

3.3 website.rules

    groups:
    - name: website.rules
      rules:
      - alert: "ssl证书过期警告"
        expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30
        for: 1h
        labels:
          severity: warning
        annotations:
          description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
          summary: "ssl证书过期警告"
      - alert: blackbox_network_stats
        expr: probe_success == 0
        for: 1m
        labels:
          severity: critical
          pod: '{{$labels.instance}}'
          namespace: '{{$labels.kubernetes_namespace}}'
        annotations:
          summary: "接口/主机/端口/域名 {{ $labels.instance }} 不能访问"
          description: "接口/主机/端口/域名 {{ $labels.instance }} 不能访问,请尽快检测!"
      - alert: curlHttpStatus
        expr:  probe_http_status_code{job="blackbox-http"} >= 422 and probe_success{job="blackbox-http"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: '业务报警: 网站不可访问'
          description: '{{$labels.instance}} 不可访问,请及时查看,当前状态码为{{$value}}'

3.4 pod.rules

    groups:
    - name: pod.rules
      rules:
      - alert: PodCPUUsage
        expr: |
           sum(rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) by (pod, namespace) > 90
        for: 5m
        labels:
          severity: warning
          pod: '{{$labels.pod}}'
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于90% (当前值: {{ $value }})"
      - alert: PodMemoryUsage
        expr: |
           sum(container_memory_rss{image!=""}) by(pod, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 85
        for: 5m
        labels:
          severity: critical
          pod: '{{$labels.pod}}'
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于85% (当前值: {{ $value }})"
      - alert: KubeDeploymentError
        expr: |
           kube_deployment_spec_replicas{job="kubernetes-service-endpoints"} != kube_deployment_status_replicas_available{job="kubernetes-service-endpoints"}
        for: 3m
        labels:
          severity: warning
          pod: '{{$labels.deployment}}'
        annotations:
          description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }}控制器与实际数量不相符 (当前值: {{ $value }})"
      - alert: coreDnsError
        expr: |
           kube_pod_container_status_running{container="coredns"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} coreDns服务异常 (当前值: {{ $value }})"
      - alert: kubeProxyError
        expr: |
           kube_pod_container_status_running{container="kube-proxy"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} kube-proxy服务异常 (当前值: {{ $value }})"
      - alert: filebeatError
        expr: |
           kube_pod_container_status_running{container="filebeat"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} filebeat服务异常 (当前值: {{ $value }})"
      - alert: PodNetworkReceive
        expr: |
           sum(rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 60000
        for: 5m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 入口流量大于60MB/s (当前值: {{ $value }}K/s)"
      - alert: PodNetworkTransmit
        expr: |
           sum(rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 60000
        for: 5m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 出口流量大于60MB/s (当前值: {{ $value }}/K/s)"
      - alert: PodRestart
        expr: |
           sum(changes(kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
      - alert: PodFailed
        expr: |
           sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
        for: 5s
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
      - alert: PodPending
        expr: |
           sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
        for: 30s
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"
      - alert: PodErrImagePull
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态ErrImagePull (当前值: {{ $value }})"
      - alert: PodImagePullBackOff
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态ImagePullBackOff (当前值: {{ $value }})"
      - alert: PodCrashLoopBackOff
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态CrashLoopBackOff (当前值: {{ $value }})"
      - alert: PodInvalidImageName
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态InvalidImageName (当前值: {{ $value }})"
      - alert: PodCreateContainerConfigError
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态CreateContainerConfigError (当前值: {{ $value }})"
      - alert: KubernetesContainerOomKiller
        expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes container oom killer (instance {{ $labels.instance }})
          description: "{{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes!"
      - alert: KubernetesPersistentvolumeError
        expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
          description: "{{ $labels.instance }} Persistent volume is in bad state!"
      - alert: KubernetesStatefulsetDown
        expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
          description: "{{ $labels.statefulset }} A StatefulSet went down!"
      - alert: KubernetesStatefulsetReplicasMismatch
        expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
          description: "{{ $labels.statefulset }} A StatefulSet does not match the expected number of replicas."

3.5 volume.rules

    groups:
    - name: volume.rules
      rules:
      - alert: PersistentVolumeClaimLost
        expr: |
           sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Lost"}) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is lost!"
      - alert: PersistentVolumeClaimPendig
        expr: |
           sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Pendig"}) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pendig!"
      - alert: PersistentVolume Failed
        expr: |
           sum(kube_persistentvolume_status_phase{phase="Failed",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "Persistent volume is failed state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PersistentVolume Pending
        expr: |
           sum(kube_persistentvolume_status_phase{phase="Pending",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "Persistent volume is pending state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

3.6 process.rules

    groups:
    - name: process.rules
      rules:
      - alert: Process for Sparkxtask already down!!!
        expr: |
           (namedprocess_namegroup_num_procs{groupname="map[:sparkxtask]"}) < 4
        for: 1m
        labels:
          severity: warning
          pod: sparkxads-process
        annotations:
          description: "任务名称: sparktask | 正常进程数量: 4个 | 当前值: {{ $value }},请Robot及时处理!"