blackbox_rules.yml
groups: - name: blackbox_alert rules: - alert: blackbox_alert expr: probe_success == 0 for: 5m labels: severity: critical annotations: summary: "接口/主机/端口 {{ $labels.instance }} 无法联通" description: "请尽快检测" - alert: "ssl证书过期警告" expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30 for: 1h labels: severity: warn annotations: description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书' summary: "ssl证书过期警告"
k8s_rules.yml
groups: - name: node.rules rules: - alert: JobDown #检测job的状态,持续5分钟metrices不能访问会发给altermanager进行报警 expr: up == 0 #0不正常,1正常 for: 5m #持续时间 , 表示持续5分钟获取不到信息,则触发报警 labels: severity: error cluster: k8s annotations: summary: "Job: {{ $labels.job }} down" description: "Instance:{{ $labels.instance }}, Job {{ $labels.job }} stop " - alert: PodDown expr: kube_pod_container_status_running != 1 for: 2s labels: severity: warning cluster: k8s annotations: summary: 'Container: {{ $labels.container }} down' description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} is not running' - alert: PodReady expr: kube_pod_container_status_ready != 1 for: 5m #Ready持续5分钟,说明启动有问题 labels: severity: warning cluster: k8s annotations: summary: 'Container: {{ $labels.container }} ready' description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} always ready for 5 minitue' - alert: PodRestart expr: changes(kube_pod_container_status_restarts_total[30m])>0 #最近30分钟pod重启 for: 2s labels: severity: warning cluster: k8s annotations: summary: 'Container: {{ $labels.container }} restart' description: 'namespace: {{ $labels.namespace }}, pod: {{ $labels.pod }} restart {{ $value }} times'
node_rules.yml
groups: - name: 主机状态-监控告警 # 命名 rules: - alert: 主机存活告警 # 命名 expr: up == 0 # 表达式,分析指标判定告警 for: 60s # 触发告警持续时间 labels: # 自定义告警标签 severity: warning annotations: # 告警内容注释,根据需要制定 summary: "{{ $labels.instance }} 宕机超过1分钟!" - alert: 主机CPU使用率告警 expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80 for: 15m labels: severity: warning annotations: summary: "CPU近15分钟使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 主机内存使用率告警 expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 85 for: 15m labels: severity: warning annotations: summary: "内存利用率大于85%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" # 磁盘利用>80% - alert: 主机磁盘使用率告警 expr: 100 - node_filesystem_free_bytes{fstype=~"xfs|ext4"} / node_filesystem_size_bytes{fstype=~"xfs|ext4"} * 100 > 80 for: 15m labels: severity: warning annotations: summary: "磁盘使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 数据目录可用量 expr: node_filesystem_avail_bytes{mountpoint="/data",fstype=~"ext4|xfs"} /1073741824 < 2 for: 1m labels: status: 严重 annotations: value: "{{ $value }}" instance: "{{ $labels.instance }}" mountpoint: "{{$labels.mountpoint}}" summary: "实例在data挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G" - alert: 根目录可用量 expr: node_filesystem_avail_bytes{mountpoint="/",fstype=~"ext4|xfs"} /1073741824 < 2 for: 1m labels: status: 严重 annotations: value: "{{ $value }}" instance: "{{ $labels.instance }}" mountpoint: "{{$labels.mountpoint}}" summary: "实例在root挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G" - alert: TCP连接数 expr: node_netstat_Tcp_CurrEstab > 10000 for: 2m labels: severity: 严重告警 annotations: summary: " TCP_ESTABLISHED过高!" description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%." - alert: 主机Tcp TimeWait数量过多告警 expr: node_sockstat_TCP_tw >= 5000 for: 1m labels: severity: warning annotations: summary: "Tcp TimeWait数量大于5000, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 主机iowait较高 expr: (sum(increase(node_cpu_seconds_total{mode='iowait'}[5m]))by(instance)) / (sum(increase(node_cpu_seconds_total[5m]))by(instance)) *100 >= 10 for: 5m labels: severity: warning annotations: summary: "CPU ioWait近5分钟占比大于等于10%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 磁盘IO性能 expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90 for: 5m labels: severity: 严重告警 annotations: summary: "{{$labels.instance}} 流入磁盘IO使用率过高,请尽快处理!" description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%." - alert: 主机磁盘读过大 expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50*1024 *1024 for: 5m labels: severity: warning annotations: summary: "磁盘读过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。" # 写入 > 50MB/s - alert: 主机磁盘写过大 expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50 * 1024 * 1024 for: 5m labels: severity: warning annotations: summary: "磁盘写过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。" - alert: 网络流入 expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400 for: 5m labels: severity: 严重告警 annotations: summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!" description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}." - alert: 网络流出 expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400 for: 5m labels: severity: 严重告警 annotations: summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!" description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}." - alert: 系统15分钟负载告警 expr: node_load5 > 5.6 for: 1m labels: user: prometheus severity: warning annotations: summary: "服务器: {{$labels.alertname}} 系统负载报警" description: "服务器:{{$labels.alertname}},系统负载: 使用超过70%!当前值: {{ $value }}%)" value: "{{ $value }}"