python抓取prometheus容器数据,并实现监控报警

发布时间 2023-06-15 11:33:15作者: 力王7314
import json
import math
import pytz
import requests
from datetime import datetime

class Monitoring(object):
def __init__(self):
self.namespace_list = ["apollo", "bhpc-admin-nginx","bluehelix","broker","cert-manager","chainnode","clear","elastic-system","exchange","gateway","kube-node-lease","kube-public","kube-system","log","wallet","rc"]
# self.namespace_list = ["broker"]
self.api_url = 'https://prometheus.doex.io/api/v1/query'
self.cpu_threshold = 80
self.mem_threshold = 80

def get_cpu(self,namespace):
container_cpu_list = []
query = f'sum(irate(container_cpu_usage_seconds_total{{container !="",container!="POD",namespace=~"{namespace}"}}[2m])) by (container, pod) / (sum(container_spec_cpu_quota{{container !="",container!="POD",namespace=~"{namespace}"}}/100000) by (container, pod)) * 100'
params = {
'query': query
}
response = requests.get(url=self.api_url, params=params)
if response.status_code == 200:
data = response.json()
result = data['data']['result']
if result:
for container_data in result:
container_name = container_data['metric']['pod']
cpu_usage = float(container_data['value'][1])
if cpu_usage > self.cpu_threshold and not math.isinf(cpu_usage):
container_cpu_list.append({"container":container_name, "cpu_usage":cpu_usage})
else:
print('找不到指定的容器或者没有配置资源limit')
else:
print('请求失败:', response.status_code)
return container_cpu_list
def get_mem(self,namespace):
container_mem_list = []
query = f'sum (container_memory_working_set_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod)/ sum(container_spec_memory_limit_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod) * 100'
#query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100'
#query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100'
params = {
'query': query
}
response = requests.get(url=self.api_url, params=params)
if response.status_code == 200:
data = response.json()
result = data['data']['result']
if result:
for container_data in result:
container_name = container_data['metric']['pod']
mem_usage = float(container_data['value'][1])
if mem_usage > self.mem_threshold and not math.isinf(mem_usage):
container_mem_list.append({"container": container_name, "mem_usage": mem_usage})
else:
print('找不到指定的容器或者没有配置资源limit')
else:
print('请求失败:', response.status_code)
return container_mem_list
def send_alert(self,container):
current_time = datetime.now()
target_timezone = pytz.timezone('Asia/Shanghai')
current_time = current_time.astimezone(target_timezone)
markdown_cpu = f'''
# io环境报警通知:
**容器名称:** {container.get("container")}
**当前cpu:** {container.get("cpu_usage")}%
**报警级别:** 警告
**报警阈值:** {self.cpu_threshold}%
**报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')}
**问题描述:**
{container.get("container")} CPU 使用率超过阈值。
'''

markdown_mem = f'''
# io环境报警通知:
**容器名称:** {container.get("container")}
**当前内存:** {container.get("mem_usage")}%
**报警级别:** 警告
**报警阈值:** {self.mem_threshold}%
**报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')}
**问题描述:**
{container.get("container")} 内存 使用率超过阈值。
'''
markdown_text = None
if container.get("cpu_usage"):
markdown_text = markdown_cpu
else:
markdown_text = markdown_mem

url = r"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=3e74a7b2-5c2d-4f17-89c5-64689af216f7"
headers = {
'Content-Type': 'application/json'
}
data = {
'msgtype': 'markdown',
'markdown': {
'content': markdown_text
}
}
if markdown_text:
res = requests.post(url=url, headers=headers,json=data)
if res.json().get("errcode") == 0:
print("发功成功")
else:
print(res.text)
else:
print("发送内容为空")


if __name__ == '__main__':
monitoring = Monitoring()
for namespace in monitoring.namespace_list:
container_cpu_list = monitoring.get_cpu(namespace)
container_mem_list = monitoring.get_mem(namespace)
#cpu监控
for container_cpu in container_cpu_list:
print(container_cpu)
monitoring.send_alert(container_cpu)
#内存监控
for container_mem in container_mem_list:
print(container_mem)
monitoring.send_alert(container_mem)


报警示例图