k8s 监控之Prometheus部署安装

发布时间 2023-10-18 03:42:57作者: 烟雨楼台,行云流水

部署监控pod   https://github.com/ruidongchenxi/k8s-ack/blob/main/node-export.yaml

[root@k8s-master cka]# cat node-export.yaml 
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: monitor-sa
  labels:
    name: node-exporter
spec:
  selector:
    matchLabels:
     name: node-exporter
  template:
    metadata:
      labels:
        name: node-exporter
    spec:
      hostPID: true
      hostIPC: true
      hostNetwork: true
      containers:
      - name: node-exporter
        image: prom/node-exporter:v0.16.0
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9100
        resources:
          requests:
            cpu: 0.15
        securityContext:
          privileged: true
        args:
        - --path.procfs
        - /host/proc
        - --path.sysfs
        - /host/sys
        - --collector.filesystem.ignored-mount-points
        - '"^/(sys|proc|dev|host|etc)($|/)"'
        volumeMounts:
        - name: dev
          mountPath: /host/dev
        - name: proc
          mountPath: /host/proc
        - name: sys
          mountPath: /host/sys
        - name: rootfs
          mountPath: /rootfs
      tolerations:
      - key: "node-role.kubernetes.io/master"
        operator: "Exists"
        effect: "NoSchedule"
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: dev
          hostPath:
            path: /dev
        - name: sys
          hostPath:
            path: /sys
        - name: rootfs
          hostPath:
            path: /

[root@k8s-master cka]# kubectl create ns monitor-sa
namespace/monitor-sa created
[root@k8s-master cka]# kubectl apply -f node-export.yaml 
daemonset.apps/node-exporter created

  

  查看cpu监控指标

[root@k8s-master cka]# curl http://192.168.10.50:9100/metrics | grep cpu
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 63228  100 63228    0     0  5437k      0 --:--:-- --:--:-- --:--:-- 5613k
# HELP go_memstats_gc_cpu_fraction The fraction of this program's available CPU time used by the GC since the program started.
# TYPE go_memstats_gc_cpu_fraction gauge
go_memstats_gc_cpu_fraction 4.159327529117454e-06
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="0",mode="user"} 0
node_cpu_guest_seconds_total{cpu="1",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="1",mode="user"} 0
node_cpu_guest_seconds_total{cpu="2",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="2",mode="user"} 0
node_cpu_guest_seconds_total{cpu="3",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="3",mode="user"} 0
# HELP node_cpu_seconds_total Seconds the cpus spent in each mode.
# TYPE node_cpu_seconds_total counter
node_cpu_seconds_total{cpu="0",mode="idle"} 82923.84
node_cpu_seconds_total{cpu="0",mode="iowait"} 46.33
node_cpu_seconds_total{cpu="0",mode="irq"} 0
node_cpu_seconds_total{cpu="0",mode="nice"} 0.04
node_cpu_seconds_total{cpu="0",mode="softirq"} 7.79
node_cpu_seconds_total{cpu="0",mode="steal"} 0
node_cpu_seconds_total{cpu="0",mode="system"} 51.35
node_cpu_seconds_total{cpu="0",mode="user"} 58.6
node_cpu_seconds_total{cpu="1",mode="idle"} 82673.51
node_cpu_seconds_total{cpu="1",mode="iowait"} 1.07
node_cpu_seconds_total{cpu="1",mode="irq"} 0
node_cpu_seconds_total{cpu="1",mode="nice"} 0.04
node_cpu_seconds_total{cpu="1",mode="softirq"} 7.89
node_cpu_seconds_total{cpu="1",mode="steal"} 0
node_cpu_seconds_total{cpu="1",mode="system"} 70.05
node_cpu_seconds_total{cpu="1",mode="user"} 80.02
node_cpu_seconds_total{cpu="2",mode="idle"} 82655.69
node_cpu_seconds_total{cpu="2",mode="iowait"} 1.05
node_cpu_seconds_total{cpu="2",mode="irq"} 0
node_cpu_seconds_total{cpu="2",mode="nice"} 0.11
node_cpu_seconds_total{cpu="2",mode="softirq"} 7.9
node_cpu_seconds_total{cpu="2",mode="steal"} 0
node_cpu_seconds_total{cpu="2",mode="system"} 72.63
node_cpu_seconds_total{cpu="2",mode="user"} 87.56
node_cpu_seconds_total{cpu="3",mode="idle"} 82495.3
node_cpu_seconds_total{cpu="3",mode="iowait"} 0.94
node_cpu_seconds_total{cpu="3",mode="irq"} 0
node_cpu_seconds_total{cpu="3",mode="nice"} 0.04
node_cpu_seconds_total{cpu="3",mode="softirq"} 11.34
node_cpu_seconds_total{cpu="3",mode="steal"} 0
node_cpu_seconds_total{cpu="3",mode="system"} 77.3
node_cpu_seconds_total{cpu="3",mode="user"} 80.99
# HELP node_memory_Percpu_bytes Memory information field Percpu_bytes.
# TYPE node_memory_Percpu_bytes gauge
node_memory_Percpu_bytes 3.407872e+07
node_scrape_collector_duration_seconds{collector="cpu"} 0.000650834
node_scrape_collector_success{collector="cpu"} 1
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 0.22

  创建运行pod 的sa

[root@k8s-master cka]# kubectl create serviceaccount monitor -n monitor-sa
serviceaccount/monitor created
[root@k8s-master cka]# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitor-sa --clusterrole=cluster-admin  --serviceaccount=monitor-sa:monitor
clusterrolebinding.rbac.authorization.k8s.io/monitor-clusterrolebinding created
[root@k8s-master cka]# kubectl create clusterrolebinding monitor-clusterrolebinding-1 --clusterrole=cluster-admin --user=system:serviceaccount:monitor:monitor-sa -n monitor-sa
clusterrolebinding.rbac.authorization.k8s.io/monitor-clusterrolebinding-1 created
[root@k8s-master cka]# kubectl get clusterrolebinding | grep clusterrolebinding-1
monitor-clusterrolebinding-1                           ClusterRole/cluster-admin                                                          17s

  创建数据目录

[root@k8s-node1 ~]# mkdir /data
您在 /var/spool/mail/root 中有新邮件
[root@k8s-node1 ~]# chmod 777 /data
[root@k8s-node2 ~]# mkdir /data && chmod 777 /data
您在 /var/spool/mail/root 中有新邮件

 创建prometheus数据存储目录

 

---
kind: ConfigMap
apiVersion: v1
metadata:
  labels:
    app: prometheus
  name: prometheus-config
  namespace: monitor-sa
data:
  prometheus.yml: | # 具体配置文件
    global: # 全局
      scrape_interval: 15s #采集时间间隔
      scrape_timeout: 10s # 采集数据超时时间
      evaluation_interval: 1m # 告警检测时间间隔
    scrape_configs:
    - job_name: 'kubernetes-node' # 采集数据原
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - source_labels: [__address__]
        regex: '(.*):10250'
        replacement: '${1}:9100'
        target_label: __address__
        action: replace
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
    - job_name: 'kubernetes-node-cadvisor'
      kubernetes_sd_configs:
      - role:  node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    - job_name: 'kubernetes-apiserver'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https
    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name 
[root@k8s-master cka]# kubectl apply -f prometheus-cfg.yaml
configmap/prometheus-config created

 

  

通过deployment部署prometheus

[root@k8s-master cka]# cat prometheus-deploy.yaml 
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus-server
  namespace: monitor-sa
  labels:
    app: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
      component: server
    #matchExpressions:
    #- {key: app, operator: In, values: [prometheus]}
    #- {key: component, operator: In, values: [server]}
  template:
    metadata:
      labels:
        app: prometheus
        component: server
      annotations:
        prometheus.io/scrape: 'false'
    spec:
      nodeName: k8s-node1  # 部署节点
      serviceAccountName: monitor
      containers:
      - name: prometheus
        image: prom/prometheus:v2.2.1
        imagePullPolicy: IfNotPresent
        command:
          - prometheus
          - --config.file=/etc/prometheus/prometheus.yml
          - --storage.tsdb.path=/prometheus
          - --storage.tsdb.retention=720h
          - --web.enable-lifecycle
        ports:
        - containerPort: 9090
          protocol: TCP
        volumeMounts:
        - mountPath: /etc/prometheus
          name: prometheus-config
        - mountPath: /prometheus/
          name: prometheus-storage-volume
      volumes:
        - name: prometheus-config
          configMap:
            name: prometheus-config
        - name: prometheus-storage-volume
          hostPath:
           path: /data
           type: Directory
[root@k8s-master cka]# kubectl apply -f prometheus-deploy.yaml 
deployment.apps/prometheus-server created

  部署service

[root@k8s-master cka]# cat prometheus-svc.yaml 
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: monitor-sa
  labels:
    app: prometheus
spec:
  type: NodePort
  ports:
    - port: 9090
      targetPort: 9090
      protocol: TCP
  selector:
    app: prometheus
    component: server
[root@k8s-master cka]# kubectl apply -f prometheus-svc.yaml 
service/prometheus created

  查看pod 与service

[root@k8s-master cka]# kubectl get svc -n monitor-sa
NAME         TYPE       CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE
prometheus   NodePort   10.110.21.221   <none>        9090:30194/TCP   94s
[root@k8s-master cka]# kubectl get pod -n monitor-sa
NAME                                 READY   STATUS    RESTARTS   AGE
node-exporter-8q9vb                  1/1     Running   0          86m
node-exporter-fv8n8                  1/1     Running   0          86m
node-exporter-xjzdc                  1/1     Running   0          86m
prometheus-server-5b5bb44bb5-7xcn7   1/1     Running   0          7m54s

  浏览器访问http://192.168.10.50:30194/targets

 

service 添加如下注解会监控service

apiVersion: v1
kind: Service
metadata:
  annotations: #添加下面注解
    prometheus.io/port: "9153"
    prometheus.io/scrape: "true"
  creationTimestamp: "2023-10-16T19:47:18Z"
  labels:
    k8s-app: kube-dns
    kubernetes.io/cluster-service: "true"
    kubernetes.io/name: CoreDNS
  name: kube-dns
  namespace: kube-system
  resourceVersion: "236"
  uid: 7162ceef-a1a2-4da8-a4da-d387e619170d

   

 热加载

kube-dns   ClusterIP   10.96.0.10   <none>        53/UDP,53/TCP,9153/TCP   23h
[root@k8s-master cka]# kubectl edit svc -n kube-system kube-dns

# Please edit the object below. Lines beginning with a '#' will be ignored,
# and an empty file will abort the edit. If an error occurs while saving this file will be
# reopened with the relevant failures.
#
apiVersion: v1
kind: Service
metadata:
#  annotations:
#    prometheus.io/port: "9153"
#    prometheus.io/scrape: "true"
  creationTimestamp: "2023-10-16T19:47:18Z"
  labels:
service/kube-dns edited

  加载配置

[root@k8s-master cka]# kubectl get pod -n monitor-sa  -owide
NAME                                 READY   STATUS    RESTARTS   AGE   IP              NODE         NOMINATED NODE   READINESS GATES
node-exporter-8q9vb                  1/1     Running   0          22h   192.168.10.51   k8s-node1    <none>           <none>
node-exporter-fv8n8                  1/1     Running   0          22h   192.168.10.50   k8s-master   <none>           <none>
node-exporter-xjzdc                  1/1     Running   0          22h   192.168.10.52   k8s-node2    <none>           <none>
prometheus-server-5b5bb44bb5-7xcn7   1/1     Running   0          21h   10.244.36.65    k8s-node1    <none>           <none>
[root@k8s-master cka]# curl -X POST http://10.244.36.65:9090/-/reload
[root@k8s-master cka]# kubectl edit svc -n kube-system kube-dns
service/kube-dns edited
[root@k8s-master cka]# curl -X POST http://10.244.36.65:9090/-/reload