Kubernetes安装Prometheus数据持久化

管理员
 
一、搭建前期准备条件
1、在k8s集群创建一个 prometheus 命名空间,做好服务隔离,确保不会影响到其它服务
 
 
二、数据持久化环境搭建
1、安装NFS,查看是否已安装NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep nfs
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep rpcbind
 
安装NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# yum -y install nfs-utils rpcbind
 
创建NFS数据存储目录
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir -p /nfs/kubernetes/prometheus
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# chmod 666 /nfs/kubernetes/prometheus
 
配置访问
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# echo "/nfs/kubernetes *(rw,no_root_squash,sync)" >>/etc/exports
 
[root@iZwz95iaf9ikzcszlcw8qpZ mnt]# cat /etc/exports
 
刷新配置生效
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# exportfs -r
 
启动NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl start rpcbind
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable rpcbind
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable nfs
 
查看RPC服务注册状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpcinfo -p localhost
 
 
创建挂载目录
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# showmount -e 挂载机器IP地址
 
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mount -t nfs 挂载机器IP地址:/nfs/kubernetes/ /nfs/kubernetes/prometheus/
 
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# df -h
 
 
 
 
三、k8s搭建prometheus
创建prometheus目录用来存放YAML配置
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir -p /opt/prometheus/prometheus
 
1、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus.configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: prometheus
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 15s
 
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
 
rule_files:
- /etc/prometheus/*.rules
 
 
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
 
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.prometheus.svc.cluster.local:8080']
 
 
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9003'
target_label: __address__
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
 
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
 
 
##############################################################################################
#pod
pod.rules: |
groups:
- name: pod.rules
rules:
- alert: K8sClusterNodeNotready
expr: |
kube_node_status_condition{condition="Ready",status!="true"} == 1
for: 30s
labels:
severity: warning
annotations:
summary: "节点: {{$labels.node}} 状态: Notready"
 
- alert: PodCPUUsage
expr: |
sum(irate(container_cpu_usage_seconds_total{image!="",container!="POD",container!=""}[1m])) by (pod,namespace) / (sum(container_spec_cpu_quota{image!="",container!="POD",container!=""}/100000) by (pod,namespace)) * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于90% (当前值: {{ $value }})"
 
- alert: PodMemoryUsage
expr: |
sum(container_memory_rss{container!="POD",container!="alermanager",image!="",pod!=""})by(pod,namespace) / sum(container_spec_memory_limit_bytes{container!="",container!="POD"})by(pod,namespace) * 100 != +inf > 90
for: 2m
labels:
severity: error
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于90% (当前值: {{ $value }})"
 
- alert: PodRestart
expr: |
sum (increase (kube_pod_container_status_restarts_total{}[1m])) by (namespace,pod) >0
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
 
- alert: PodFailed
expr: |
sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
for: 5s
labels:
severity: error
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
 
- alert: PodPending
expr: |
sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
for: 1m
labels:
severity: error
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"
 
 
- alert: PodErrImagePull
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ErrImagePull (当前值: {{ $value }})"
- alert: PodImagePullBackOff
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ImagePullBackOff (当前值: {{ $value }})"
- alert: PodCrashLoopBackOff
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CrashLoopBackOff (当前值: {{ $value }})"
- alert: PodInvalidImageName
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态InvalidImageName (当前值: {{ $value }})"
- alert: PodCreateContainerConfigError
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CreateContainerConfigError (当前值: {{ $value }})"
 
 
 
启动YAML
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus.configmap.yaml
configmap/prometheus-config created
 
查看状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get configmaps -n prometheus |grep prometheus
 
 
 
2、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus.deploy.yaml
 
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: prometheus
labels:
app: prometheus
spec:
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
- image: prom/prometheus:v2.24.1
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=40d"
- "--web.enable-admin-api" # 控制对admin HTTP API的访问,其中包括删除时间序列等功能
- "--web.enable-lifecycle" # 支持热更新,直接执行localhost:9090/-/reload立即生效
ports:
- containerPort: 9090
protocol: TCP
name: http
volumeMounts:
- mountPath: "/prometheus"
subPath: prometheus
name: data
- mountPath: "/etc/prometheus"
name: config-volume
resources:
requests:
cpu: 4000m
memory: 8192Mi
limits:
cpu: 8000m
memory: 16384Mi
 
- name: alermanage
image: prom/alertmanager:v0.22.0
imagePullPolicy: IfNotPresent
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager/data"
ports:
- containerPort: 9093
name: http
volumeMounts:
- mountPath: "/etc/alertmanager"
name: alertcfg
resources:
requests:
cpu: 2000m
memory: 4096Mi
limits:
cpu: 4000m
memory: 8192Mi
 
securityContext:
runAsUser: 0
volumes:
- name: data
persistentVolumeClaim:
claimName: prometheus
- configMap:
name: prometheus-config
name: config-volume
 
- name: alertcfg
configMap:
name: alermanager
 
 
---
apiVersion: v1
kind: Service
metadata:
namespace: prometheus
name: prometheus
labels:
app: prometheus
spec:
type: NodePort
selector:
app: prometheus
ports:
- name: http
port: 9090
 
 
 
 
 
3、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-volume.yaml
 
apiVersion: v1
kind: PersistentVolume
metadata:
name: prometheus
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
nfs:
server: 10.11.33.57 #NFS地址
path: /nfs/kubernetes/prometheus #NFS挂载目录
 
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus
namespace: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
 
 
 
启动YAML 配置文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-volume.yaml persistentvolume/prometheus created persistentvolumeclaim/prometheus created
 
 
4、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus
 
 
 
启动YAML配置文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-rbac.yaml
serviceaccount/prometheus created
clusterrole.rbac.authorization.k8s.io/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
 
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus.deploy.yaml
 
查看启动状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get pod -n prometheus |grep prometheus
 
 
 
5、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-svc.yaml
 
apiVersion: v1
kind: Service
metadata:
namespace: prometheus
name: prometheus
labels:
app: prometheus
spec:
type: NodePort
selector:
app: prometheus
ports:
- name: http
port: 9090
- name: alertmanager
port: 9093
targetPort: 9093
 
 
启动YMAL配置文件(也可以手动在k8s平台创建)
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-svc.yaml service/prometheus created
 
 
 
 
四、Prometheus监控Kubernetes 集群节点及应用
 
每一个节点都会运行一个Pod,如果从集群中删除或添加节点后,也会进行自动扩展
 
1、编辑prometheus-node-exporter监控的YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vimprometheus-node-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: prometheus-node-exporter
namespace: prometheus
labels:
name: prometheus-node-exporter
k8s-app: node-exporter
spec:
selector:
matchLabels:
name: prometheus-node-exporter
template:
metadata:
labels:
name: prometheus-node-exporter
app: node-exporter
spec:
 
# affinity:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: type
# operator: NotIn
# values:
# - virtual-kubelet
 
hostPID: true
hostIPC: true
hostNetwork: true
containers:
- name: prometheus-node-exporter
image: prom/node-exporter:v1.1.0
ports:
- containerPort: 9003
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
args:
- --web.listen-address
- ":9003"
- --path.procfs
- /host/proc
- --path.sysfs
- /host/sys
- --collector.filesystem.ignored-mount-points
- '"^/(sys|proc|dev|host|etc)($|/)"'
volumeMounts:
- name: dev
mountPath: /host/dev
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: rootfs
mountPath: /rootfs
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
 
#如果节点有打污点,需要添加如下配置
- key: "dedicated" #污点标签
operator: "Exists"
effect: "NoExecute" #污点 NoExecute
 
- key: "eci"
operator: "Exists"
effect: "NoSchedule"
 
 
volumes:
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
 
 
 
启动prometheus-node-exporter监控
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-node-exporter.yaml
daemonset.extensions/node-exporter created
 
查看prometheus-node-exporter节点监控状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get pod -n prometheus -o wide|grep node
 
 
由于要获取的数据是主机的监控指标数据,而node-exporter是运行在容器中的,所以在Pod中需要配置一些Pod的安全策略