前言
发件流程
Prometheus → Alertmanager → prometheus-webhook-dingtalk (带签名) → 钉钉
准备 钉钉群 – 创建自定义机器人 – Webhook 方式 – 安全设置 加签 (保存好秘钥) – 完成 – 保存机器人调用的Webhook地址
Prometheus配置告警规则
mkdir /usr/local/prometheus
docker run -d --name test -P prom/prometheus
docker cp test:/etc/prometheus/prometheus.yml /usr/local/prometheus
docker rm -f test
添加告警规则
cat rules/node.yml
groups:
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up{job="node"} == 0
for: 1m
labels:
severity: 严重警告
annotations:
summary: "{{$labels.instance}} 服务器停止工作"
description: "{{$labels.instance}} 已经停止1分钟以上"
- alert: CPU使用情况
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 1m
labels:
severity: 一般警告
annotations:
summary: "{{$labels.instance}} CPU使用率过高!"
description: "{{$labels.instance}} CPU使用大于80%(目前使用:{{ printf \"%.2f\" $value }}%)"
- alert: 内存使用情况
expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100) > 85
for: 1m
labels:
severity: 严重警告
annotations:
summary: "{{$labels.instance}} 内存使用率过高!"
description: "{{$labels.instance}} 内存使用大于85%(目前使用:{{ printf \"%.2f\" $value }}%)"
- alert: Swap分区使用情况
expr: (node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_SwapTotal_bytes * 100 > 80
for: 1m
labels:
severity: 严重警告
annotations:
summary: "{{$labels.instance}} Swap内存使用率过高!"
description: "{{$labels.instance}} Swap内存使用大于80%(目前使用:{{ printf \"%.2f\" $value }}%)"
- alert: IO性能
expr: max by (instance, device) (rate(node_disk_io_time_seconds_total[2m])) * 100 > 90
for: 1m
labels:
severity: 一般警告
annotations:
summary: "设备 {{$labels.device}} 磁盘IO使用率过高!"
description: "{{$labels.instance}} 的 {{$labels.device}} 当前磁盘IO大于90%(目前使用:{{ printf \"%.2f\" $value }}%)"
- alert: 网络下载
expr: max by (instance) (rate(node_network_receive_bytes_total[2m]) * 8 / 1000000) > 200
for: 2m
labels:
severity: 一般警告
annotations:
summary: "{{$labels.instance}} 下载网络带宽过高!"
description: "{{$labels.instance}} 下载网络带宽持续2分钟高于200Mbps,当前使用:{{ printf \"%.2f\" $value }}Mbps"
- alert: 网络上传
expr: max by (instance) (rate(node_network_transmit_bytes_total[2m]) * 8 / 1000000) > 200
for: 2m
labels:
severity: 一般警告
annotations:
summary: "{{$labels.instance}} 上传网络带宽过高!"
description: "{{$labels.instance}} 上传网络带宽持续2分钟高于200Mbps,当前使用:{{ printf \"%.2f\" $value }}Mbps"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 5000
for: 1m
labels:
severity: 一般警告
annotations:
summary: "{{$labels.instance}} TCP_ESTABLISHED过高!"
description: "{{$labels.instance}} TCP_ESTABLISHED连接数为{{ printf \"%.0f\" $value }},超过5000"
- alert: 磁盘容量
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext.?|xfs"} / node_filesystem_size_bytes{fstype=~"ext.?|xfs"} * 100) > 90
for: 1m
labels:
severity: 严重警告
annotations:
summary: "挂载点 {{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.instance}} 的 {{$labels.mountpoint}} 分区使用率为 {{ printf \"%.2f\" $value }}%,已超过90%"
配置告警组件 告警文件路径 采集node信息
vim /usr/local/prometheus/prometheus.yml
[root@localhost _data]# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.60.65.66:7003 # 部署Alertmanager的节点和端口
# 配置告警规则文件
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "node"
static_configs:
- targets: ["10.60.65.xx:9100","10.60.65.xx:9100"]
Alertmanager配置
部署 Alertmanager
mkdir /usr/local/alertmanager
docker run -d --name test -p 7003:9093 prom/alertmanager
docker cp test:/etc/alertmanager/alertmanager.yml /usr/local/alertmanager/
docker rm -f test
cat > alertmanager.yml <<EOF
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 1m
repeat_interval: 5h
receiver: 'dingtalk-receiver'
receivers:
- name: 'dingtalk-receiver'
webhook_configs:
- url: 'http://10.60.65.66:8060/dingtalk/default/send' # prometheus-webhook-dingtalk部署节点和端口
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
EOF
docker run -d --name alertmanager -p 7003:9093 -v /usr/local/alertmanager/:/etc/alertmanager/ prom/alertmanager
Prometheus-webhook-dingtalk配置
mkdir /usr/local/dingtalk
cat > /usr/local/dingtalk/config.yml <<EOF
targets:
default:
url: https://oapi.dingtalk.com/robot/send?access_token=钉钉机器人key
secret: 加签的秘钥
message:
title: 'Prometheus 告警通知'
text: |
{{ range .Alerts }}
> **[Prometheus 告警通知]**
>
> **🔥 告警级别**: {{ .Labels.severity | toUpper }}
> **🖥️ 实例地址**: {{ .Labels.instance }}
> **📌 告警名称**: {{ .Labels.alertname }}
> **📝 告警摘要**: {{ .Annotations.summary }}
> **📄 详细描述**: {{ .Annotations.description }}
> **⏰ 触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
> **📊 当前状态**: {{ if eq .Status "resolved" }}✅ 已恢复(恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}){{ else }}🚨 正在告警{{ end }}
>
> ---
{{ end }}
EOF
docker run -d --name dingtalk-webhook \
-p 8060:8060 \
-v /usr/local/dingtalk/config.yml:/config.yml \
timonwong/prometheus-webhook-dingtalk:latest \
--config.file=/config.yml
使用部署Prometheus-webhook-dingtalk机器执行,测试信息是否发送成功
curl -X POST http://localhost:8060/dingtalk/default/send -H "Content-Type: application/json" -d '{
"version": "4",
"groupKey": "{}",
"status": "firing",
"receiver": "dingtalk",
"groupLabels": {},
"commonLabels": {
"alertname": "TestAlert",
"instance": "localhost:9090",
"severity": "critical"
},
"commonAnnotations": {
"summary": "测试告警",
"description": "这是一个测试"
},
"externalURL": "http://localhost:9093",
"alerts": [
{
"status": "firing",
"labels": {
"alertname": "TestAlert",
"instance": "localhost:9090",
"severity": "critical"
},
"annotations": {
"summary": "测试告警",
"description": "这是一个测试"
},
"startsAt": "2025-04-05T08:00:00Z",
"endsAt": "0001-01-01T00:00:00Z",
"generatorURL": ""
}
]
}'

设备异常告警信息

声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。

评论(0)