
- 1.什么是altermanager
Alertmanager是一款开源的告警工具包,可以和Prometheus集成。
- 2.下载Alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.28.1/alertmanager-0.28.1.linux-amd64.tar.gz
- 3.解压安装包
[root@node-exporter43 ~]# tar xf alertmanager-0.28.1.linux-amd64.tar.gz -C /usr/local/
[root@node-exporter43 ~]#
- 4.修改Alertmanager的配置文件
[root@node-exporter43 ~]# cd /usr/local/alertmanager-0.28.1.linux-amd64/
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
# 通用配置
global:
resolve_timeout: 5m
smtp_from: '1272606829@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '1272606829@qq.com'
smtp_auth_password: 'fxjamdpfhussbagi'
smtp_require_tls: false
smtp_hello: 'qq.com'
# 定义路由信息
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'sre_system'
# 配置子路由
routes:
- receiver: 'sre_dba'
match_re:
job: violet_dba_exporter
# 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
# 这样做的目的是将消息发给最后的系统组(sre_system)
continue: true
- receiver: 'sre_k8s'
match_re:
job: violet_k8s_exporter
continue: true
- receiver: 'sre_system'
match_re:
job: .*
continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
email_configs:
- to: '1954271303@qq.com'
send_resolved: true
- to: '1786026481@qq.com'
send_resolved: true
- name: 'sre_k8s'
email_configs:
- to: '3199355854@qq.com'
send_resolved: true
- to: '914680354@qq.com'
send_resolved: true
- name: 'sre_system'
email_configs:
- to: '1786026481@qq.com'
send_resolved: true
- to: '1293039483@qq.com'
send_resolved: true
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
- 5.检查配置文件语法
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' SUCCESS
Found:
- global config
- route
- 0 inhibit rules
- 3 receivers
- 0 templates
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
- 6.启动Alertmanager服务
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager
- 7.访问Alertmanager的WebUI
http://10.0.0.43:9093/#/status
- 8.Prometheus server集成Alertmanager实现告警功能
1. 修改配置文件
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# egrep -v "^#|^$" prometheus.yml
global:
scrape_interval: 3s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.0.0.43:9093
rule_files:
- "violet-linux-rules.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "violet_dba_exporter"
static_configs:
- targets: ["10.0.0.41:9100"]
- job_name: "violet_k8s_exporter"
static_configs:
- targets: ["10.0.0.42:9100"]
- job_name: "violet_bigdata_exporter"
static_configs:
- targets: ["10.0.0.43:9100"]
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
2 修改告警规则
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# cat violet-linux-rules.yml
groups:
- name: violet-linux-rules-alert
rules:
- alert: violet-dba_exporter-alert
expr: up{job="violet_dba_exporter"} == 0
for: 3s
labels:
school: violet
class: linux
apps: dba
annotations:
summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
- alert: violet-k8s_exporter-alert
expr: up{job="violet_k8s_exporter"} == 0
for: 3s
labels:
school: violet
class: linux
apps: k8s
annotations:
summary: "{{ $labels.instance }} K8S服务器已停止运行超过 3s!"
- alert: violet-bigdata_exporter-alert
expr: up{job="violet_bigdata_exporter"} == 0
for: 5s
labels:
school: violet
class: linux
apps: bigdata
annotations:
summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
3.检查配置文件语法
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# ./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: 1 rule files found
SUCCESS: prometheus.yml is valid prometheus config file syntax
Checking violet-linux-rules.yml
SUCCESS: 3 rules found
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
4.重新加载prometheus的配置
curl -X POST http://10.0.0.31:9090/-/reload
5.查看prometheus server的WebUI验证是否生效
http://10.0.0.31:9090/config
http://10.0.0.31:9090/targets?search=
http://10.0.0.31:9090/alerts?search=
6.触发告警功能
[root@node-exporter41 ~]# systemctl stop node-exporter.service
[root@node-exporter41 ~]# ss -ntl | grep 9100
[root@node-exporter41 ~]#
[root@node-exporter42 ~]# systemctl stop node-exporter.service
[root@node-exporter42 ~]#
[root@node-exporter42 ~]# ss -ntl | grep 9100
[root@node-exporter42 ~]#
[root@node-exporter43 ~]# systemctl stop node-exporter.service
[root@node-exporter43 ~]#
[root@node-exporter43 ~]# ss -ntl | grep 9100
[root@node-exporter43 ~]#
7.查看alermanager的WebUI及邮箱接受者
alertmanager自定义告警模板
- 1 告警模板介绍
默认的告警信息界面有些简单,可以借助告警的模板信息,对告警信息进行丰富,需要借助于Alertmanager的模板功能来实现。
告警模板的使用流程如下:
- 分析关键信息
- 定制模板内容
- Alertmanager加载模板文件
- 告警信息使用模板内容属性
模板文件使用标准Go模板语法,并暴露一些包含时间标签和值的变量。
- 标签引用: {{ $label.<label_name> }}
- 指标样本值引用: {{ $value }}
为了显式效果,需要了解一些html相关技术,参考链接:
https://www.w3school.com.cn/html/index.asp
- 2 altertmanger节点自定义告警模板参考案例
2.1 创建邮件模板文件工作目录
[root@node-exporter43 ~]# mkdir -pv /violet/softwares/alertmanager/tmpl
2.2 创建模板实例,工作中可以考虑嵌入公司的logo
[root@node-exporter43 ~]# cat /violet/softwares/alertmanager/tmpl/email.tmpl
{{ define "oldboyedu.html" }}
<h1>你好: https://www.violet.com/</h1>
<table border="1">
<tr>
<th>报警项</th>
<th>实例</th>
<th>报警阀值</th>
<th>开始时间</th>
</tr>
{{ range $i, $alert := .Alerts }}
<tr>
<td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
<img src="https://www.violet.com/static/images/header/logo.png">
{{ end }}
[root@node-exporter43 ~]#
2.3 alertmanager引用自定义模板文件
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
# 通用配置
global:
resolve_timeout: 5m
smtp_from: '1272606829@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '1272606829@qq.com'
smtp_auth_password: 'fxjamdpfhussbagi'
smtp_require_tls: false
smtp_hello: 'qq.com'
# 定义路由信息
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'sre_system'
# 配置子路由
routes:
- receiver: 'sre_dba'
match_re:
job: violet_dba_exporter
# 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
# 这样做的目的是将消息发给最后的系统组(sre_system)
continue: true
- receiver: 'sre_k8s'
match_re:
job: violet_k8s_exporter
continue: true
- receiver: 'sre_system'
match_re:
job: .*
continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
email_configs:
- to: '1954271303@qq.com'
send_resolved: true
# 添加此行,定制邮件的标题,对于"{{}}"属性用于加载其他信息,需要使用单引号括住。
headers: { Subject: "[WARN] LINUX96报警邮件" }
# 添加此行,调用模板显式邮件正文,对于"{}"不需要使用单引号,否则服务启动不成功。
html: '{{ template "violet.html" . }}'
- to: '1786026481@qq.com'
send_resolved: true
- name: 'sre_k8s'
email_configs:
- to: '3199355854@qq.com'
send_resolved: true
headers: { Subject: "[WARN] LINUX96报警邮件" }
html: '{{ template "violet.html" . }}'
- to: '914680354@qq.com'
send_resolved: true
- name: 'sre_system'
email_configs:
- to: '1786026481@qq.com'
send_resolved: true
headers: { Subject: "[WARN] LINUX报警邮件" }
html: '{{ template "violet.html" . }}'
- to: '1293039483@qq.com'
send_resolved: true
headers: { Subject: "[WARN] LINUX报警邮件" }
html: '{{ template "violet.html" . }}'
# 加载模板
templates:
- '/violet/softwares/alertmanager/tmpl/*.tmpl'
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
2.4 alertmanager语法检查
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./amtool check-config ./alertmanager.yml
Checking './alertmanager.yml' SUCCESS
Found:
- global config
- route
- 0 inhibit rules
- 3 receivers
- 1 templates
SUCCESS
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
2.5 重启Alertmanager程序
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager
2.6 查看WebUi观察配置是否生效
http://10.0.0.43:9093/#/status
2.7 再次触发告警配置
Alertmanager集成钉钉插件实现告警
参考链接:
https://github.com/timonwong/prometheus-webhook-dingtalk/
- 1.部署钉钉插件
1.1 下载钉钉插件
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
1.2 解压文件
[root@node-exporter43 ~]# tar xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz -C /usr/local/
[root@node-exporter43 ~]#
[root@node-exporter43 ~]# cd /usr/local/prometheus-webhook-dingtalk-2.1.0.linux-amd64/
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ll
total 18752
drwxr-xr-x 3 3434 3434 4096 Apr 21 2022 ./
drwxr-xr-x 12 root root 4096 Mar 30 17:47 ../
-rw-r--r-- 1 3434 3434 1299 Apr 21 2022 config.example.yml
drwxr-xr-x 4 3434 3434 4096 Apr 21 2022 contrib/
-rw-r--r-- 1 3434 3434 11358 Apr 21 2022 LICENSE
-rwxr-xr-x 1 3434 3434 19172733 Apr 21 2022 prometheus-webhook-dingtalk*
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
1.3 修改配置文件
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# cp config{.example,}.yml
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ll
total 18756
drwxr-xr-x 3 3434 3434 4096 Mar 30 17:47 ./
drwxr-xr-x 12 root root 4096 Mar 30 17:47 ../
-rw-r--r-- 1 3434 3434 1299 Apr 21 2022 config.example.yml
-rw-r--r-- 1 root root 1299 Mar 30 17:47 config.yml
drwxr-xr-x 4 3434 3434 4096 Apr 21 2022 contrib/
-rw-r--r-- 1 3434 3434 11358 Apr 21 2022 LICENSE
-rwxr-xr-x 1 3434 3434 19172733 Apr 21 2022 prometheus-webhook-dingtalk*
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# cat config.yml # 也可以直接即可
targets:
linux96:
# 对应的是dingding的webhook
url: https://oapi.dingtalk.com/robot/send?access_token=8acb32ed5ac1e1c346582461443e8430dc79512c905dd6f7b7cf0c751b02d294
# 对应的是"加签"的值,复制过来即可
secret: "SEC13b0108bfcd46c306a12346ddb4524c86573b6dd7da910cb35c6d29829a8118b"
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
1.4 启动钉钉插件
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ./prometheus-webhook-dingtalk --web.listen-address="10.0.0.43:8060"
- 2.Alertmanager集成钉钉插件
2.1 修改Alertmanager的配置文件
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
# 通用配置
global:
resolve_timeout: 5m
smtp_from: '1272606829@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '1272606829@qq.com'
smtp_auth_password: 'fxjamdpfhussbagi'
smtp_require_tls: false
smtp_hello: 'qq.com'
# 定义路由信息
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'sre_system'
# 配置子路由
routes:
- receiver: 'sre_dba'
match_re:
job: violet_dba_exporter
# 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
# 这样做的目的是将消息发给最后的系统组(sre_system)
continue: true
- receiver: 'sre_k8s'
match_re:
job: yinzhengjie_k8s_exporter
continue: true
- receiver: 'sre_system'
match_re:
job: .*
continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
email_configs:
- to: '1954271303@qq.com'
send_resolved: true
# 添加此行,定制邮件的标题,对于"{{}}"属性用于加载其他信息,需要使用单引号括住。
headers: { Subject: "[WARN] LINUX报警邮件" }
# 添加此行,调用模板显式邮件正文,对于"{}"不需要使用单引号,否则服务启动不成功。
html: '{{ template "violet.html" . }}'
- to: '1786026481@qq.com'
send_resolved: true
- name: 'sre_k8s'
email_configs:
- to: '3199355854@qq.com'
send_resolved: true
headers: { Subject: "[WARN] LINUX96报警邮件" }
html: '{{ template "violet.html" . }}'
- to: '914680354@qq.com'
send_resolved: true
- name: 'sre_system'
webhook_configs:
# 指向的是Prometheus的插件地址
- url: 'http://10.0.0.43:8060/dingtalk/linux/send'
http_config: {}
max_alerts: 0
send_resolved: true
#email_configs:
#- to: '1786026481@qq.com'
# send_resolved: true
# headers: { Subject: "[WARN] LINUX报警邮件" }
# html: '{{ template "violet.html" . }}'
#- to: '1293039483@qq.com'
# send_resolved: true
# headers: { Subject: "[WARN] LINUX报警邮件" }
# html: '{{ template "violet.html" . }}'
# 加载模板
templates:
- '/oldboyedu/softwares/alertmanager/tmpl/*.tmpl'
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
2.2 启动Alertmanager
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager
2.3.测试告警验证
Alertmanager的告警抑制(inhibit)
- 1.什么是告警抑制
说白了,就是抑制告警,和静默不同的是,抑制的应用场景一般用于抑制符合条件的告警。
举个例子:
一个数据中心有800台服务器,每台服务器有50个监控项,假设一个意味着有4w个监控告警。
如果数据中心端点,理论上来说就会有4w条告警发送到你的手机,你是处理不过来的,所以我们只需要将数据中心断电的告警发出来即可。
- 2.Prometheus编写规则
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# cat violet-linux-rules.yml
groups:
- name: violet-linux-rules-alert
rules:
- alert: violet-dba_exporter-alert
expr: up{job="violet_dba_exporter"} == 0
for: 3s
labels:
school: violet
class: linux
apps: dba
severity: critical
dc: beijing
annotations:
summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
# 这里注释部分增加了一个value的属性信息,会从Prometheus的默认信息中获取阈值
value: "{{ $value }}"
- alert: violet-k8s_exporter-alert
expr: up{job="violet_k8s_exporter"} == 0
for: 3s
labels:
school: violet
class: linux
apps: k8s
severity: warning
dc: beijing
annotations:
summary: "{{ $labels.instance }} K8S服务器已停止运行超过 3s!"
value: "{{ $value }}"
- alert: violet-bigdata_exporter-alert
expr: up{job="violet_bigdata_exporter"} == 0
for: 5s
labels:
school: violet
class: linux
apps: bigdata
severity: warning
dc: shenzhen
annotations:
summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"
value: "{{ $value }}"
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
- 3.Alertmanager配置告警抑制规则
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
# 通用配置
global:
resolve_timeout: 5m
smtp_from: '1272606829@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '1272606829@qq.com'
smtp_auth_password: 'fxjamdpfhussbagi'
smtp_require_tls: false
smtp_hello: 'qq.com'
# 定义路由信息
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'sre_system'
# 配置子路由
routes:
- receiver: 'sre_dba'
match_re:
job: violet_dba_exporter
# 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
# 这样做的目的是将消息发给最后的系统组(sre_system)
continue: true
- receiver: 'sre_k8s'
match_re:
job: violet_k8s_exporter
continue: true
- receiver: 'sre_system'
match_re:
job: .*
continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
email_configs:
- to: '1954271303@qq.com'
send_resolved: true
# 添加此行,定制邮件的标题,对于"{{}}"属性用于加载其他信息,需要使用单引号括住。
headers: { Subject: "[WARN] LINUX报警邮件" }
# 添加此行,调用模板显式邮件正文,对于"{}"不需要使用单引号,否则服务启动不成功。
html: '{{ template "violet.html" . }}'
- to: '1786026481@qq.com'
send_resolved: true
- name: 'sre_k8s'
email_configs:
- to: '3199355854@qq.com'
send_resolved: true
headers: { Subject: "[WARN] LINUX报警邮件" }
html: '{{ template "violet.html" . }}'
- to: '914680354@qq.com'
send_resolved: true
- name: 'sre_system'
webhook_configs:
# 指向的是Prometheus的插件地址
- url: 'http://10.0.0.43:8060/dingtalk/linux/send'
http_config: {}
max_alerts: 0
send_resolved: true
#email_configs:
#- to: '1786026481@qq.com'
# send_resolved: true
# headers: { Subject: "[WARN] LINUX报警邮件" }
# html: '{{ template "violet.html" . }}'
#- to: '1293039483@qq.com'
# send_resolved: true
# headers: { Subject: "[WARN] LINUX报警邮件" }
# html: '{{ template "violet.html" . }}'
# 加载模板
templates:
- '/violet/softwares/alertmanager/tmpl/*.tmpl'
## 配置告警抑制规则
inhibit_rules:
# 如果"dc"的值相同的前提条件下。
# 则当触发了"severity: critical"告警,就会抑制"severity: warning"的告警信息。
- source_match:
severity: critical
target_match:
severity: warning
equal:
- dc
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
- 4.启动Alertmanager
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ./prometheus-webhook-dingtalk --web.listen-address="10.0.0.43:8060"
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager
5.停止服务在钉钉验证