Merge branch 'main' of github.com:didi/nightingale

master
UlricQin 3 years ago
commit 40e7ede5e3

@ -2,7 +2,7 @@
NOW = $(shell date -u '+%Y%m%d%I%M%S')
RELEASE_VERSION = 5.0.0-ga-05
RELEASE_VERSION = 5.2.0
APP = n9e
SERVER_BIN = $(APP)

@ -1,25 +1,28 @@
## 简介
## Introduction
Nightingale, Prometheus enterprise edition
💡 A Distributed and High-Performance Monitoring System. Prometheus enterprise edition.
## Architecture
## 文档
![n9e-architecture](doc/img/arch.png)
- 国外:[https://n9e.github.io/](https://n9e.github.io/)
- 国内:[https://n9e.gitee.io/](https://n9e.gitee.io/)
- [v4老文档](https://gitee.com/n9e/book/tree/master/content/v4/docs)
## Docs
- github: [https://n9e.github.io/](https://n9e.github.io/)
- gitee: [https://n9e.gitee.io/](https://n9e.gitee.io/)
- v4(old version): [https://n9e.didiyun.com/](https://n9e.didiyun.com/)
## TODO
- [x] deploy nightingale in docker
- [x] export /metrics endpoint
- [ ] notify.py support feishu
- [x] notify.py support feishu
- [ ] notify.py support sms
- [ ] notify.py support voice
- [ ] support remote write api
- [x] support remote write api
- [ ] support pushgateway api
## 大本营
## Any questions?
微信公众号:`__n9e__`(夜莺监控),回复“加群”可以加入交流群,回复“星球”可加入知识星球提问题
[Click me](https://s3-gz01.didistatic.com/n9e-pub/image/n9e-wx.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 198 KiB

@ -79,7 +79,7 @@ services:
- "server"
nwebapi:
image: ulric2019/nightingale:5.0.0-ga-05
image: ulric2019/nightingale:5.1.0
container_name: nwebapi
hostname: nwebapi
restart: always
@ -107,7 +107,7 @@ services:
- "webapi"
nserver:
image: ulric2019/nightingale:5.0.0-ga-05
image: ulric2019/nightingale:5.1.0
container_name: nserver
hostname: nserver
restart: always

@ -155,7 +155,7 @@ CREATE TABLE `dashboard` (
`group_id` bigint not null default 0 comment 'busi group id',
`name` varchar(191) not null,
`tags` varchar(255) not null comment 'split by space',
`configs` varchar(4096) comment 'dashboard variables',
`configs` varchar(8192) comment 'dashboard variables',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
@ -186,7 +186,7 @@ CREATE TABLE `chart` (
CREATE TABLE `chart_share` (
`id` bigint unsigned not null auto_increment,
`cluster` varchar(128) not null,
`configs` varchar(8192),
`configs` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
primary key (`id`),
@ -202,7 +202,7 @@ CREATE TABLE `alert_rule` (
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`disabled` tinyint(1) not null comment '0:enabled 1:disabled',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(4096) not null comment 'promql',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`enable_stime` char(5) not null default '00:00',
`enable_etime` char(5) not null default '23:59',
@ -227,7 +227,7 @@ CREATE TABLE `alert_mute` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`cluster` varchar(128) not null,
`tags` varchar(2048) not null default '' comment 'json,map,tagkey->regexp|value',
`tags` varchar(4096) not null default '' comment 'json,map,tagkey->regexp|value',
`cause` varchar(255) not null default '',
`btime` bigint not null default 0 comment 'begin time',
`etime` bigint not null default 0 comment 'end time',
@ -243,7 +243,7 @@ CREATE TABLE `alert_subscribe` (
`group_id` bigint not null default 0 comment 'busi group id',
`cluster` varchar(128) not null,
`rule_id` bigint not null default 0,
`tags` varchar(2048) not null default '' comment 'json,map,tagkey->regexp|value',
`tags` varchar(4096) not null default '' comment 'json,map,tagkey->regexp|value',
`redefine_severity` tinyint(1) default 0 comment 'is redefine severity?',
`new_severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`redefine_channels` tinyint(1) default 0 comment 'is redefine channels?',
@ -302,7 +302,7 @@ CREATE TABLE `alert_cur_event` (
`rule_note` varchar(512) not null default 'alert rule note',
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(4096) not null comment 'promql',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(255) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
@ -333,7 +333,7 @@ CREATE TABLE `alert_his_event` (
`rule_note` varchar(512) not null default 'alert rule note',
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(4096) not null comment 'promql',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(255) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),

@ -24,6 +24,10 @@ mail_from = "ulricqin@163.com"
class Sender(object):
@classmethod
def send_email(cls, payload):
if mail_user == "ulricqin" and mail_pass == "password":
print("invalid smtp configuration")
return
users = payload.get('event').get("notify_users_obj")
emails = {}

@ -54,6 +54,7 @@ Interval = 1000
[Alerting]
NotifyScriptPath = "./etc/script/notify.py"
NotifyConcurrency = 100
TemplatesDir = "./etc/template"
[Alerting.RedisPub]
Enable = false

@ -7,6 +7,9 @@ RunMode = "release"
# do not change
AdminRole = "Admin"
# metrics descriptions
MetricsYamlFile = "./etc/metrics.yaml"
# Linkage with notify.py script
NotifyChannels = [ "email", "dingtalk", "wecom", "feishu" ]

@ -27,3 +27,8 @@ scrape_configs:
static_configs:
- targets: ['localhost:9090']
- job_name: 'n9e'
file_sd_configs:
- files:
- targets.json

@ -0,0 +1,7 @@
[
{
"targets": [
"nwebapi:18000","nserver:19000"
]
}
]

@ -6,6 +6,9 @@ import urllib2
import smtplib
from email.mime.text import MIMEText
reload(sys)
sys.setdefaultencoding('utf8')
notify_channel_funcs = {
"email":"email",
"sms":"sms",
@ -24,6 +27,10 @@ mail_from = "ulricqin@163.com"
class Sender(object):
@classmethod
def send_email(cls, payload):
if mail_user == "ulricqin" and mail_pass == "password":
print("invalid smtp configuration")
return
users = payload.get('event').get("notify_users_obj")
emails = {}
@ -82,7 +89,13 @@ class Sender(object):
@classmethod
def send_dingtalk(cls, payload):
users = payload.get('event').get("notify_users_obj")
event = payload.get('event')
users = event.get("notify_users_obj")
rule_name = event.get("rule_name")
event_state = "Triggered"
if event.get("is_recovered"):
event_state = "Recovered"
tokens = {}
phones = {}
@ -101,9 +114,10 @@ class Sender(object):
for t in tokens:
url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t)
body = {
"msgtype": "text",
"text": {
"content": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found")
"msgtype": "markdown",
"markdown": {
"title": "{} - {}".format(event_state, rule_name),
"text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()])
},
"at": {
"atMobiles": phones.keys(),

@ -57,6 +57,7 @@ Interval = 1000
[Alerting]
NotifyScriptPath = "./etc/script/notify.py"
NotifyConcurrency = 100
TemplatesDir = "./etc/template"
[Alerting.RedisPub]
Enable = false

@ -1,6 +1,11 @@
级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
{{if .IsRecovered}}恢复时间:{{timeformat .LastEvalTime}}{{else}}触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}{{end}}
#### {{if .IsRecovered}}<font color="#008800">S{{.Severity}} - Recovered - {{.RuleName}}</font>{{else}}<font color="#FF0000">S{{.Severity}} - Triggered - {{.RuleName}}</font>{{end}}
---
- **规则标题**: {{.RuleName}}{{if .RuleNote}}
- **规则备注**: {{.RuleNote}}{{end}}
- **监控指标**: {{.TagsJSON}}
- {{if .IsRecovered}}**恢复时间**{{timeformat .LastEvalTime}}{{else}}**触发时间**: {{timeformat .TriggerTime}}
- **触发时值**: {{.TriggerValue}}{{end}}
- **发送时间**: {{timestamp}}

@ -3,4 +3,5 @@
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
{{if .IsRecovered}}恢复时间:{{timeformat .LastEvalTime}}{{else}}触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}{{end}}
触发时值: {{.TriggerValue}}{{end}}
发送时间: {{timestamp}}

@ -181,6 +181,13 @@
</tr>
{{end}}
<tr>
<th>发送时间:</th>
<td>
{{timestamp}}
</td>
</tr>
<tr>
<th>PromQL</th>
<td>

@ -3,4 +3,5 @@
**规则备注**: {{.RuleNote}}{{end}}
**监控指标**: {{.TagsJSON}}
{{if .IsRecovered}}**恢复时间**{{timeformat .LastEvalTime}}{{else}}**触发时间**: {{timeformat .TriggerTime}}
**触发时值**: {{.TriggerValue}}{{end}}
**触发时值**: {{.TriggerValue}}{{end}}
**发送时间**: {{timestamp}}

@ -7,6 +7,9 @@ RunMode = "release"
# do not change
AdminRole = "Admin"
# metrics descriptions
MetricsYamlFile = "./etc/metrics.yaml"
# Linkage with notify.py script
NotifyChannels = [ "email", "dingtalk", "wecom", "feishu" ]

@ -10,6 +10,7 @@ require (
github.com/gin-gonic/gin v1.7.4
github.com/go-ldap/ldap/v3 v3.4.1
github.com/go-redis/redis/v8 v8.11.3
github.com/gogo/protobuf v1.1.1
github.com/golang-jwt/jwt v3.2.2+incompatible
github.com/golang/protobuf v1.5.2
github.com/golang/snappy v0.0.4

@ -104,9 +104,8 @@ func AlertMuteDel(ids []int64) error {
return DB().Where("id in ?", ids).Delete(new(AlertMute)).Error
}
func AlertMuteStatistics(cluster string, btime int64) (*Statistics, error) {
session := DB().Model(&AlertMute{}).Select("count(*) as total", "max(create_at) as last_updated").Where("btime <= ?", btime)
func AlertMuteStatistics(cluster string) (*Statistics, error) {
session := DB().Model(&AlertMute{}).Select("count(*) as total", "max(create_at) as last_updated")
if cluster != "" {
session = session.Where("cluster = ?", cluster)
}
@ -120,7 +119,7 @@ func AlertMuteStatistics(cluster string, btime int64) (*Statistics, error) {
return stats[0], nil
}
func AlertMuteGetsByCluster(cluster string, btime int64) ([]*AlertMute, error) {
func AlertMuteGetsByCluster(cluster string) ([]*AlertMute, error) {
// clean expired first
buf := int64(30)
err := DB().Where("etime < ?", time.Now().Unix()+buf).Delete(new(AlertMute)).Error
@ -129,7 +128,7 @@ func AlertMuteGetsByCluster(cluster string, btime int64) ([]*AlertMute, error) {
}
// get my cluster's mutes
session := DB().Model(&AlertMute{}).Where("btime <= ?", btime)
session := DB().Model(&AlertMute{})
if cluster != "" {
session = session.Where("cluster = ?", cluster)
}

@ -12,7 +12,6 @@ import (
"github.com/didi/nightingale/v5/src/pkg/httpx"
"github.com/didi/nightingale/v5/src/pkg/logx"
"github.com/didi/nightingale/v5/src/server/naming"
"github.com/didi/nightingale/v5/src/server/reader"
"github.com/didi/nightingale/v5/src/server/writer"
"github.com/didi/nightingale/v5/src/storage"
@ -77,7 +76,6 @@ func MustLoad(fpaths ...string) {
}
C.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", C.Heartbeat.IP, C.HTTP.Port)
C.Heartbeat.Cluster = C.ClusterName
C.Alerting.RedisPub.ChannelKey = C.Alerting.RedisPub.ChannelPrefix + C.ClusterName
@ -93,7 +91,7 @@ type Config struct {
Log logx.Config
HTTP httpx.Config
BasicAuth gin.Accounts
Heartbeat naming.HeartbeatConfig
Heartbeat HeartbeatConfig
Alerting Alerting
NoData NoData
Redis storage.RedisConfig
@ -106,9 +104,16 @@ type Config struct {
Ibex Ibex
}
type HeartbeatConfig struct {
IP string
Interval int64
Endpoint string
}
type Alerting struct {
NotifyScriptPath string
NotifyConcurrency int
TemplatesDir string
RedisPub RedisPub
}

@ -58,36 +58,7 @@ func persist(event *models.AlertCurEvent) {
his := event.ToHis()
if has {
// 数据库里有这个事件,说明之前触发过了
if event.IsRecovered {
// 本次恢复了,把未恢复的事件删除,在全量告警里添加记录
err := models.AlertCurEventDelByHash(event.Hash)
if err != nil {
logger.Errorf("event_del_cur_fail: %v hash=%s", err, event.Hash)
}
if err := his.Add(); err != nil {
logger.Errorf(
"event_persist_his_fail: %v rule_id=%d hash=%s tags=%v timestamp=%d value=%s",
err,
event.RuleId,
event.Hash,
event.TagsJSON,
event.TriggerTime,
event.TriggerValue,
)
}
}
return
}
if event.IsRecovered {
// alert_cur_event表里没有数据表示之前没告警结果现在报了恢复神奇....理论上不应该出现的
return
}
// 本次是告警alert_cur_event表里也没有数据
// 不管是告警还是恢复,全量告警里都要记录
if err := his.Add(); err != nil {
logger.Errorf(
"event_persist_his_fail: %v rule_id=%d hash=%s tags=%v timestamp=%d value=%s",
@ -100,6 +71,41 @@ func persist(event *models.AlertCurEvent) {
)
}
if has {
// 活跃告警表中有记录,删之
err = models.AlertCurEventDelByHash(event.Hash)
if err != nil {
logger.Errorf("event_del_cur_fail: %v hash=%s", err, event.Hash)
return
}
if !event.IsRecovered {
// 恢复事件从活跃告警列表彻底删掉告警事件要重新加进来新的event
// use his id as cur id
event.Id = his.Id
if event.Id > 0 {
if err := event.Add(); err != nil {
logger.Errorf(
"event_persist_cur_fail: %v rule_id=%d hash=%s tags=%v timestamp=%d value=%s",
err,
event.RuleId,
event.Hash,
event.TagsJSON,
event.TriggerTime,
event.TriggerValue,
)
}
}
}
return
}
if event.IsRecovered {
// alert_cur_event表里没有数据表示之前没告警结果现在报了恢复神奇....理论上不应该出现的
return
}
// use his id as cur id
event.Id = his.Id
if event.Id > 0 {

@ -20,9 +20,6 @@ func Start(ctx context.Context) error {
// filter my rules and start worker
go loopFilterRules(ctx)
// repeat notifier
go loopRepeat(ctx)
go reportQueueSize()
return nil

@ -5,14 +5,15 @@ import (
"github.com/didi/nightingale/v5/src/server/memsto"
)
func isMuted(event *models.AlertCurEvent) bool {
// 如果传入了clock这个可选参数就表示使用这个clock表示的时间否则就从event的字段中取TriggerTime
func isMuted(event *models.AlertCurEvent, clock ...int64) bool {
mutes, has := memsto.AlertMuteCache.Gets(event.GroupId)
if !has || len(mutes) == 0 {
return false
}
for i := 0; i < len(mutes); i++ {
if matchMute(event, mutes[i]) {
if matchMute(event, mutes[i], clock...) {
return true
}
}
@ -20,8 +21,13 @@ func isMuted(event *models.AlertCurEvent) bool {
return false
}
func matchMute(event *models.AlertCurEvent, mute *models.AlertMute) bool {
if event.TriggerTime < mute.Btime || event.TriggerTime > mute.Etime {
func matchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int64) bool {
ts := event.TriggerTime
if len(clock) > 0 {
ts = clock[0]
}
if ts < mute.Btime || ts > mute.Etime {
return false
}

@ -44,15 +44,17 @@ var fns = template.FuncMap{
}
func initTpls() error {
tplDir := path.Join(runner.Cwd, "etc", "template")
if config.C.Alerting.TemplatesDir == "" {
config.C.Alerting.TemplatesDir = path.Join(runner.Cwd, "etc", "template")
}
filenames, err := file.FilesUnder(tplDir)
filenames, err := file.FilesUnder(config.C.Alerting.TemplatesDir)
if err != nil {
return errors.WithMessage(err, "failed to exec FilesUnder")
}
if len(filenames) == 0 {
return errors.New("no tpl files under " + tplDir)
return errors.New("no tpl files under " + config.C.Alerting.TemplatesDir)
}
tplFiles := make([]string, 0, len(filenames))
@ -63,11 +65,11 @@ func initTpls() error {
}
if len(tplFiles) == 0 {
return errors.New("no tpl files under " + tplDir)
return errors.New("no tpl files under " + config.C.Alerting.TemplatesDir)
}
for i := 0; i < len(tplFiles); i++ {
tplpath := path.Join(tplDir, tplFiles[i])
tplpath := path.Join(config.C.Alerting.TemplatesDir, tplFiles[i])
tpl, err := template.New(tplFiles[i]).Funcs(fns).ParseFiles(tplpath)
if err != nil {

@ -1,66 +0,0 @@
package engine
import (
"context"
"time"
"github.com/didi/nightingale/v5/src/models"
"github.com/didi/nightingale/v5/src/server/config"
"github.com/didi/nightingale/v5/src/server/memsto"
"github.com/toolkits/pkg/logger"
)
func loopRepeat(ctx context.Context) {
duration := time.Duration(9000) * time.Millisecond
for {
select {
case <-ctx.Done():
return
case <-time.After(duration):
repeat()
}
}
}
// 拉取未恢复的告警表中需要重复通知的数据
func repeat() {
events, err := models.AlertCurEventNeedRepeat(config.C.ClusterName)
if err != nil {
logger.Errorf("repeat: AlertCurEventNeedRepeat: %v", err)
return
}
if len(events) == 0 {
return
}
for i := 0; i < len(events); i++ {
event := events[i]
rule := memsto.AlertRuleCache.Get(event.RuleId)
if rule == nil {
continue
}
if rule.NotifyRepeatStep == 0 {
// 用户后来调整了这个字段,不让继续发送了
continue
}
event.DB2Mem()
if isNoneffective(event.TriggerTime, rule) {
continue
}
if isMuted(event) {
continue
}
fillUsers(event)
notify(event)
if err = event.IncRepeatStep(int64(rule.NotifyRepeatStep * 60)); err != nil {
logger.Errorf("repeat: IncRepeatStep: %v", err)
}
}
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save