version 5.1

master
UlricQin 3 years ago
parent 7a2b07eebd
commit 6e3ad3dd6b

@ -0,0 +1,38 @@
.PHONY: start build
NOW = $(shell date -u '+%Y%m%d%I%M%S')
RELEASE_VERSION = 5.1.0
APP = n9e
SERVER_BIN = ${APP}
# RELEASE_ROOT = release
# RELEASE_SERVER = release/${APP}
# GIT_COUNT = $(shell git rev-list --all --count)
# GIT_HASH = $(shell git rev-parse --short HEAD)
# RELEASE_TAG = $(RELEASE_VERSION).$(GIT_COUNT).$(GIT_HASH)
all: build
build:
@go build -ldflags "-w -s -X main.VERSION=$(RELEASE_VERSION)" -o $(SERVER_BIN) ./src
# start:
# @go run -ldflags "-X main.VERSION=$(RELEASE_TAG)" ./cmd/${APP}/main.go web -c ./configs/config.toml -m ./configs/model.conf --menu ./configs/menu.yaml
# swagger:
# @swag init --parseDependency --generalInfo ./cmd/${APP}/main.go --output ./internal/app/swagger
# wire:
# @wire gen ./internal/app
# test:
# cd ./internal/app/test && go test -v
# clean:
# rm -rf data release $(SERVER_BIN) internal/app/test/data cmd/${APP}/data
# pack: build
# rm -rf $(RELEASE_ROOT) && mkdir -p $(RELEASE_SERVER)
# cp -r $(SERVER_BIN) configs $(RELEASE_SERVER)
# cd $(RELEASE_ROOT) && tar -cvf $(APP).tar ${APP} && rm -rf ${APP}

@ -1,10 +1,18 @@
## 基本信息
- 官网:[n9e.didiyun.com](https://n9e.didiyun.com/) 右上角切换版本
- 招聘前后端都要base北京薪资open可将简历发至邮箱 `echo cWlueWVuaW5nQGRpZGlnbG9iYWwuY29t | base64 -d` 一起来做开源
## 大本营
微信公号:`__n9e__`(夜莺监控)
微信公号:`__n9e__`(夜莺监控)
知识星球:夜莺开源社区
钉钉交流群:
# todo
- [x] deploy nightingale in docker
- [x] export /metrics endpoint
- [ ] notify.py support feishu
- [ ] notify.py support sms
- [ ] notify.py support voice

@ -1,9 +0,0 @@
package alert
import (
"context"
)
func Start(ctx context.Context) {
go popEvent()
}

File diff suppressed because it is too large Load Diff

@ -1,89 +0,0 @@
package alert
import (
"strings"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/models"
"github.com/toolkits/pkg/logger"
)
func isEventMute(event *models.AlertEvent) bool {
historyPoints, err := event.GetHistoryPoints()
if err != nil {
logger.Errorf("get event HistoryPoints:%+v failed, err: %v", event.HistoryPoints, err)
return false
}
// 先去匹配一下metric为空的mute
if matchMute("", event.ResIdent, event.TagMap, event.ResClasspaths) {
return true
}
// 如果是与条件就会有多个metric任一个匹配了屏蔽规则都算被屏蔽
for i := 0; i < len(historyPoints); i++ {
if matchMute(historyPoints[i].Metric, event.ResIdent, event.TagMap, event.ResClasspaths) {
return true
}
}
resAndTags, exists := cache.ResTags.Get(event.ResIdent)
if exists {
if event.TriggerTime > resAndTags.Resource.MuteBtime && event.TriggerTime < resAndTags.Resource.MuteEtime {
return true
}
}
return false
}
func matchMute(metric, ident string, tags map[string]string, classpaths string) bool {
filters, exists := cache.AlertMute.GetByKey(metric)
if !exists {
// 没有屏蔽规则跟这个事件相关
return false
}
// 只要有一个屏蔽规则命中,那这个事件就是被屏蔽了
for _, filter := range filters {
if matchMuteOnce(filter, ident, tags, classpaths) {
return true
}
}
return false
}
func matchMuteOnce(filter cache.Filter, ident string, tags map[string]string, classpaths string) bool {
if len(filter.ClasspathPrefix) > 0 && !strings.HasPrefix(classpaths, filter.ClasspathPrefix) && !strings.Contains(classpaths, " "+filter.ClasspathPrefix) {
// 没配置分组屏蔽就不做后续比较
// 比如事件的资源calsspath为“n9e.mon n9e.rdb ccp.web”配置屏蔽为n9e.rdb
// 只要字符串前缀为n9e.rdb或者字符串包含“ n9e.rdb”即可判断所有alsspath中是否有前缀为n9e.rdb的
// 只要有任一点不满足,那这个屏蔽规则也没有继续验证下去的必要
return false
}
if filter.ResReg != nil && !filter.ResReg.MatchString(ident) {
// 比如屏蔽规则配置的是c3-ceph.*
// 当前事件的资源标识是c4-ceph01.bj
// 只要有任一点不满足,那这个屏蔽规则也没有继续验证下去的必要
return false
}
// 每个mute中的tags都得出现在event.tags否则就是不匹配
return mapContains(tags, filter.TagsMap)
}
func mapContains(big, small map[string]string) bool {
for tagk, tagv := range small {
val, exists := big[tagk]
if !exists {
return false
}
if val != tagv {
return false
}
}
return true
}

@ -1,89 +0,0 @@
package backend
import (
"fmt"
"github.com/prometheus/prometheus/promql"
"github.com/didi/nightingale/v5/vos"
"github.com/toolkits/pkg/container/list"
pp "github.com/didi/nightingale/v5/backend/prome"
)
type BackendSection struct {
DataSource string `yaml:"datasource"`
Prometheus pp.PromeSection `yaml:"prometheus"`
}
type DataSource interface {
PushEndpoint
QueryData(inputs vos.DataQueryParam) []*vos.DataQueryResp // 查询一段时间
QueryDataInstant(ql string) []*vos.DataQueryInstanceResp // 查询一个时间点数据 等同于prometheus instant_query
QueryTagKeys(recv vos.CommonTagQueryParam) *vos.TagKeyQueryResp // 获取标签的names
QueryTagValues(recv vos.CommonTagQueryParam) *vos.TagValueQueryResp // 根据一个label_name获取 values
QueryTagPairs(recv vos.CommonTagQueryParam) *vos.TagPairQueryResp // 根据匹配拿到所有 series 上面三个使用统一的结构体
QueryMetrics(recv vos.MetricQueryParam) *vos.MetricQueryResp // 根据标签查 metric_names
QueryVector(ql string) promql.Vector // prometheus pull alert 所用,其他数据源留空即可
CleanUp() // 数据源退出时需要做的清理工作
}
type PushEndpoint interface {
Push2Queue(items []*vos.MetricPoint)
}
var (
defaultDataSource string
registryDataSources = make(map[string]DataSource)
registryPushEndpoints = make(map[string]PushEndpoint)
)
func Init(cfg BackendSection) {
defaultDataSource = cfg.DataSource
// init prometheus
if cfg.Prometheus.Enable {
promeDs := &pp.PromeDataSource{
Section: cfg.Prometheus,
PushQueue: list.NewSafeListLimited(10240000),
}
promeDs.Init()
RegisterDataSource(cfg.Prometheus.Name, promeDs)
}
}
// get backend datasource
// (pluginId == "" for default datasource)
func GetDataSourceFor(pluginId string) (DataSource, error) {
if pluginId == "" {
pluginId = defaultDataSource
}
if source, exists := registryDataSources[pluginId]; exists {
return source, nil
}
return nil, fmt.Errorf("could not find datasource for plugin: %s", pluginId)
}
func DatasourceCleanUp() {
for _, ds := range registryDataSources {
ds.CleanUp()
}
}
// get all push endpoints
func GetPushEndpoints() ([]PushEndpoint, error) {
if len(registryPushEndpoints) > 0 {
items := make([]PushEndpoint, 0, len(registryPushEndpoints))
for _, value := range registryPushEndpoints {
items = append(items, value)
}
return items, nil
}
return nil, fmt.Errorf("could not find any pushendpoint")
}
func RegisterDataSource(pluginId string, datasource DataSource) {
registryDataSources[pluginId] = datasource
registryPushEndpoints[pluginId] = datasource
}

@ -1,183 +0,0 @@
package backend
import (
"bufio"
"bytes"
"context"
"io"
"io/ioutil"
"net/http"
"regexp"
"time"
"github.com/gogo/protobuf/proto"
"github.com/golang/snappy"
"github.com/opentracing-contrib/go-stdlib/nethttp"
"github.com/opentracing/opentracing-go"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/vos"
)
var MetricNameRE = regexp.MustCompile(`^[a-zA-Z_:][a-zA-Z0-9_:]*$`)
type sample struct {
labels labels.Labels
t int64
v float64
}
func labelsToLabelsProto(labels labels.Labels, buf []prompb.Label) []prompb.Label {
result := buf[:0]
if cap(buf) < len(labels) {
result = make([]prompb.Label, 0, len(labels))
}
for _, l := range labels {
result = append(result, prompb.Label{
Name: l.Name,
Value: l.Value,
})
}
return result
}
func (pd *PromeDataSource) convertOne(item *vos.MetricPoint) (prompb.TimeSeries, error) {
pt := prompb.TimeSeries{}
pt.Samples = []prompb.Sample{{}}
s := sample{}
s.t = item.Time
s.v = item.Value
// name
if !MetricNameRE.MatchString(item.Metric) {
return pt, errors.New("invalid metrics name")
}
nameLs := labels.Label{
Name: LABEL_NAME,
Value: item.Metric,
}
s.labels = append(s.labels, nameLs)
if item.Ident != "" {
identLs := labels.Label{
Name: LABEL_IDENT,
Value: item.Ident,
}
s.labels = append(s.labels, identLs)
}
for k, v := range item.TagsMap {
if model.LabelNameRE.MatchString(k) {
ls := labels.Label{
Name: k,
Value: v,
}
s.labels = append(s.labels, ls)
}
}
pt.Labels = labelsToLabelsProto(s.labels, pt.Labels)
// 时间赋值问题,使用毫秒时间戳
tsMs := time.Unix(s.t, 0).UnixNano() / 1e6
pt.Samples[0].Timestamp = tsMs
pt.Samples[0].Value = s.v
return pt, nil
}
type RecoverableError struct {
error
}
func remoteWritePost(c *HttpClient, req []byte) error {
httpReq, err := http.NewRequest("POST", c.url.String(), bytes.NewReader(req))
if err != nil {
// Errors from NewRequest are from unparsable URLs, so are not
// recoverable.
return err
}
httpReq.Header.Add("Content-Encoding", "snappy")
httpReq.Header.Set("Content-Type", "application/x-protobuf")
httpReq.Header.Set("User-Agent", "n9e-v5")
httpReq.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()
httpReq = httpReq.WithContext(ctx)
if parentSpan := opentracing.SpanFromContext(ctx); parentSpan != nil {
var ht *nethttp.Tracer
httpReq, ht = nethttp.TraceRequest(
parentSpan.Tracer(),
httpReq,
nethttp.OperationName("Remote Store"),
nethttp.ClientTrace(false),
)
defer ht.Finish()
}
httpResp, err := c.Client.Do(httpReq)
if err != nil {
// Errors from Client.Do are from (for example) network errors, so are
// recoverable.
return RecoverableError{err}
}
defer func() {
io.Copy(ioutil.Discard, httpResp.Body)
httpResp.Body.Close()
}()
if httpResp.StatusCode/100 != 2 {
scanner := bufio.NewScanner(io.LimitReader(httpResp.Body, 512))
line := ""
if scanner.Scan() {
line = scanner.Text()
}
if httpResp.StatusCode == 400 {
//400的错误是客户端的问题不返回给上层输出到debug日志中
logger.Debugf("server returned HTTP status %s: %s req:%v", httpResp.Status, line, getSamples(req))
} else {
err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line)
}
}
if httpResp.StatusCode/100 == 5 {
return RecoverableError{err}
}
return err
}
func (pd *PromeDataSource) buildWriteRequest(samples []prompb.TimeSeries) ([]byte, error) {
req := &prompb.WriteRequest{
Timeseries: samples,
Metadata: nil,
}
data, err := proto.Marshal(req)
if err != nil {
return nil, err
}
compressed := snappy.Encode(nil, data)
return compressed, nil
}
func getSamples(compressed []byte) []prompb.TimeSeries {
var samples []prompb.TimeSeries
req := &prompb.WriteRequest{
Timeseries: samples,
Metadata: nil,
}
d, _ := snappy.Decode(nil, compressed)
proto.Unmarshal(d, req)
return req.Timeseries
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,9 +0,0 @@
#!/bin/bash
# release version
version=5.0.0-rc7-1
#export GO111MODULE=on
#export GOPROXY=https://goproxy.cn
go build -ldflags "-X github.com/didi/nightingale/v5/config.Version=${version}" -o n9e-server main.go

@ -1,33 +0,0 @@
package cache
import (
"regexp"
"sync"
)
type AlertMuteMap struct {
sync.RWMutex
Data map[string][]Filter
}
type Filter struct {
ClasspathPrefix string
ResReg *regexp.Regexp
TagsMap map[string]string
}
var AlertMute = &AlertMuteMap{Data: make(map[string][]Filter)}
func (a *AlertMuteMap) SetAll(m map[string][]Filter) {
a.Lock()
defer a.Unlock()
a.Data = m
}
func (a *AlertMuteMap) GetByKey(key string) ([]Filter, bool) {
a.RLock()
defer a.RUnlock()
value, exists := a.Data[key]
return value, exists
}

@ -1,75 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type AlertRulesByMetricCache struct {
sync.RWMutex
Data map[string][]*models.AlertRule // key是metric便于后续检索
MaxUpdateTs int64 // 从数据库拿到的最大update_at
RuleNum int64 // 从数据库中统计到的行数
LastSync int64 // 保存上次全量同步时间
}
var (
AlertRulesByMetric = &AlertRulesByMetricCache{Data: make(map[string][]*models.AlertRule)}
)
func (a *AlertRulesByMetricCache) GetBy(instance string) []*models.AlertRule {
a.RLock()
defer a.RUnlock()
return a.Data[instance]
}
func (a *AlertRulesByMetricCache) SetAll(alertRulesMap map[string][]*models.AlertRule, lastUpdateTs, ruleNum, lastSync int64) {
a.Lock()
defer a.Unlock()
a.Data = alertRulesMap
a.MaxUpdateTs = lastUpdateTs
a.RuleNum = ruleNum
a.LastSync = lastSync
}
type AlertRulesTotalCache struct {
sync.RWMutex
Data map[int64]*models.AlertRule
}
var AlertRules = &AlertRulesTotalCache{Data: make(map[int64]*models.AlertRule)}
func (a *AlertRulesTotalCache) Get(id int64) (*models.AlertRule, bool) {
a.RLock()
defer a.RUnlock()
alertRule, exists := a.Data[id]
return alertRule, exists
}
func (a *AlertRulesTotalCache) SetAll(alertRulesMap map[int64]*models.AlertRule) {
a.Lock()
defer a.Unlock()
a.Data = alertRulesMap
}
// 获取所有PULL型规则的列表
func (a *AlertRulesTotalCache) Pulls() []*models.AlertRule {
a.RLock()
defer a.RUnlock()
cnt := len(a.Data)
ret := make([]*models.AlertRule, 0, cnt)
for _, rule := range a.Data {
if rule.Type == models.PULL {
ret = append(ret, rule)
}
}
return ret
}

7
cache/cache.go vendored

@ -1,7 +0,0 @@
package cache
import (
cmap "github.com/orcaman/concurrent-map"
)
var MetricDescMapper = cmap.New()

@ -1,27 +0,0 @@
package cache
import (
"sync"
)
type ClasspathPrefixMap struct {
sync.RWMutex
Data map[int64][]int64
}
var ClasspathPrefix = &ClasspathPrefixMap{Data: make(map[int64][]int64)}
func (c *ClasspathPrefixMap) Get(id int64) ([]int64, bool) {
c.RLock()
defer c.RUnlock()
ids, exists := c.Data[id]
return ids, exists
}
func (c *ClasspathPrefixMap) SetAll(data map[int64][]int64) {
c.Lock()
defer c.Unlock()
c.Data = data
return
}

@ -1,33 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type ClasspathResMap struct {
sync.RWMutex
Data map[int64]*ClasspathAndRes
}
type ClasspathAndRes struct {
Res []string
Classpath *models.Classpath
}
// classpath_id -> classpath & res_idents
var ClasspathRes = &ClasspathResMap{Data: make(map[int64]*ClasspathAndRes)}
func (c *ClasspathResMap) Get(id int64) (*ClasspathAndRes, bool) {
c.RLock()
defer c.RUnlock()
resources, exists := c.Data[id]
return resources, exists
}
func (c *ClasspathResMap) SetAll(collectRulesMap map[int64]*ClasspathAndRes) {
c.Lock()
defer c.Unlock()
c.Data = collectRulesMap
}

@ -1,32 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type CollectRuleOfIdentMap struct {
sync.RWMutex
Data map[string][]*models.CollectRule
}
var CollectRulesOfIdent = &CollectRuleOfIdentMap{Data: make(map[string][]*models.CollectRule)}
func (c *CollectRuleOfIdentMap) GetBy(ident string) []*models.CollectRule {
c.RLock()
defer c.RUnlock()
return c.Data[ident]
}
func (c *CollectRuleOfIdentMap) Set(node string, collectRules []*models.CollectRule) {
c.Lock()
defer c.Unlock()
c.Data[node] = collectRules
}
func (c *CollectRuleOfIdentMap) SetAll(collectRulesMap map[string][]*models.CollectRule) {
c.Lock()
defer c.Unlock()
c.Data = collectRulesMap
}

@ -1,76 +0,0 @@
package cache
import (
"sync"
)
type SafeDoubleMap struct {
sync.RWMutex
M map[string]map[string]struct{}
}
// res_ident -> classpath_path -> struct{}{}
var ResClasspath = &SafeDoubleMap{M: make(map[string]map[string]struct{})}
func (s *SafeDoubleMap) GetKeys() []string {
s.RLock()
defer s.RUnlock()
keys := make([]string, 0, len(s.M))
for key := range s.M {
keys = append(keys, key)
}
return keys
}
func (s *SafeDoubleMap) GetValues(key string) []string {
s.RLock()
defer s.RUnlock()
valueMap, exists := s.M[key]
if !exists {
return []string{}
}
values := make([]string, 0, len(valueMap))
for value := range valueMap {
values = append(values, value)
}
return values
}
func (s *SafeDoubleMap) Exists(key string, value string) bool {
s.RLock()
defer s.RUnlock()
if _, exists := s.M[key]; !exists {
return false
}
if _, exists := s.M[key][value]; !exists {
return false
}
return true
}
func (s *SafeDoubleMap) Set(key string, value string) {
s.Lock()
defer s.Unlock()
if _, exists := s.M[key]; !exists {
s.M[key] = make(map[string]struct{})
}
s.M[key][value] = struct{}{}
}
func (s *SafeDoubleMap) SetAll(data map[string]map[string]struct{}) {
s.Lock()
defer s.Unlock()
s.M = data
}

36
cache/res_tags.go vendored

@ -1,36 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
// resource_ident -> tags_map
// 监控数据上报的时候要把资源的tags附到指标数据上
type ResTagsMap struct {
sync.RWMutex
Data map[string]ResourceAndTags
}
type ResourceAndTags struct {
Tags map[string]string
Resource models.Resource
}
var ResTags = &ResTagsMap{Data: make(map[string]ResourceAndTags)}
func (r *ResTagsMap) SetAll(m map[string]ResourceAndTags) {
r.Lock()
defer r.Unlock()
r.Data = m
}
func (r *ResTagsMap) Get(key string) (ResourceAndTags, bool) {
r.RLock()
defer r.RUnlock()
value, exists := r.Data[key]
return value, exists
}

48
cache/user.go vendored

@ -1,48 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type UserMap struct {
sync.RWMutex
Data map[int64]*models.User
}
var UserCache = &UserMap{Data: make(map[int64]*models.User)}
func (s *UserMap) GetBy(id int64) *models.User {
s.RLock()
defer s.RUnlock()
return s.Data[id]
}
func (s *UserMap) GetByIds(ids []int64) []*models.User {
s.RLock()
defer s.RUnlock()
var users []*models.User
for _, id := range ids {
if s.Data[id] == nil {
continue
}
users = append(users, s.Data[id])
}
return users
}
func (s *UserMap) GetById(id int64) *models.User {
s.RLock()
defer s.RUnlock()
return s.Data[id]
}
func (s *UserMap) SetAll(users map[int64]*models.User) {
s.Lock()
defer s.Unlock()
s.Data = users
}

@ -1,41 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type UserGroupMap struct {
sync.RWMutex
Data map[int64]*models.UserGroup
}
var UserGroupCache = &UserGroupMap{Data: make(map[int64]*models.UserGroup)}
func (s *UserGroupMap) GetBy(id int64) *models.UserGroup {
s.RLock()
defer s.RUnlock()
return s.Data[id]
}
func (s *UserGroupMap) GetByIds(ids []int64) []*models.UserGroup {
s.RLock()
defer s.RUnlock()
var userGroups []*models.UserGroup
for _, id := range ids {
if s.Data[id] == nil {
continue
}
userGroups = append(userGroups, s.Data[id])
}
return userGroups
}
func (s *UserGroupMap) SetAll(userGroups map[int64]*models.UserGroup) {
s.Lock()
defer s.Unlock()
s.Data = userGroups
}

@ -1,38 +0,0 @@
package cache
import (
"sync"
)
type UserGroupMemberMap struct {
sync.RWMutex
Data map[int64]map[int64]struct{}
}
// groupid -> userid
var UserGroupMember = &UserGroupMemberMap{Data: make(map[int64]map[int64]struct{})}
func (m *UserGroupMemberMap) Get(id int64) (map[int64]struct{}, bool) {
m.RLock()
defer m.RUnlock()
ids, exists := m.Data[id]
return ids, exists
}
func (m *UserGroupMemberMap) Exists(gid, uid int64) bool {
m.RLock()
defer m.RUnlock()
uidMap, exists := m.Data[gid]
if !exists {
return false
}
_, exists = uidMap[uid]
return exists
}
func (m *UserGroupMemberMap) SetAll(data map[int64]map[int64]struct{}) {
m.Lock()
defer m.Unlock()
m.Data = data
}

@ -1,186 +0,0 @@
3.1.1
影响模块n9e-job
更新内容job模块之前给监控用的callback地址method误设置为了get是不对的改成了post
3.1.2
影响模块n9e-rdb
更新内容:子节点修改的时候,不允许修改为租户节点
3.1.3
影响模块n9e-monapi
更新内容对于P2、P3的告警会发送重复的两条
3.1.4
影响模块n9e-index n9e-judge n9e-monapi n9e-rdb n9e-transfer n9e-tsdb
更新内容把hbs的逻辑从monapi挪到rdb拆分监控的权限点
3.1.5
影响模块n9e-monapi
更新内容清理策略的时候会空指针node删除了策略还在此时会复现
3.1.6
影响模块n9e-ams etc/gop.yml
更新内容主机设备增加了扩展字段的管理用于维护一些位置信息、过保信息增加了新的sqlsql/n9e_ams_3.1.6.sql
3.2.0
影响模块n9e-agent etc/agent.yml
更新内容agent支持metrics指标采集能力这个版本是为商业版本服务的开源用户无需更新
3.3.0
影响模块n9e-rdb n9e-transfer n9e-judge n9e-ams n9e-monapi sql/n9e_rdb_3.3.0.sql etc/*.tpl
更新内容增强安全性密码复杂度、cookie处理优化等支持M3DB作为存储后端如果要尝试M3需要修改transfer、monapi配置文件修复告警引擎与条件串数的问题为主机设备增加自定义字段的能力
3.3.1
影响模块n9e-job n9e-rdb n9e-agent n9e-ams n9e-judge
更新内容修复job模块的一个调度bugrdb支持根据org搜索useragent在fields变化时及时感知fields和host扩展字段联动解决上个版本引入的judge处理nodata的问题
3.4.0
升级内容:
- 增强了安全性引入了session机制写入cookie的内容从user.uuid变更为随机session.id
- 修复部分sql注入漏洞
- 告警引擎函数优化all、c_avg_rate_abs等
- 告警消息内容优化可以展示设备名称和设备备注感谢冯骐的PR
- 增加了大盘导入导出功能
升级方法:
- 除了agent、tsdb、index的二进制不用升级其他所有模块的二进制都要升级
- job ams monapi rdb 四个模块的配置文件中的cookieName全部换成ecmc-sid
- rdb的配置文件发生了较大变化需要对照升级
- sql目录下有几个3.4.0的sql需要导入
3.4.1
升级内容:
- 修复日志监控采集策略配置了tag但是无法编辑的问题
升级方法:
- 更新monapi的二进制即可
3.5.0
升级内容:
- 引入了组件监控模块prober内置了mysql、redis、mongo监控采集能力
- 引入了内置监控大盘和内置告警策略,可以在任意节点一键导入内置大盘和策略
升级方法:
- n9e-monapi n9e-rdb n9e-transfer n9e-ams n9e-job 的二进制要升级
- n9e-agent也可以升级解决了进程监控的性能问题如果不在意可以不升级
- n9e-prober 模块需要新部署
- sql目录下有个3.5.0的sql patch文件需要导入
- etc目录下新增了screen、alert两个目录需要拷贝到生产环境
- etc目录下新增了plugins目录需要随着prober模块走
- etc/address.yml里增加prober的配置
3.5.1
升级内容:
- monapi里的alarmEnabled默认值设置为true
- agent进程采集忽略EOF日志
- agent增加一个接口获取endpoint
- agent日志监控支持一种新的日志时间格式
- 修复组件监控调整采集频率不生效的问题
升级方法:
- 替换n9e-monapi n9e-prober n9e-agent二进制升级pub下的前端资源文件
3.5.2
升级内容:
- prober模板支持匿名结构体结构体嵌套
- prober插件添加了对TLS的支持
- 修复prober上报没有port的问题
升级方法:
- 替换n9e-prober n9e-monapi二进制升级pub下的前端资源文件
3.6.0
升级内容:
- prober模块支持nginx、elasticsearch、prometheus的监控采集prometheus转换时姑且干掉了 Histogram 和 Summary
- 告警消息中节点挂载关系做了去重处理
升级方法:
- 替换n9e-prober n9e-monapi二进制
3.7.0
升级内容:
- 调整session清理频率
- 新增zookeeper、tengine、rabbitmq、haproxy、ping、telnet相关采集工具
- bugfix集群部署的时候多个redis实例judge只能识别最后一个实例的问题
升级方法:
- sql/n9e_rdb-v3.7.0.sql 有个新的表结构,需要导入一下
- 替换n9e-rdb n9e-prober n9e-judge n9e-monapi二进制前端没有升级
- 将etc/plugins里zookeeper.yml,tengine.yml等新增的yml文件复制到配置文件里
3.7.1
升级内容:
- prober采集增加dryrun测试方法可以测试是否真的能采集到数据
- 增加dns_query插件对dns做监控
- 内置大盘增加n9e内置模块大盘
- 如果存储使用m3支持在transfer配置一次查询每条线最多返回的原始点数
- 日志监控可以把最后一条日志放到extra字段报警的时候可以展示需要升级n9e-agent n9e-monapi
- 修复agent对进程监控采集的bug进程cpu使用采集的不准确
- 修改告警策略配置多个团队的时候不生效的问题
- monapi支持一个新的timestamp格式
升级方法:
- sql/n9e_mon-v3.7.1.sql变更了表结构需要执行一下
- 将etc/plugins里的dns_query.yml放到生产环境的etc/plugins目录下
- 将etc/screen/n9e_modules放到生产环境的etc/screen目录下
- 替换n9e-rdb n9e-prober n9e-monapi n9e-transfer n9e-agent二进制
3.8.0
升级内容:
- monapi优化告警策略中用户信息补全逻辑
- rdb新增接口,查询项目下用户拥有的资源权限点
- transfer查询索引接口支持指定时间范围
- prober去掉组件采集默认的白名单设置
升级方法:
- 替换n9e-rdb n9e-prober n9e-monapi n9e-transfer二进制
- 将etc/password-changed-email.tpl放到生产环境的etc目录下
4.0.0
升级内容:
- 服务端模块合并为一个模块
- agentd和server的调用全部走rpc
重新安装:见 https://n9e.didiyun.com/v4/docs/install/
升级方法:
- 使用新的etc替换掉原来的etc
- 使用etc/nginx.conf替换原来的nginx.conf
- n9e-prober替换旧的n9e-prober
- n9e-agentd替换n9e-agent
- n9e-server替换n9e-rdb、n9e-ams、n9e-job、n9e-monapi、n9e-transfer、n9e-judge
4.0.1
升级内容:
- 修复消息通知的问题
重新安装:见 https://n9e.didiyun.com/v4/docs/install/
升级方法:
- 将 *.tpl 文件放到 etc/tpl 下
- 替换etc/server.yml
- 替换n9e-server
4.0.2
升级内容:
- 优化告警接收人补全逻辑
- 增加pospostgresql监控插件
重新安装:见 https://n9e.didiyun.com/v4/docs/install/
升级方法:
- 替换n9e-server n9e-prober
4.0.3
升级内容:
- 修复nodata恢复告警重复问题
升级方法:
- 替换n9e-server
5.0.0-rc1
升级内容:
- 发布v5预览版
部署方式:
- 见文档 https://n9e.didiyun.com/docs/install/
5.0.0-rc2
升级内容:
- 修复若干问题
- 新增告警策略,监控大盘导入、导出和内置模板功能
- 新增概览页面
部署方式:
- 见文档 https://n9e.didiyun.com/docs/install/

@ -1,176 +0,0 @@
package config
import (
"bytes"
"fmt"
"net"
"os"
"strings"
"github.com/spf13/viper"
"github.com/toolkits/pkg/file"
"github.com/didi/nightingale/v5/backend"
"github.com/didi/nightingale/v5/models"
"github.com/didi/nightingale/v5/pkg/i18n"
"github.com/didi/nightingale/v5/pkg/iconf"
"github.com/didi/nightingale/v5/pkg/ilog"
)
type ConfigStruct struct {
Logger ilog.Config `yaml:"logger"`
HTTP httpSection `yaml:"http"`
RPC rpcSection `yaml:"rpc"`
LDAP models.LdapSection `yaml:"ldap"`
MySQL models.MysqlSection `yaml:"mysql"`
Heartbeat heartbeatSection `yaml:"heartbeat"`
I18N i18n.Config `yaml:"i18n"`
Judge judgeSection `yaml:"judge"`
Alert alertSection `yaml:"alert"`
Trans transSection `yaml:"trans"`
ContactKeys []contactKey `yaml:"contactKeys"`
NotifyChannels []string `yaml:"notifyChannels"`
Tpl tplSection `yaml:"tpl"`
}
type tplSection struct {
AlertRulePath string `yaml:"alertRulePath"`
DashboardPath string `yaml:"dashboardPath"`
}
type alertSection struct {
NotifyScriptPath string `yaml:"notifyScriptPath"`
NotifyScriptConcurrency int `yaml:"notifyScriptConcurrency"`
MutedAlertPersist bool `yaml:"mutedAlertPersist"`
}
type transSection struct {
Enable bool `yaml:"enable"`
Backend backend.BackendSection `yaml:"backend"`
}
type judgeSection struct {
ReadBatch int `yaml:"readBatch"`
ConnTimeout int `yaml:"connTimeout"`
CallTimeout int `yaml:"callTimeout"`
WriterNum int `yaml:"writerNum"`
ConnMax int `yaml:"connMax"`
ConnIdle int `yaml:"connIdle"`
}
type heartbeatSection struct {
IP string `yaml:"ip"`
LocalAddr string `yaml:"-"`
Interval int64 `yaml:"interval"`
}
type httpSection struct {
Mode string `yaml:"mode"`
Access bool `yaml:"access"`
Listen string `yaml:"listen"`
Pprof bool `yaml:"pprof"`
CookieName string `yaml:"cookieName"`
CookieDomain string `yaml:"cookieDomain"`
CookieSecure bool `yaml:"cookieSecure"`
CookieHttpOnly bool `yaml:"cookieHttpOnly"`
CookieMaxAge int `yaml:"cookieMaxAge"`
CookieSecret string `yaml:"cookieSecret"`
CsrfSecret string `yaml:"csrfSecret"`
}
type rpcSection struct {
Listen string `yaml:"listen"`
}
type contactKey struct {
Label string `yaml:"label" json:"label"`
Key string `yaml:"key" json:"key"`
}
var Config *ConfigStruct
func Parse() error {
ymlFile := iconf.GetYmlFile("server")
if ymlFile == "" {
return fmt.Errorf("configuration file of server not found")
}
bs, err := file.ReadBytes(ymlFile)
if err != nil {
return fmt.Errorf("cannot read yml[%s]: %v", ymlFile, err)
}
viper.SetConfigType("yaml")
err = viper.ReadConfig(bytes.NewBuffer(bs))
if err != nil {
return fmt.Errorf("cannot read yml[%s]: %v", ymlFile, err)
}
// default value settings
viper.SetDefault("i18n.lang", "zh")
viper.SetDefault("heartbeat.interval", 1000)
viper.SetDefault("judge.readBatch", 2000)
viper.SetDefault("judge.connTimeout", 2000)
viper.SetDefault("judge.callTimeout", 5000)
viper.SetDefault("judge.writerNum", 256)
viper.SetDefault("judge.connMax", 2560)
viper.SetDefault("judge.connIdle", 256)
viper.SetDefault("alert.notifyScriptPath", "./etc/script/notify.py")
viper.SetDefault("alert.notifyScriptConcurrency", 200)
viper.SetDefault("alert.mutedAlertPersist", true)
viper.SetDefault("trans.backend.prometheus.lookbackDeltaMinute", 2)
viper.SetDefault("trans.backend.prometheus.maxConcurrentQuery", 30)
viper.SetDefault("trans.backend.prometheus.maxSamples", 50000000)
viper.SetDefault("trans.backend.prometheus.maxFetchAllSeriesLimitMinute", 5)
viper.SetDefault("trans.backend.prometheus.slowLogRecordSecond", 3)
viper.SetDefault("trans.backend.prometheus.defaultFetchSeriesQl", `{__name__=~"system.*"}`)
viper.SetDefault("tpl.alertRulePath", "./etc/alert_rule")
viper.SetDefault("tpl.dashboardPath", "./etc/dashboard")
err = viper.Unmarshal(&Config)
if err != nil {
return fmt.Errorf("cannot read yml[%s]: %v", ymlFile, err)
}
fmt.Println("config.file:", ymlFile)
if Config.Heartbeat.IP == "" {
// auto detect
Config.Heartbeat.IP = fmt.Sprint(GetOutboundIP())
if Config.Heartbeat.IP == "" {
fmt.Println("heartbeat ip auto got is blank")
os.Exit(1)
}
}
// 用户在配置文件中指定了heartbeat.ip 用于本机没有网络下面的报错那么需要将Config.Heartbeat.LocalAddr设置一下
// auto get outbound ip fail: dial udp 8.8.8.8:80: connect: network is unreachable
port := strings.Split(Config.RPC.Listen, ":")[1]
Config.Heartbeat.LocalAddr = Config.Heartbeat.IP + ":" + port
// 正常情况肯定不是127.0.0.1,但是,如果就是单机部署,并且这个机器没有网络,比如本地调试并且本机没网的时候
// if Config.Heartbeat.IP == "127.0.0.1" {
// fmt.Println("heartbeat ip is 127.0.0.1 and it is useless, so, exit")
// os.Exit(1)
// }
fmt.Println("heartbeat.ip:", Config.Heartbeat.IP)
fmt.Printf("heartbeat.interval: %dms\n", Config.Heartbeat.Interval)
return nil
}
// Get preferred outbound ip of this machine
func GetOutboundIP() net.IP {
conn, err := net.Dial("udp", "8.8.8.8:80")
if err != nil {
fmt.Println("auto get outbound ip fail:", err)
os.Exit(1)
}
defer conn.Close()
localAddr := conn.LocalAddr().(*net.UDPAddr)
return localAddr.IP
}

@ -1,6 +0,0 @@
package config
// Server周期性去数据库心跳给自己起的名字
const EndpointName = "server_rpc"
var Version = "not specified"

@ -1,71 +0,0 @@
package config
import "github.com/didi/nightingale/v5/pkg/i18n"
var (
dict = map[string]string{
"Login fail, check your username and password": "登录失败,请检查您的用户名和密码",
"Internal server error, try again later please": "系统内部错误,请稍后再试",
"Each user has at most two tokens": "每个用户至多创建两个密钥",
"No such token": "密钥不存在",
"Username is blank": "用户名不能为空",
"Username has invalid characters": "用户名含有非法字符",
"Nickname has invalid characters": "用户昵称含有非法字符",
"Phone invalid": "手机号格式有误",
"Email invalid": "邮箱格式有误",
"Incorrect old password": "旧密码错误",
"Username %s already exists": "用户名(%s)已存在",
"No such user": "用户不存在",
"UserGroup %s already exists": "用户组(%s)已存在",
"Group name has invalid characters": "分组名称含有非法字符",
"Group note has invalid characters": "分组备注含有非法字符",
"No such user group": "用户组不存在",
"Classpath path has invalid characters": "机器分组路径含有非法字符",
"Classpath note has invalid characters": "机器分组路径备注含有非法字符",
"There are still resources under the classpath": "机器分组路径下仍然挂有资源",
"There are still collect rules under the classpath": "机器分组路径下仍然存在采集策略",
"No such classpath": "机器分组路径不存在",
"Classpath %s already exists": "机器分组路径(%s)已存在",
"Preset classpath %s cannot delete": "内置机器分组(%s)不允许删除",
"No such mute config": "此屏蔽配置不存在",
"DashboardGroup name has invalid characters": "大盘分组名称含有非法字符",
"DashboardGroup name is blank": "大盘分组名称为空",
"DashboardGroup %s already exists": "大盘分组(%s)已存在",
"No such dashboard group": "大盘分组不存在",
"Dashboard name has invalid characters": "大盘名称含有非法字符",
"Dashboard %s already exists": "监控大盘(%s)已存在",
"ChartGroup name has invalid characters": "图表分组名称含有非法字符",
"No such dashboard": "监控大盘不存在",
"No such chart group": "图表分组不存在",
"No such chart": "图表不存在",
"There are still dashboards under the group": "分组下面仍然存在监控大盘,请先从组内移出",
"AlertRuleGroup name has invalid characters": "告警规则分组含有非法字符",
"AlertRuleGroup %s already exists": "告警规则分组(%s)已存在",
"There are still alert rules under the group": "分组下面仍然存在告警规则",
"AlertRule name has invalid characters": "告警规则含有非法字符",
"No such alert rule": "告警规则不存在",
"No such alert rule group": "告警规则分组不存在",
"No such alert event": "告警事件不存在",
"Alert rule %s already exists": "告警规则(%s)已存在",
"No such collect rule": "采集规则不存在",
"Decoded metric description empty": "导入的指标释义列表为空",
"User disabled": "用户已被禁用",
"Tags(%s) invalid": "标签(%s)格式不合法",
"Resource filter(Func:%s)'s param invalid": "资源过滤条件(函数:%s)参数不合法(为空或包含空格都不合法)",
"Tags filter(Func:%s)'s param invalid": "标签过滤条件(函数:%s)参数不合法(为空或包含空格都不合法)",
"Regexp: %s cannot be compiled": "正则表达式(%s)不合法,无法编译",
"AppendTags(%s) invalid": "附件标签(%s)格式不合法",
"Regexp %s matching failed": "正则表达式 %s 匹配失败",
"Regexp %s matched, but cannot get substring()": "主正则 %s 匹配成功,但无法匹配到子串",
"TagKey or TagValue contains illegal characters[:,/=\r\n\t]": "标签KEY或者标签值包含非法字符串[:,/=\r\n\t]",
"Resource cannot delete in preset classpath": "预置分组不能删除资源",
"No such resource %s": "不存在该资源(%s)",
}
langDict = map[string]map[string]string{
"zh": dict,
}
)
func init() {
i18n.DictRegister(langDict)
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save