forked from pneymrl2f/nightingale
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
177 lines
4.8 KiB
177 lines
4.8 KiB
// Copyright (c) 2017 Uber Technologies, Inc.
|
|
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
package tchannel
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/net/context"
|
|
)
|
|
|
|
const (
|
|
_defaultHealthCheckTimeout = time.Second
|
|
_defaultHealthCheckFailuresToClose = 5
|
|
|
|
_healthHistorySize = 256
|
|
)
|
|
|
|
// HealthCheckOptions are the parameters to configure active TChannel health
|
|
// checks. These are not intended to check application level health, but
|
|
// TCP connection health (similar to TCP keep-alives). The health checks use
|
|
// TChannel ping messages.
|
|
type HealthCheckOptions struct {
|
|
// The period between health checks. If this is zeor, active health checks
|
|
// are disabled.
|
|
Interval time.Duration
|
|
|
|
// The timeout to use for a health check.
|
|
// If no value is specified, it defaults to time.Second.
|
|
Timeout time.Duration
|
|
|
|
// FailuresToClose is the number of consecutive health check failures that
|
|
// will cause this connection to be closed.
|
|
// If no value is specified, it defaults to 5.
|
|
FailuresToClose int
|
|
}
|
|
|
|
type healthHistory struct {
|
|
sync.RWMutex
|
|
|
|
states []bool
|
|
|
|
insertAt int
|
|
total int
|
|
}
|
|
|
|
func newHealthHistory() *healthHistory {
|
|
return &healthHistory{
|
|
states: make([]bool, _healthHistorySize),
|
|
}
|
|
}
|
|
|
|
func (hh *healthHistory) add(b bool) {
|
|
hh.Lock()
|
|
defer hh.Unlock()
|
|
|
|
hh.states[hh.insertAt] = b
|
|
hh.insertAt = (hh.insertAt + 1) % _healthHistorySize
|
|
hh.total++
|
|
}
|
|
|
|
func (hh *healthHistory) asBools() []bool {
|
|
hh.RLock()
|
|
defer hh.RUnlock()
|
|
|
|
if hh.total < _healthHistorySize {
|
|
return append([]bool(nil), hh.states[:hh.total]...)
|
|
}
|
|
|
|
states := hh.states
|
|
copyStates := make([]bool, 0, _healthHistorySize)
|
|
copyStates = append(copyStates, states[hh.insertAt:]...)
|
|
copyStates = append(copyStates, states[:hh.insertAt]...)
|
|
return copyStates
|
|
}
|
|
|
|
func (hco HealthCheckOptions) enabled() bool {
|
|
return hco.Interval > 0
|
|
}
|
|
|
|
func (hco HealthCheckOptions) withDefaults() HealthCheckOptions {
|
|
if hco.Timeout == 0 {
|
|
hco.Timeout = _defaultHealthCheckTimeout
|
|
}
|
|
if hco.FailuresToClose == 0 {
|
|
hco.FailuresToClose = _defaultHealthCheckFailuresToClose
|
|
}
|
|
return hco
|
|
}
|
|
|
|
// healthCheck will do periodic pings on the connection to check the state of the connection.
|
|
// We accept connID on the stack so can more easily debug panics or leaked goroutines.
|
|
func (c *Connection) healthCheck(connID uint32) {
|
|
defer close(c.healthCheckDone)
|
|
|
|
opts := c.opts.HealthChecks
|
|
|
|
ticker := c.timeTicker(opts.Interval)
|
|
defer ticker.Stop()
|
|
|
|
consecutiveFailures := 0
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
case <-c.healthCheckCtx.Done():
|
|
return
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(c.healthCheckCtx, opts.Timeout)
|
|
err := c.ping(ctx)
|
|
cancel()
|
|
c.healthCheckHistory.add(err == nil)
|
|
if err == nil {
|
|
if c.log.Enabled(LogLevelDebug) {
|
|
c.log.Debug("Performed successful active health check.")
|
|
}
|
|
consecutiveFailures = 0
|
|
continue
|
|
}
|
|
|
|
// If the health check failed because the connection closed or health
|
|
// checks were stopped, we don't need to log or close the connection.
|
|
if GetSystemErrorCode(err) == ErrCodeCancelled || err == ErrInvalidConnectionState {
|
|
c.log.WithFields(ErrField(err)).Debug("Health checker stopped.")
|
|
return
|
|
}
|
|
|
|
consecutiveFailures++
|
|
c.log.WithFields(LogFields{
|
|
{"consecutiveFailures", consecutiveFailures},
|
|
ErrField(err),
|
|
{"failuresToClose", opts.FailuresToClose},
|
|
}...).Warn("Failed active health check.")
|
|
|
|
if consecutiveFailures >= opts.FailuresToClose {
|
|
c.close(LogFields{
|
|
{"reason", "health check failure"},
|
|
ErrField(err),
|
|
}...)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *Connection) stopHealthCheck() {
|
|
// Health checks are not enabled.
|
|
if c.healthCheckDone == nil {
|
|
return
|
|
}
|
|
|
|
// Best effort check to see if health checks were stopped.
|
|
if c.healthCheckCtx.Err() != nil {
|
|
return
|
|
}
|
|
c.log.Debug("Stopping health checks.")
|
|
c.healthCheckQuit()
|
|
<-c.healthCheckDone
|
|
}
|