feat(healthcheck): HEALTH_TARGET_ADDRESS -> HEALTH_TARGET_ADDRESSES
- Specify fallback addresses - Defaults changed from cloudflare:443 to cloudflare:443,github.com:443 - Startup check runs a parallel dial to each of the addresses specified with a global 6s timeout - Full periodic check cycles through addresses as it fails and moves to retry
This commit is contained in:
@@ -163,7 +163,7 @@ ENV VPN_SERVICE_PROVIDER=pia \
|
|||||||
LOG_LEVEL=info \
|
LOG_LEVEL=info \
|
||||||
# Health
|
# Health
|
||||||
HEALTH_SERVER_ADDRESS=127.0.0.1:9999 \
|
HEALTH_SERVER_ADDRESS=127.0.0.1:9999 \
|
||||||
HEALTH_TARGET_ADDRESS=cloudflare.com:443 \
|
HEALTH_TARGET_ADDRESSES=cloudflare.com:443,github.com:443 \
|
||||||
HEALTH_ICMP_TARGET_IP=1.1.1.1 \
|
HEALTH_ICMP_TARGET_IP=1.1.1.1 \
|
||||||
HEALTH_RESTART_VPN=on \
|
HEALTH_RESTART_VPN=on \
|
||||||
# DNS
|
# DNS
|
||||||
|
|||||||
@@ -17,10 +17,11 @@ type Health struct {
|
|||||||
// for the health check server.
|
// for the health check server.
|
||||||
// It cannot be the empty string in the internal state.
|
// It cannot be the empty string in the internal state.
|
||||||
ServerAddress string
|
ServerAddress string
|
||||||
// TargetAddress is the address (host or host:port)
|
// TargetAddresses are the addresses (host or host:port)
|
||||||
// to TCP TLS dial to periodically for the health check.
|
// to TCP TLS dial to periodically for the health check.
|
||||||
// It cannot be the empty string in the internal state.
|
// Addresses after the first one are used as fallbacks for retries.
|
||||||
TargetAddress string
|
// It cannot be empty in the internal state.
|
||||||
|
TargetAddresses []string
|
||||||
// ICMPTargetIP is the IP address to use for ICMP echo requests
|
// ICMPTargetIP is the IP address to use for ICMP echo requests
|
||||||
// in the health checker. It can be set to an unspecified address (0.0.0.0)
|
// in the health checker. It can be set to an unspecified address (0.0.0.0)
|
||||||
// such that the VPN server IP is used, which is also the default behavior.
|
// such that the VPN server IP is used, which is also the default behavior.
|
||||||
@@ -41,10 +42,10 @@ func (h Health) Validate() (err error) {
|
|||||||
|
|
||||||
func (h *Health) copy() (copied Health) {
|
func (h *Health) copy() (copied Health) {
|
||||||
return Health{
|
return Health{
|
||||||
ServerAddress: h.ServerAddress,
|
ServerAddress: h.ServerAddress,
|
||||||
TargetAddress: h.TargetAddress,
|
TargetAddresses: h.TargetAddresses,
|
||||||
ICMPTargetIP: h.ICMPTargetIP,
|
ICMPTargetIP: h.ICMPTargetIP,
|
||||||
RestartVPN: gosettings.CopyPointer(h.RestartVPN),
|
RestartVPN: gosettings.CopyPointer(h.RestartVPN),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,14 +54,14 @@ func (h *Health) copy() (copied Health) {
|
|||||||
// settings.
|
// settings.
|
||||||
func (h *Health) OverrideWith(other Health) {
|
func (h *Health) OverrideWith(other Health) {
|
||||||
h.ServerAddress = gosettings.OverrideWithComparable(h.ServerAddress, other.ServerAddress)
|
h.ServerAddress = gosettings.OverrideWithComparable(h.ServerAddress, other.ServerAddress)
|
||||||
h.TargetAddress = gosettings.OverrideWithComparable(h.TargetAddress, other.TargetAddress)
|
h.TargetAddresses = gosettings.OverrideWithSlice(h.TargetAddresses, other.TargetAddresses)
|
||||||
h.ICMPTargetIP = gosettings.OverrideWithComparable(h.ICMPTargetIP, other.ICMPTargetIP)
|
h.ICMPTargetIP = gosettings.OverrideWithComparable(h.ICMPTargetIP, other.ICMPTargetIP)
|
||||||
h.RestartVPN = gosettings.OverrideWithPointer(h.RestartVPN, other.RestartVPN)
|
h.RestartVPN = gosettings.OverrideWithPointer(h.RestartVPN, other.RestartVPN)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *Health) SetDefaults() {
|
func (h *Health) SetDefaults() {
|
||||||
h.ServerAddress = gosettings.DefaultComparable(h.ServerAddress, "127.0.0.1:9999")
|
h.ServerAddress = gosettings.DefaultComparable(h.ServerAddress, "127.0.0.1:9999")
|
||||||
h.TargetAddress = gosettings.DefaultComparable(h.TargetAddress, "cloudflare.com:443")
|
h.TargetAddresses = gosettings.DefaultSlice(h.TargetAddresses, []string{"cloudflare.com:443", "github.com:443"})
|
||||||
h.ICMPTargetIP = gosettings.DefaultComparable(h.ICMPTargetIP, netip.IPv4Unspecified()) // use the VPN server IP
|
h.ICMPTargetIP = gosettings.DefaultComparable(h.ICMPTargetIP, netip.IPv4Unspecified()) // use the VPN server IP
|
||||||
h.RestartVPN = gosettings.DefaultPointer(h.RestartVPN, true)
|
h.RestartVPN = gosettings.DefaultPointer(h.RestartVPN, true)
|
||||||
}
|
}
|
||||||
@@ -72,7 +73,10 @@ func (h Health) String() string {
|
|||||||
func (h Health) toLinesNode() (node *gotree.Node) {
|
func (h Health) toLinesNode() (node *gotree.Node) {
|
||||||
node = gotree.New("Health settings:")
|
node = gotree.New("Health settings:")
|
||||||
node.Appendf("Server listening address: %s", h.ServerAddress)
|
node.Appendf("Server listening address: %s", h.ServerAddress)
|
||||||
node.Appendf("Target address: %s", h.TargetAddress)
|
targetAddrs := node.Appendf("Target addresses:")
|
||||||
|
for _, targetAddr := range h.TargetAddresses {
|
||||||
|
targetAddrs.Append(targetAddr)
|
||||||
|
}
|
||||||
icmpTarget := "VPN server IP"
|
icmpTarget := "VPN server IP"
|
||||||
if !h.ICMPTargetIP.IsUnspecified() {
|
if !h.ICMPTargetIP.IsUnspecified() {
|
||||||
icmpTarget = h.ICMPTargetIP.String()
|
icmpTarget = h.ICMPTargetIP.String()
|
||||||
@@ -84,8 +88,8 @@ func (h Health) toLinesNode() (node *gotree.Node) {
|
|||||||
|
|
||||||
func (h *Health) Read(r *reader.Reader) (err error) {
|
func (h *Health) Read(r *reader.Reader) (err error) {
|
||||||
h.ServerAddress = r.String("HEALTH_SERVER_ADDRESS")
|
h.ServerAddress = r.String("HEALTH_SERVER_ADDRESS")
|
||||||
h.TargetAddress = r.String("HEALTH_TARGET_ADDRESS",
|
h.TargetAddresses = r.CSV("HEALTH_TARGET_ADDRESSES",
|
||||||
reader.RetroKeys("HEALTH_ADDRESS_TO_PING"))
|
reader.RetroKeys("HEALTH_ADDRESS_TO_PING", "HEALTH_TARGET_ADDRESS"))
|
||||||
h.ICMPTargetIP, err = r.NetipAddr("HEALTH_ICMP_TARGET_IP")
|
h.ICMPTargetIP, err = r.NetipAddr("HEALTH_ICMP_TARGET_IP")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|||||||
@@ -57,7 +57,9 @@ func Test_Settings_String(t *testing.T) {
|
|||||||
| └── Log level: INFO
|
| └── Log level: INFO
|
||||||
├── Health settings:
|
├── Health settings:
|
||||||
| ├── Server listening address: 127.0.0.1:9999
|
| ├── Server listening address: 127.0.0.1:9999
|
||||||
| ├── Target address: cloudflare.com:443
|
| ├── Target addresses:
|
||||||
|
| | ├── cloudflare.com:443
|
||||||
|
| | └── github.com:443
|
||||||
| ├── ICMP target IP: VPN server IP
|
| ├── ICMP target IP: VPN server IP
|
||||||
| └── Restart VPN on healthcheck failure: yes
|
| └── Restart VPN on healthcheck failure: yes
|
||||||
├── Shadowsocks server settings:
|
├── Shadowsocks server settings:
|
||||||
|
|||||||
@@ -16,13 +16,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Checker struct {
|
type Checker struct {
|
||||||
tlsDialAddr string
|
tlsDialAddrs []string
|
||||||
dialer *net.Dialer
|
dialer *net.Dialer
|
||||||
echoer *icmp.Echoer
|
echoer *icmp.Echoer
|
||||||
dnsClient *dns.Client
|
dnsClient *dns.Client
|
||||||
logger Logger
|
logger Logger
|
||||||
icmpTarget netip.Addr
|
icmpTarget netip.Addr
|
||||||
configMutex sync.Mutex
|
configMutex sync.Mutex
|
||||||
|
|
||||||
icmpNotPermitted bool
|
icmpNotPermitted bool
|
||||||
smallCheckName string
|
smallCheckName string
|
||||||
@@ -45,13 +45,13 @@ func NewChecker(logger Logger) *Checker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetConfig sets the TCP+TLS dial address and the ICMP echo IP address
|
// SetConfig sets the TCP+TLS dial addresses and the ICMP echo IP address
|
||||||
// to target by the [Checker].
|
// to target by the [Checker].
|
||||||
// This function MUST be called before calling [Checker.Start].
|
// This function MUST be called before calling [Checker.Start].
|
||||||
func (c *Checker) SetConfig(tlsDialAddr string, icmpTarget netip.Addr) {
|
func (c *Checker) SetConfig(tlsDialAddrs []string, icmpTarget netip.Addr) {
|
||||||
c.configMutex.Lock()
|
c.configMutex.Lock()
|
||||||
defer c.configMutex.Unlock()
|
defer c.configMutex.Unlock()
|
||||||
c.tlsDialAddr = tlsDialAddr
|
c.tlsDialAddrs = tlsDialAddrs
|
||||||
c.icmpTarget = icmpTarget
|
c.icmpTarget = icmpTarget
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,17 +63,11 @@ func (c *Checker) SetConfig(tlsDialAddr string, icmpTarget netip.Addr) {
|
|||||||
// It returns an error if the initial TCP+TLS check fails.
|
// It returns an error if the initial TCP+TLS check fails.
|
||||||
// The Checker has to be ultimately stopped by calling [Checker.Stop].
|
// The Checker has to be ultimately stopped by calling [Checker.Stop].
|
||||||
func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error) {
|
func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error) {
|
||||||
if c.tlsDialAddr == "" || c.icmpTarget.IsUnspecified() {
|
if len(c.tlsDialAddrs) == 0 || c.icmpTarget.IsUnspecified() {
|
||||||
panic("call Checker.SetConfig with non empty values before Checker.Start")
|
panic("call Checker.SetConfig with non empty values before Checker.Start")
|
||||||
}
|
}
|
||||||
|
|
||||||
// connection isn't under load yet when the checker starts, so a short
|
err = c.startupCheck(ctx)
|
||||||
// 6 seconds timeout suffices and provides quick enough feedback that
|
|
||||||
// the new connection is not working.
|
|
||||||
const timeout = 6 * time.Second
|
|
||||||
tcpTLSCheckCtx, tcpTLSCheckCancel := context.WithTimeout(ctx, timeout)
|
|
||||||
err = tcpTLSCheck(tcpTLSCheckCtx, c.dialer, c.tlsDialAddr)
|
|
||||||
tcpTLSCheckCancel()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("startup check: %w", err)
|
return nil, fmt.Errorf("startup check: %w", err)
|
||||||
}
|
}
|
||||||
@@ -123,6 +117,7 @@ func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error)
|
|||||||
func (c *Checker) Stop() error {
|
func (c *Checker) Stop() error {
|
||||||
c.stop()
|
c.stop()
|
||||||
<-c.done
|
<-c.done
|
||||||
|
c.tlsDialAddrs = nil
|
||||||
c.icmpTarget = netip.Addr{}
|
c.icmpTarget = netip.Addr{}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -143,7 +138,7 @@ func (c *Checker) smallPeriodicCheck(ctx context.Context) error {
|
|||||||
15 * time.Second,
|
15 * time.Second,
|
||||||
30 * time.Second,
|
30 * time.Second,
|
||||||
}
|
}
|
||||||
check := func(ctx context.Context) error {
|
check := func(ctx context.Context, _ int) error {
|
||||||
if c.icmpNotPermitted {
|
if c.icmpNotPermitted {
|
||||||
return c.dnsClient.Check(ctx)
|
return c.dnsClient.Check(ctx)
|
||||||
}
|
}
|
||||||
@@ -163,8 +158,9 @@ func (c *Checker) fullPeriodicCheck(ctx context.Context) error {
|
|||||||
// 20s timeout in case the connection is under stress
|
// 20s timeout in case the connection is under stress
|
||||||
// See https://github.com/qdm12/gluetun/issues/2270
|
// See https://github.com/qdm12/gluetun/issues/2270
|
||||||
tryTimeouts := []time.Duration{10 * time.Second, 15 * time.Second, 30 * time.Second}
|
tryTimeouts := []time.Duration{10 * time.Second, 15 * time.Second, 30 * time.Second}
|
||||||
check := func(ctx context.Context) error {
|
check := func(ctx context.Context, try int) error {
|
||||||
return tcpTLSCheck(ctx, c.dialer, c.tlsDialAddr)
|
tlsDialAddr := c.tlsDialAddrs[try%len(c.tlsDialAddrs)]
|
||||||
|
return tcpTLSCheck(ctx, c.dialer, tlsDialAddr)
|
||||||
}
|
}
|
||||||
return withRetries(ctx, tryTimeouts, c.logger, "TCP+TLS dial", check)
|
return withRetries(ctx, tryTimeouts, c.logger, "TCP+TLS dial", check)
|
||||||
}
|
}
|
||||||
@@ -226,7 +222,7 @@ func makeAddressToDial(address string) (addressToDial string, err error) {
|
|||||||
var ErrAllCheckTriesFailed = errors.New("all check tries failed")
|
var ErrAllCheckTriesFailed = errors.New("all check tries failed")
|
||||||
|
|
||||||
func withRetries(ctx context.Context, tryTimeouts []time.Duration,
|
func withRetries(ctx context.Context, tryTimeouts []time.Duration,
|
||||||
logger Logger, checkName string, check func(ctx context.Context) error,
|
logger Logger, checkName string, check func(ctx context.Context, try int) error,
|
||||||
) error {
|
) error {
|
||||||
maxTries := len(tryTimeouts)
|
maxTries := len(tryTimeouts)
|
||||||
type errData struct {
|
type errData struct {
|
||||||
@@ -237,7 +233,7 @@ func withRetries(ctx context.Context, tryTimeouts []time.Duration,
|
|||||||
for i, timeout := range tryTimeouts {
|
for i, timeout := range tryTimeouts {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
checkCtx, cancel := context.WithTimeout(ctx, timeout)
|
checkCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||||
err := check(checkCtx)
|
err := check(checkCtx, i)
|
||||||
cancel()
|
cancel()
|
||||||
switch {
|
switch {
|
||||||
case err == nil:
|
case err == nil:
|
||||||
@@ -256,3 +252,48 @@ func withRetries(ctx context.Context, tryTimeouts []time.Duration,
|
|||||||
}
|
}
|
||||||
return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", "))
|
return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", "))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Checker) startupCheck(ctx context.Context) error {
|
||||||
|
// connection isn't under load yet when the checker starts, so a short
|
||||||
|
// 6 seconds timeout suffices and provides quick enough feedback that
|
||||||
|
// the new connection is not working. However, since the addresses to dial
|
||||||
|
// may be multiple, we run the check in parallel. If any succeeds, the check passes.
|
||||||
|
// This is to prevent false negatives at startup, if one of the addresses is down
|
||||||
|
// for external reasons.
|
||||||
|
const timeout = 6 * time.Second
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||||
|
defer cancel()
|
||||||
|
errCh := make(chan error)
|
||||||
|
|
||||||
|
for _, address := range c.tlsDialAddrs {
|
||||||
|
go func(addr string) {
|
||||||
|
err := tcpTLSCheck(ctx, c.dialer, addr)
|
||||||
|
errCh <- err
|
||||||
|
}(address)
|
||||||
|
}
|
||||||
|
|
||||||
|
errs := make([]error, 0, len(c.tlsDialAddrs))
|
||||||
|
success := false
|
||||||
|
for range c.tlsDialAddrs {
|
||||||
|
err := <-errCh
|
||||||
|
if err == nil {
|
||||||
|
success = true
|
||||||
|
cancel()
|
||||||
|
continue
|
||||||
|
} else if success {
|
||||||
|
continue // ignore canceled errors after success
|
||||||
|
}
|
||||||
|
|
||||||
|
c.logger.Debugf("startup check parallel attempt failed: %s", err)
|
||||||
|
errs = append(errs, err)
|
||||||
|
}
|
||||||
|
if success {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
errStrings := make([]string, len(errs))
|
||||||
|
for i, err := range errs {
|
||||||
|
errStrings[i] = fmt.Sprintf("parallel attempt %d/%d failed: %s", i+1, len(errs), err)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", "))
|
||||||
|
}
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ func Test_Checker_fullcheck(t *testing.T) {
|
|||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
dialer := &net.Dialer{}
|
dialer := &net.Dialer{}
|
||||||
const address = "cloudflare.com:443"
|
addresses := []string{"badaddress:9876", "cloudflare.com:443", "google.com:443"}
|
||||||
|
|
||||||
checker := &Checker{
|
checker := &Checker{
|
||||||
dialer: dialer,
|
dialer: dialer,
|
||||||
tlsDialAddr: address,
|
tlsDialAddrs: addresses,
|
||||||
}
|
}
|
||||||
|
|
||||||
canceledCtx, cancel := context.WithCancel(context.Background())
|
canceledCtx, cancel := context.WithCancel(context.Background())
|
||||||
@@ -52,8 +52,8 @@ func Test_Checker_fullcheck(t *testing.T) {
|
|||||||
|
|
||||||
dialer := &net.Dialer{}
|
dialer := &net.Dialer{}
|
||||||
checker := &Checker{
|
checker := &Checker{
|
||||||
dialer: dialer,
|
dialer: dialer,
|
||||||
tlsDialAddr: listeningAddress.String(),
|
tlsDialAddrs: []string{listeningAddress.String()},
|
||||||
}
|
}
|
||||||
|
|
||||||
err = checker.fullPeriodicCheck(ctx)
|
err = checker.fullPeriodicCheck(ctx)
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ type CmdStarter interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type HealthChecker interface {
|
type HealthChecker interface {
|
||||||
SetConfig(tlsDialAddr string, icmpTarget netip.Addr)
|
SetConfig(tlsDialAddrs []string, icmpTarget netip.Addr)
|
||||||
Start(ctx context.Context) (runError <-chan error, err error)
|
Start(ctx context.Context) (runError <-chan error, err error)
|
||||||
Stop() error
|
Stop() error
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ func (l *Loop) onTunnelUp(ctx, loopCtx context.Context, data tunnelUpData) {
|
|||||||
if icmpTarget.IsUnspecified() {
|
if icmpTarget.IsUnspecified() {
|
||||||
icmpTarget = data.serverIP
|
icmpTarget = data.serverIP
|
||||||
}
|
}
|
||||||
l.healthChecker.SetConfig(l.healthSettings.TargetAddress, icmpTarget)
|
l.healthChecker.SetConfig(l.healthSettings.TargetAddresses, icmpTarget)
|
||||||
|
|
||||||
healthErrCh, err := l.healthChecker.Start(ctx)
|
healthErrCh, err := l.healthChecker.Start(ctx)
|
||||||
l.healthServer.SetError(err)
|
l.healthServer.SetError(err)
|
||||||
|
|||||||
Reference in New Issue
Block a user