feat(healthcheck): HEALTH_TARGET_ADDRESS -> HEALTH_TARGET_ADDRESSES

- Specify fallback addresses
- Defaults changed from cloudflare:443 to cloudflare:443,github.com:443
- Startup check runs a parallel dial to each of the addresses specified with a global 6s timeout
- Full periodic check cycles through addresses as it fails and moves to retry
This commit is contained in:
Quentin McGaw
2025-11-19 15:41:21 +00:00
parent 03f1fea123
commit 482421dda3
7 changed files with 91 additions and 44 deletions

View File

@@ -163,7 +163,7 @@ ENV VPN_SERVICE_PROVIDER=pia \
LOG_LEVEL=info \ LOG_LEVEL=info \
# Health # Health
HEALTH_SERVER_ADDRESS=127.0.0.1:9999 \ HEALTH_SERVER_ADDRESS=127.0.0.1:9999 \
HEALTH_TARGET_ADDRESS=cloudflare.com:443 \ HEALTH_TARGET_ADDRESSES=cloudflare.com:443,github.com:443 \
HEALTH_ICMP_TARGET_IP=1.1.1.1 \ HEALTH_ICMP_TARGET_IP=1.1.1.1 \
HEALTH_RESTART_VPN=on \ HEALTH_RESTART_VPN=on \
# DNS # DNS

View File

@@ -17,10 +17,11 @@ type Health struct {
// for the health check server. // for the health check server.
// It cannot be the empty string in the internal state. // It cannot be the empty string in the internal state.
ServerAddress string ServerAddress string
// TargetAddress is the address (host or host:port) // TargetAddresses are the addresses (host or host:port)
// to TCP TLS dial to periodically for the health check. // to TCP TLS dial to periodically for the health check.
// It cannot be the empty string in the internal state. // Addresses after the first one are used as fallbacks for retries.
TargetAddress string // It cannot be empty in the internal state.
TargetAddresses []string
// ICMPTargetIP is the IP address to use for ICMP echo requests // ICMPTargetIP is the IP address to use for ICMP echo requests
// in the health checker. It can be set to an unspecified address (0.0.0.0) // in the health checker. It can be set to an unspecified address (0.0.0.0)
// such that the VPN server IP is used, which is also the default behavior. // such that the VPN server IP is used, which is also the default behavior.
@@ -41,10 +42,10 @@ func (h Health) Validate() (err error) {
func (h *Health) copy() (copied Health) { func (h *Health) copy() (copied Health) {
return Health{ return Health{
ServerAddress: h.ServerAddress, ServerAddress: h.ServerAddress,
TargetAddress: h.TargetAddress, TargetAddresses: h.TargetAddresses,
ICMPTargetIP: h.ICMPTargetIP, ICMPTargetIP: h.ICMPTargetIP,
RestartVPN: gosettings.CopyPointer(h.RestartVPN), RestartVPN: gosettings.CopyPointer(h.RestartVPN),
} }
} }
@@ -53,14 +54,14 @@ func (h *Health) copy() (copied Health) {
// settings. // settings.
func (h *Health) OverrideWith(other Health) { func (h *Health) OverrideWith(other Health) {
h.ServerAddress = gosettings.OverrideWithComparable(h.ServerAddress, other.ServerAddress) h.ServerAddress = gosettings.OverrideWithComparable(h.ServerAddress, other.ServerAddress)
h.TargetAddress = gosettings.OverrideWithComparable(h.TargetAddress, other.TargetAddress) h.TargetAddresses = gosettings.OverrideWithSlice(h.TargetAddresses, other.TargetAddresses)
h.ICMPTargetIP = gosettings.OverrideWithComparable(h.ICMPTargetIP, other.ICMPTargetIP) h.ICMPTargetIP = gosettings.OverrideWithComparable(h.ICMPTargetIP, other.ICMPTargetIP)
h.RestartVPN = gosettings.OverrideWithPointer(h.RestartVPN, other.RestartVPN) h.RestartVPN = gosettings.OverrideWithPointer(h.RestartVPN, other.RestartVPN)
} }
func (h *Health) SetDefaults() { func (h *Health) SetDefaults() {
h.ServerAddress = gosettings.DefaultComparable(h.ServerAddress, "127.0.0.1:9999") h.ServerAddress = gosettings.DefaultComparable(h.ServerAddress, "127.0.0.1:9999")
h.TargetAddress = gosettings.DefaultComparable(h.TargetAddress, "cloudflare.com:443") h.TargetAddresses = gosettings.DefaultSlice(h.TargetAddresses, []string{"cloudflare.com:443", "github.com:443"})
h.ICMPTargetIP = gosettings.DefaultComparable(h.ICMPTargetIP, netip.IPv4Unspecified()) // use the VPN server IP h.ICMPTargetIP = gosettings.DefaultComparable(h.ICMPTargetIP, netip.IPv4Unspecified()) // use the VPN server IP
h.RestartVPN = gosettings.DefaultPointer(h.RestartVPN, true) h.RestartVPN = gosettings.DefaultPointer(h.RestartVPN, true)
} }
@@ -72,7 +73,10 @@ func (h Health) String() string {
func (h Health) toLinesNode() (node *gotree.Node) { func (h Health) toLinesNode() (node *gotree.Node) {
node = gotree.New("Health settings:") node = gotree.New("Health settings:")
node.Appendf("Server listening address: %s", h.ServerAddress) node.Appendf("Server listening address: %s", h.ServerAddress)
node.Appendf("Target address: %s", h.TargetAddress) targetAddrs := node.Appendf("Target addresses:")
for _, targetAddr := range h.TargetAddresses {
targetAddrs.Append(targetAddr)
}
icmpTarget := "VPN server IP" icmpTarget := "VPN server IP"
if !h.ICMPTargetIP.IsUnspecified() { if !h.ICMPTargetIP.IsUnspecified() {
icmpTarget = h.ICMPTargetIP.String() icmpTarget = h.ICMPTargetIP.String()
@@ -84,8 +88,8 @@ func (h Health) toLinesNode() (node *gotree.Node) {
func (h *Health) Read(r *reader.Reader) (err error) { func (h *Health) Read(r *reader.Reader) (err error) {
h.ServerAddress = r.String("HEALTH_SERVER_ADDRESS") h.ServerAddress = r.String("HEALTH_SERVER_ADDRESS")
h.TargetAddress = r.String("HEALTH_TARGET_ADDRESS", h.TargetAddresses = r.CSV("HEALTH_TARGET_ADDRESSES",
reader.RetroKeys("HEALTH_ADDRESS_TO_PING")) reader.RetroKeys("HEALTH_ADDRESS_TO_PING", "HEALTH_TARGET_ADDRESS"))
h.ICMPTargetIP, err = r.NetipAddr("HEALTH_ICMP_TARGET_IP") h.ICMPTargetIP, err = r.NetipAddr("HEALTH_ICMP_TARGET_IP")
if err != nil { if err != nil {
return err return err

View File

@@ -57,7 +57,9 @@ func Test_Settings_String(t *testing.T) {
| └── Log level: INFO | └── Log level: INFO
├── Health settings: ├── Health settings:
| ├── Server listening address: 127.0.0.1:9999 | ├── Server listening address: 127.0.0.1:9999
| ├── Target address: cloudflare.com:443 | ├── Target addresses:
| | ├── cloudflare.com:443
| | └── github.com:443
| ├── ICMP target IP: VPN server IP | ├── ICMP target IP: VPN server IP
| └── Restart VPN on healthcheck failure: yes | └── Restart VPN on healthcheck failure: yes
├── Shadowsocks server settings: ├── Shadowsocks server settings:

View File

@@ -16,13 +16,13 @@ import (
) )
type Checker struct { type Checker struct {
tlsDialAddr string tlsDialAddrs []string
dialer *net.Dialer dialer *net.Dialer
echoer *icmp.Echoer echoer *icmp.Echoer
dnsClient *dns.Client dnsClient *dns.Client
logger Logger logger Logger
icmpTarget netip.Addr icmpTarget netip.Addr
configMutex sync.Mutex configMutex sync.Mutex
icmpNotPermitted bool icmpNotPermitted bool
smallCheckName string smallCheckName string
@@ -45,13 +45,13 @@ func NewChecker(logger Logger) *Checker {
} }
} }
// SetConfig sets the TCP+TLS dial address and the ICMP echo IP address // SetConfig sets the TCP+TLS dial addresses and the ICMP echo IP address
// to target by the [Checker]. // to target by the [Checker].
// This function MUST be called before calling [Checker.Start]. // This function MUST be called before calling [Checker.Start].
func (c *Checker) SetConfig(tlsDialAddr string, icmpTarget netip.Addr) { func (c *Checker) SetConfig(tlsDialAddrs []string, icmpTarget netip.Addr) {
c.configMutex.Lock() c.configMutex.Lock()
defer c.configMutex.Unlock() defer c.configMutex.Unlock()
c.tlsDialAddr = tlsDialAddr c.tlsDialAddrs = tlsDialAddrs
c.icmpTarget = icmpTarget c.icmpTarget = icmpTarget
} }
@@ -63,17 +63,11 @@ func (c *Checker) SetConfig(tlsDialAddr string, icmpTarget netip.Addr) {
// It returns an error if the initial TCP+TLS check fails. // It returns an error if the initial TCP+TLS check fails.
// The Checker has to be ultimately stopped by calling [Checker.Stop]. // The Checker has to be ultimately stopped by calling [Checker.Stop].
func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error) { func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error) {
if c.tlsDialAddr == "" || c.icmpTarget.IsUnspecified() { if len(c.tlsDialAddrs) == 0 || c.icmpTarget.IsUnspecified() {
panic("call Checker.SetConfig with non empty values before Checker.Start") panic("call Checker.SetConfig with non empty values before Checker.Start")
} }
// connection isn't under load yet when the checker starts, so a short err = c.startupCheck(ctx)
// 6 seconds timeout suffices and provides quick enough feedback that
// the new connection is not working.
const timeout = 6 * time.Second
tcpTLSCheckCtx, tcpTLSCheckCancel := context.WithTimeout(ctx, timeout)
err = tcpTLSCheck(tcpTLSCheckCtx, c.dialer, c.tlsDialAddr)
tcpTLSCheckCancel()
if err != nil { if err != nil {
return nil, fmt.Errorf("startup check: %w", err) return nil, fmt.Errorf("startup check: %w", err)
} }
@@ -123,6 +117,7 @@ func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error)
func (c *Checker) Stop() error { func (c *Checker) Stop() error {
c.stop() c.stop()
<-c.done <-c.done
c.tlsDialAddrs = nil
c.icmpTarget = netip.Addr{} c.icmpTarget = netip.Addr{}
return nil return nil
} }
@@ -143,7 +138,7 @@ func (c *Checker) smallPeriodicCheck(ctx context.Context) error {
15 * time.Second, 15 * time.Second,
30 * time.Second, 30 * time.Second,
} }
check := func(ctx context.Context) error { check := func(ctx context.Context, _ int) error {
if c.icmpNotPermitted { if c.icmpNotPermitted {
return c.dnsClient.Check(ctx) return c.dnsClient.Check(ctx)
} }
@@ -163,8 +158,9 @@ func (c *Checker) fullPeriodicCheck(ctx context.Context) error {
// 20s timeout in case the connection is under stress // 20s timeout in case the connection is under stress
// See https://github.com/qdm12/gluetun/issues/2270 // See https://github.com/qdm12/gluetun/issues/2270
tryTimeouts := []time.Duration{10 * time.Second, 15 * time.Second, 30 * time.Second} tryTimeouts := []time.Duration{10 * time.Second, 15 * time.Second, 30 * time.Second}
check := func(ctx context.Context) error { check := func(ctx context.Context, try int) error {
return tcpTLSCheck(ctx, c.dialer, c.tlsDialAddr) tlsDialAddr := c.tlsDialAddrs[try%len(c.tlsDialAddrs)]
return tcpTLSCheck(ctx, c.dialer, tlsDialAddr)
} }
return withRetries(ctx, tryTimeouts, c.logger, "TCP+TLS dial", check) return withRetries(ctx, tryTimeouts, c.logger, "TCP+TLS dial", check)
} }
@@ -226,7 +222,7 @@ func makeAddressToDial(address string) (addressToDial string, err error) {
var ErrAllCheckTriesFailed = errors.New("all check tries failed") var ErrAllCheckTriesFailed = errors.New("all check tries failed")
func withRetries(ctx context.Context, tryTimeouts []time.Duration, func withRetries(ctx context.Context, tryTimeouts []time.Duration,
logger Logger, checkName string, check func(ctx context.Context) error, logger Logger, checkName string, check func(ctx context.Context, try int) error,
) error { ) error {
maxTries := len(tryTimeouts) maxTries := len(tryTimeouts)
type errData struct { type errData struct {
@@ -237,7 +233,7 @@ func withRetries(ctx context.Context, tryTimeouts []time.Duration,
for i, timeout := range tryTimeouts { for i, timeout := range tryTimeouts {
start := time.Now() start := time.Now()
checkCtx, cancel := context.WithTimeout(ctx, timeout) checkCtx, cancel := context.WithTimeout(ctx, timeout)
err := check(checkCtx) err := check(checkCtx, i)
cancel() cancel()
switch { switch {
case err == nil: case err == nil:
@@ -256,3 +252,48 @@ func withRetries(ctx context.Context, tryTimeouts []time.Duration,
} }
return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", ")) return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", "))
} }
func (c *Checker) startupCheck(ctx context.Context) error {
// connection isn't under load yet when the checker starts, so a short
// 6 seconds timeout suffices and provides quick enough feedback that
// the new connection is not working. However, since the addresses to dial
// may be multiple, we run the check in parallel. If any succeeds, the check passes.
// This is to prevent false negatives at startup, if one of the addresses is down
// for external reasons.
const timeout = 6 * time.Second
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
errCh := make(chan error)
for _, address := range c.tlsDialAddrs {
go func(addr string) {
err := tcpTLSCheck(ctx, c.dialer, addr)
errCh <- err
}(address)
}
errs := make([]error, 0, len(c.tlsDialAddrs))
success := false
for range c.tlsDialAddrs {
err := <-errCh
if err == nil {
success = true
cancel()
continue
} else if success {
continue // ignore canceled errors after success
}
c.logger.Debugf("startup check parallel attempt failed: %s", err)
errs = append(errs, err)
}
if success {
return nil
}
errStrings := make([]string, len(errs))
for i, err := range errs {
errStrings[i] = fmt.Sprintf("parallel attempt %d/%d failed: %s", i+1, len(errs), err)
}
return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", "))
}

View File

@@ -18,11 +18,11 @@ func Test_Checker_fullcheck(t *testing.T) {
t.Parallel() t.Parallel()
dialer := &net.Dialer{} dialer := &net.Dialer{}
const address = "cloudflare.com:443" addresses := []string{"badaddress:9876", "cloudflare.com:443", "google.com:443"}
checker := &Checker{ checker := &Checker{
dialer: dialer, dialer: dialer,
tlsDialAddr: address, tlsDialAddrs: addresses,
} }
canceledCtx, cancel := context.WithCancel(context.Background()) canceledCtx, cancel := context.WithCancel(context.Background())
@@ -52,8 +52,8 @@ func Test_Checker_fullcheck(t *testing.T) {
dialer := &net.Dialer{} dialer := &net.Dialer{}
checker := &Checker{ checker := &Checker{
dialer: dialer, dialer: dialer,
tlsDialAddr: listeningAddress.String(), tlsDialAddrs: []string{listeningAddress.String()},
} }
err = checker.fullPeriodicCheck(ctx) err = checker.fullPeriodicCheck(ctx)

View File

@@ -101,7 +101,7 @@ type CmdStarter interface {
} }
type HealthChecker interface { type HealthChecker interface {
SetConfig(tlsDialAddr string, icmpTarget netip.Addr) SetConfig(tlsDialAddrs []string, icmpTarget netip.Addr)
Start(ctx context.Context) (runError <-chan error, err error) Start(ctx context.Context) (runError <-chan error, err error)
Stop() error Stop() error
} }

View File

@@ -35,7 +35,7 @@ func (l *Loop) onTunnelUp(ctx, loopCtx context.Context, data tunnelUpData) {
if icmpTarget.IsUnspecified() { if icmpTarget.IsUnspecified() {
icmpTarget = data.serverIP icmpTarget = data.serverIP
} }
l.healthChecker.SetConfig(l.healthSettings.TargetAddress, icmpTarget) l.healthChecker.SetConfig(l.healthSettings.TargetAddresses, icmpTarget)
healthErrCh, err := l.healthChecker.Start(ctx) healthErrCh, err := l.healthChecker.Start(ctx)
l.healthServer.SetError(err) l.healthServer.SetError(err)