diff --git a/Dockerfile b/Dockerfile index 5395c553..056322eb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -163,7 +163,7 @@ ENV VPN_SERVICE_PROVIDER=pia \ LOG_LEVEL=info \ # Health HEALTH_SERVER_ADDRESS=127.0.0.1:9999 \ - HEALTH_TARGET_ADDRESS=cloudflare.com:443 \ + HEALTH_TARGET_ADDRESSES=cloudflare.com:443,github.com:443 \ HEALTH_ICMP_TARGET_IP=1.1.1.1 \ HEALTH_RESTART_VPN=on \ # DNS diff --git a/internal/configuration/settings/health.go b/internal/configuration/settings/health.go index 6f0b6763..6554acbc 100644 --- a/internal/configuration/settings/health.go +++ b/internal/configuration/settings/health.go @@ -17,10 +17,11 @@ type Health struct { // for the health check server. // It cannot be the empty string in the internal state. ServerAddress string - // TargetAddress is the address (host or host:port) + // TargetAddresses are the addresses (host or host:port) // to TCP TLS dial to periodically for the health check. - // It cannot be the empty string in the internal state. - TargetAddress string + // Addresses after the first one are used as fallbacks for retries. + // It cannot be empty in the internal state. + TargetAddresses []string // ICMPTargetIP is the IP address to use for ICMP echo requests // in the health checker. It can be set to an unspecified address (0.0.0.0) // such that the VPN server IP is used, which is also the default behavior. @@ -41,10 +42,10 @@ func (h Health) Validate() (err error) { func (h *Health) copy() (copied Health) { return Health{ - ServerAddress: h.ServerAddress, - TargetAddress: h.TargetAddress, - ICMPTargetIP: h.ICMPTargetIP, - RestartVPN: gosettings.CopyPointer(h.RestartVPN), + ServerAddress: h.ServerAddress, + TargetAddresses: h.TargetAddresses, + ICMPTargetIP: h.ICMPTargetIP, + RestartVPN: gosettings.CopyPointer(h.RestartVPN), } } @@ -53,14 +54,14 @@ func (h *Health) copy() (copied Health) { // settings. func (h *Health) OverrideWith(other Health) { h.ServerAddress = gosettings.OverrideWithComparable(h.ServerAddress, other.ServerAddress) - h.TargetAddress = gosettings.OverrideWithComparable(h.TargetAddress, other.TargetAddress) + h.TargetAddresses = gosettings.OverrideWithSlice(h.TargetAddresses, other.TargetAddresses) h.ICMPTargetIP = gosettings.OverrideWithComparable(h.ICMPTargetIP, other.ICMPTargetIP) h.RestartVPN = gosettings.OverrideWithPointer(h.RestartVPN, other.RestartVPN) } func (h *Health) SetDefaults() { h.ServerAddress = gosettings.DefaultComparable(h.ServerAddress, "127.0.0.1:9999") - h.TargetAddress = gosettings.DefaultComparable(h.TargetAddress, "cloudflare.com:443") + h.TargetAddresses = gosettings.DefaultSlice(h.TargetAddresses, []string{"cloudflare.com:443", "github.com:443"}) h.ICMPTargetIP = gosettings.DefaultComparable(h.ICMPTargetIP, netip.IPv4Unspecified()) // use the VPN server IP h.RestartVPN = gosettings.DefaultPointer(h.RestartVPN, true) } @@ -72,7 +73,10 @@ func (h Health) String() string { func (h Health) toLinesNode() (node *gotree.Node) { node = gotree.New("Health settings:") node.Appendf("Server listening address: %s", h.ServerAddress) - node.Appendf("Target address: %s", h.TargetAddress) + targetAddrs := node.Appendf("Target addresses:") + for _, targetAddr := range h.TargetAddresses { + targetAddrs.Append(targetAddr) + } icmpTarget := "VPN server IP" if !h.ICMPTargetIP.IsUnspecified() { icmpTarget = h.ICMPTargetIP.String() @@ -84,8 +88,8 @@ func (h Health) toLinesNode() (node *gotree.Node) { func (h *Health) Read(r *reader.Reader) (err error) { h.ServerAddress = r.String("HEALTH_SERVER_ADDRESS") - h.TargetAddress = r.String("HEALTH_TARGET_ADDRESS", - reader.RetroKeys("HEALTH_ADDRESS_TO_PING")) + h.TargetAddresses = r.CSV("HEALTH_TARGET_ADDRESSES", + reader.RetroKeys("HEALTH_ADDRESS_TO_PING", "HEALTH_TARGET_ADDRESS")) h.ICMPTargetIP, err = r.NetipAddr("HEALTH_ICMP_TARGET_IP") if err != nil { return err diff --git a/internal/configuration/settings/settings_test.go b/internal/configuration/settings/settings_test.go index 907655c5..231fc2e6 100644 --- a/internal/configuration/settings/settings_test.go +++ b/internal/configuration/settings/settings_test.go @@ -57,7 +57,9 @@ func Test_Settings_String(t *testing.T) { | └── Log level: INFO ├── Health settings: | ├── Server listening address: 127.0.0.1:9999 -| ├── Target address: cloudflare.com:443 +| ├── Target addresses: +| | ├── cloudflare.com:443 +| | └── github.com:443 | ├── ICMP target IP: VPN server IP | └── Restart VPN on healthcheck failure: yes ├── Shadowsocks server settings: diff --git a/internal/healthcheck/checker.go b/internal/healthcheck/checker.go index 74fc754b..0d6622d8 100644 --- a/internal/healthcheck/checker.go +++ b/internal/healthcheck/checker.go @@ -16,13 +16,13 @@ import ( ) type Checker struct { - tlsDialAddr string - dialer *net.Dialer - echoer *icmp.Echoer - dnsClient *dns.Client - logger Logger - icmpTarget netip.Addr - configMutex sync.Mutex + tlsDialAddrs []string + dialer *net.Dialer + echoer *icmp.Echoer + dnsClient *dns.Client + logger Logger + icmpTarget netip.Addr + configMutex sync.Mutex icmpNotPermitted bool smallCheckName string @@ -45,13 +45,13 @@ func NewChecker(logger Logger) *Checker { } } -// SetConfig sets the TCP+TLS dial address and the ICMP echo IP address +// SetConfig sets the TCP+TLS dial addresses and the ICMP echo IP address // to target by the [Checker]. // This function MUST be called before calling [Checker.Start]. -func (c *Checker) SetConfig(tlsDialAddr string, icmpTarget netip.Addr) { +func (c *Checker) SetConfig(tlsDialAddrs []string, icmpTarget netip.Addr) { c.configMutex.Lock() defer c.configMutex.Unlock() - c.tlsDialAddr = tlsDialAddr + c.tlsDialAddrs = tlsDialAddrs c.icmpTarget = icmpTarget } @@ -63,17 +63,11 @@ func (c *Checker) SetConfig(tlsDialAddr string, icmpTarget netip.Addr) { // It returns an error if the initial TCP+TLS check fails. // The Checker has to be ultimately stopped by calling [Checker.Stop]. func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error) { - if c.tlsDialAddr == "" || c.icmpTarget.IsUnspecified() { + if len(c.tlsDialAddrs) == 0 || c.icmpTarget.IsUnspecified() { panic("call Checker.SetConfig with non empty values before Checker.Start") } - // connection isn't under load yet when the checker starts, so a short - // 6 seconds timeout suffices and provides quick enough feedback that - // the new connection is not working. - const timeout = 6 * time.Second - tcpTLSCheckCtx, tcpTLSCheckCancel := context.WithTimeout(ctx, timeout) - err = tcpTLSCheck(tcpTLSCheckCtx, c.dialer, c.tlsDialAddr) - tcpTLSCheckCancel() + err = c.startupCheck(ctx) if err != nil { return nil, fmt.Errorf("startup check: %w", err) } @@ -123,6 +117,7 @@ func (c *Checker) Start(ctx context.Context) (runError <-chan error, err error) func (c *Checker) Stop() error { c.stop() <-c.done + c.tlsDialAddrs = nil c.icmpTarget = netip.Addr{} return nil } @@ -143,7 +138,7 @@ func (c *Checker) smallPeriodicCheck(ctx context.Context) error { 15 * time.Second, 30 * time.Second, } - check := func(ctx context.Context) error { + check := func(ctx context.Context, _ int) error { if c.icmpNotPermitted { return c.dnsClient.Check(ctx) } @@ -163,8 +158,9 @@ func (c *Checker) fullPeriodicCheck(ctx context.Context) error { // 20s timeout in case the connection is under stress // See https://github.com/qdm12/gluetun/issues/2270 tryTimeouts := []time.Duration{10 * time.Second, 15 * time.Second, 30 * time.Second} - check := func(ctx context.Context) error { - return tcpTLSCheck(ctx, c.dialer, c.tlsDialAddr) + check := func(ctx context.Context, try int) error { + tlsDialAddr := c.tlsDialAddrs[try%len(c.tlsDialAddrs)] + return tcpTLSCheck(ctx, c.dialer, tlsDialAddr) } return withRetries(ctx, tryTimeouts, c.logger, "TCP+TLS dial", check) } @@ -226,7 +222,7 @@ func makeAddressToDial(address string) (addressToDial string, err error) { var ErrAllCheckTriesFailed = errors.New("all check tries failed") func withRetries(ctx context.Context, tryTimeouts []time.Duration, - logger Logger, checkName string, check func(ctx context.Context) error, + logger Logger, checkName string, check func(ctx context.Context, try int) error, ) error { maxTries := len(tryTimeouts) type errData struct { @@ -237,7 +233,7 @@ func withRetries(ctx context.Context, tryTimeouts []time.Duration, for i, timeout := range tryTimeouts { start := time.Now() checkCtx, cancel := context.WithTimeout(ctx, timeout) - err := check(checkCtx) + err := check(checkCtx, i) cancel() switch { case err == nil: @@ -256,3 +252,48 @@ func withRetries(ctx context.Context, tryTimeouts []time.Duration, } return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", ")) } + +func (c *Checker) startupCheck(ctx context.Context) error { + // connection isn't under load yet when the checker starts, so a short + // 6 seconds timeout suffices and provides quick enough feedback that + // the new connection is not working. However, since the addresses to dial + // may be multiple, we run the check in parallel. If any succeeds, the check passes. + // This is to prevent false negatives at startup, if one of the addresses is down + // for external reasons. + const timeout = 6 * time.Second + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + errCh := make(chan error) + + for _, address := range c.tlsDialAddrs { + go func(addr string) { + err := tcpTLSCheck(ctx, c.dialer, addr) + errCh <- err + }(address) + } + + errs := make([]error, 0, len(c.tlsDialAddrs)) + success := false + for range c.tlsDialAddrs { + err := <-errCh + if err == nil { + success = true + cancel() + continue + } else if success { + continue // ignore canceled errors after success + } + + c.logger.Debugf("startup check parallel attempt failed: %s", err) + errs = append(errs, err) + } + if success { + return nil + } + + errStrings := make([]string, len(errs)) + for i, err := range errs { + errStrings[i] = fmt.Sprintf("parallel attempt %d/%d failed: %s", i+1, len(errs), err) + } + return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", ")) +} diff --git a/internal/healthcheck/checker_test.go b/internal/healthcheck/checker_test.go index 911bbe27..4f9419ed 100644 --- a/internal/healthcheck/checker_test.go +++ b/internal/healthcheck/checker_test.go @@ -18,11 +18,11 @@ func Test_Checker_fullcheck(t *testing.T) { t.Parallel() dialer := &net.Dialer{} - const address = "cloudflare.com:443" + addresses := []string{"badaddress:9876", "cloudflare.com:443", "google.com:443"} checker := &Checker{ - dialer: dialer, - tlsDialAddr: address, + dialer: dialer, + tlsDialAddrs: addresses, } canceledCtx, cancel := context.WithCancel(context.Background()) @@ -52,8 +52,8 @@ func Test_Checker_fullcheck(t *testing.T) { dialer := &net.Dialer{} checker := &Checker{ - dialer: dialer, - tlsDialAddr: listeningAddress.String(), + dialer: dialer, + tlsDialAddrs: []string{listeningAddress.String()}, } err = checker.fullPeriodicCheck(ctx) diff --git a/internal/vpn/interfaces.go b/internal/vpn/interfaces.go index 12b880d9..ea346e05 100644 --- a/internal/vpn/interfaces.go +++ b/internal/vpn/interfaces.go @@ -101,7 +101,7 @@ type CmdStarter interface { } type HealthChecker interface { - SetConfig(tlsDialAddr string, icmpTarget netip.Addr) + SetConfig(tlsDialAddrs []string, icmpTarget netip.Addr) Start(ctx context.Context) (runError <-chan error, err error) Stop() error } diff --git a/internal/vpn/tunnelup.go b/internal/vpn/tunnelup.go index 009b9f3d..8b5da91a 100644 --- a/internal/vpn/tunnelup.go +++ b/internal/vpn/tunnelup.go @@ -35,7 +35,7 @@ func (l *Loop) onTunnelUp(ctx, loopCtx context.Context, data tunnelUpData) { if icmpTarget.IsUnspecified() { icmpTarget = data.serverIP } - l.healthChecker.SetConfig(l.healthSettings.TargetAddress, icmpTarget) + l.healthChecker.SetConfig(l.healthSettings.TargetAddresses, icmpTarget) healthErrCh, err := l.healthChecker.Start(ctx) l.healthServer.SetError(err)