feat(healthcheck): log duration for each failed attempt

This commit is contained in:
Quentin McGaw
2025-11-15 16:45:03 +00:00
parent 6023eb1878
commit 113c113615

View File

@@ -131,9 +131,7 @@ func (c *Checker) smallPeriodicCheck(ctx context.Context) error {
c.configMutex.Lock() c.configMutex.Lock()
ip := c.icmpTarget ip := c.icmpTarget
c.configMutex.Unlock() c.configMutex.Unlock()
const maxTries = 3 tryTimeouts := []time.Duration{10 * time.Second, 20 * time.Second, 30 * time.Second}
const timeout = 10 * time.Second
const extraTryTime = 10 * time.Second // 10s added for each subsequent retry
check := func(ctx context.Context) error { check := func(ctx context.Context) error {
if c.icmpNotPermitted { if c.icmpNotPermitted {
return c.dnsClient.Check(ctx) return c.dnsClient.Check(ctx)
@@ -147,19 +145,17 @@ func (c *Checker) smallPeriodicCheck(ctx context.Context) error {
} }
return err return err
} }
return withRetries(ctx, maxTries, timeout, extraTryTime, c.logger, c.smallCheckName, check) return withRetries(ctx, tryTimeouts, c.logger, c.smallCheckName, check)
} }
func (c *Checker) fullPeriodicCheck(ctx context.Context) error { func (c *Checker) fullPeriodicCheck(ctx context.Context) error {
const maxTries = 2
// 20s timeout in case the connection is under stress // 20s timeout in case the connection is under stress
// See https://github.com/qdm12/gluetun/issues/2270 // See https://github.com/qdm12/gluetun/issues/2270
const timeout = 20 * time.Second tryTimeouts := []time.Duration{20 * time.Second, 30 * time.Second}
const extraTryTime = 10 * time.Second // 10s added for each subsequent retry
check := func(ctx context.Context) error { check := func(ctx context.Context) error {
return tcpTLSCheck(ctx, c.dialer, c.tlsDialAddr) return tcpTLSCheck(ctx, c.dialer, c.tlsDialAddr)
} }
return withRetries(ctx, maxTries, timeout, extraTryTime, c.logger, "TCP+TLS dial", check) return withRetries(ctx, tryTimeouts, c.logger, "TCP+TLS dial", check)
} }
func tcpTLSCheck(ctx context.Context, dialer *net.Dialer, targetAddress string) error { func tcpTLSCheck(ctx context.Context, dialer *net.Dialer, targetAddress string) error {
@@ -218,13 +214,17 @@ func makeAddressToDial(address string) (addressToDial string, err error) {
var ErrAllCheckTriesFailed = errors.New("all check tries failed") var ErrAllCheckTriesFailed = errors.New("all check tries failed")
func withRetries(ctx context.Context, maxTries uint, tryTimeout, extraTryTime time.Duration, func withRetries(ctx context.Context, tryTimeouts []time.Duration,
logger Logger, checkName string, check func(ctx context.Context) error, logger Logger, checkName string, check func(ctx context.Context) error,
) error { ) error {
try := uint(0) maxTries := len(tryTimeouts)
var errs []error type errData struct {
for { err error
timeout := tryTimeout + time.Duration(try)*extraTryTime //nolint:gosec duration time.Duration
}
errs := make([]errData, maxTries)
for i, timeout := range tryTimeouts {
start := time.Now()
checkCtx, cancel := context.WithTimeout(ctx, timeout) checkCtx, cancel := context.WithTimeout(ctx, timeout)
err := check(checkCtx) err := check(checkCtx)
cancel() cancel()
@@ -234,17 +234,14 @@ func withRetries(ctx context.Context, maxTries uint, tryTimeout, extraTryTime ti
case ctx.Err() != nil: case ctx.Err() != nil:
return fmt.Errorf("%s: %w", checkName, ctx.Err()) return fmt.Errorf("%s: %w", checkName, ctx.Err())
} }
logger.Debugf("%s attempt %d/%d failed: %s", checkName, try+1, maxTries, err) logger.Debugf("%s attempt %d/%d failed: %s", checkName, i+1, maxTries, err)
errs = append(errs, err) errs[i].err = err
try++ errs[i].duration = time.Since(start)
if try < maxTries {
continue
}
errStrings := make([]string, len(errs))
for i, err := range errs {
errStrings[i] = fmt.Sprintf("attempt %d: %s", i+1, err.Error())
}
return fmt.Errorf("%w: after %d %s attempts (%s)",
ErrAllCheckTriesFailed, maxTries, checkName, strings.Join(errStrings, "; "))
} }
errStrings := make([]string, len(errs))
for i, err := range errs {
errStrings[i] = fmt.Sprintf("attempt %d (%s): %s", i+1, err.duration, err.err)
}
return fmt.Errorf("%w: %s", ErrAllCheckTriesFailed, strings.Join(errStrings, ", "))
} }