From f0c27ffbb2059ebc18e9f851190e9d743b99da48 Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 20 Nov 2025 16:37:08 -0800 Subject: [PATCH] fix: create fresh timeout context for each filer retry attempt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timeout context was created once at function start and reused across all retry attempts, causing subsequent retries to run with progressively shorter (or expired) deadlines. Problem flow: Line 244: timeoutCtx, cancel := context.WithTimeout(ctx, 5s) defer cancel() Retry 1, filer 0: client.LookupVolume(timeoutCtx, ...) ← 5s available ✅ Retry 1, filer 1: client.LookupVolume(timeoutCtx, ...) ← 3s left Retry 1, filer 2: client.LookupVolume(timeoutCtx, ...) ← 0.5s left Retry 2, filer 0: client.LookupVolume(timeoutCtx, ...) ← EXPIRED! ❌ Result: Retries always fail with DeadlineExceeded, defeating the purpose of retries. Fix: Moved context.WithTimeout inside the per-filer loop, creating a fresh timeout context for each attempt: for x := 0; x < n; x++ { timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout) err := pb.WithGrpcFilerClient(..., func(client) { resp, err := client.LookupVolume(timeoutCtx, ...) ... }) cancel() // Clean up immediately after call } Benefits: - Each filer attempt gets full fc.grpcTimeout (default 5s) - Retries actually have time to complete - No context leaks (cancel called after each attempt) - More predictable timeout behavior Example with fix: Retry 1, filer 0: fresh 5s timeout ✅ Retry 1, filer 1: fresh 5s timeout ✅ Retry 2, filer 0: fresh 5s timeout ✅ Total max time: 3 retries × 3 filers × 5s = 45s (plus backoff) Note: The outer ctx (from caller) still provides overall cancellation if the caller cancels or times out the entire operation. --- weed/wdclient/filer_client.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/weed/wdclient/filer_client.go b/weed/wdclient/filer_client.go index 7821bf068..eb6a5b4fe 100644 --- a/weed/wdclient/filer_client.go +++ b/weed/wdclient/filer_client.go @@ -26,8 +26,8 @@ const ( // filerHealth tracks the health status of a filer type filerHealth struct { - failureCount int32 // atomic: consecutive failures - lastFailureTimeNs int64 // atomic: last failure time in Unix nanoseconds + failureCount int32 // atomic: consecutive failures + lastFailureTimeNs int64 // atomic: last failure time in Unix nanoseconds } // FilerClient provides volume location services by querying a filer @@ -198,12 +198,12 @@ func isRetryableGrpcError(err error) bool { func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool { health := fc.filerHealth[index] failureCount := atomic.LoadInt32(&health.failureCount) - + // Allow up to 2 failures before skipping if failureCount < 3 { return false } - + // Re-check unhealthy filers every 30 seconds lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs) if lastFailureNs == 0 { @@ -213,7 +213,7 @@ func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool { if time.Since(lastFailureTime) > 30*time.Second { return false // Time to re-check } - + return true // Skip this unhealthy filer } @@ -240,10 +240,6 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s fc := p.filerClient result := make(map[string][]Location) - // Create a timeout context for the gRPC call - timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout) - defer cancel() - // Convert grpcTimeout to milliseconds for the signature parameter timeoutMs := int32(fc.grpcTimeout.Milliseconds()) @@ -272,6 +268,9 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s filerAddress := fc.filerAddresses[i] + // Create a fresh timeout context for each filer attempt + // This ensures each retry gets the full grpcTimeout, not a diminishing deadline + timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout) err := pb.WithGrpcFilerClient(false, timeoutMs, filerAddress, fc.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error { resp, err := client.LookupVolume(timeoutCtx, &filer_pb.LookupVolumeRequest{ VolumeIds: volumeIds, @@ -313,6 +312,7 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s return nil }) + cancel() // Clean up timeout context immediately after call returns if err != nil { glog.V(1).Infof("FilerClient: filer %s lookup failed (attempt %d/%d, retry %d/%d): %v", filerAddress, x+1, n, retry+1, maxRetries, err)