fix: create fresh timeout context for each filer retry attempt

The timeout context was created once at function start and reused across all retry attempts, causing subsequent retries to run with progressively shorter (or expired) deadlines. Problem flow: Line 244: timeoutCtx, cancel := context.WithTimeout(ctx, 5s) defer cancel() Retry 1, filer 0: client.LookupVolume(timeoutCtx, ...) ← 5s available ✅ Retry 1, filer 1: client.LookupVolume(timeoutCtx, ...) ← 3s left Retry 1, filer 2: client.LookupVolume(timeoutCtx, ...) ← 0.5s left Retry 2, filer 0: client.LookupVolume(timeoutCtx, ...) ← EXPIRED! ❌ Result: Retries always fail with DeadlineExceeded, defeating the purpose of retries. Fix: Moved context.WithTimeout inside the per-filer loop, creating a fresh timeout context for each attempt: for x := 0; x < n; x++ { timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout) err := pb.WithGrpcFilerClient(..., func(client) { resp, err := client.LookupVolume(timeoutCtx, ...) ... }) cancel() // Clean up immediately after call } Benefits: - Each filer attempt gets full fc.grpcTimeout (default 5s) - Retries actually have time to complete - No context leaks (cancel called after each attempt) - More predictable timeout behavior Example with fix: Retry 1, filer 0: fresh 5s timeout ✅ Retry 1, filer 1: fresh 5s timeout ✅ Retry 2, filer 0: fresh 5s timeout ✅ Total max time: 3 retries × 3 filers × 5s = 45s (plus backoff) Note: The outer ctx (from caller) still provides overall cancellation if the caller cancels or times out the entire operation.
2 weeks ago · f0c27ffbb2
1 changed files with 9 additions and 9 deletions
--- a/weed/wdclient/filer_client.go
+++ b/weed/wdclient/filer_client.go
@ -240,10 +240,6 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s
 	fc := p.filerClient
 	result := make(map[string][]Location)

-	// Create a timeout context for the gRPC call
-	timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout)
-	defer cancel()
-
 	// Convert grpcTimeout to milliseconds for the signature parameter
 	timeoutMs := int32(fc.grpcTimeout.Milliseconds())

@ -272,6 +268,9 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s

 			filerAddress := fc.filerAddresses[i]

+			// Create a fresh timeout context for each filer attempt
+			// This ensures each retry gets the full grpcTimeout, not a diminishing deadline
+			timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout)
 			err := pb.WithGrpcFilerClient(false, timeoutMs, filerAddress, fc.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
 				resp, err := client.LookupVolume(timeoutCtx, &filer_pb.LookupVolumeRequest{
 					VolumeIds: volumeIds,
@ -313,6 +312,7 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s

 				return nil
 			})
+			cancel() // Clean up timeout context immediately after call returns

 			if err != nil {
 				glog.V(1).Infof("FilerClient: filer %s lookup failed (attempt %d/%d, retry %d/%d): %v", filerAddress, x+1, n, retry+1, maxRetries, err)