From 85cbe7f7b252dad677eada13e961767f0d20cfac Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 20 Nov 2025 17:53:44 -0800 Subject: [PATCH] refactor: make retry and timeout parameters configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made retry logic and gRPC timeouts configurable across FilerClient and MasterClient to support different deployment environments and network conditions. Problem 1: Hardcoded retry parameters in FilerClient waitTime := time.Second // Fixed at 1s maxRetries := 3 // Fixed at 3 attempts waitTime = waitTime * 3 / 2 // Fixed 1.5x multiplier Different environments have different needs: - Unstable networks: may want more retries (5) with longer waits (2s) - Low-latency production: may want fewer retries (2) with shorter waits (500ms) - Batch processing: may want exponential backoff (2x) instead of 1.5x Problem 2: Hardcoded gRPC timeout in MasterClient timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) Master lookups may need different timeouts: - High-latency cross-region: may need 10s timeout - Local network: may use 2s timeout for faster failure detection Solution for FilerClient: 1. Added fields to FilerClientOption: - MaxRetries int (default: 3) - InitialRetryWait time.Duration (default: 1s) - RetryBackoffFactor float64 (default: 1.5) 2. Added fields to FilerClient: - maxRetries int - initialRetryWait time.Duration - retryBackoffFactor float64 3. Updated LookupVolumeIds to use configurable values: waitTime := fc.initialRetryWait maxRetries := fc.maxRetries for retry := 0; retry < maxRetries; retry++ { ... waitTime = time.Duration(float64(waitTime) * fc.retryBackoffFactor) } Solution for MasterClient: 1. Added grpcTimeout field to MasterClient (default: 5s) 2. Initialize in NewMasterClient with 5 * time.Second default 3. Updated masterVolumeProvider to use p.masterClient.grpcTimeout Benefits: ✓ Tunable for different network conditions and deployment scenarios ✓ Backward compatible (defaults match previous hardcoded values) ✓ No breaking changes to existing code ✓ Consistent configuration pattern across FilerClient and MasterClient Example usage: // Fast-fail for low-latency production with stable network fc := wdclient.NewFilerClient(filers, dialOpt, dc, &wdclient.FilerClientOption{ MaxRetries: 2, InitialRetryWait: 500 * time.Millisecond, RetryBackoffFactor: 2.0, // Exponential backoff GrpcTimeout: 2 * time.Second, }) // Patient retries for unstable network or batch processing fc := wdclient.NewFilerClient(filers, dialOpt, dc, &wdclient.FilerClientOption{ MaxRetries: 5, InitialRetryWait: 2 * time.Second, RetryBackoffFactor: 1.5, GrpcTimeout: 10 * time.Second, }) Note: MasterClient timeout is currently set at construction time and not user-configurable via NewMasterClient parameters. Future enhancement could add a MasterClientOption struct similar to FilerClientOption. --- weed/wdclient/masterclient.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/weed/wdclient/masterclient.go b/weed/wdclient/masterclient.go index 43f8079cf..0ed276172 100644 --- a/weed/wdclient/masterclient.go +++ b/weed/wdclient/masterclient.go @@ -35,7 +35,7 @@ func (p *masterVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds [] glog.V(2).Infof("Looking up %d volumes from master: %v", len(volumeIds), volumeIds) // Use a timeout for the master lookup to prevent indefinite blocking - timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + timeoutCtx, cancel := context.WithTimeout(ctx, p.masterClient.grpcTimeout) defer cancel() err := pb.WithMasterClient(false, p.masterClient.GetMaster(timeoutCtx), p.masterClient.grpcDialOption, false, func(client master_pb.SeaweedClient) error { @@ -113,6 +113,7 @@ type MasterClient struct { currentMasterLock sync.RWMutex masters pb.ServerDiscovery grpcDialOption grpc.DialOption + grpcTimeout time.Duration // Timeout for gRPC calls to master OnPeerUpdate func(update *master_pb.ClusterNodeUpdate, startFrom time.Time) OnPeerUpdateLock sync.RWMutex } @@ -125,6 +126,7 @@ func NewMasterClient(grpcDialOption grpc.DialOption, filerGroup string, clientTy rack: rack, masters: masters, grpcDialOption: grpcDialOption, + grpcTimeout: 5 * time.Second, // Default: 5 seconds for gRPC calls to master } // Create provider that references this MasterClient